ATllll's picture
Update src/streamlit_app.py
70d6a4d verified
import streamlit as st
import fitz # PyMuPDF
import pandas as pd
import nltk
import re
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
# βœ… FIXED: Use local writable directory for NLTK downloads to avoid permission error
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data') # Changed from home dir to CWD
os.makedirs(nltk_data_dir, exist_ok=True) # Create if doesn't exist
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.data.path.append(nltk_data_dir)
# Set up Streamlit page
st.set_page_config(page_title="BERT Resume Matcher", layout="wide")
st.title("πŸ€– AI Resume Matcher using BERT")
st.markdown("Upload resumes and a job description β€” see similarity scores using **semantic NLP** and keyword matching.")
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
# Function to extract cleaned keywords from text
def extract_keywords(text):
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
# Custom stopwords (non-skill filler words)
custom_stopwords = {
'basic', 'knowledge', 'either', 'ctc', 'good', 'lpa', 'per', 'month',
'year', 'strong', 'skills', 'required', 'looking', 'fresher',
'candidate', 'experience', 'preferred', 'concepts'
}
# Remove non-alphabetic tokens and filter
words = [re.sub(r'\W+', '', word) for word in tokens if word.isalpha()]
keywords = [word for word in words if word not in stop_words and word not in custom_stopwords and len(word) > 2]
return set(keywords)
# Upload UI
uploaded_files = st.file_uploader("πŸ“€ Upload Resumes (PDF)", type="pdf", accept_multiple_files=True)
job_desc = st.text_area("πŸ“ Paste Job Description Here", height=200)
if st.button("πŸš€ Match Resumes"):
if uploaded_files and job_desc.strip():
resume_texts = []
resume_names = []
for file in uploaded_files:
try:
text = extract_text_from_pdf(file)
resume_texts.append(text)
resume_names.append(file.name)
except Exception as e:
st.error(f"❌ Error processing {file.name}: {str(e)}")
# Load Sentence-BERT model
with st.spinner("πŸ” Computing similarity..."):
model = SentenceTransformer('all-MiniLM-L6-v2')
# Encode job description and resumes
all_docs = [job_desc] + resume_texts
embeddings = model.encode(all_docs, convert_to_tensor=True)
job_embedding = embeddings[0]
resume_embeddings = embeddings[1:]
semantic_scores = util.cos_sim(job_embedding, resume_embeddings).flatten().tolist()
# Extract job keywords
job_keywords = extract_keywords(job_desc)
results = []
for i in range(len(resume_texts)):
resume_keywords = extract_keywords(resume_texts[i])
matched = job_keywords & resume_keywords
missing = job_keywords - resume_keywords
match_ratio = len(matched) / len(job_keywords) if job_keywords else 0
results.append({
"Resume": resume_names[i],
"Semantic Score (0–100)": round(semantic_scores[i] * 100, 2),
"Skill Match (%)": round(match_ratio * 100, 2),
"Matched Keywords": ", ".join(sorted(matched)),
"Missing Keywords": ", ".join(sorted(missing))
})
results_df = pd.DataFrame(results).sort_values(by="Semantic Score (0–100)", ascending=False).reset_index(drop=True)
st.success("βœ… Matching complete!")
st.dataframe(results_df)
# Download CSV
csv = results_df.to_csv(index=False).encode('utf-8')
st.download_button("πŸ“₯ Download Results as CSV", csv, "resume_match_results.csv", "text/csv")
else:
st.warning("⚠️ Please upload resumes and enter a job description before matching.")