File size: 4,279 Bytes
8f50fcd 7b56d8f 70d6a4d 7b56d8f 8f50fcd 7b56d8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import streamlit as st
import fitz # PyMuPDF
import pandas as pd
import nltk
import re
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
# β
FIXED: Use local writable directory for NLTK downloads to avoid permission error
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data') # Changed from home dir to CWD
os.makedirs(nltk_data_dir, exist_ok=True) # Create if doesn't exist
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.data.path.append(nltk_data_dir)
# Set up Streamlit page
st.set_page_config(page_title="BERT Resume Matcher", layout="wide")
st.title("π€ AI Resume Matcher using BERT")
st.markdown("Upload resumes and a job description β see similarity scores using **semantic NLP** and keyword matching.")
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
# Function to extract cleaned keywords from text
def extract_keywords(text):
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
# Custom stopwords (non-skill filler words)
custom_stopwords = {
'basic', 'knowledge', 'either', 'ctc', 'good', 'lpa', 'per', 'month',
'year', 'strong', 'skills', 'required', 'looking', 'fresher',
'candidate', 'experience', 'preferred', 'concepts'
}
# Remove non-alphabetic tokens and filter
words = [re.sub(r'\W+', '', word) for word in tokens if word.isalpha()]
keywords = [word for word in words if word not in stop_words and word not in custom_stopwords and len(word) > 2]
return set(keywords)
# Upload UI
uploaded_files = st.file_uploader("π€ Upload Resumes (PDF)", type="pdf", accept_multiple_files=True)
job_desc = st.text_area("π Paste Job Description Here", height=200)
if st.button("π Match Resumes"):
if uploaded_files and job_desc.strip():
resume_texts = []
resume_names = []
for file in uploaded_files:
try:
text = extract_text_from_pdf(file)
resume_texts.append(text)
resume_names.append(file.name)
except Exception as e:
st.error(f"β Error processing {file.name}: {str(e)}")
# Load Sentence-BERT model
with st.spinner("π Computing similarity..."):
model = SentenceTransformer('all-MiniLM-L6-v2')
# Encode job description and resumes
all_docs = [job_desc] + resume_texts
embeddings = model.encode(all_docs, convert_to_tensor=True)
job_embedding = embeddings[0]
resume_embeddings = embeddings[1:]
semantic_scores = util.cos_sim(job_embedding, resume_embeddings).flatten().tolist()
# Extract job keywords
job_keywords = extract_keywords(job_desc)
results = []
for i in range(len(resume_texts)):
resume_keywords = extract_keywords(resume_texts[i])
matched = job_keywords & resume_keywords
missing = job_keywords - resume_keywords
match_ratio = len(matched) / len(job_keywords) if job_keywords else 0
results.append({
"Resume": resume_names[i],
"Semantic Score (0β100)": round(semantic_scores[i] * 100, 2),
"Skill Match (%)": round(match_ratio * 100, 2),
"Matched Keywords": ", ".join(sorted(matched)),
"Missing Keywords": ", ".join(sorted(missing))
})
results_df = pd.DataFrame(results).sort_values(by="Semantic Score (0β100)", ascending=False).reset_index(drop=True)
st.success("β
Matching complete!")
st.dataframe(results_df)
# Download CSV
csv = results_df.to_csv(index=False).encode('utf-8')
st.download_button("π₯ Download Results as CSV", csv, "resume_match_results.csv", "text/csv")
else:
st.warning("β οΈ Please upload resumes and enter a job description before matching.")
|