|
|
import streamlit as st |
|
|
import fitz |
|
|
import pandas as pd |
|
|
import nltk |
|
|
import re |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
from nltk.corpus import stopwords |
|
|
from nltk.tokenize import word_tokenize |
|
|
import os |
|
|
|
|
|
|
|
|
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data') |
|
|
os.makedirs(nltk_data_dir, exist_ok=True) |
|
|
nltk.download('punkt', download_dir=nltk_data_dir) |
|
|
nltk.download('stopwords', download_dir=nltk_data_dir) |
|
|
nltk.data.path.append(nltk_data_dir) |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="BERT Resume Matcher", layout="wide") |
|
|
st.title("π€ AI Resume Matcher using BERT") |
|
|
st.markdown("Upload resumes and a job description β see similarity scores using **semantic NLP** and keyword matching.") |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") |
|
|
text = "" |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
return text |
|
|
|
|
|
|
|
|
def extract_keywords(text): |
|
|
tokens = word_tokenize(text.lower()) |
|
|
stop_words = set(stopwords.words('english')) |
|
|
|
|
|
|
|
|
custom_stopwords = { |
|
|
'basic', 'knowledge', 'either', 'ctc', 'good', 'lpa', 'per', 'month', |
|
|
'year', 'strong', 'skills', 'required', 'looking', 'fresher', |
|
|
'candidate', 'experience', 'preferred', 'concepts' |
|
|
} |
|
|
|
|
|
|
|
|
words = [re.sub(r'\W+', '', word) for word in tokens if word.isalpha()] |
|
|
keywords = [word for word in words if word not in stop_words and word not in custom_stopwords and len(word) > 2] |
|
|
|
|
|
return set(keywords) |
|
|
|
|
|
|
|
|
uploaded_files = st.file_uploader("π€ Upload Resumes (PDF)", type="pdf", accept_multiple_files=True) |
|
|
job_desc = st.text_area("π Paste Job Description Here", height=200) |
|
|
|
|
|
if st.button("π Match Resumes"): |
|
|
if uploaded_files and job_desc.strip(): |
|
|
resume_texts = [] |
|
|
resume_names = [] |
|
|
|
|
|
for file in uploaded_files: |
|
|
try: |
|
|
text = extract_text_from_pdf(file) |
|
|
resume_texts.append(text) |
|
|
resume_names.append(file.name) |
|
|
except Exception as e: |
|
|
st.error(f"β Error processing {file.name}: {str(e)}") |
|
|
|
|
|
|
|
|
with st.spinner("π Computing similarity..."): |
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
all_docs = [job_desc] + resume_texts |
|
|
embeddings = model.encode(all_docs, convert_to_tensor=True) |
|
|
|
|
|
job_embedding = embeddings[0] |
|
|
resume_embeddings = embeddings[1:] |
|
|
semantic_scores = util.cos_sim(job_embedding, resume_embeddings).flatten().tolist() |
|
|
|
|
|
|
|
|
job_keywords = extract_keywords(job_desc) |
|
|
results = [] |
|
|
|
|
|
for i in range(len(resume_texts)): |
|
|
resume_keywords = extract_keywords(resume_texts[i]) |
|
|
matched = job_keywords & resume_keywords |
|
|
missing = job_keywords - resume_keywords |
|
|
match_ratio = len(matched) / len(job_keywords) if job_keywords else 0 |
|
|
|
|
|
results.append({ |
|
|
"Resume": resume_names[i], |
|
|
"Semantic Score (0β100)": round(semantic_scores[i] * 100, 2), |
|
|
"Skill Match (%)": round(match_ratio * 100, 2), |
|
|
"Matched Keywords": ", ".join(sorted(matched)), |
|
|
"Missing Keywords": ", ".join(sorted(missing)) |
|
|
}) |
|
|
|
|
|
results_df = pd.DataFrame(results).sort_values(by="Semantic Score (0β100)", ascending=False).reset_index(drop=True) |
|
|
|
|
|
st.success("β
Matching complete!") |
|
|
st.dataframe(results_df) |
|
|
|
|
|
|
|
|
csv = results_df.to_csv(index=False).encode('utf-8') |
|
|
st.download_button("π₯ Download Results as CSV", csv, "resume_match_results.csv", "text/csv") |
|
|
else: |
|
|
st.warning("β οΈ Please upload resumes and enter a job description before matching.") |
|
|
|