resume_search / app.py
Anthony Thomas
Update app.py
03791a4 verified
import streamlit as st
import os
import PyPDF2
print("PyPDF2 successfully imported!")
import docx
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Ensure NLTK data is downloaded
nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
nltk.download('wordnet', download_dir=nltk_data_dir, quiet=True)
nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
# Load sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')
# Functions for resume processing
def read_resume(file):
ext = os.path.splitext(file.name)[1].lower()
content = ""
try:
if ext == ".pdf":
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
content += page.extract_text() + "\n"
elif ext == ".docx":
doc = docx.Document(file)
content = '\n'.join([para.text for para in doc.paragraphs])
elif ext == ".txt":
content = file.read().decode('utf-8')
except Exception as e:
st.error(f"Error reading {file.name}: {e}")
return content
# Preprocessing text
def preprocess_text(text):
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
tokens = text.split()
stop_words = set(stopwords.words('english'))
words = [word for word in tokens if word not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in words]
return ' '.join(lemmas)
def compute_similarity(job_description, resumes):
preprocessed_jd = preprocess_text(job_description)
jd_embedding = model.encode(preprocessed_jd)
resume_results = []
for resume_name, resume_text in resumes.items():
processed_resume = preprocess_text(resume_text)
resume_embedding = model.encode(processed_resume)
similarity_score = cosine_similarity([jd_embedding], [resume_embedding])[0][0]
resume_results.append((resume_name, similarity_score))
sorted_resumes = sorted(resume_results, key=lambda x: x[1], reverse=True)
return sorted_resumes
# Streamlit UI
st.title("Resume Search and Ranking Tool")
# Upload files and job description
uploaded_files = st.file_uploader("Upload resumes (PDF, DOCX, TXT)", accept_multiple_files=True)
job_description = st.text_area("Enter Job Description:")
if st.button("Rank Resumes"):
if uploaded_files and job_description:
resumes = {}
for uploaded_file in uploaded_files:
content = read_resume(uploaded_file)
if content:
resumes[uploaded_file.name] = content
ranked_resumes = compute_similarity(job_description, resumes)
st.success("Resumes ranked successfully!")
for i, (resume_name, score) in enumerate(ranked_resumes):
st.write(f"{i + 1}. {resume_name} - Similarity: {score:.2f}")
else:
st.warning("Please upload resumes and enter a job description.")