CV-Extractor / resume_recommender.py
Aqdas's picture
Update resume_recommender.py
7cf3d3b verified
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def preprocess_text(text: str) -> str:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
words = text.split()
words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
return ' '.join(words)
def get_bert_embedding(text: str):
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).detach().numpy()
def recommend_resume(resume_df: pd.DataFrame, job_desc: str) -> pd.DataFrame:
'''Take Resume and return the recommended resumes align with the job description'''
recommended_resumes = pd.DataFrame()
rows = resume_df.shape[0]
job_description_embedding = get_bert_embedding(job_desc)
all_similarity_scores = []
for row in range(rows):
print('This is row', row)
print("This is row skill: ",resume_df['skills'].iloc[row])
resume_skill_str = " ".join(resume_df['skills'].iloc[row])
resume_aboutSec_str = " ".join(resume_df['about_section'].iloc[row])
resume_text = preprocess_text(" ".join(resume_skill_str + " " + resume_aboutSec_str))
resume_embedding = get_bert_embedding(resume_text)
similarity_score = cosine_similarity(resume_embedding, job_description_embedding).flatten()[0]
experience_match = resume_df['past_company_experience'].iloc[row] >= 0
print('similarity_score: ',similarity_score, 'experience_match: ',experience_match )
all_similarity_scores.append(similarity_score)
matching_threshold = 0.0
if experience_match:
if similarity_score >= matching_threshold:
recommended_resumes = pd.concat([recommended_resumes, resume_df.iloc[[row]]])
print("Resume matches the job description.")
else:
print("similarity score is out of threshold")
continue
else:
print("Resume does not match the job description.")
print('one complete')
recommended_resumes['similarity_score'] = all_similarity_scores
return recommended_resumes