Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import nltk
|
| 3 |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
@@ -8,21 +9,17 @@ import re
|
|
| 8 |
import matplotlib.pyplot as plt
|
| 9 |
import seaborn as sns
|
| 10 |
import spacy
|
| 11 |
-
import re
|
| 12 |
-
import pandas as pd
|
| 13 |
-
import matplotlib.pyplot as plt
|
| 14 |
-
import seaborn as sns
|
| 15 |
|
|
|
|
| 16 |
nltk.download('punkt')
|
| 17 |
|
| 18 |
-
|
| 19 |
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
|
| 20 |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 21 |
float_digit_regex = re.compile(r'^\d{10}$')
|
| 22 |
-
email_with_phone_regex =
|
| 23 |
-
r'(\d{10}).|.(\d{10})')
|
| 24 |
-
|
| 25 |
|
|
|
|
| 26 |
def extract_text_from_pdf(pdf_file):
|
| 27 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 28 |
text = ""
|
|
@@ -30,97 +27,74 @@ def extract_text_from_pdf(pdf_file):
|
|
| 30 |
text += pdf_reader.pages[page_num].extract_text()
|
| 31 |
return text
|
| 32 |
|
| 33 |
-
|
| 34 |
def tokenize_text(text, nlp_model):
|
| 35 |
doc = nlp_model(text, disable=["tagger", "parser"])
|
| 36 |
tokens = [(token.text.lower(), token.label_) for token in doc.ents]
|
| 37 |
return tokens
|
| 38 |
|
| 39 |
-
|
| 40 |
def extract_cgpa(resume_text):
|
| 41 |
-
# Define a regular expression pattern for CGPA extraction
|
| 42 |
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
|
| 43 |
-
|
| 44 |
-
# Search for CGPA pattern in the text
|
| 45 |
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
|
| 46 |
-
|
| 47 |
-
# Check if a match is found
|
| 48 |
if match:
|
| 49 |
-
# Extract CGPA value
|
| 50 |
cgpa = match.group(1) if match.group(1) else match.group(2)
|
| 51 |
return float(cgpa)
|
| 52 |
else:
|
| 53 |
return None
|
| 54 |
|
| 55 |
-
|
| 56 |
def extract_skills(text, skills_keywords):
|
| 57 |
-
skills = [skill.lower()
|
| 58 |
-
for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
|
| 59 |
return skills
|
| 60 |
|
| 61 |
-
|
| 62 |
def preprocess_text(text):
|
| 63 |
return word_tokenize(text.lower())
|
| 64 |
|
| 65 |
-
|
| 66 |
-
def load_data(results):
|
| 67 |
-
df = pd.DataFrame(results)
|
| 68 |
-
return df
|
| 69 |
-
|
| 70 |
def train_doc2vec_model(documents):
|
| 71 |
model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
|
| 72 |
model.build_vocab(documents)
|
| 73 |
-
model.train(documents, total_examples=model.corpus_count,
|
| 74 |
-
epochs=model.epochs)
|
| 75 |
return model
|
| 76 |
|
| 77 |
-
|
| 78 |
def calculate_similarity(model, text1, text2):
|
| 79 |
vector1 = model.infer_vector(preprocess_text(text1))
|
| 80 |
vector2 = model.infer_vector(preprocess_text(text2))
|
| 81 |
return model.dv.cosine_similarities(vector1, [vector2])[0]
|
| 82 |
|
| 83 |
-
|
| 84 |
def accuracy_calculation(true_positives, false_positives, false_negatives):
|
| 85 |
total = true_positives + false_positives + false_negatives
|
| 86 |
accuracy = true_positives / total if total != 0 else 0
|
| 87 |
return accuracy
|
| 88 |
|
| 89 |
-
|
| 90 |
# Streamlit Frontend
|
| 91 |
st.markdown("# Resume Matching Tool 📃📃")
|
| 92 |
st.markdown("An application to match resumes with a job description.")
|
| 93 |
|
| 94 |
# Sidebar - File Upload for Resumes
|
| 95 |
st.sidebar.markdown("## Upload Resumes PDF")
|
| 96 |
-
resumes_files = st.sidebar.file_uploader(
|
| 97 |
-
"Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
|
| 98 |
|
| 99 |
if resumes_files:
|
| 100 |
# Sidebar - File Upload for Job Descriptions
|
| 101 |
st.sidebar.markdown("## Upload Job Description PDF")
|
| 102 |
-
job_descriptions_file = st.sidebar.file_uploader(
|
| 103 |
-
"Upload Job Description PDF", type=["pdf"])
|
| 104 |
-
|
| 105 |
-
# Get skills keywords from user input
|
| 106 |
-
skills_keywords_input = st.sidebar.text_input(
|
| 107 |
-
"Enter skills keywords separated by commas (e.g., python, java, machine learning):")
|
| 108 |
-
skills_keywords = [skill.strip()
|
| 109 |
-
for skill in skills_keywords_input.split(',') if skill.strip()]
|
| 110 |
|
| 111 |
if job_descriptions_file:
|
|
|
|
| 112 |
nlp_model_path = "en_Resume_Matching_Keywords"
|
| 113 |
nlp = spacy.load(nlp_model_path)
|
| 114 |
-
|
| 115 |
# Backend Processing
|
| 116 |
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
| 117 |
-
resumes_texts = [extract_text_from_pdf(
|
| 118 |
-
resume_file) for resume_file in resumes_files]
|
| 119 |
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
| 120 |
job_description_tokens = tokenize_text(job_description_text, nlp)
|
| 121 |
|
| 122 |
-
# st.subheader("Matching Keywords")
|
| 123 |
-
|
| 124 |
# Initialize counters
|
| 125 |
overall_skill_matches = 0
|
| 126 |
overall_qualification_matches = 0
|
|
@@ -193,8 +167,7 @@ if resumes_files:
|
|
| 193 |
overall_qualification_matches += qualificationMatch
|
| 194 |
|
| 195 |
# Add count of matched skills for this resume to the list
|
| 196 |
-
skills_counts_all_resumes.append(
|
| 197 |
-
[resume_text.count(skill.lower()) for skill in job_skills])
|
| 198 |
|
| 199 |
# Create a dictionary for the current resume and append to the results list
|
| 200 |
result_dict = {
|
|
@@ -214,20 +187,22 @@ if resumes_files:
|
|
| 214 |
# Display overall matches
|
| 215 |
st.subheader("Overall Matches")
|
| 216 |
st.write(f"Total Skill Matches: {overall_skill_matches}")
|
| 217 |
-
st.write(
|
| 218 |
-
f"Total Qualification Matches: {overall_qualification_matches}")
|
| 219 |
st.write(f"Job Qualifications: {job_qualifications}")
|
| 220 |
st.write(f"Job Skills: {job_skills}")
|
| 221 |
|
| 222 |
# Display individual results in a table
|
| 223 |
-
results_df =
|
| 224 |
st.subheader("Individual Results")
|
| 225 |
st.dataframe(results_df)
|
| 226 |
-
tagged_resumes = [TaggedDocument(words=preprocess_text(
|
| 227 |
-
text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
|
| 228 |
model_resumes = train_doc2vec_model(tagged_resumes)
|
| 229 |
-
|
| 230 |
st.subheader("\nHeatmap:")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
if skills_keywords:
|
| 233 |
# Calculate the similarity score between each skill keyword and the resume text
|
|
@@ -235,20 +210,16 @@ if resumes_files:
|
|
| 235 |
for resume_text in resumes_texts:
|
| 236 |
resume_text_similarity_scores = []
|
| 237 |
for skill in skills_keywords:
|
| 238 |
-
similarity_score = calculate_similarity(
|
| 239 |
-
model_resumes, resume_text, skill)
|
| 240 |
resume_text_similarity_scores.append(similarity_score)
|
| 241 |
skills_similarity_scores.append(resume_text_similarity_scores)
|
| 242 |
|
| 243 |
# Create a DataFrame with the similarity scores and set the index to the names of the PDFs
|
| 244 |
-
skills_similarity_df = pd.DataFrame(
|
| 245 |
-
skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
|
| 246 |
|
| 247 |
# Plot the heatmap
|
| 248 |
fig, ax = plt.subplots(figsize=(12, 8))
|
| 249 |
-
|
| 250 |
-
sns.heatmap(skills_similarity_df,
|
| 251 |
-
cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
|
| 252 |
ax.set_title('Heatmap for Skills Similarity')
|
| 253 |
ax.set_xlabel('Skills')
|
| 254 |
ax.set_ylabel('Resumes')
|
|
@@ -264,4 +235,4 @@ if resumes_files:
|
|
| 264 |
else:
|
| 265 |
st.warning("Please upload the Job Description PDF to proceed.")
|
| 266 |
else:
|
| 267 |
-
st.warning("Please upload Resumes PDF to proceed.")
|
|
|
|
| 1 |
+
# Import necessary libraries
|
| 2 |
import streamlit as st
|
| 3 |
import nltk
|
| 4 |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
|
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
import seaborn as sns
|
| 11 |
import spacy
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
+
# Download necessary NLTK data
|
| 14 |
nltk.download('punkt')
|
| 15 |
|
| 16 |
+
# Define regular expressions for pattern matching
|
| 17 |
float_regex = re.compile(r'^\d{1,2}(\.\d{1,2})?$')
|
| 18 |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
| 19 |
float_digit_regex = re.compile(r'^\d{10}$')
|
| 20 |
+
email_with_phone_regex = re.compile(r'(\d{10}).|.(\d{10})')
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# Function to extract text from a PDF file
|
| 23 |
def extract_text_from_pdf(pdf_file):
|
| 24 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 25 |
text = ""
|
|
|
|
| 27 |
text += pdf_reader.pages[page_num].extract_text()
|
| 28 |
return text
|
| 29 |
|
| 30 |
+
# Function to tokenize text using the NLP model
|
| 31 |
def tokenize_text(text, nlp_model):
|
| 32 |
doc = nlp_model(text, disable=["tagger", "parser"])
|
| 33 |
tokens = [(token.text.lower(), token.label_) for token in doc.ents]
|
| 34 |
return tokens
|
| 35 |
|
| 36 |
+
# Function to extract CGPA from a resume
|
| 37 |
def extract_cgpa(resume_text):
|
|
|
|
| 38 |
cgpa_pattern = r'\b(?:CGPA|GPA|C\.G\.PA|Cumulative GPA)\s*:?[\s-]([0-9]+(?:\.[0-9]+)?)\b|\b([0-9]+(?:\.[0-9]+)?)\s(?:CGPA|GPA)\b'
|
|
|
|
|
|
|
| 39 |
match = re.search(cgpa_pattern, resume_text, re.IGNORECASE)
|
|
|
|
|
|
|
| 40 |
if match:
|
|
|
|
| 41 |
cgpa = match.group(1) if match.group(1) else match.group(2)
|
| 42 |
return float(cgpa)
|
| 43 |
else:
|
| 44 |
return None
|
| 45 |
|
| 46 |
+
# Function to extract skills from a resume
|
| 47 |
def extract_skills(text, skills_keywords):
|
| 48 |
+
skills = [skill.lower() for skill in skills_keywords if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text.lower())]
|
|
|
|
| 49 |
return skills
|
| 50 |
|
| 51 |
+
# Function to preprocess text
|
| 52 |
def preprocess_text(text):
|
| 53 |
return word_tokenize(text.lower())
|
| 54 |
|
| 55 |
+
# Function to train a Doc2Vec model
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def train_doc2vec_model(documents):
|
| 57 |
model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
|
| 58 |
model.build_vocab(documents)
|
| 59 |
+
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
|
|
|
|
| 60 |
return model
|
| 61 |
|
| 62 |
+
# Function to calculate similarity between two texts
|
| 63 |
def calculate_similarity(model, text1, text2):
|
| 64 |
vector1 = model.infer_vector(preprocess_text(text1))
|
| 65 |
vector2 = model.infer_vector(preprocess_text(text2))
|
| 66 |
return model.dv.cosine_similarities(vector1, [vector2])[0]
|
| 67 |
|
| 68 |
+
# Function to calculate accuracy
|
| 69 |
def accuracy_calculation(true_positives, false_positives, false_negatives):
|
| 70 |
total = true_positives + false_positives + false_negatives
|
| 71 |
accuracy = true_positives / total if total != 0 else 0
|
| 72 |
return accuracy
|
| 73 |
|
|
|
|
| 74 |
# Streamlit Frontend
|
| 75 |
st.markdown("# Resume Matching Tool 📃📃")
|
| 76 |
st.markdown("An application to match resumes with a job description.")
|
| 77 |
|
| 78 |
# Sidebar - File Upload for Resumes
|
| 79 |
st.sidebar.markdown("## Upload Resumes PDF")
|
| 80 |
+
resumes_files = st.sidebar.file_uploader("Upload Resumes PDF", type=["pdf"], accept_multiple_files=True)
|
|
|
|
| 81 |
|
| 82 |
if resumes_files:
|
| 83 |
# Sidebar - File Upload for Job Descriptions
|
| 84 |
st.sidebar.markdown("## Upload Job Description PDF")
|
| 85 |
+
job_descriptions_file = st.sidebar.file_uploader("Upload Job Description PDF", type=["pdf"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
if job_descriptions_file:
|
| 88 |
+
# Load the pre-trained NLP model
|
| 89 |
nlp_model_path = "en_Resume_Matching_Keywords"
|
| 90 |
nlp = spacy.load(nlp_model_path)
|
| 91 |
+
|
| 92 |
# Backend Processing
|
| 93 |
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
| 94 |
+
resumes_texts = [extract_text_from_pdf(resume_file) for resume_file in resumes_files]
|
|
|
|
| 95 |
job_description_text = extract_text_from_pdf(job_descriptions_file)
|
| 96 |
job_description_tokens = tokenize_text(job_description_text, nlp)
|
| 97 |
|
|
|
|
|
|
|
| 98 |
# Initialize counters
|
| 99 |
overall_skill_matches = 0
|
| 100 |
overall_qualification_matches = 0
|
|
|
|
| 167 |
overall_qualification_matches += qualificationMatch
|
| 168 |
|
| 169 |
# Add count of matched skills for this resume to the list
|
| 170 |
+
skills_counts_all_resumes.append([resume_text.count(skill.lower()) for skill in job_skills])
|
|
|
|
| 171 |
|
| 172 |
# Create a dictionary for the current resume and append to the results list
|
| 173 |
result_dict = {
|
|
|
|
| 187 |
# Display overall matches
|
| 188 |
st.subheader("Overall Matches")
|
| 189 |
st.write(f"Total Skill Matches: {overall_skill_matches}")
|
| 190 |
+
st.write(f"Total Qualification Matches: {overall_qualification_matches}")
|
|
|
|
| 191 |
st.write(f"Job Qualifications: {job_qualifications}")
|
| 192 |
st.write(f"Job Skills: {job_skills}")
|
| 193 |
|
| 194 |
# Display individual results in a table
|
| 195 |
+
results_df = pd.DataFrame(results_list)
|
| 196 |
st.subheader("Individual Results")
|
| 197 |
st.dataframe(results_df)
|
| 198 |
+
tagged_resumes = [TaggedDocument(words=preprocess_text(text), tags=[str(i)]) for i, text in enumerate(resumes_texts)]
|
|
|
|
| 199 |
model_resumes = train_doc2vec_model(tagged_resumes)
|
| 200 |
+
|
| 201 |
st.subheader("\nHeatmap:")
|
| 202 |
+
|
| 203 |
+
# Get skills keywords from user input
|
| 204 |
+
skills_keywords_input = st.text_input("Enter skills keywords separated by commas (e.g., python, java, machine learning):")
|
| 205 |
+
skills_keywords = [skill.strip() for skill in skills_keywords_input.split(',') if skill.strip()]
|
| 206 |
|
| 207 |
if skills_keywords:
|
| 208 |
# Calculate the similarity score between each skill keyword and the resume text
|
|
|
|
| 210 |
for resume_text in resumes_texts:
|
| 211 |
resume_text_similarity_scores = []
|
| 212 |
for skill in skills_keywords:
|
| 213 |
+
similarity_score = calculate_similarity(model_resumes, resume_text, skill)
|
|
|
|
| 214 |
resume_text_similarity_scores.append(similarity_score)
|
| 215 |
skills_similarity_scores.append(resume_text_similarity_scores)
|
| 216 |
|
| 217 |
# Create a DataFrame with the similarity scores and set the index to the names of the PDFs
|
| 218 |
+
skills_similarity_df = pd.DataFrame(skills_similarity_scores, columns=skills_keywords, index=[resume_file.name for resume_file in resumes_files])
|
|
|
|
| 219 |
|
| 220 |
# Plot the heatmap
|
| 221 |
fig, ax = plt.subplots(figsize=(12, 8))
|
| 222 |
+
sns.heatmap(skills_similarity_df, cmap='YlGnBu', annot=True, fmt=".2f", ax=ax)
|
|
|
|
|
|
|
| 223 |
ax.set_title('Heatmap for Skills Similarity')
|
| 224 |
ax.set_xlabel('Skills')
|
| 225 |
ax.set_ylabel('Resumes')
|
|
|
|
| 235 |
else:
|
| 236 |
st.warning("Please upload the Job Description PDF to proceed.")
|
| 237 |
else:
|
| 238 |
+
st.warning("Please upload Resumes PDF to proceed.")
|