|
|
import gradio as gr |
|
|
import os |
|
|
import io |
|
|
import re |
|
|
from docx import Document |
|
|
from PyPDF2 import PdfReader |
|
|
import pandas as pd |
|
|
import spacy |
|
|
from collections import Counter |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
try: |
|
|
nlp = spacy.load("en_core_web_lg") |
|
|
print("SpaCy model loaded successfully.") |
|
|
except Exception as e: |
|
|
print(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.") |
|
|
|
|
|
|
|
|
predefined_skills_list = set([ |
|
|
"python", "tensorflow", "pytorch", "scikit-learn", "numpy", "pandas", |
|
|
"docker", "kubernetes", "aws", "git", "sql", "java", "r", "tableau", |
|
|
"jupyter", "vscode", "bert", "spacy", "nltk", "opencv", "cnns", |
|
|
"mlops", "agile", "feature engineering", "model deployment", |
|
|
"machine learning", "deep learning", "nlp", "computer vision", |
|
|
"data analysis", "predictive modeling", "fraud detection", |
|
|
"recommendation system", "sentiment analysis", "ab testing", |
|
|
"xgboost", "spark", "hadoop", "azure", "gcp", |
|
|
"ai", "artificial intelligence", "data science", "big data", |
|
|
"software development", "web development", "mobile development", |
|
|
"databases", "cloud computing", "networking", "cybersecurity", |
|
|
"project management", "communication", "teamwork", "leadership", |
|
|
"problem solving", "critical thinking", "creativity" |
|
|
]) |
|
|
predefined_skills_list.update([ |
|
|
"machine learning engineer", "data scientist", "ai engineer", "deep learning engineer", |
|
|
"senior machine learning engineer", "junior data scientist", |
|
|
"data engineer", "software engineer", "full stack", "frontend", "backend" |
|
|
]) |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
try: |
|
|
with open(pdf_path, 'rb') as file: |
|
|
reader = PdfReader(file) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
text += page.extract_text() or "" |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"Error reading PDF {pdf_path}: {e}") |
|
|
return "" |
|
|
|
|
|
def extract_text_from_docx(docx_path): |
|
|
try: |
|
|
document = Document(docx_path) |
|
|
text = "\n".join([paragraph.text for paragraph in document.paragraphs]) |
|
|
return text |
|
|
except Exception as e: |
|
|
print(f"Error reading DOCX {docx_path}: {e}") |
|
|
return "" |
|
|
|
|
|
def get_file_content(file_obj): |
|
|
if file_obj is None: |
|
|
return "" |
|
|
file_path = file_obj.name |
|
|
if file_path.endswith('.pdf'): |
|
|
return extract_text_from_pdf(file_path) |
|
|
elif file_path.endswith('.docx'): |
|
|
return extract_text_from_docx(file_path) |
|
|
elif file_path.endswith('.txt'): |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
return f.read() |
|
|
else: |
|
|
return "" |
|
|
|
|
|
|
|
|
def preprocess_text(text): |
|
|
if not isinstance(text, str): return "" |
|
|
text = text.lower() |
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
doc = nlp(text) |
|
|
processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space] |
|
|
return " ".join(processed_tokens) |
|
|
|
|
|
|
|
|
def extract_skills(text_doc, skill_keywords=None): |
|
|
extracted_skills = [] |
|
|
if skill_keywords is None: skill_keywords = set() |
|
|
doc_text = text_doc.text.lower() |
|
|
for skill in skill_keywords: |
|
|
if re.search(r'\b' + re.escape(skill) + r'\b', doc_text): |
|
|
extracted_skills.append(skill) |
|
|
entities = {} |
|
|
for ent in text_doc.ents: |
|
|
if ent.label_ == "ORG": entities.setdefault("organizations", []).append(ent.text) |
|
|
elif ent.label_ == "GPE": entities.setdefault("locations", []).append(ent.text) |
|
|
elif ent.label_ == "DATE": entities.setdefault("dates", []).append(ent.text) |
|
|
elif ent.label_ == "PERSON": entities.setdefault("people", []).append(ent.text) |
|
|
return list(set(extracted_skills)), entities |
|
|
|
|
|
def extract_experience_and_education(text): |
|
|
years_experience = 0 |
|
|
education_level = "Not Specified" |
|
|
exp_matches = re.findall(r'(\d+)\s*(?:\+|plus)?\s*years?\s+of\s+experience|\d+\s*yrs?\s+exp', text.lower()) |
|
|
if exp_matches: |
|
|
try: |
|
|
years_experience = max([int(re.findall(r'\d+', m)[0]) for m in exp_matches if re.findall(r'\d+', m)]) |
|
|
except (ValueError, IndexError): pass |
|
|
text_lower = text.lower() |
|
|
if "phd" in text_lower or "doctorate" in text_lower: education_level = "Ph.D." |
|
|
elif "master" in text_lower or "m.s." in text_lower or "msc" in text_lower: education_level = "Master's" |
|
|
elif "bachelor" in text_lower or "b.s." in text_lower or "bsc" in text_lower: education_level = "Bachelor's" |
|
|
elif "associate" in text_lower: education_level = "Associate's" |
|
|
return years_experience, education_level |
|
|
|
|
|
|
|
|
def get_text_embeddings(text): |
|
|
if not text: return np.zeros(nlp.vocab.vectors.shape[1]) |
|
|
doc = nlp(text) |
|
|
if doc.has_vector: return doc.vector |
|
|
else: return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1]) |
|
|
|
|
|
def calculate_cosine_similarity(vec1, vec2): |
|
|
if np.all(vec1 == 0) or np.all(vec2 == 0): |
|
|
return 0.0 |
|
|
vec1 = vec1.reshape(1, -1) |
|
|
vec2 = vec2.reshape(1, -1) |
|
|
return cosine_similarity(vec1, vec2)[0][0] |
|
|
|
|
|
|
|
|
def analyze_document(doc_text): |
|
|
doc_spacy = nlp(doc_text) |
|
|
cleaned_text = preprocess_text(doc_text) |
|
|
extracted_skills, general_entities = extract_skills(doc_spacy, skill_keywords=predefined_skills_list) |
|
|
years_exp, education_level = extract_experience_and_education(doc_text) |
|
|
text_embedding = get_text_embeddings(cleaned_text) |
|
|
return { |
|
|
"raw_text": doc_text, "cleaned_text": cleaned_text, "spacy_doc": doc_spacy, |
|
|
"extracted_skills": extracted_skills, "general_entities": general_entities, |
|
|
"years_experience": years_exp, "education_level": education_level, |
|
|
"text_embedding": text_embedding |
|
|
} |
|
|
|
|
|
|
|
|
def calculate_match_scores(cv_data, jd_data): |
|
|
results = {} |
|
|
overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"]) |
|
|
results["overall_match_score"] = round(overall_similarity * 100, 2) |
|
|
cv_skills = set(cv_data["extracted_skills"]) |
|
|
jd_skills = set(jd_data["extracted_skills"]) |
|
|
matched_skills = list(cv_skills.intersection(jd_skills)) |
|
|
missing_skills = list(jd_skills.difference(cv_skills)) |
|
|
extra_skills_in_cv = list(cv_skills.difference(jd_skills)) |
|
|
results["matched_skills"] = matched_skills |
|
|
results["missing_skills"] = missing_skills |
|
|
results["extra_skills_in_cv"] = extra_skills_in_cv |
|
|
if jd_skills: skill_match_percentage = len(matched_skills) / len(jd_skills) * 100 |
|
|
else: skill_match_percentage = 0.0 |
|
|
results["skill_match_percentage"] = round(skill_match_percentage, 2) |
|
|
corpus = [cv_data["cleaned_text"], jd_data["cleaned_text"]] |
|
|
tfidf_vectorizer = TfidfVectorizer(max_features=100) |
|
|
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) |
|
|
feature_names = tfidf_vectorizer.get_feature_names_out() |
|
|
cv_tfidf_scores = {feature_names[i]: tfidf_matrix[0, i] for i in tfidf_matrix[0].nonzero()[1]} |
|
|
jd_tfidf_scores = {feature_names[i]: tfidf_matrix[1, i] for i in tfidf_matrix[1].nonzero()[1]} |
|
|
top_cv_keywords = sorted(cv_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15] |
|
|
top_jd_keywords = sorted(jd_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15] |
|
|
results["top_cv_keywords"] = [k for k,v in top_cv_keywords] |
|
|
results["top_jd_keywords"] = [k for k,v in top_jd_keywords] |
|
|
common_keywords = set(results["top_cv_keywords"]).intersection(set(results["top_jd_keywords"])) |
|
|
results["common_keywords"] = list(common_keywords) |
|
|
cv_exp_years = cv_data["years_experience"] |
|
|
jd_exp_years = jd_data["years_experience"] |
|
|
results["cv_years_experience"] = cv_exp_years |
|
|
results["jd_years_experience"] = jd_exp_years |
|
|
exp_status = "Not specified by Job" |
|
|
if jd_exp_years > 0: |
|
|
if cv_exp_years >= jd_exp_years: exp_status = "Meets or Exceeds Requirement" |
|
|
else: exp_status = f"Below Requirement (Needs {jd_exp_years - cv_exp_years} more years)" |
|
|
results["experience_match_status"] = exp_status |
|
|
cv_edu = cv_data["education_level"] |
|
|
jd_edu = jd_data["education_level"] |
|
|
results["cv_education_level"] = cv_edu |
|
|
results["jd_education_level"] = jd_edu |
|
|
edu_match_status = "Not Specified by Job" |
|
|
if jd_edu != "Not Specified": |
|
|
edu_order = {"Associate's": 1, "Bachelor's": 2, "Master's": 3, "Ph.D.": 4} |
|
|
if edu_order.get(cv_edu, 0) >= edu_order.get(jd_edu, 0): edu_match_status = "Meets or Exceeds Requirement" |
|
|
else: edu_match_status = "Below Requirement" |
|
|
results["education_match_status"] = edu_match_status |
|
|
return results |
|
|
|
|
|
|
|
|
def perform_cv_job_analysis(cv_text, job_desc_text): |
|
|
cv_analysis_data = analyze_document(cv_text) |
|
|
job_desc_analysis_data = analyze_document(job_desc_text) |
|
|
match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data) |
|
|
return match_results |
|
|
|
|
|
|
|
|
def create_overall_match_plot(score): |
|
|
fig, ax = plt.subplots(figsize=(6, 2)) |
|
|
sns.set_style("whitegrid") |
|
|
ax.barh(["Overall Match"], [score], color='skyblue') |
|
|
ax.set_xlim(0, 100) |
|
|
ax.text(score + 2, 0, f'{score}%', va='center', color='black', fontsize=12) |
|
|
ax.set_title("Overall CV-Job Description Match Score", fontsize=14) |
|
|
ax.set_xlabel("Match Percentage", fontsize=12) |
|
|
ax.get_yaxis().set_visible(False) |
|
|
plt.tight_layout() |
|
|
return fig |
|
|
|
|
|
def create_skill_match_plot(matched_skills, missing_skills): |
|
|
labels = ['Matched Skills', 'Missing Skills'] |
|
|
sizes = [len(matched_skills), len(missing_skills)] |
|
|
colors = ['#66b3ff', '#ff9999'] |
|
|
explode = (0.05, 0.05) if sizes[0] > 0 and sizes[1] > 0 else (0,0) |
|
|
if sum(sizes) == 0: return None |
|
|
fig, ax = plt.subplots(figsize=(7, 7)) |
|
|
ax.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90, textprops={'fontsize': 12}) |
|
|
ax.axis('equal') |
|
|
ax.set_title("Skill Match Breakdown", fontsize=14) |
|
|
plt.tight_layout() |
|
|
return fig |
|
|
|
|
|
def create_top_keywords_plot(cv_keywords, jd_keywords): |
|
|
fig, axes = plt.subplots(1, 2, figsize=(16, 6)) |
|
|
sns.set_style("whitegrid") |
|
|
cv_df = pd.DataFrame(Counter(cv_keywords).most_common(10), columns=['Keyword', 'Count']) |
|
|
if not cv_df.empty: |
|
|
sns.barplot(x='Count', y='Keyword', data=cv_df, ax=axes[0], palette='viridis') |
|
|
axes[0].set_title('Top Keywords in CV', fontsize=14) |
|
|
axes[0].set_xlabel('Frequency/Importance', fontsize=12) |
|
|
axes[0].set_ylabel('') |
|
|
jd_df = pd.DataFrame(Counter(jd_keywords).most_common(10), columns=['Keyword', 'Count']) |
|
|
if not jd_df.empty: |
|
|
sns.barplot(x='Count', y='Keyword', data=jd_df, ax=axes[1], palette='plasma') |
|
|
axes[1].set_title('Top Keywords in Job Description', fontsize=14) |
|
|
axes[1].set_xlabel('Frequency/Importance', fontsize=12) |
|
|
axes[1].set_ylabel('') |
|
|
plt.tight_layout() |
|
|
return fig |
|
|
|
|
|
|
|
|
def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input): |
|
|
cv_content = "" |
|
|
if cv_file_obj is not None: |
|
|
cv_content = get_file_content(cv_file_obj) |
|
|
elif cv_text_input: |
|
|
cv_content = cv_text_input |
|
|
if not cv_content: |
|
|
return (f"<h4><p style='color:red;'>π¨ Error: Please upload a CV file or paste your CV text.</p></h4>", |
|
|
None, None, None, "Analysis Failed") |
|
|
if not jd_text_input: |
|
|
return (f"<h4><p style='color:red;'>π¨ Error: Please paste the Job Description text.</p></h4>", |
|
|
None, None, None, "Analysis Failed") |
|
|
try: |
|
|
analysis_results = perform_cv_job_analysis(cv_content, jd_text_input) |
|
|
|
|
|
matched_skills_str = ', '.join(analysis_results['matched_skills']) if analysis_results['matched_skills'] else 'None found matching job description.' |
|
|
missing_skills_str = ', '.join(analysis_results['missing_skills']) if analysis_results['missing_skills'] else 'π₯³ None! Your CV has all specified skills.' |
|
|
extra_skills_str = ', '.join(analysis_results['extra_skills_in_cv']) if analysis_results['extra_skills_in_cv'] else 'None. (This is often fine, showing broader capability.)' |
|
|
common_keywords_str = ', '.join(analysis_results['common_keywords']) if analysis_results['common_keywords'] else 'No significant common keywords beyond skills.' |
|
|
cv_keywords_str = ', '.join(analysis_results['top_cv_keywords']) if analysis_results['top_cv_keywords'] else 'N/A' |
|
|
jd_keywords_str = ', '.join(analysis_results['top_jd_keywords']) if analysis_results['top_jd_keywords'] else 'N/A' |
|
|
|
|
|
html_output = f""" |
|
|
<h2 style='text-align: center;'>π‘ Analysis Results Summary π‘</h2> |
|
|
<div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center; margin-bottom: 20px;'> |
|
|
<div style='background-color: #e0f7fa; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);'> |
|
|
<h3>Overall Match Score</h3> |
|
|
<h1 style='color: #007bb6;'>{analysis_results['overall_match_score']}%</h1> |
|
|
</div> |
|
|
<div style='background-color: #e8f5e9; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);'> |
|
|
<h3>Skill Match</h3> |
|
|
<h1 style='color: #43a047;'>{analysis_results['skill_match_percentage']}%</h1> |
|
|
</div> |
|
|
<div style='background-color: #fff3e0; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);'> |
|
|
<h3>Experience Match</h3> |
|
|
<h1 style='color: #fb8c00;'>{analysis_results['experience_match_status']}</h1> |
|
|
</div> |
|
|
</div> |
|
|
<hr style='border-top: 2px solid #bbb; margin: 20px 0;'/> |
|
|
<h2 style='text-align: center;'>π Detailed Breakdown</h2> |
|
|
<h4>Skills Analysis</h4> |
|
|
<p><strong>β
Matched Skills:</strong> {matched_skills_str}</p> |
|
|
<p><strong>β Missing Skills (from Job Description):</strong> {missing_skills_str}</p> |
|
|
<p><strong>π‘ Extra Skills in CV (not in Job Description):</strong> {extra_skills_str}</p> |
|
|
<h4>Keyword Relevance (Top TF-IDF Terms)</h4> |
|
|
<p><strong>π€ Top Common Keywords:</strong> {common_keywords_str}</p> |
|
|
<p><strong>π Top Keywords in Your CV:</strong> {cv_keywords_str}</p> |
|
|
<p><strong>π― Top Keywords in Job Description:</strong> {jd_keywords_str}</p> |
|
|
<h4>Experience & Education Comparison</h4> |
|
|
<p><strong>π€ Your CV's Experience:</strong> <code>{analysis_results['cv_years_experience']}</code> years</p> |
|
|
<p><strong>πΌ Job's Required Experience:</strong> <code>{analysis_results['jd_years_experience']}</code> years</p> |
|
|
<p style='color:green;'><strong>Status:</strong> {analysis_results['experience_match_status']}</p> |
|
|
<p><strong>π Your CV's Education:</strong> <code>{analysis_results['cv_education_level']}</code></p> |
|
|
<p><strong>π Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p> |
|
|
<p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p> |
|
|
""" |
|
|
|
|
|
overall_plot = create_overall_match_plot(analysis_results['overall_match_score']) |
|
|
skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills']) |
|
|
keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords']) |
|
|
return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!" |
|
|
except Exception as e: |
|
|
import traceback |
|
|
error_traceback = traceback.format_exc() |
|
|
return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>" |
|
|
f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>", |
|
|
None, None, None, "Analysis Failed") |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo: |
|
|
|
|
|
|
|
|
gr.HTML("<style>#root{padding-top: 100px !important;}</style>") |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# π¨βπΌ CV-Job Match Analyzer π |
|
|
Welcome! This tool helps you understand how well a CV matches a job description. |
|
|
Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("## **1. Your CV**") |
|
|
cv_file_obj = gr.File(label="Upload CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"]) |
|
|
cv_text_input = gr.Textbox(label="Or paste CV text here (overrides file upload)", lines=10, placeholder="Paste your CV content here...") |
|
|
gr.Markdown("## **2. Job Description**") |
|
|
jd_text_input = gr.Textbox(label="Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...") |
|
|
with gr.Row(): |
|
|
analyze_button = gr.Button("β¨ Analyze CV Match β¨", variant="primary", scale=1) |
|
|
clear_button = gr.ClearButton([cv_file_obj, cv_text_input, jd_text_input], scale=1) |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
output_html = gr.HTML(label="Analysis Report") |
|
|
gr.Markdown("## **π Visual Insights**") |
|
|
output_overall_plot = gr.Plot(label="Overall Match Score") |
|
|
output_skill_plot = gr.Plot(label="Skill Match Breakdown") |
|
|
output_keywords_plot = gr.Plot(label="Top Keywords") |
|
|
|
|
|
analyze_button.click( |
|
|
fn=analyze_cv_match, |
|
|
inputs=[cv_file_obj, cv_text_input, jd_text_input], |
|
|
outputs=[output_html, output_overall_plot, output_skill_plot, output_keywords_plot, gr.State(value="")], |
|
|
) |
|
|
|
|
|
demo.launch() |