Update app.py
Browse files
app.py
CHANGED
|
@@ -14,15 +14,11 @@ import seaborn as sns
|
|
| 14 |
import numpy as np
|
| 15 |
|
| 16 |
# --- SpaCy Model Loading ---
|
| 17 |
-
# For Gradio on Hugging Face Spaces, the model is usually installed via requirements.txt
|
| 18 |
-
# so spacy.load() will find it.
|
| 19 |
try:
|
| 20 |
nlp = spacy.load("en_core_web_lg")
|
| 21 |
print("SpaCy model loaded successfully.")
|
| 22 |
except Exception as e:
|
| 23 |
print(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
|
| 24 |
-
# In a Gradio app, you might raise an error or display a message in the UI
|
| 25 |
-
# For now, let's just print to logs if it fails to load at startup.
|
| 26 |
|
| 27 |
# --- Global Predefined Skills ---
|
| 28 |
predefined_skills_list = set([
|
|
@@ -46,12 +42,8 @@ predefined_skills_list.update([
|
|
| 46 |
"data engineer", "software engineer", "full stack", "frontend", "backend"
|
| 47 |
])
|
| 48 |
|
| 49 |
-
# --- Text Extraction Functions
|
| 50 |
-
# Gradio's gr.File component provides a file path to the temporary uploaded file.
|
| 51 |
def extract_text_from_pdf(pdf_path):
|
| 52 |
-
"""
|
| 53 |
-
Extracts text from a PDF file given its path.
|
| 54 |
-
"""
|
| 55 |
try:
|
| 56 |
with open(pdf_path, 'rb') as file:
|
| 57 |
reader = PdfReader(file)
|
|
@@ -60,26 +52,22 @@ def extract_text_from_pdf(pdf_path):
|
|
| 60 |
text += page.extract_text() or ""
|
| 61 |
return text
|
| 62 |
except Exception as e:
|
| 63 |
-
print(f"Error reading PDF {pdf_path}: {e}")
|
| 64 |
return ""
|
| 65 |
|
| 66 |
def extract_text_from_docx(docx_path):
|
| 67 |
-
"""
|
| 68 |
-
Extracts text from a DOCX file given its path.
|
| 69 |
-
"""
|
| 70 |
try:
|
| 71 |
document = Document(docx_path)
|
| 72 |
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
|
| 73 |
return text
|
| 74 |
except Exception as e:
|
| 75 |
-
print(f"Error reading DOCX {docx_path}: {e}")
|
| 76 |
return ""
|
| 77 |
|
| 78 |
def get_file_content(file_obj):
|
| 79 |
-
"""Helper to get content from Gradio's file component."""
|
| 80 |
if file_obj is None:
|
| 81 |
return ""
|
| 82 |
-
file_path = file_obj.name
|
| 83 |
if file_path.endswith('.pdf'):
|
| 84 |
return extract_text_from_pdf(file_path)
|
| 85 |
elif file_path.endswith('.docx'):
|
|
@@ -90,7 +78,7 @@ def get_file_content(file_obj):
|
|
| 90 |
else:
|
| 91 |
return ""
|
| 92 |
|
| 93 |
-
# --- Text Preprocessing Functions
|
| 94 |
def preprocess_text(text):
|
| 95 |
if not isinstance(text, str): return ""
|
| 96 |
text = text.lower()
|
|
@@ -99,7 +87,7 @@ def preprocess_text(text):
|
|
| 99 |
processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
|
| 100 |
return " ".join(processed_tokens)
|
| 101 |
|
| 102 |
-
# --- Information Extraction
|
| 103 |
def extract_skills(text_doc, skill_keywords=None):
|
| 104 |
extracted_skills = []
|
| 105 |
if skill_keywords is None: skill_keywords = set()
|
|
@@ -130,7 +118,7 @@ def extract_experience_and_education(text):
|
|
| 130 |
elif "associate" in text_lower: education_level = "Associate's"
|
| 131 |
return years_experience, education_level
|
| 132 |
|
| 133 |
-
# --- Feature Engineering
|
| 134 |
def get_text_embeddings(text):
|
| 135 |
if not text: return np.zeros(nlp.vocab.vectors.shape[1])
|
| 136 |
doc = nlp(text)
|
|
@@ -143,7 +131,7 @@ def calculate_cosine_similarity(vec1, vec2):
|
|
| 143 |
vec2 = vec2.reshape(1, -1)
|
| 144 |
return cosine_similarity(vec1, vec2)[0][0]
|
| 145 |
|
| 146 |
-
# --- Main Processing Pipeline
|
| 147 |
def analyze_document(doc_text):
|
| 148 |
doc_spacy = nlp(doc_text)
|
| 149 |
cleaned_text = preprocess_text(doc_text)
|
|
@@ -157,7 +145,7 @@ def analyze_document(doc_text):
|
|
| 157 |
"text_embedding": text_embedding
|
| 158 |
}
|
| 159 |
|
| 160 |
-
# --- Matching and Scoring Logic
|
| 161 |
def calculate_match_scores(cv_data, jd_data):
|
| 162 |
results = {}
|
| 163 |
overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"])
|
|
@@ -206,14 +194,14 @@ def calculate_match_scores(cv_data, jd_data):
|
|
| 206 |
results["education_match_status"] = edu_match_status
|
| 207 |
return results
|
| 208 |
|
| 209 |
-
# --- Overall Analysis Orchestrator
|
| 210 |
def perform_cv_job_analysis(cv_text, job_desc_text):
|
| 211 |
cv_analysis_data = analyze_document(cv_text)
|
| 212 |
job_desc_analysis_data = analyze_document(job_desc_text)
|
| 213 |
match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
|
| 214 |
return match_results
|
| 215 |
|
| 216 |
-
# --- Visualization Functions
|
| 217 |
def create_overall_match_plot(score):
|
| 218 |
fig, ax = plt.subplots(figsize=(6, 2))
|
| 219 |
sns.set_style("whitegrid")
|
|
@@ -259,12 +247,7 @@ def create_top_keywords_plot(cv_keywords, jd_keywords):
|
|
| 259 |
|
| 260 |
# --- Main Gradio Interface Function ---
|
| 261 |
def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
|
| 262 |
-
"""
|
| 263 |
-
This function will be called by Gradio's Interface.
|
| 264 |
-
It takes Gradio inputs and returns Gradio outputs (HTML, plots).
|
| 265 |
-
"""
|
| 266 |
cv_content = ""
|
| 267 |
-
# Prioritize file upload over text area if both are provided
|
| 268 |
if cv_file_obj is not None:
|
| 269 |
cv_content = get_file_content(cv_file_obj)
|
| 270 |
elif cv_text_input:
|
|
@@ -277,7 +260,6 @@ def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
|
|
| 277 |
None, None, None, "Analysis Failed")
|
| 278 |
try:
|
| 279 |
analysis_results = perform_cv_job_analysis(cv_content, jd_text_input)
|
| 280 |
-
# Generate HTML output for KPIs and detailed breakdown
|
| 281 |
html_output = f"""
|
| 282 |
<h2 style='text-align: center;'>π‘ Analysis Results Summary π‘</h2>
|
| 283 |
<div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center; margin-bottom: 20px;'>
|
|
@@ -312,23 +294,23 @@ def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
|
|
| 312 |
<p><strong>π Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p>
|
| 313 |
<p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p>
|
| 314 |
"""
|
| 315 |
-
# Generate plots
|
| 316 |
overall_plot = create_overall_match_plot(analysis_results['overall_match_score'])
|
| 317 |
skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
|
| 318 |
keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
|
| 319 |
-
# Return all outputs in the correct order
|
| 320 |
return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!"
|
| 321 |
except Exception as e:
|
| 322 |
import traceback
|
| 323 |
error_traceback = traceback.format_exc()
|
| 324 |
-
# Return empty plots/HTML on error
|
| 325 |
return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>"
|
| 326 |
f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>",
|
| 327 |
None, None, None, "Analysis Failed")
|
| 328 |
|
| 329 |
# --- Gradio Interface Definition ---
|
| 330 |
-
# Use gr.Blocks for more flexibility in layout
|
| 331 |
with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
gr.Markdown(
|
| 333 |
"""
|
| 334 |
# π¨βπΌ CV-Job Match Analyzer π
|
|
@@ -336,29 +318,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
|
|
| 336 |
Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
|
| 337 |
"""
|
| 338 |
)
|
| 339 |
-
with gr.Row():
|
| 340 |
-
with gr.Column(scale=1, min_width=400):
|
| 341 |
-
gr.Markdown("## **1. Your CV**")
|
| 342 |
cv_file_obj = gr.File(label="Upload CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
|
| 343 |
cv_text_input = gr.Textbox(label="Or paste CV text here (overrides file upload)", lines=10, placeholder="Paste your CV content here...")
|
| 344 |
-
gr.Markdown("## **2. Job Description**")
|
| 345 |
jd_text_input = gr.Textbox(label="Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...")
|
| 346 |
-
with gr.Row():
|
| 347 |
analyze_button = gr.Button("β¨ Analyze CV Match β¨", variant="primary", scale=1)
|
| 348 |
clear_button = gr.ClearButton([cv_file_obj, cv_text_input, jd_text_input], scale=1)
|
| 349 |
-
with gr.Column(scale=2, min_width=600):
|
| 350 |
-
output_html = gr.HTML(label="Analysis Report")
|
| 351 |
-
gr.Markdown("## **π Visual Insights**")
|
| 352 |
-
output_overall_plot = gr.Plot(label="Overall Match Score")
|
| 353 |
output_skill_plot = gr.Plot(label="Skill Match Breakdown")
|
| 354 |
output_keywords_plot = gr.Plot(label="Top Keywords")
|
| 355 |
-
|
| 356 |
analyze_button.click(
|
| 357 |
fn=analyze_cv_match,
|
| 358 |
inputs=[cv_file_obj, cv_text_input, jd_text_input],
|
| 359 |
outputs=[output_html, output_overall_plot, output_skill_plot, output_keywords_plot, gr.State(value="")],
|
| 360 |
-
# The last output for gr.State is a dummy. It needs to match the number of return values from analyze_cv_match.
|
| 361 |
-
# Gradio uses the last return value for the status bar if it's a string, or you can explicitly link it.
|
| 362 |
)
|
| 363 |
|
| 364 |
demo.launch()
|
|
|
|
| 14 |
import numpy as np
|
| 15 |
|
| 16 |
# --- SpaCy Model Loading ---
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
nlp = spacy.load("en_core_web_lg")
|
| 19 |
print("SpaCy model loaded successfully.")
|
| 20 |
except Exception as e:
|
| 21 |
print(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# --- Global Predefined Skills ---
|
| 24 |
predefined_skills_list = set([
|
|
|
|
| 42 |
"data engineer", "software engineer", "full stack", "frontend", "backend"
|
| 43 |
])
|
| 44 |
|
| 45 |
+
# --- Text Extraction Functions ---
|
|
|
|
| 46 |
def extract_text_from_pdf(pdf_path):
|
|
|
|
|
|
|
|
|
|
| 47 |
try:
|
| 48 |
with open(pdf_path, 'rb') as file:
|
| 49 |
reader = PdfReader(file)
|
|
|
|
| 52 |
text += page.extract_text() or ""
|
| 53 |
return text
|
| 54 |
except Exception as e:
|
| 55 |
+
print(f"Error reading PDF {pdf_path}: {e}")
|
| 56 |
return ""
|
| 57 |
|
| 58 |
def extract_text_from_docx(docx_path):
|
|
|
|
|
|
|
|
|
|
| 59 |
try:
|
| 60 |
document = Document(docx_path)
|
| 61 |
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
|
| 62 |
return text
|
| 63 |
except Exception as e:
|
| 64 |
+
print(f"Error reading DOCX {docx_path}: {e}")
|
| 65 |
return ""
|
| 66 |
|
| 67 |
def get_file_content(file_obj):
|
|
|
|
| 68 |
if file_obj is None:
|
| 69 |
return ""
|
| 70 |
+
file_path = file_obj.name
|
| 71 |
if file_path.endswith('.pdf'):
|
| 72 |
return extract_text_from_pdf(file_path)
|
| 73 |
elif file_path.endswith('.docx'):
|
|
|
|
| 78 |
else:
|
| 79 |
return ""
|
| 80 |
|
| 81 |
+
# --- Text Preprocessing Functions ---
|
| 82 |
def preprocess_text(text):
|
| 83 |
if not isinstance(text, str): return ""
|
| 84 |
text = text.lower()
|
|
|
|
| 87 |
processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
|
| 88 |
return " ".join(processed_tokens)
|
| 89 |
|
| 90 |
+
# --- Information Extraction ---
|
| 91 |
def extract_skills(text_doc, skill_keywords=None):
|
| 92 |
extracted_skills = []
|
| 93 |
if skill_keywords is None: skill_keywords = set()
|
|
|
|
| 118 |
elif "associate" in text_lower: education_level = "Associate's"
|
| 119 |
return years_experience, education_level
|
| 120 |
|
| 121 |
+
# --- Feature Engineering ---
|
| 122 |
def get_text_embeddings(text):
|
| 123 |
if not text: return np.zeros(nlp.vocab.vectors.shape[1])
|
| 124 |
doc = nlp(text)
|
|
|
|
| 131 |
vec2 = vec2.reshape(1, -1)
|
| 132 |
return cosine_similarity(vec1, vec2)[0][0]
|
| 133 |
|
| 134 |
+
# --- Main Processing Pipeline ---
|
| 135 |
def analyze_document(doc_text):
|
| 136 |
doc_spacy = nlp(doc_text)
|
| 137 |
cleaned_text = preprocess_text(doc_text)
|
|
|
|
| 145 |
"text_embedding": text_embedding
|
| 146 |
}
|
| 147 |
|
| 148 |
+
# --- Matching and Scoring Logic ---
|
| 149 |
def calculate_match_scores(cv_data, jd_data):
|
| 150 |
results = {}
|
| 151 |
overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"])
|
|
|
|
| 194 |
results["education_match_status"] = edu_match_status
|
| 195 |
return results
|
| 196 |
|
| 197 |
+
# --- Overall Analysis Orchestrator ---
|
| 198 |
def perform_cv_job_analysis(cv_text, job_desc_text):
|
| 199 |
cv_analysis_data = analyze_document(cv_text)
|
| 200 |
job_desc_analysis_data = analyze_document(job_desc_text)
|
| 201 |
match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
|
| 202 |
return match_results
|
| 203 |
|
| 204 |
+
# --- Visualization Functions ---
|
| 205 |
def create_overall_match_plot(score):
|
| 206 |
fig, ax = plt.subplots(figsize=(6, 2))
|
| 207 |
sns.set_style("whitegrid")
|
|
|
|
| 247 |
|
| 248 |
# --- Main Gradio Interface Function ---
|
| 249 |
def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
cv_content = ""
|
|
|
|
| 251 |
if cv_file_obj is not None:
|
| 252 |
cv_content = get_file_content(cv_file_obj)
|
| 253 |
elif cv_text_input:
|
|
|
|
| 260 |
None, None, None, "Analysis Failed")
|
| 261 |
try:
|
| 262 |
analysis_results = perform_cv_job_analysis(cv_content, jd_text_input)
|
|
|
|
| 263 |
html_output = f"""
|
| 264 |
<h2 style='text-align: center;'>π‘ Analysis Results Summary π‘</h2>
|
| 265 |
<div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center; margin-bottom: 20px;'>
|
|
|
|
| 294 |
<p><strong>π Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p>
|
| 295 |
<p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p>
|
| 296 |
"""
|
|
|
|
| 297 |
overall_plot = create_overall_match_plot(analysis_results['overall_match_score'])
|
| 298 |
skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
|
| 299 |
keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
|
|
|
|
| 300 |
return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!"
|
| 301 |
except Exception as e:
|
| 302 |
import traceback
|
| 303 |
error_traceback = traceback.format_exc()
|
|
|
|
| 304 |
return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>"
|
| 305 |
f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>",
|
| 306 |
None, None, None, "Analysis Failed")
|
| 307 |
|
| 308 |
# --- Gradio Interface Definition ---
|
|
|
|
| 309 |
with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
|
| 310 |
+
|
| 311 |
+
# THIS IS THE NEW LINE TO ADD EMPTY SPACE AT THE TOP
|
| 312 |
+
gr.Markdown("<br>" * 5)
|
| 313 |
+
|
| 314 |
gr.Markdown(
|
| 315 |
"""
|
| 316 |
# π¨βπΌ CV-Job Match Analyzer π
|
|
|
|
| 318 |
Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
|
| 319 |
"""
|
| 320 |
)
|
| 321 |
+
with gr.Row():
|
| 322 |
+
with gr.Column(scale=1, min_width=400):
|
| 323 |
+
gr.Markdown("## **1. Your CV**")
|
| 324 |
cv_file_obj = gr.File(label="Upload CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
|
| 325 |
cv_text_input = gr.Textbox(label="Or paste CV text here (overrides file upload)", lines=10, placeholder="Paste your CV content here...")
|
| 326 |
+
gr.Markdown("## **2. Job Description**")
|
| 327 |
jd_text_input = gr.Textbox(label="Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...")
|
| 328 |
+
with gr.Row():
|
| 329 |
analyze_button = gr.Button("β¨ Analyze CV Match β¨", variant="primary", scale=1)
|
| 330 |
clear_button = gr.ClearButton([cv_file_obj, cv_text_input, jd_text_input], scale=1)
|
| 331 |
+
with gr.Column(scale=2, min_width=600):
|
| 332 |
+
output_html = gr.HTML(label="Analysis Report")
|
| 333 |
+
gr.Markdown("## **π Visual Insights**")
|
| 334 |
+
output_overall_plot = gr.Plot(label="Overall Match Score")
|
| 335 |
output_skill_plot = gr.Plot(label="Skill Match Breakdown")
|
| 336 |
output_keywords_plot = gr.Plot(label="Top Keywords")
|
| 337 |
+
|
| 338 |
analyze_button.click(
|
| 339 |
fn=analyze_cv_match,
|
| 340 |
inputs=[cv_file_obj, cv_text_input, jd_text_input],
|
| 341 |
outputs=[output_html, output_overall_plot, output_skill_plot, output_keywords_plot, gr.State(value="")],
|
|
|
|
|
|
|
| 342 |
)
|
| 343 |
|
| 344 |
demo.launch()
|