Update app.py
Browse files
app.py
CHANGED
|
@@ -126,4 +126,233 @@ def get_text_embeddings(text):
|
|
| 126 |
else: return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1])
|
| 127 |
|
| 128 |
def calculate_cosine_similarity(vec1, vec2):
|
| 129 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
else: return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1])
|
| 127 |
|
| 128 |
def calculate_cosine_similarity(vec1, vec2):
|
| 129 |
+
# This check prevents errors if one of the vectors is all zeros.
|
| 130 |
+
if np.all(vec1 == 0) or np.all(vec2 == 0):
|
| 131 |
+
return 0.0
|
| 132 |
+
vec1 = vec1.reshape(1, -1)
|
| 133 |
+
vec2 = vec2.reshape(1, -1)
|
| 134 |
+
return cosine_similarity(vec1, vec2)[0][0]
|
| 135 |
+
|
| 136 |
+
# --- Main Processing Pipeline ---
|
| 137 |
+
def analyze_document(doc_text):
|
| 138 |
+
doc_spacy = nlp(doc_text)
|
| 139 |
+
cleaned_text = preprocess_text(doc_text)
|
| 140 |
+
extracted_skills, general_entities = extract_skills(doc_spacy, skill_keywords=predefined_skills_list)
|
| 141 |
+
years_exp, education_level = extract_experience_and_education(doc_text)
|
| 142 |
+
text_embedding = get_text_embeddings(cleaned_text)
|
| 143 |
+
return {
|
| 144 |
+
"raw_text": doc_text, "cleaned_text": cleaned_text, "spacy_doc": doc_spacy,
|
| 145 |
+
"extracted_skills": extracted_skills, "general_entities": general_entities,
|
| 146 |
+
"years_experience": years_exp, "education_level": education_level,
|
| 147 |
+
"text_embedding": text_embedding
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
# --- Matching and Scoring Logic ---
|
| 151 |
+
def calculate_match_scores(cv_data, jd_data):
|
| 152 |
+
results = {}
|
| 153 |
+
overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"])
|
| 154 |
+
results["overall_match_score"] = round(overall_similarity * 100, 2)
|
| 155 |
+
cv_skills = set(cv_data["extracted_skills"])
|
| 156 |
+
jd_skills = set(jd_data["extracted_skills"])
|
| 157 |
+
matched_skills = list(cv_skills.intersection(jd_skills))
|
| 158 |
+
missing_skills = list(jd_skills.difference(cv_skills))
|
| 159 |
+
extra_skills_in_cv = list(cv_skills.difference(jd_skills))
|
| 160 |
+
results["matched_skills"] = matched_skills
|
| 161 |
+
results["missing_skills"] = missing_skills
|
| 162 |
+
results["extra_skills_in_cv"] = extra_skills_in_cv
|
| 163 |
+
if jd_skills: skill_match_percentage = len(matched_skills) / len(jd_skills) * 100
|
| 164 |
+
else: skill_match_percentage = 0.0
|
| 165 |
+
results["skill_match_percentage"] = round(skill_match_percentage, 2)
|
| 166 |
+
corpus = [cv_data["cleaned_text"], jd_data["cleaned_text"]]
|
| 167 |
+
tfidf_vectorizer = TfidfVectorizer(max_features=100)
|
| 168 |
+
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
|
| 169 |
+
feature_names = tfidf_vectorizer.get_feature_names_out()
|
| 170 |
+
cv_tfidf_scores = {feature_names[i]: tfidf_matrix[0, i] for i in tfidf_matrix[0].nonzero()[1]}
|
| 171 |
+
jd_tfidf_scores = {feature_names[i]: tfidf_matrix[1, i] for i in tfidf_matrix[1].nonzero()[1]}
|
| 172 |
+
top_cv_keywords = sorted(cv_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
|
| 173 |
+
top_jd_keywords = sorted(jd_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
|
| 174 |
+
results["top_cv_keywords"] = [k for k,v in top_cv_keywords]
|
| 175 |
+
results["top_jd_keywords"] = [k for k,v in top_jd_keywords]
|
| 176 |
+
common_keywords = set(results["top_cv_keywords"]).intersection(set(results["top_jd_keywords"]))
|
| 177 |
+
results["common_keywords"] = list(common_keywords)
|
| 178 |
+
cv_exp_years = cv_data["years_experience"]
|
| 179 |
+
jd_exp_years = jd_data["years_experience"]
|
| 180 |
+
results["cv_years_experience"] = cv_exp_years
|
| 181 |
+
results["jd_years_experience"] = jd_exp_years
|
| 182 |
+
exp_status = "Not specified by Job"
|
| 183 |
+
if jd_exp_years > 0:
|
| 184 |
+
if cv_exp_years >= jd_exp_years: exp_status = "Meets or Exceeds Requirement"
|
| 185 |
+
else: exp_status = f"Below Requirement (Needs {jd_exp_years - cv_exp_years} more years)"
|
| 186 |
+
results["experience_match_status"] = exp_status
|
| 187 |
+
cv_edu = cv_data["education_level"]
|
| 188 |
+
jd_edu = jd_data["education_level"]
|
| 189 |
+
results["cv_education_level"] = cv_edu
|
| 190 |
+
results["jd_education_level"] = jd_edu
|
| 191 |
+
edu_match_status = "Not Specified by Job"
|
| 192 |
+
if jd_edu != "Not Specified":
|
| 193 |
+
edu_order = {"Associate's": 1, "Bachelor's": 2, "Master's": 3, "Ph.D.": 4}
|
| 194 |
+
if edu_order.get(cv_edu, 0) >= edu_order.get(jd_edu, 0): edu_match_status = "Meets or Exceeds Requirement"
|
| 195 |
+
else: edu_match_status = "Below Requirement"
|
| 196 |
+
results["education_match_status"] = edu_match_status
|
| 197 |
+
return results
|
| 198 |
+
|
| 199 |
+
# --- Overall Analysis Orchestrator ---
|
| 200 |
+
def perform_cv_job_analysis(cv_text, job_desc_text):
|
| 201 |
+
cv_analysis_data = analyze_document(cv_text)
|
| 202 |
+
job_desc_analysis_data = analyze_document(job_desc_text)
|
| 203 |
+
match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
|
| 204 |
+
return match_results
|
| 205 |
+
|
| 206 |
+
# --- Visualization Functions ---
|
| 207 |
+
def create_overall_match_plot(score):
|
| 208 |
+
fig, ax = plt.subplots(figsize=(6, 2))
|
| 209 |
+
sns.set_style("whitegrid")
|
| 210 |
+
ax.barh(["Overall Match"], [score], color='skyblue')
|
| 211 |
+
ax.set_xlim(0, 100)
|
| 212 |
+
ax.text(score + 2, 0, f'{score}%', va='center', color='black', fontsize=12)
|
| 213 |
+
ax.set_title("Overall CV-Job Description Match Score", fontsize=14)
|
| 214 |
+
ax.set_xlabel("Match Percentage", fontsize=12)
|
| 215 |
+
ax.get_yaxis().set_visible(False)
|
| 216 |
+
plt.tight_layout()
|
| 217 |
+
return fig
|
| 218 |
+
|
| 219 |
+
def create_skill_match_plot(matched_skills, missing_skills):
|
| 220 |
+
labels = ['Matched Skills', 'Missing Skills']
|
| 221 |
+
sizes = [len(matched_skills), len(missing_skills)]
|
| 222 |
+
colors = ['#66b3ff', '#ff9999']
|
| 223 |
+
explode = (0.05, 0.05) if sizes[0] > 0 and sizes[1] > 0 else (0,0)
|
| 224 |
+
if sum(sizes) == 0: return None
|
| 225 |
+
fig, ax = plt.subplots(figsize=(7, 7))
|
| 226 |
+
ax.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90, textprops={'fontsize': 12})
|
| 227 |
+
ax.axis('equal')
|
| 228 |
+
ax.set_title("Skill Match Breakdown", fontsize=14)
|
| 229 |
+
plt.tight_layout()
|
| 230 |
+
return fig
|
| 231 |
+
|
| 232 |
+
def create_top_keywords_plot(cv_keywords, jd_keywords):
|
| 233 |
+
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
|
| 234 |
+
sns.set_style("whitegrid")
|
| 235 |
+
cv_df = pd.DataFrame(Counter(cv_keywords).most_common(10), columns=['Keyword', 'Count'])
|
| 236 |
+
if not cv_df.empty:
|
| 237 |
+
sns.barplot(x='Count', y='Keyword', data=cv_df, ax=axes[0], palette='viridis')
|
| 238 |
+
axes[0].set_title('Top Keywords in CV', fontsize=14)
|
| 239 |
+
axes[0].set_xlabel('Frequency/Importance', fontsize=12)
|
| 240 |
+
axes[0].set_ylabel('')
|
| 241 |
+
jd_df = pd.DataFrame(Counter(jd_keywords).most_common(10), columns=['Keyword', 'Count'])
|
| 242 |
+
if not jd_df.empty:
|
| 243 |
+
sns.barplot(x='Count', y='Keyword', data=jd_df, ax=axes[1], palette='plasma')
|
| 244 |
+
axes[1].set_title('Top Keywords in Job Description', fontsize=14)
|
| 245 |
+
axes[1].set_xlabel('Frequency/Importance', fontsize=12)
|
| 246 |
+
axes[1].set_ylabel('')
|
| 247 |
+
plt.tight_layout()
|
| 248 |
+
return fig
|
| 249 |
+
|
| 250 |
+
# --- Main Gradio Interface Function ---
|
| 251 |
+
def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
|
| 252 |
+
cv_content = ""
|
| 253 |
+
if cv_file_obj is not None:
|
| 254 |
+
cv_content = get_file_content(cv_file_obj)
|
| 255 |
+
elif cv_text_input:
|
| 256 |
+
cv_content = cv_text_input
|
| 257 |
+
if not cv_content:
|
| 258 |
+
return (f"<h4><p style='color:red;'>π¨ Error: Please upload a CV file or paste your CV text.</p></h4>",
|
| 259 |
+
None, None, None, "Analysis Failed")
|
| 260 |
+
if not jd_text_input:
|
| 261 |
+
return (f"<h4><p style='color:red;'>π¨ Error: Please paste the Job Description text.</p></h4>",
|
| 262 |
+
None, None, None, "Analysis Failed")
|
| 263 |
+
try:
|
| 264 |
+
analysis_results = perform_cv_job_analysis(cv_content, jd_text_input)
|
| 265 |
+
|
| 266 |
+
# Using helper variables for clarity and to prevent syntax issues in f-strings
|
| 267 |
+
matched_skills_str = ', '.join(analysis_results['matched_skills']) if analysis_results['matched_skills'] else 'None found matching job description.'
|
| 268 |
+
missing_skills_str = ', '.join(analysis_results['missing_skills']) if analysis_results['missing_skills'] else 'π₯³ None! Your CV has all specified skills.'
|
| 269 |
+
extra_skills_str = ', '.join(analysis_results['extra_skills_in_cv']) if analysis_results['extra_skills_in_cv'] else 'None. (This is often fine, showing broader capability.)'
|
| 270 |
+
common_keywords_str = ', '.join(analysis_results['common_keywords']) if analysis_results['common_keywords'] else 'No significant common keywords beyond skills.'
|
| 271 |
+
cv_keywords_str = ', '.join(analysis_results['top_cv_keywords']) if analysis_results['top_cv_keywords'] else 'N/A'
|
| 272 |
+
jd_keywords_str = ', '.join(analysis_results['top_jd_keywords']) if analysis_results['top_jd_keywords'] else 'N/A'
|
| 273 |
+
|
| 274 |
+
# Build the HTML string piece by piece
|
| 275 |
+
html_output = f"""
|
| 276 |
+
<h2 style='text-align: center;'>π‘ Analysis Results Summary π‘</h2>
|
| 277 |
+
<div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center; margin-bottom: 20px;'>
|
| 278 |
+
<div style='background-color: #e0f7fa; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);'>
|
| 279 |
+
<h3>Overall Match Score</h3>
|
| 280 |
+
<h1 style='color: #007bb6;'>{analysis_results['overall_match_score']}%</h1>
|
| 281 |
+
</div>
|
| 282 |
+
<div style='background-color: #e8f5e9; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);'>
|
| 283 |
+
<h3>Skill Match</h3>
|
| 284 |
+
<h1 style='color: #43a047;'>{analysis_results['skill_match_percentage']}%</h1>
|
| 285 |
+
</div>
|
| 286 |
+
<div style='background-color: #fff3e0; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);'>
|
| 287 |
+
<h3>Experience Match</h3>
|
| 288 |
+
<h1 style='color: #fb8c00;'>{analysis_results['experience_match_status']}</h1>
|
| 289 |
+
</div>
|
| 290 |
+
</div>
|
| 291 |
+
<hr style='border-top: 2px solid #bbb; margin: 20px 0;'/>
|
| 292 |
+
<h2 style='text-align: center;'>π Detailed Breakdown</h2>
|
| 293 |
+
<h4>Skills Analysis</h4>
|
| 294 |
+
<p><strong>β
Matched Skills:</strong> {matched_skills_str}</p>
|
| 295 |
+
<p><strong>β Missing Skills (from Job Description):</strong> {missing_skills_str}</p>
|
| 296 |
+
<p><strong>π‘ Extra Skills in CV (not in Job Description):</strong> {extra_skills_str}</p>
|
| 297 |
+
<h4>Keyword Relevance (Top TF-IDF Terms)</h4>
|
| 298 |
+
<p><strong>π€ Top Common Keywords:</strong> {common_keywords_str}</p>
|
| 299 |
+
<p><strong>π Top Keywords in Your CV:</strong> {cv_keywords_str}</p>
|
| 300 |
+
<p><strong>π― Top Keywords in Job Description:</strong> {jd_keywords_str}</p>
|
| 301 |
+
<h4>Experience & Education Comparison</h4>
|
| 302 |
+
<p><strong>π€ Your CV's Experience:</strong> <code>{analysis_results['cv_years_experience']}</code> years</p>
|
| 303 |
+
<p><strong>πΌ Job's Required Experience:</strong> <code>{analysis_results['jd_years_experience']}</code> years</p>
|
| 304 |
+
<p style='color:green;'><strong>Status:</strong> {analysis_results['experience_match_status']}</p>
|
| 305 |
+
<p><strong>π Your CV's Education:</strong> <code>{analysis_results['cv_education_level']}</code></p>
|
| 306 |
+
<p><strong>π Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p>
|
| 307 |
+
<p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p>
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
# Generate plots
|
| 311 |
+
overall_plot = create_overall_match_plot(analysis_results['overall_match_score'])
|
| 312 |
+
skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
|
| 313 |
+
keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
|
| 314 |
+
return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!"
|
| 315 |
+
except Exception as e:
|
| 316 |
+
import traceback
|
| 317 |
+
error_traceback = traceback.format_exc()
|
| 318 |
+
return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>"
|
| 319 |
+
f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>",
|
| 320 |
+
None, None, None, "Analysis Failed")
|
| 321 |
+
|
| 322 |
+
# --- Gradio Interface Definition ---
|
| 323 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
|
| 324 |
+
|
| 325 |
+
# Cleaner fix: Add CSS padding to the top of the app to push it down
|
| 326 |
+
gr.HTML("<style>#root{padding-top: 50px !important;}</style>")
|
| 327 |
+
|
| 328 |
+
gr.Markdown(
|
| 329 |
+
"""
|
| 330 |
+
# π¨βπΌ CV-Job Match Analyzer π
|
| 331 |
+
Welcome! This tool helps you understand how well a CV matches a job description.
|
| 332 |
+
Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
|
| 333 |
+
"""
|
| 334 |
+
)
|
| 335 |
+
with gr.Row():
|
| 336 |
+
with gr.Column(scale=1, min_width=400):
|
| 337 |
+
gr.Markdown("## **1. Your CV**")
|
| 338 |
+
cv_file_obj = gr.File(label="Upload CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
|
| 339 |
+
cv_text_input = gr.Textbox(label="Or paste CV text here (overrides file upload)", lines=10, placeholder="Paste your CV content here...")
|
| 340 |
+
gr.Markdown("## **2. Job Description**")
|
| 341 |
+
jd_text_input = gr.Textbox(label="Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...")
|
| 342 |
+
with gr.Row():
|
| 343 |
+
analyze_button = gr.Button("β¨ Analyze CV Match β¨", variant="primary", scale=1)
|
| 344 |
+
clear_button = gr.ClearButton([cv_file_obj, cv_text_input, jd_text_input], scale=1)
|
| 345 |
+
with gr.Column(scale=2, min_width=600):
|
| 346 |
+
output_html = gr.HTML(label="Analysis Report")
|
| 347 |
+
gr.Markdown("## **π Visual Insights**")
|
| 348 |
+
output_overall_plot = gr.Plot(label="Overall Match Score")
|
| 349 |
+
output_skill_plot = gr.Plot(label="Skill Match Breakdown")
|
| 350 |
+
output_keywords_plot = gr.Plot(label="Top Keywords")
|
| 351 |
+
|
| 352 |
+
analyze_button.click(
|
| 353 |
+
fn=analyze_cv_match,
|
| 354 |
+
inputs=[cv_file_obj, cv_text_input, jd_text_input],
|
| 355 |
+
outputs=[output_html, output_overall_plot, output_skill_plot, output_keywords_plot, gr.State(value="")],
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
demo.launch()
|