Bur3hani commited on
Commit
85fdd68
Β·
verified Β·
1 Parent(s): 0c610e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -286
app.py CHANGED
@@ -1,9 +1,9 @@
1
- import streamlit as st
2
  import os
3
  import io
4
  import re
5
  from docx import Document
6
- from PyPDF2 import PdfReader # PyPDF2 is used. For more robust PDF parsing, consider 'PyMuPDF' (fitz)
7
  import pandas as pd
8
  import spacy
9
  from collections import Counter
@@ -14,25 +14,17 @@ import seaborn as sns
14
  import numpy as np
15
 
16
  # --- SpaCy Model Loading ---
17
- # Use st.cache_resource to load the model only once and reuse it across sessions.
18
- @st.cache_resource
19
- def load_spacy_model():
20
- """
21
- Loads the English spaCy model.
22
- The model should be pre-downloaded via requirements.txt for Hugging Face Spaces.
23
- """
24
- try:
25
- nlp_model = spacy.load("en_core_web_lg")
26
- return nlp_model
27
- except Exception as e:
28
- st.error(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
29
- st.stop() # Stop the app if model fails to load
30
-
31
- nlp = load_spacy_model()
32
- print("SpaCy model loaded successfully.") # This will appear in your Space logs
33
-
34
-
35
- # --- Global Predefined Skills (could be loaded from a file for larger lists) ---
36
  predefined_skills_list = set([
37
  "python", "tensorflow", "pytorch", "scikit-learn", "numpy", "pandas",
38
  "docker", "kubernetes", "aws", "git", "sql", "java", "r", "tableau",
@@ -50,262 +42,181 @@ predefined_skills_list = set([
50
  ])
51
  predefined_skills_list.update([
52
  "machine learning engineer", "data scientist", "ai engineer", "deep learning engineer",
53
- "senior machine learning engineer", "junior data scientist", # Adding common job titles too
54
  "data engineer", "software engineer", "full stack", "frontend", "backend"
55
  ])
56
 
57
 
58
- # --- Text Extraction Functions (Adjusted for Streamlit file_object) ---
 
59
 
60
- def extract_text_from_pdf(file_object):
61
  """
62
- Extracts text from a PDF file-like object using PyPDF2.
63
  """
64
  try:
65
- reader = PdfReader(file_object)
66
- text = ""
67
- for page in reader.pages:
68
- text += page.extract_text() or "" # Handle pages with no extractable text
 
69
  return text
70
  except Exception as e:
71
- st.error(f"Error reading PDF: {e}")
72
  return ""
73
 
74
- def extract_text_from_docx(file_object):
75
  """
76
- Extracts text from a DOCX file-like object using python-docx.
77
  """
78
  try:
79
- document = Document(file_object)
80
  text = "\n".join([paragraph.text for paragraph in document.paragraphs])
81
  return text
82
  except Exception as e:
83
- st.error(f"Error reading DOCX: {e}")
84
  return ""
85
 
86
- # --- Text Preprocessing Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
 
88
  def preprocess_text(text):
89
- """
90
- Applies standard NLP preprocessing steps.
91
- """
92
- if not isinstance(text, str):
93
- return ""
94
  text = text.lower()
95
  text = re.sub(r'\s+', ' ', text).strip()
96
  doc = nlp(text)
97
- processed_tokens = [
98
- token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space
99
- ]
100
  return " ".join(processed_tokens)
101
 
102
- # --- Information Extraction (NER & Keyword Extraction) ---
103
-
104
  def extract_skills(text_doc, skill_keywords=None):
105
- """
106
- Extracts skills using spaCy's NER and a custom keyword list.
107
- Args:
108
- text_doc (spacy.tokens.Doc): spaCy Doc object of the text.
109
- skill_keywords (set): An optional set of predefined skill keywords.
110
- Returns:
111
- list: A list of extracted skills.
112
- """
113
  extracted_skills = []
114
- if skill_keywords is None:
115
- skill_keywords = set() # Should not be None if global is used
116
-
117
  doc_text = text_doc.text.lower()
118
  for skill in skill_keywords:
119
  if re.search(r'\b' + re.escape(skill) + r'\b', doc_text):
120
  extracted_skills.append(skill)
121
-
122
  entities = {}
123
  for ent in text_doc.ents:
124
- if ent.label_ == "ORG":
125
- entities.setdefault("organizations", []).append(ent.text)
126
- elif ent.label_ == "GPE":
127
- entities.setdefault("locations", []).append(ent.text)
128
- elif ent.label_ == "DATE":
129
- entities.setdefault("dates", []).append(ent.text)
130
- elif ent.label_ == "PERSON":
131
- entities.setdefault("people", []).append(ent.text)
132
-
133
  return list(set(extracted_skills)), entities
134
 
135
  def extract_experience_and_education(text):
136
- """
137
- Attempts to extract experience years and education level using regex and simple rules.
138
- This is a simplified approach and can be complex for diverse CVs.
139
- """
140
  years_experience = 0
141
  education_level = "Not Specified"
142
-
143
  exp_matches = re.findall(r'(\d+)\s*(?:\+|plus)?\s*years?\s+of\s+experience|\d+\s*yrs?\s+exp', text.lower())
144
  if exp_matches:
145
  try:
146
  years_experience = max([int(re.findall(r'\d+', m)[0]) for m in exp_matches if re.findall(r'\d+', m)])
147
- except (ValueError, IndexError):
148
- pass
149
-
150
  text_lower = text.lower()
151
- if "phd" in text_lower or "doctorate" in text_lower:
152
- education_level = "Ph.D."
153
- elif "master" in text_lower or "m.s." in text_lower or "msc" in text_lower:
154
- education_level = "Master's"
155
- elif "bachelor" in text_lower or "b.s." in text_lower or "bsc" in text_lower:
156
- education_level = "Bachelor's"
157
- elif "associate" in text_lower:
158
- education_level = "Associate's"
159
-
160
  return years_experience, education_level
161
 
162
- # --- Feature Engineering ---
163
-
164
  def get_text_embeddings(text):
165
- """
166
- Generates sentence embeddings for a given text using spaCy's pre-trained vectors.
167
- """
168
- if not text:
169
- return np.zeros(nlp.vocab.vectors.shape[1])
170
  doc = nlp(text)
171
- if doc.has_vector:
172
- return doc.vector
173
- else:
174
- # Fallback if no vector for doc (shouldn't happen with en_core_web_lg)
175
- return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1])
176
 
177
  def calculate_cosine_similarity(vec1, vec2):
178
- """
179
- Calculates cosine similarity between two vectors.
180
- Handles potential division by zero if vectors are zero vectors.
181
- """
182
- if np.all(vec1 == 0) or np.all(vec2 == 0):
183
- return 0.0
184
  vec1 = vec1.reshape(1, -1)
185
  vec2 = vec2.reshape(1, -1)
186
  return cosine_similarity(vec1, vec2)[0][0]
187
 
188
- # --- Main Processing Pipeline for a Document (CV or Job Description) ---
189
-
190
  def analyze_document(doc_text):
191
- """
192
- Processes a document (CV or Job Description) for analysis.
193
- """
194
  doc_spacy = nlp(doc_text)
195
  cleaned_text = preprocess_text(doc_text)
196
  extracted_skills, general_entities = extract_skills(doc_spacy, skill_keywords=predefined_skills_list)
197
  years_exp, education_level = extract_experience_and_education(doc_text)
198
  text_embedding = get_text_embeddings(cleaned_text)
199
-
200
  return {
201
- "raw_text": doc_text,
202
- "cleaned_text": cleaned_text,
203
- "spacy_doc": doc_spacy,
204
- "extracted_skills": extracted_skills,
205
- "general_entities": general_entities,
206
- "years_experience": years_exp,
207
- "education_level": education_level,
208
  "text_embedding": text_embedding
209
  }
210
 
211
- # --- Matching and Scoring Logic ---
212
-
213
  def calculate_match_scores(cv_data, jd_data):
214
- """
215
- Calculates various match scores and identifies keyword overlaps.
216
- """
217
  results = {}
218
-
219
- # 1. Overall Semantic Similarity (using embeddings)
220
- overall_similarity = calculate_cosine_similarity(
221
- cv_data["text_embedding"],
222
- jd_data["text_embedding"]
223
- )
224
  results["overall_match_score"] = round(overall_similarity * 100, 2)
225
-
226
- # 2. Skill Matching
227
  cv_skills = set(cv_data["extracted_skills"])
228
  jd_skills = set(jd_data["extracted_skills"])
229
-
230
  matched_skills = list(cv_skills.intersection(jd_skills))
231
  missing_skills = list(jd_skills.difference(cv_skills))
232
  extra_skills_in_cv = list(cv_skills.difference(jd_skills))
233
-
234
  results["matched_skills"] = matched_skills
235
  results["missing_skills"] = missing_skills
236
  results["extra_skills_in_cv"] = extra_skills_in_cv
237
-
238
- if jd_skills:
239
- skill_match_percentage = len(matched_skills) / len(jd_skills) * 100
240
- else:
241
- skill_match_percentage = 0.0
242
  results["skill_match_percentage"] = round(skill_match_percentage, 2)
243
-
244
- # 3. Keyword Overlap (using TF-IDF for important words beyond specific skills)
245
  corpus = [cv_data["cleaned_text"], jd_data["cleaned_text"]]
246
  tfidf_vectorizer = TfidfVectorizer(max_features=100)
247
  tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
248
  feature_names = tfidf_vectorizer.get_feature_names_out()
249
-
250
  cv_tfidf_scores = {feature_names[i]: tfidf_matrix[0, i] for i in tfidf_matrix[0].nonzero()[1]}
251
  jd_tfidf_scores = {feature_names[i]: tfidf_matrix[1, i] for i in tfidf_matrix[1].nonzero()[1]}
252
-
253
  top_cv_keywords = sorted(cv_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
254
  top_jd_keywords = sorted(jd_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
255
-
256
  results["top_cv_keywords"] = [k for k,v in top_cv_keywords]
257
  results["top_jd_keywords"] = [k for k,v in top_jd_keywords]
258
-
259
  common_keywords = set(results["top_cv_keywords"]).intersection(set(results["top_jd_keywords"]))
260
  results["common_keywords"] = list(common_keywords)
261
-
262
- # 4. Experience Matching
263
  cv_exp_years = cv_data["years_experience"]
264
  jd_exp_years = jd_data["years_experience"]
265
  results["cv_years_experience"] = cv_exp_years
266
  results["jd_years_experience"] = jd_exp_years
267
-
268
  exp_status = "Not specified by Job"
269
  if jd_exp_years > 0:
270
- if cv_exp_years >= jd_exp_years:
271
- exp_status = "Meets or Exceeds Requirement"
272
- else:
273
- exp_status = f"Below Requirement (Needs {jd_exp_years - cv_exp_years} more years)"
274
  results["experience_match_status"] = exp_status
275
-
276
- # 5. Education Matching (simplified)
277
  cv_edu = cv_data["education_level"]
278
  jd_edu = jd_data["education_level"]
279
  results["cv_education_level"] = cv_edu
280
  results["jd_education_level"] = jd_edu
281
-
282
  edu_match_status = "Not Specified by Job"
283
- if jd_edu != "Not Specified": # Only compare if JD specifies
284
  edu_order = {"Associate's": 1, "Bachelor's": 2, "Master's": 3, "Ph.D.": 4}
285
- if edu_order.get(cv_edu, 0) >= edu_order.get(jd_edu, 0):
286
- edu_match_status = "Meets or Exceeds Requirement"
287
- else:
288
- edu_match_status = "Below Requirement"
289
  results["education_match_status"] = edu_match_status
290
-
291
  return results
292
 
293
- # --- Overall Analysis Orchestrator ---
294
  def perform_cv_job_analysis(cv_text, job_desc_text):
295
- """
296
- Orchestrates the entire analysis process from raw text to results.
297
- """
298
  cv_analysis_data = analyze_document(cv_text)
299
  job_desc_analysis_data = analyze_document(job_desc_text)
300
  match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
301
  return match_results
302
 
303
- # --- Visualization Functions (Adjusted for Streamlit) ---
304
- # Each visualization function now returns a matplotlib figure object
305
- # and Streamlit's st.pyplot() is used to display it, then figure is closed.
306
-
307
  def create_overall_match_plot(score):
308
- """Returns a matplotlib figure for overall match."""
309
  fig, ax = plt.subplots(figsize=(6, 2))
310
  sns.set_style("whitegrid")
311
  ax.barh(["Overall Match"], [score], color='skyblue')
@@ -318,153 +229,138 @@ def create_overall_match_plot(score):
318
  return fig
319
 
320
  def create_skill_match_plot(matched_skills, missing_skills):
321
- """Returns a matplotlib figure for skill match breakdown."""
322
  labels = ['Matched Skills', 'Missing Skills']
323
  sizes = [len(matched_skills), len(missing_skills)]
324
  colors = ['#66b3ff', '#ff9999']
325
  explode = (0.05, 0.05) if sizes[0] > 0 and sizes[1] > 0 else (0,0)
326
-
327
- if sum(sizes) == 0:
328
- return None # Indicate no plot can be made
329
-
330
  fig, ax = plt.subplots(figsize=(7, 7))
331
- ax.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
332
- shadow=True, startangle=90, textprops={'fontsize': 12})
333
  ax.axis('equal')
334
  ax.set_title("Skill Match Breakdown", fontsize=14)
335
  plt.tight_layout()
336
  return fig
337
 
338
  def create_top_keywords_plot(cv_keywords, jd_keywords):
339
- """Returns a matplotlib figure for top keywords."""
340
  fig, axes = plt.subplots(1, 2, figsize=(16, 6))
341
  sns.set_style("whitegrid")
342
-
343
  cv_df = pd.DataFrame(Counter(cv_keywords).most_common(10), columns=['Keyword', 'Count'])
344
  if not cv_df.empty:
345
  sns.barplot(x='Count', y='Keyword', data=cv_df, ax=axes[0], palette='viridis')
346
  axes[0].set_title('Top Keywords in CV', fontsize=14)
347
  axes[0].set_xlabel('Frequency/Importance', fontsize=12)
348
  axes[0].set_ylabel('')
349
-
350
  jd_df = pd.DataFrame(Counter(jd_keywords).most_common(10), columns=['Keyword', 'Count'])
351
  if not jd_df.empty:
352
  sns.barplot(x='Count', y='Keyword', data=jd_df, ax=axes[1], palette='plasma')
353
  axes[1].set_title('Top Keywords in Job Description', fontsize=14)
354
  axes[1].set_xlabel('Frequency/Importance', fontsize=12)
355
  axes[1].set_ylabel('')
356
-
357
  plt.tight_layout()
358
  return fig
359
 
360
- # --- Streamlit Application Layout ---
361
-
362
- st.set_page_config(page_title="CV-Job Match Analyzer", layout="wide", icon="πŸ‘¨β€πŸ’Ό")
363
-
364
- st.title("πŸ‘¨β€πŸ’Ό CV-Job Match Analyzer πŸ“ˆ")
365
- st.markdown("""
366
- Welcome! This tool helps you understand how well a CV matches a job description.
367
- Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
368
- """)
369
-
370
- # Input for CV
371
- st.header("1. Upload Your CV")
372
- uploaded_cv_file = st.file_uploader("Choose a CV file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"], key="cv_uploader")
373
- cv_text_area = st.text_area("Or paste your CV text here (overrides file upload)", height=250, key="cv_text_area")
374
-
375
- cv_content = ""
376
- if uploaded_cv_file is not None:
377
- if uploaded_cv_file.name.endswith('.pdf'):
378
- cv_content = extract_text_from_pdf(uploaded_cv_file)
379
- elif uploaded_cv_file.name.endswith('.docx'):
380
- cv_content = extract_text_from_docx(uploaded_cv_file)
381
- elif uploaded_cv_file.name.endswith('.txt'):
382
- cv_content = uploaded_cv_file.read().decode("utf-8")
383
- st.success("CV file uploaded and parsed successfully!")
384
- elif cv_text_area: # If text area has content and no file uploaded
385
- cv_content = cv_text_area
386
-
387
- # Input for Job Description
388
- st.header("2. Input Job Description")
389
- job_desc_text_area = st.text_area("Paste the Job Description text here", height=250, key="jd_text_area")
390
-
391
- # Analyze Button
392
- st.markdown("---")
393
- if st.button("✨ Analyze CV Match ✨", use_container_width=True):
394
  if not cv_content:
395
- st.error("🚨 Please upload a CV file or paste your CV text to proceed.")
396
- if not job_desc_text_area:
397
- st.error("🚨 Please paste the Job Description text to proceed.")
398
-
399
- if cv_content and job_desc_text_area:
400
- with st.spinner("πŸš€ Analyzing your documents... This might take a moment!"):
401
- try:
402
- analysis_results = perform_cv_job_analysis(cv_content, job_desc_text_area)
403
-
404
- st.subheader("πŸ’‘ Analysis Results Summary πŸ’‘")
405
-
406
- # Display KPIs in columns
407
- col1, col2, col3 = st.columns(3)
408
- with col1:
409
- st.metric(label="Overall Match Score", value=f"{analysis_results['overall_match_score']}%")
410
- with col2:
411
- st.metric(label="Skill Match", value=f"{analysis_results['skill_match_percentage']}%")
412
- with col3:
413
- exp_status = analysis_results['experience_match_status']
414
- if "Meets" in exp_status or "Exceeds" in exp_status:
415
- st.metric(label="Experience Match", value=exp_status, delta="Good!")
416
- else:
417
- st.metric(label="Experience Match", value=exp_status, delta="Needs attention")
418
-
419
- st.markdown("---")
420
-
421
- st.subheader("πŸ“Š Visual Insights")
422
-
423
- # Overall Match Plot
424
- fig_overall = create_overall_match_plot(analysis_results['overall_match_score'])
425
- st.pyplot(fig_overall)
426
- plt.close(fig_overall) # Close figure to free memory
427
-
428
- # Skill Match Plot
429
- fig_skill = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
430
- if fig_skill:
431
- st.pyplot(fig_skill)
432
- plt.close(fig_skill)
433
- else:
434
- st.info("No specific skills identified in the job description for a detailed skill match breakdown.")
435
-
436
- # Top Keywords Plot
437
- fig_keywords = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
438
- st.pyplot(fig_keywords)
439
- plt.close(fig_keywords)
440
-
441
- st.markdown("---")
442
- st.subheader("πŸ“ Detailed Breakdown")
443
-
444
- st.markdown("#### Skills Analysis")
445
- st.write(f"**βœ… Matched Skills:**", ", ".join(analysis_results['matched_skills']) if analysis_results['matched_skills'] else "None found matching job description.")
446
- st.write(f"**❌ Missing Skills (from Job Description):**", ", ".join(analysis_results['missing_skills']) if analysis_results['missing_skills'] else "πŸ₯³ None! Your CV has all specified skills.")
447
- st.write(f"**πŸ’‘ Extra Skills in CV (not in Job Description):**", ", ".join(analysis_results['extra_skills_in_cv']) if analysis_results['extra_skills_in_cv'] else "None. (This is often fine, showing broader capability.)")
448
-
449
- st.markdown("#### Keyword Relevance (Top TF-IDF Terms)")
450
- st.write(f"**🀝 Top Common Keywords:**", ", ".join(analysis_results['common_keywords']) if analysis_results['common_keywords'] else "No significant common keywords beyond skills.")
451
- st.write(f"**πŸ” Top Keywords in Your CV:**", ", ".join(analysis_results['top_cv_keywords']) if analysis_results['top_cv_keywords'] else "N/A")
452
- st.write(f"**🎯 Top Keywords in Job Description:**", ", ".join(analysis_results['top_jd_keywords']) if analysis_results['top_jd_keywords'] else "N/A")
453
-
454
-
455
- st.markdown("#### Experience & Education Comparison")
456
- st.write(f"**πŸ‘€ Your CV's Experience:** `{analysis_results['cv_years_experience']}` years")
457
- st.write(f"**πŸ’Ό Job's Required Experience:** `{analysis_results['jd_years_experience']}` years")
458
- st.info(f"**Status:** {analysis_results['experience_match_status']}")
459
-
460
- st.write(f"**πŸŽ“ Your CV's Education:** `{analysis_results['cv_education_level']}`")
461
- st.write(f"**πŸ“š Job's Required Education:** `{analysis_results['jd_education_level']}`")
462
- st.info(f"**Status:** {analysis_results['education_match_status']}")
463
-
464
-
465
- except Exception as e:
466
- st.error(f"An unexpected error occurred during analysis: {e}")
467
- st.exception(e) # Show full traceback in Streamlit debug logs
468
-
469
- st.markdown("---")
470
- st.markdown("Developed with ❀️ for Data Science by your mentor")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import os
3
  import io
4
  import re
5
  from docx import Document
6
+ from PyPDF2 import PdfReader
7
  import pandas as pd
8
  import spacy
9
  from collections import Counter
 
14
  import numpy as np
15
 
16
  # --- SpaCy Model Loading ---
17
+ # For Gradio on Hugging Face Spaces, the model is usually installed via requirements.txt
18
+ # so spacy.load() will find it.
19
+ try:
20
+ nlp = spacy.load("en_core_web_lg")
21
+ print("SpaCy model loaded successfully.")
22
+ except Exception as e:
23
+ print(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
24
+ # In a Gradio app, you might raise an error or display a message in the UI
25
+ # For now, let's just print to logs if it fails to load at startup.
26
+
27
+ # --- Global Predefined Skills ---
 
 
 
 
 
 
 
 
28
  predefined_skills_list = set([
29
  "python", "tensorflow", "pytorch", "scikit-learn", "numpy", "pandas",
30
  "docker", "kubernetes", "aws", "git", "sql", "java", "r", "tableau",
 
42
  ])
43
  predefined_skills_list.update([
44
  "machine learning engineer", "data scientist", "ai engineer", "deep learning engineer",
45
+ "senior machine learning engineer", "junior data scientist",
46
  "data engineer", "software engineer", "full stack", "frontend", "backend"
47
  ])
48
 
49
 
50
+ # --- Text Extraction Functions (Adapted for file paths in Gradio's File component) ---
51
+ # Gradio's gr.File component provides a file path to the temporary uploaded file.
52
 
53
+ def extract_text_from_pdf(pdf_path):
54
  """
55
+ Extracts text from a PDF file given its path.
56
  """
57
  try:
58
+ with open(pdf_path, 'rb') as file:
59
+ reader = PdfReader(file)
60
+ text = ""
61
+ for page in reader.pages:
62
+ text += page.extract_text() or ""
63
  return text
64
  except Exception as e:
65
+ print(f"Error reading PDF {pdf_path}: {e}") # Will print to Gradio logs
66
  return ""
67
 
68
+ def extract_text_from_docx(docx_path):
69
  """
70
+ Extracts text from a DOCX file given its path.
71
  """
72
  try:
73
+ document = Document(docx_path)
74
  text = "\n".join([paragraph.text for paragraph in document.paragraphs])
75
  return text
76
  except Exception as e:
77
+ print(f"Error reading DOCX {docx_path}: {e}") # Will print to Gradio logs
78
  return ""
79
 
80
+ def get_file_content(file_obj):
81
+ """Helper to get content from Gradio's file component."""
82
+ if file_obj is None:
83
+ return ""
84
+ file_path = file_obj.name # Gradio file component gives path in .name attribute
85
+ if file_path.endswith('.pdf'):
86
+ return extract_text_from_pdf(file_path)
87
+ elif file_path.endswith('.docx'):
88
+ return extract_text_from_docx(file_path)
89
+ elif file_path.endswith('.txt'):
90
+ with open(file_path, 'r', encoding='utf-8') as f:
91
+ return f.read()
92
+ else:
93
+ return ""
94
 
95
+ # --- Text Preprocessing Functions (same as before) ---
96
  def preprocess_text(text):
97
+ if not isinstance(text, str): return ""
 
 
 
 
98
  text = text.lower()
99
  text = re.sub(r'\s+', ' ', text).strip()
100
  doc = nlp(text)
101
+ processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
 
 
102
  return " ".join(processed_tokens)
103
 
104
+ # --- Information Extraction (NER & Keyword Extraction) (same as before) ---
 
105
  def extract_skills(text_doc, skill_keywords=None):
 
 
 
 
 
 
 
 
106
  extracted_skills = []
107
+ if skill_keywords is None: skill_keywords = set()
 
 
108
  doc_text = text_doc.text.lower()
109
  for skill in skill_keywords:
110
  if re.search(r'\b' + re.escape(skill) + r'\b', doc_text):
111
  extracted_skills.append(skill)
 
112
  entities = {}
113
  for ent in text_doc.ents:
114
+ if ent.label_ == "ORG": entities.setdefault("organizations", []).append(ent.text)
115
+ elif ent.label_ == "GPE": entities.setdefault("locations", []).append(ent.text)
116
+ elif ent.label_ == "DATE": entities.setdefault("dates", []).append(ent.text)
117
+ elif ent.label_ == "PERSON": entities.setdefault("people", []).append(ent.text)
 
 
 
 
 
118
  return list(set(extracted_skills)), entities
119
 
120
  def extract_experience_and_education(text):
 
 
 
 
121
  years_experience = 0
122
  education_level = "Not Specified"
 
123
  exp_matches = re.findall(r'(\d+)\s*(?:\+|plus)?\s*years?\s+of\s+experience|\d+\s*yrs?\s+exp', text.lower())
124
  if exp_matches:
125
  try:
126
  years_experience = max([int(re.findall(r'\d+', m)[0]) for m in exp_matches if re.findall(r'\d+', m)])
127
+ except (ValueError, IndexError): pass
 
 
128
  text_lower = text.lower()
129
+ if "phd" in text_lower or "doctorate" in text_lower: education_level = "Ph.D."
130
+ elif "master" in text_lower or "m.s." in text_lower or "msc" in text_lower: education_level = "Master's"
131
+ elif "bachelor" in text_lower or "b.s." in text_lower or "bsc" in text_lower: education_level = "Bachelor's"
132
+ elif "associate" in text_lower: education_level = "Associate's"
 
 
 
 
 
133
  return years_experience, education_level
134
 
135
+ # --- Feature Engineering (same as before) ---
 
136
  def get_text_embeddings(text):
137
+ if not text: return np.zeros(nlp.vocab.vectors.shape[1])
 
 
 
 
138
  doc = nlp(text)
139
+ if doc.has_vector: return doc.vector
140
+ else: return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1])
 
 
 
141
 
142
  def calculate_cosine_similarity(vec1, vec2):
143
+ if np.all(vec1 == 0) or np.all(vec2 == 0): return 0.0
 
 
 
 
 
144
  vec1 = vec1.reshape(1, -1)
145
  vec2 = vec2.reshape(1, -1)
146
  return cosine_similarity(vec1, vec2)[0][0]
147
 
148
+ # --- Main Processing Pipeline for a Document (CV or Job Description) (same as before) ---
 
149
  def analyze_document(doc_text):
 
 
 
150
  doc_spacy = nlp(doc_text)
151
  cleaned_text = preprocess_text(doc_text)
152
  extracted_skills, general_entities = extract_skills(doc_spacy, skill_keywords=predefined_skills_list)
153
  years_exp, education_level = extract_experience_and_education(doc_text)
154
  text_embedding = get_text_embeddings(cleaned_text)
 
155
  return {
156
+ "raw_text": doc_text, "cleaned_text": cleaned_text, "spacy_doc": doc_spacy,
157
+ "extracted_skills": extracted_skills, "general_entities": general_entities,
158
+ "years_experience": years_exp, "education_level": education_level,
 
 
 
 
159
  "text_embedding": text_embedding
160
  }
161
 
162
+ # --- Matching and Scoring Logic (same as before) ---
 
163
  def calculate_match_scores(cv_data, jd_data):
 
 
 
164
  results = {}
165
+ overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"])
 
 
 
 
 
166
  results["overall_match_score"] = round(overall_similarity * 100, 2)
 
 
167
  cv_skills = set(cv_data["extracted_skills"])
168
  jd_skills = set(jd_data["extracted_skills"])
 
169
  matched_skills = list(cv_skills.intersection(jd_skills))
170
  missing_skills = list(jd_skills.difference(cv_skills))
171
  extra_skills_in_cv = list(cv_skills.difference(jd_skills))
 
172
  results["matched_skills"] = matched_skills
173
  results["missing_skills"] = missing_skills
174
  results["extra_skills_in_cv"] = extra_skills_in_cv
175
+ if jd_skills: skill_match_percentage = len(matched_skills) / len(jd_skills) * 100
176
+ else: skill_match_percentage = 0.0
 
 
 
177
  results["skill_match_percentage"] = round(skill_match_percentage, 2)
 
 
178
  corpus = [cv_data["cleaned_text"], jd_data["cleaned_text"]]
179
  tfidf_vectorizer = TfidfVectorizer(max_features=100)
180
  tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
181
  feature_names = tfidf_vectorizer.get_feature_names_out()
 
182
  cv_tfidf_scores = {feature_names[i]: tfidf_matrix[0, i] for i in tfidf_matrix[0].nonzero()[1]}
183
  jd_tfidf_scores = {feature_names[i]: tfidf_matrix[1, i] for i in tfidf_matrix[1].nonzero()[1]}
 
184
  top_cv_keywords = sorted(cv_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
185
  top_jd_keywords = sorted(jd_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
 
186
  results["top_cv_keywords"] = [k for k,v in top_cv_keywords]
187
  results["top_jd_keywords"] = [k for k,v in top_jd_keywords]
 
188
  common_keywords = set(results["top_cv_keywords"]).intersection(set(results["top_jd_keywords"]))
189
  results["common_keywords"] = list(common_keywords)
 
 
190
  cv_exp_years = cv_data["years_experience"]
191
  jd_exp_years = jd_data["years_experience"]
192
  results["cv_years_experience"] = cv_exp_years
193
  results["jd_years_experience"] = jd_exp_years
 
194
  exp_status = "Not specified by Job"
195
  if jd_exp_years > 0:
196
+ if cv_exp_years >= jd_exp_years: exp_status = "Meets or Exceeds Requirement"
197
+ else: exp_status = f"Below Requirement (Needs {jd_exp_years - cv_exp_years} more years)"
 
 
198
  results["experience_match_status"] = exp_status
 
 
199
  cv_edu = cv_data["education_level"]
200
  jd_edu = jd_data["education_level"]
201
  results["cv_education_level"] = cv_edu
202
  results["jd_education_level"] = jd_edu
 
203
  edu_match_status = "Not Specified by Job"
204
+ if jd_edu != "Not Specified":
205
  edu_order = {"Associate's": 1, "Bachelor's": 2, "Master's": 3, "Ph.D.": 4}
206
+ if edu_order.get(cv_edu, 0) >= edu_order.get(jd_edu, 0): edu_match_status = "Meets or Exceeds Requirement"
207
+ else: edu_match_status = "Below Requirement"
 
 
208
  results["education_match_status"] = edu_match_status
 
209
  return results
210
 
211
+ # --- Overall Analysis Orchestrator (same as before) ---
212
  def perform_cv_job_analysis(cv_text, job_desc_text):
 
 
 
213
  cv_analysis_data = analyze_document(cv_text)
214
  job_desc_analysis_data = analyze_document(job_desc_text)
215
  match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
216
  return match_results
217
 
218
+ # --- Visualization Functions (Returns figure object) ---
 
 
 
219
  def create_overall_match_plot(score):
 
220
  fig, ax = plt.subplots(figsize=(6, 2))
221
  sns.set_style("whitegrid")
222
  ax.barh(["Overall Match"], [score], color='skyblue')
 
229
  return fig
230
 
231
  def create_skill_match_plot(matched_skills, missing_skills):
 
232
  labels = ['Matched Skills', 'Missing Skills']
233
  sizes = [len(matched_skills), len(missing_skills)]
234
  colors = ['#66b3ff', '#ff9999']
235
  explode = (0.05, 0.05) if sizes[0] > 0 and sizes[1] > 0 else (0,0)
236
+ if sum(sizes) == 0: return None
 
 
 
237
  fig, ax = plt.subplots(figsize=(7, 7))
238
+ ax.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90, textprops={'fontsize': 12})
 
239
  ax.axis('equal')
240
  ax.set_title("Skill Match Breakdown", fontsize=14)
241
  plt.tight_layout()
242
  return fig
243
 
244
  def create_top_keywords_plot(cv_keywords, jd_keywords):
 
245
  fig, axes = plt.subplots(1, 2, figsize=(16, 6))
246
  sns.set_style("whitegrid")
 
247
  cv_df = pd.DataFrame(Counter(cv_keywords).most_common(10), columns=['Keyword', 'Count'])
248
  if not cv_df.empty:
249
  sns.barplot(x='Count', y='Keyword', data=cv_df, ax=axes[0], palette='viridis')
250
  axes[0].set_title('Top Keywords in CV', fontsize=14)
251
  axes[0].set_xlabel('Frequency/Importance', fontsize=12)
252
  axes[0].set_ylabel('')
 
253
  jd_df = pd.DataFrame(Counter(jd_keywords).most_common(10), columns=['Keyword', 'Count'])
254
  if not jd_df.empty:
255
  sns.barplot(x='Count', y='Keyword', data=jd_df, ax=axes[1], palette='plasma')
256
  axes[1].set_title('Top Keywords in Job Description', fontsize=14)
257
  axes[1].set_xlabel('Frequency/Importance', fontsize=12)
258
  axes[1].set_ylabel('')
 
259
  plt.tight_layout()
260
  return fig
261
 
262
+
263
+ # --- Main Gradio Interface Function ---
264
+ def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
265
+ """
266
+ This function will be called by Gradio's Interface.
267
+ It takes Gradio inputs and returns Gradio outputs (HTML, plots).
268
+ """
269
+ cv_content = ""
270
+ # Prioritize file upload over text area if both are provided
271
+ if cv_file_obj is not None:
272
+ cv_content = get_file_content(cv_file_obj)
273
+ elif cv_text_input:
274
+ cv_content = cv_text_input
275
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  if not cv_content:
277
+ return "<h4><p style='color:red;'>🚨 Error: Please upload a CV file or paste your CV text.</p></h4>", None, None, None, ""
278
+ if not jd_text_input:
279
+ return "<h4><p style='color:red;'>🚨 Error: Please paste the Job Description text.</p></h4>", None, None, None, ""
280
+
281
+ try:
282
+ analysis_results = perform_cv_job_analysis(cv_content, jd_text_input)
283
+
284
+ # Generate HTML output for KPIs and detailed breakdown
285
+ html_output = f"""
286
+ <h2>πŸ’‘ Analysis Results Summary πŸ’‘</h2>
287
+ <div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center;'>
288
+ <div style='background-color: #e0f7fa; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px;'>
289
+ <h3>Overall Match Score</h3>
290
+ <h1 style='color: #007bb6;'>{analysis_results['overall_match_score']}%</h1>
291
+ </div>
292
+ <div style='background-color: #e8f5e9; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px;'>
293
+ <h3>Skill Match</h3>
294
+ <h1 style='color: #43a047;'>{analysis_results['skill_match_percentage']}%</h1>
295
+ </div>
296
+ <div style='background-color: #fff3e0; padding: 15px; border-radius: 8px; margin: 5px; min-width: 200px;'>
297
+ <h3>Experience Match</h3>
298
+ <h1 style='color: #fb8c00;'>{analysis_results['experience_match_status']}</h1>
299
+ </div>
300
+ </div>
301
+ <hr/>
302
+ <h2>πŸ“ Detailed Breakdown</h2>
303
+ <h4>Skills Analysis</h4>
304
+ <p><strong>βœ… Matched Skills:</strong> {', '.join(analysis_results['matched_skills']) if analysis_results['matched_skills'] else 'None found matching job description.'}</p>
305
+ <p><strong>❌ Missing Skills (from Job Description):</strong> {', '.join(analysis_results['missing_skills']) if analysis_results['missing_skills'] else 'πŸ₯³ None! Your CV has all specified skills.'}</p>
306
+ <p><strong>πŸ’‘ Extra Skills in CV (not in Job Description):</strong> {', '.join(analysis_results['extra_skills_in_cv']) if analysis_results['extra_skills_in_cv'] else 'None. (This is often fine, showing broader capability.)'}</p>
307
+
308
+ <h4>Keyword Relevance (Top TF-IDF Terms)</h4>
309
+ <p><strong>🀝 Top Common Keywords:</strong> {', '.join(analysis_results['common_keywords']) if analysis_results['common_keywords'] else 'No significant common keywords beyond skills.'}</p>
310
+ <p><strong>πŸ” Top Keywords in Your CV:</strong> {', '.join(analysis_results['top_cv_keywords']) if analysis_results['top_cv_keywords'] else 'N/A'}</p>
311
+ <p><strong>🎯 Top Keywords in Job Description:</strong> {', '.join(analysis_results['top_jd_keywords']) if analysis_results['top_jd_keywords'] else 'N/A'}</p>
312
+
313
+ <h4>Experience & Education Comparison</h4>
314
+ <p><strong>πŸ‘€ Your CV's Experience:</strong> <code>{analysis_results['cv_years_experience']}</code> years</p>
315
+ <p><strong>πŸ’Ό Job's Required Experience:</strong> <code>{analysis_results['jd_years_experience']}</code> years</p>
316
+ <p style='color:green;'><strong>Status:</strong> {analysis_results['experience_match_status']}</p>
317
+
318
+ <p><strong>πŸŽ“ Your CV's Education:</strong> <code>{analysis_results['cv_education_level']}</code></p>
319
+ <p><strong>πŸ“š Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p>
320
+ <p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p>
321
+ """
322
+
323
+ # Generate plots
324
+ overall_plot = create_overall_match_plot(analysis_results['overall_match_score'])
325
+ skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
326
+ keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
327
+
328
+ # Gradio can return multiple outputs. For plots, it expects the figure objects.
329
+ return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!"
330
+
331
+ except Exception as e:
332
+ import traceback
333
+ error_traceback = traceback.format_exc()
334
+ return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>"
335
+ f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>"), None, None, None, "Analysis Failed"
336
+
337
+
338
+ # --- Gradio Interface Definition ---
339
+
340
+ # Define the input components
341
+ inputs = [
342
+ gr.File(label="1. Upload Your CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"]),
343
+ gr.Textbox(label="Or paste your CV text here", lines=10, placeholder="Paste your CV content here..."),
344
+ gr.Textbox(label="2. Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...")
345
+ ]
346
+
347
+ # Define the output components
348
+ outputs = [
349
+ gr.HTML(label="Analysis Report"), # For text-based KPIs and detailed breakdown
350
+ gr.Plot(label="Overall Match Score"), # For the first plot
351
+ gr.Plot(label="Skill Match Breakdown"), # For the second plot
352
+ gr.Plot(label="Top Keywords") # For the third plot
353
+ # Gradio also returns the status message in the bottom right
354
+ ]
355
+
356
+ # Create the Gradio Interface
357
+ gr.Interface(
358
+ fn=analyze_cv_match, # Our main analysis function
359
+ inputs=inputs,
360
+ outputs=outputs,
361
+ title="πŸ‘¨β€πŸ’Ό CV-Job Match Analyzer πŸ“ˆ",
362
+ description="Upload your CV and paste a job description to get an instant compatibility analysis with charts and key insights. "
363
+ "Developed by your mentor (A.I.).",
364
+ allow_flagging="never", # Disable flagging feature
365
+ theme=gr.themes.Soft() # A nice, modern theme
366
+ ).launch()