Bur3hani commited on
Commit
489b15d
Β·
verified Β·
1 Parent(s): 86f4072

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -45
app.py CHANGED
@@ -14,15 +14,11 @@ import seaborn as sns
14
  import numpy as np
15
 
16
  # --- SpaCy Model Loading ---
17
- # For Gradio on Hugging Face Spaces, the model is usually installed via requirements.txt
18
- # so spacy.load() will find it.
19
  try:
20
  nlp = spacy.load("en_core_web_lg")
21
  print("SpaCy model loaded successfully.")
22
  except Exception as e:
23
  print(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
24
- # In a Gradio app, you might raise an error or display a message in the UI
25
- # For now, let's just print to logs if it fails to load at startup.
26
 
27
  # --- Global Predefined Skills ---
28
  predefined_skills_list = set([
@@ -46,12 +42,8 @@ predefined_skills_list.update([
46
  "data engineer", "software engineer", "full stack", "frontend", "backend"
47
  ])
48
 
49
- # --- Text Extraction Functions (Adapted for file paths in Gradio's File component) ---
50
- # Gradio's gr.File component provides a file path to the temporary uploaded file.
51
  def extract_text_from_pdf(pdf_path):
52
- """
53
- Extracts text from a PDF file given its path.
54
- """
55
  try:
56
  with open(pdf_path, 'rb') as file:
57
  reader = PdfReader(file)
@@ -60,26 +52,22 @@ def extract_text_from_pdf(pdf_path):
60
  text += page.extract_text() or ""
61
  return text
62
  except Exception as e:
63
- print(f"Error reading PDF {pdf_path}: {e}") # Will print to Gradio logs
64
  return ""
65
 
66
  def extract_text_from_docx(docx_path):
67
- """
68
- Extracts text from a DOCX file given its path.
69
- """
70
  try:
71
  document = Document(docx_path)
72
  text = "\n".join([paragraph.text for paragraph in document.paragraphs])
73
  return text
74
  except Exception as e:
75
- print(f"Error reading DOCX {docx_path}: {e}") # Will print to Gradio logs
76
  return ""
77
 
78
  def get_file_content(file_obj):
79
- """Helper to get content from Gradio's file component."""
80
  if file_obj is None:
81
  return ""
82
- file_path = file_obj.name # Gradio file component gives path in .name attribute
83
  if file_path.endswith('.pdf'):
84
  return extract_text_from_pdf(file_path)
85
  elif file_path.endswith('.docx'):
@@ -90,7 +78,7 @@ def get_file_content(file_obj):
90
  else:
91
  return ""
92
 
93
- # --- Text Preprocessing Functions (same as before) ---
94
  def preprocess_text(text):
95
  if not isinstance(text, str): return ""
96
  text = text.lower()
@@ -99,7 +87,7 @@ def preprocess_text(text):
99
  processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
100
  return " ".join(processed_tokens)
101
 
102
- # --- Information Extraction (NER & Keyword Extraction) (same as before) ---
103
  def extract_skills(text_doc, skill_keywords=None):
104
  extracted_skills = []
105
  if skill_keywords is None: skill_keywords = set()
@@ -130,7 +118,7 @@ def extract_experience_and_education(text):
130
  elif "associate" in text_lower: education_level = "Associate's"
131
  return years_experience, education_level
132
 
133
- # --- Feature Engineering (same as before) ---
134
  def get_text_embeddings(text):
135
  if not text: return np.zeros(nlp.vocab.vectors.shape[1])
136
  doc = nlp(text)
@@ -143,7 +131,7 @@ def calculate_cosine_similarity(vec1, vec2):
143
  vec2 = vec2.reshape(1, -1)
144
  return cosine_similarity(vec1, vec2)[0][0]
145
 
146
- # --- Main Processing Pipeline for a Document (CV or Job Description) (same as before) ---
147
  def analyze_document(doc_text):
148
  doc_spacy = nlp(doc_text)
149
  cleaned_text = preprocess_text(doc_text)
@@ -157,7 +145,7 @@ def analyze_document(doc_text):
157
  "text_embedding": text_embedding
158
  }
159
 
160
- # --- Matching and Scoring Logic (same as before) ---
161
  def calculate_match_scores(cv_data, jd_data):
162
  results = {}
163
  overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"])
@@ -206,14 +194,14 @@ def calculate_match_scores(cv_data, jd_data):
206
  results["education_match_status"] = edu_match_status
207
  return results
208
 
209
- # --- Overall Analysis Orchestrator (same as before) ---
210
  def perform_cv_job_analysis(cv_text, job_desc_text):
211
  cv_analysis_data = analyze_document(cv_text)
212
  job_desc_analysis_data = analyze_document(job_desc_text)
213
  match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
214
  return match_results
215
 
216
- # --- Visualization Functions (Returns figure object) ---
217
  def create_overall_match_plot(score):
218
  fig, ax = plt.subplots(figsize=(6, 2))
219
  sns.set_style("whitegrid")
@@ -259,12 +247,7 @@ def create_top_keywords_plot(cv_keywords, jd_keywords):
259
 
260
  # --- Main Gradio Interface Function ---
261
  def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
262
- """
263
- This function will be called by Gradio's Interface.
264
- It takes Gradio inputs and returns Gradio outputs (HTML, plots).
265
- """
266
  cv_content = ""
267
- # Prioritize file upload over text area if both are provided
268
  if cv_file_obj is not None:
269
  cv_content = get_file_content(cv_file_obj)
270
  elif cv_text_input:
@@ -277,7 +260,6 @@ def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
277
  None, None, None, "Analysis Failed")
278
  try:
279
  analysis_results = perform_cv_job_analysis(cv_content, jd_text_input)
280
- # Generate HTML output for KPIs and detailed breakdown
281
  html_output = f"""
282
  <h2 style='text-align: center;'>πŸ’‘ Analysis Results Summary πŸ’‘</h2>
283
  <div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center; margin-bottom: 20px;'>
@@ -312,23 +294,23 @@ def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
312
  <p><strong>πŸ“š Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p>
313
  <p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p>
314
  """
315
- # Generate plots
316
  overall_plot = create_overall_match_plot(analysis_results['overall_match_score'])
317
  skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
318
  keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
319
- # Return all outputs in the correct order
320
  return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!"
321
  except Exception as e:
322
  import traceback
323
  error_traceback = traceback.format_exc()
324
- # Return empty plots/HTML on error
325
  return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>"
326
  f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>",
327
  None, None, None, "Analysis Failed")
328
 
329
  # --- Gradio Interface Definition ---
330
- # Use gr.Blocks for more flexibility in layout
331
  with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
 
 
 
 
332
  gr.Markdown(
333
  """
334
  # πŸ‘¨β€πŸ’Ό CV-Job Match Analyzer πŸ“ˆ
@@ -336,29 +318,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
336
  Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
337
  """
338
  )
339
- with gr.Row(): # Arrange inputs and outputs in two columns
340
- with gr.Column(scale=1, min_width=400): # Left column for inputs
341
- gr.Markdown("## **1. Your CV**") # Section title for CV input
342
  cv_file_obj = gr.File(label="Upload CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
343
  cv_text_input = gr.Textbox(label="Or paste CV text here (overrides file upload)", lines=10, placeholder="Paste your CV content here...")
344
- gr.Markdown("## **2. Job Description**") # Section title for JD input
345
  jd_text_input = gr.Textbox(label="Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...")
346
- with gr.Row(): # Buttons in a row
347
  analyze_button = gr.Button("✨ Analyze CV Match ✨", variant="primary", scale=1)
348
  clear_button = gr.ClearButton([cv_file_obj, cv_text_input, jd_text_input], scale=1)
349
- with gr.Column(scale=2, min_width=600): # Right column for outputs (plots and HTML report)
350
- output_html = gr.HTML(label="Analysis Report") # This will show the text-based KPIs and detailed breakdown
351
- gr.Markdown("## **πŸ“Š Visual Insights**") # Title for the plots section
352
- output_overall_plot = gr.Plot(label="Overall Match Score") # Plots will appear here
353
  output_skill_plot = gr.Plot(label="Skill Match Breakdown")
354
  output_keywords_plot = gr.Plot(label="Top Keywords")
355
- # Define the action when the button is clicked
356
  analyze_button.click(
357
  fn=analyze_cv_match,
358
  inputs=[cv_file_obj, cv_text_input, jd_text_input],
359
  outputs=[output_html, output_overall_plot, output_skill_plot, output_keywords_plot, gr.State(value="")],
360
- # The last output for gr.State is a dummy. It needs to match the number of return values from analyze_cv_match.
361
- # Gradio uses the last return value for the status bar if it's a string, or you can explicitly link it.
362
  )
363
 
364
  demo.launch()
 
14
  import numpy as np
15
 
16
  # --- SpaCy Model Loading ---
 
 
17
  try:
18
  nlp = spacy.load("en_core_web_lg")
19
  print("SpaCy model loaded successfully.")
20
  except Exception as e:
21
  print(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
 
 
22
 
23
  # --- Global Predefined Skills ---
24
  predefined_skills_list = set([
 
42
  "data engineer", "software engineer", "full stack", "frontend", "backend"
43
  ])
44
 
45
+ # --- Text Extraction Functions ---
 
46
  def extract_text_from_pdf(pdf_path):
 
 
 
47
  try:
48
  with open(pdf_path, 'rb') as file:
49
  reader = PdfReader(file)
 
52
  text += page.extract_text() or ""
53
  return text
54
  except Exception as e:
55
+ print(f"Error reading PDF {pdf_path}: {e}")
56
  return ""
57
 
58
  def extract_text_from_docx(docx_path):
 
 
 
59
  try:
60
  document = Document(docx_path)
61
  text = "\n".join([paragraph.text for paragraph in document.paragraphs])
62
  return text
63
  except Exception as e:
64
+ print(f"Error reading DOCX {docx_path}: {e}")
65
  return ""
66
 
67
  def get_file_content(file_obj):
 
68
  if file_obj is None:
69
  return ""
70
+ file_path = file_obj.name
71
  if file_path.endswith('.pdf'):
72
  return extract_text_from_pdf(file_path)
73
  elif file_path.endswith('.docx'):
 
78
  else:
79
  return ""
80
 
81
+ # --- Text Preprocessing Functions ---
82
  def preprocess_text(text):
83
  if not isinstance(text, str): return ""
84
  text = text.lower()
 
87
  processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
88
  return " ".join(processed_tokens)
89
 
90
+ # --- Information Extraction ---
91
  def extract_skills(text_doc, skill_keywords=None):
92
  extracted_skills = []
93
  if skill_keywords is None: skill_keywords = set()
 
118
  elif "associate" in text_lower: education_level = "Associate's"
119
  return years_experience, education_level
120
 
121
+ # --- Feature Engineering ---
122
  def get_text_embeddings(text):
123
  if not text: return np.zeros(nlp.vocab.vectors.shape[1])
124
  doc = nlp(text)
 
131
  vec2 = vec2.reshape(1, -1)
132
  return cosine_similarity(vec1, vec2)[0][0]
133
 
134
+ # --- Main Processing Pipeline ---
135
  def analyze_document(doc_text):
136
  doc_spacy = nlp(doc_text)
137
  cleaned_text = preprocess_text(doc_text)
 
145
  "text_embedding": text_embedding
146
  }
147
 
148
+ # --- Matching and Scoring Logic ---
149
  def calculate_match_scores(cv_data, jd_data):
150
  results = {}
151
  overall_similarity = calculate_cosine_similarity(cv_data["text_embedding"], jd_data["text_embedding"])
 
194
  results["education_match_status"] = edu_match_status
195
  return results
196
 
197
+ # --- Overall Analysis Orchestrator ---
198
  def perform_cv_job_analysis(cv_text, job_desc_text):
199
  cv_analysis_data = analyze_document(cv_text)
200
  job_desc_analysis_data = analyze_document(job_desc_text)
201
  match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
202
  return match_results
203
 
204
+ # --- Visualization Functions ---
205
  def create_overall_match_plot(score):
206
  fig, ax = plt.subplots(figsize=(6, 2))
207
  sns.set_style("whitegrid")
 
247
 
248
  # --- Main Gradio Interface Function ---
249
  def analyze_cv_match(cv_file_obj, cv_text_input, jd_text_input):
 
 
 
 
250
  cv_content = ""
 
251
  if cv_file_obj is not None:
252
  cv_content = get_file_content(cv_file_obj)
253
  elif cv_text_input:
 
260
  None, None, None, "Analysis Failed")
261
  try:
262
  analysis_results = perform_cv_job_analysis(cv_content, jd_text_input)
 
263
  html_output = f"""
264
  <h2 style='text-align: center;'>πŸ’‘ Analysis Results Summary πŸ’‘</h2>
265
  <div style='display: flex; justify-content: space-around; flex-wrap: wrap; text-align: center; margin-bottom: 20px;'>
 
294
  <p><strong>πŸ“š Job's Required Education:</strong> <code>{analysis_results['jd_education_level']}</code></p>
295
  <p style='color:green;'><strong>Status:</strong> {analysis_results['education_match_status']}</p>
296
  """
 
297
  overall_plot = create_overall_match_plot(analysis_results['overall_match_score'])
298
  skill_plot = create_skill_match_plot(analysis_results['matched_skills'], analysis_results['missing_skills'])
299
  keywords_plot = create_top_keywords_plot(analysis_results['top_cv_keywords'], analysis_results['top_jd_keywords'])
 
300
  return html_output, overall_plot, skill_plot, keywords_plot, "Analysis Complete!"
301
  except Exception as e:
302
  import traceback
303
  error_traceback = traceback.format_exc()
 
304
  return (f"<h4><p style='color:red;'>An unexpected error occurred during analysis: {e}</p></h4>"
305
  f"<details><summary>Click for details</summary><pre>{error_traceback}</pre></details>",
306
  None, None, None, "Analysis Failed")
307
 
308
  # --- Gradio Interface Definition ---
 
309
  with gr.Blocks(theme=gr.themes.Soft(), title="CV-Job Match Analyzer") as demo:
310
+
311
+ # THIS IS THE NEW LINE TO ADD EMPTY SPACE AT THE TOP
312
+ gr.Markdown("<br>" * 5)
313
+
314
  gr.Markdown(
315
  """
316
  # πŸ‘¨β€πŸ’Ό CV-Job Match Analyzer πŸ“ˆ
 
318
  Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
319
  """
320
  )
321
+ with gr.Row():
322
+ with gr.Column(scale=1, min_width=400):
323
+ gr.Markdown("## **1. Your CV**")
324
  cv_file_obj = gr.File(label="Upload CV (PDF, DOCX, TXT)", file_types=[".pdf", ".docx", ".txt"])
325
  cv_text_input = gr.Textbox(label="Or paste CV text here (overrides file upload)", lines=10, placeholder="Paste your CV content here...")
326
+ gr.Markdown("## **2. Job Description**")
327
  jd_text_input = gr.Textbox(label="Paste the Job Description text here", lines=10, placeholder="Paste the job description content here...")
328
+ with gr.Row():
329
  analyze_button = gr.Button("✨ Analyze CV Match ✨", variant="primary", scale=1)
330
  clear_button = gr.ClearButton([cv_file_obj, cv_text_input, jd_text_input], scale=1)
331
+ with gr.Column(scale=2, min_width=600):
332
+ output_html = gr.HTML(label="Analysis Report")
333
+ gr.Markdown("## **πŸ“Š Visual Insights**")
334
+ output_overall_plot = gr.Plot(label="Overall Match Score")
335
  output_skill_plot = gr.Plot(label="Skill Match Breakdown")
336
  output_keywords_plot = gr.Plot(label="Top Keywords")
337
+
338
  analyze_button.click(
339
  fn=analyze_cv_match,
340
  inputs=[cv_file_obj, cv_text_input, jd_text_input],
341
  outputs=[output_html, output_overall_plot, output_skill_plot, output_keywords_plot, gr.State(value="")],
 
 
342
  )
343
 
344
  demo.launch()