Bur3hani commited on
Commit
cfc4901
·
verified ·
1 Parent(s): 214f5a5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +372 -0
app.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import io
4
+ import re
5
+ from docx import Document
6
+ from PyPDF2 import PdfReader # PyPDF2 is used. For more robust PDF parsing, consider 'PyMuPDF' (fitz)
7
+ import pandas as pd
8
+ import spacy
9
+ from collections import Counter
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+ import numpy as np
15
+
16
+ # --- SpaCy Model Loading ---
17
+ # Use st.cache_resource to load the model only once and reuse it across sessions.
18
+ @st.cache_resource
19
+ def load_spacy_model():
20
+ """
21
+ Loads the English spaCy model.
22
+ The model should be pre-downloaded via requirements.txt for Hugging Face Spaces.
23
+ """
24
+ try:
25
+ nlp_model = spacy.load("en_core_web_lg")
26
+ return nlp_model
27
+ except Exception as e:
28
+ st.error(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
29
+ st.stop() # Stop the app if model fails to load
30
+
31
+ nlp = load_spacy_model()
32
+ print("SpaCy model loaded successfully.") # This will appear in your Space logs
33
+
34
+
35
+ # --- Global Predefined Skills (could be loaded from a file for larger lists) ---
36
+ predefined_skills_list = set([
37
+ "python", "tensorflow", "pytorch", "scikit-learn", "numpy", "pandas",
38
+ "docker", "kubernetes", "aws", "git", "sql", "java", "r", "tableau",
39
+ "jupyter", "vscode", "bert", "spacy", "nltk", "opencv", "cnns",
40
+ "mlops", "agile", "feature engineering", "model deployment",
41
+ "machine learning", "deep learning", "nlp", "computer vision",
42
+ "data analysis", "predictive modeling", "fraud detection",
43
+ "recommendation system", "sentiment analysis", "ab testing",
44
+ "xgboost", "spark", "hadoop", "azure", "gcp",
45
+ "ai", "artificial intelligence", "data science", "big data",
46
+ "software development", "web development", "mobile development",
47
+ "databases", "cloud computing", "networking", "cybersecurity",
48
+ "project management", "communication", "teamwork", "leadership",
49
+ "problem solving", "critical thinking", "creativity"
50
+ ])
51
+ predefined_skills_list.update([
52
+ "machine learning engineer", "data scientist", "ai engineer", "deep learning engineer",
53
+ "senior machine learning engineer", "junior data scientist", # Adding common job titles too
54
+ "data engineer", "software engineer", "full stack", "frontend", "backend"
55
+ ])
56
+
57
+
58
+ # --- Text Extraction Functions (Adjusted for Streamlit file_object) ---
59
+
60
+ def extract_text_from_pdf(file_object):
61
+ """
62
+ Extracts text from a PDF file-like object using PyPDF2.
63
+ """
64
+ try:
65
+ reader = PdfReader(file_object)
66
+ text = ""
67
+ for page in reader.pages:
68
+ text += page.extract_text() or "" # Handle pages with no extractable text
69
+ return text
70
+ except Exception as e:
71
+ st.error(f"Error reading PDF: {e}")
72
+ return ""
73
+
74
+ def extract_text_from_docx(file_object):
75
+ """
76
+ Extracts text from a DOCX file-like object using python-docx.
77
+ """
78
+ try:
79
+ document = Document(file_object)
80
+ text = "\n".join([paragraph.text for paragraph in document.paragraphs])
81
+ return text
82
+ except Exception as e:
83
+ st.error(f"Error reading DOCX: {e}")
84
+ return ""
85
+
86
+ # --- Text Preprocessing Functions ---
87
+
88
+ def preprocess_text(text):
89
+ """
90
+ Applies standard NLP preprocessing steps.
91
+ """
92
+ if not isinstance(text, str):
93
+ return ""
94
+ text = text.lower()
95
+ text = re.sub(r'\s+', ' ', text).strip()
96
+ doc = nlp(text)
97
+ processed_tokens = [
98
+ token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space
99
+ ]
100
+ return " ".join(processed_tokens)
101
+
102
+ # --- Information Extraction (NER & Keyword Extraction) ---
103
+
104
+ def extract_skills(text_doc, skill_keywords=None):
105
+ """
106
+ Extracts skills using spaCy's NER and a custom keyword list.
107
+ Args:
108
+ text_doc (spacy.tokens.Doc): spaCy Doc object of the text.
109
+ skill_keywords (set): An optional set of predefined skill keywords.
110
+ Returns:
111
+ list: A list of extracted skills.
112
+ """
113
+ extracted_skills = []
114
+ if skill_keywords is None:
115
+ skill_keywords = set() # Should not be None if global is used
116
+
117
+ doc_text = text_doc.text.lower()
118
+ for skill in skill_keywords:
119
+ if re.search(r'\b' + re.escape(skill) + r'\b', doc_text):
120
+ extracted_skills.append(skill)
121
+
122
+ entities = {}
123
+ for ent in text_doc.ents:
124
+ if ent.label_ == "ORG":
125
+ entities.setdefault("organizations", []).append(ent.text)
126
+ elif ent.label_ == "GPE":
127
+ entities.setdefault("locations", []).append(ent.text)
128
+ elif ent.label_ == "DATE":
129
+ entities.setdefault("dates", []).append(ent.text)
130
+ elif ent.label_ == "PERSON":
131
+ entities.setdefault("people", []).append(ent.text)
132
+
133
+ return list(set(extracted_skills)), entities
134
+
135
+ def extract_experience_and_education(text):
136
+ """
137
+ Attempts to extract experience years and education level using regex and simple rules.
138
+ This is a simplified approach and can be complex for diverse CVs.
139
+ """
140
+ years_experience = 0
141
+ education_level = "Not Specified"
142
+
143
+ exp_matches = re.findall(r'(\d+)\s*(?:\+|plus)?\s*years?\s+of\s+experience|\d+\s*yrs?\s+exp', text.lower())
144
+ if exp_matches:
145
+ try:
146
+ years_experience = max([int(re.findall(r'\d+', m)[0]) for m in exp_matches if re.findall(r'\d+', m)])
147
+ except (ValueError, IndexError):
148
+ pass
149
+
150
+ text_lower = text.lower()
151
+ if "phd" in text_lower or "doctorate" in text_lower:
152
+ education_level = "Ph.D."
153
+ elif "master" in text_lower or "m.s." in text_lower or "msc" in text_lower:
154
+ education_level = "Master's"
155
+ elif "bachelor" in text_lower or "b.s." in text_lower or "bsc" in text_lower:
156
+ education_level = "Bachelor's"
157
+ elif "associate" in text_lower:
158
+ education_level = "Associate's"
159
+
160
+ return years_experience, education_level
161
+
162
+ # --- Feature Engineering ---
163
+
164
+ def get_text_embeddings(text):
165
+ """
166
+ Generates sentence embeddings for a given text using spaCy's pre-trained vectors.
167
+ """
168
+ if not text:
169
+ return np.zeros(nlp.vocab.vectors.shape[1])
170
+ doc = nlp(text)
171
+ if doc.has_vector:
172
+ return doc.vector
173
+ else:
174
+ # Fallback if no vector for doc (shouldn't happen with en_core_web_lg)
175
+ return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1])
176
+
177
+ def calculate_cosine_similarity(vec1, vec2):
178
+ """
179
+ Calculates cosine similarity between two vectors.
180
+ Handles potential division by zero if vectors are zero vectors.
181
+ """
182
+ if np.all(vec1 == 0) or np.all(vec2 == 0):
183
+ return 0.0
184
+ vec1 = vec1.reshape(1, -1)
185
+ vec2 = vec2.reshape(1, -1)
186
+ return cosine_similarity(vec1, vec2)[0][0]
187
+
188
+ # --- Main Processing Pipeline for a Document (CV or Job Description) ---
189
+
190
+ def analyze_document(doc_text):
191
+ """
192
+ Processes a document (CV or Job Description) for analysis.
193
+ """
194
+ doc_spacy = nlp(doc_text)
195
+ cleaned_text = preprocess_text(doc_text)
196
+ extracted_skills, general_entities = extract_skills(doc_spacy, skill_keywords=predefined_skills_list)
197
+ years_exp, education_level = extract_experience_and_education(doc_text)
198
+ text_embedding = get_text_embeddings(cleaned_text)
199
+
200
+ return {
201
+ "raw_text": doc_text,
202
+ "cleaned_text": cleaned_text,
203
+ "spacy_doc": doc_spacy,
204
+ "extracted_skills": extracted_skills,
205
+ "general_entities": general_entities,
206
+ "years_experience": years_exp,
207
+ "education_level": education_level,
208
+ "text_embedding": text_embedding
209
+ }
210
+
211
+ # --- Matching and Scoring Logic ---
212
+
213
+ def calculate_match_scores(cv_data, jd_data):
214
+ """
215
+ Calculates various match scores and identifies keyword overlaps.
216
+ """
217
+ results = {}
218
+
219
+ # 1. Overall Semantic Similarity (using embeddings)
220
+ overall_similarity = calculate_cosine_similarity(
221
+ cv_data["text_embedding"],
222
+ jd_data["text_embedding"]
223
+ )
224
+ results["overall_match_score"] = round(overall_similarity * 100, 2)
225
+
226
+ # 2. Skill Matching
227
+ cv_skills = set(cv_data["extracted_skills"])
228
+ jd_skills = set(jd_data["extracted_skills"])
229
+
230
+ matched_skills = list(cv_skills.intersection(jd_skills))
231
+ missing_skills = list(jd_skills.difference(cv_skills))
232
+ extra_skills_in_cv = list(cv_skills.difference(jd_skills))
233
+
234
+ results["matched_skills"] = matched_skills
235
+ results["missing_skills"] = missing_skills
236
+ results["extra_skills_in_cv"] = extra_skills_in_cv
237
+
238
+ if jd_skills:
239
+ skill_match_percentage = len(matched_skills) / len(jd_skills) * 100
240
+ else:
241
+ skill_match_percentage = 0.0
242
+ results["skill_match_percentage"] = round(skill_match_percentage, 2)
243
+
244
+ # 3. Keyword Overlap (using TF-IDF for important words beyond specific skills)
245
+ corpus = [cv_data["cleaned_text"], jd_data["cleaned_text"]]
246
+ tfidf_vectorizer = TfidfVectorizer(max_features=100)
247
+ tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
248
+ feature_names = tfidf_vectorizer.get_feature_names_out()
249
+
250
+ cv_tfidf_scores = {feature_names[i]: tfidf_matrix[0, i] for i in tfidf_matrix[0].nonzero()[1]}
251
+ jd_tfidf_scores = {feature_names[i]: tfidf_matrix[1, i] for i in tfidf_matrix[1].nonzero()[1]}
252
+
253
+ top_cv_keywords = sorted(cv_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
254
+ top_jd_keywords = sorted(jd_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
255
+
256
+ results["top_cv_keywords"] = [k for k,v in top_cv_keywords]
257
+ results["top_jd_keywords"] = [k for k,v in top_jd_keywords]
258
+
259
+ common_keywords = set(results["top_cv_keywords"]).intersection(set(results["top_jd_keywords"]))
260
+ results["common_keywords"] = list(common_keywords)
261
+
262
+ # 4. Experience Matching
263
+ cv_exp_years = cv_data["years_experience"]
264
+ jd_exp_years = jd_data["years_experience"]
265
+ results["cv_years_experience"] = cv_exp_years
266
+ results["jd_years_experience"] = jd_exp_years
267
+
268
+ exp_status = "Not specified by Job"
269
+ if jd_exp_years > 0:
270
+ if cv_exp_years >= jd_exp_years:
271
+ exp_status = "Meets or Exceeds Requirement"
272
+ else:
273
+ exp_status = f"Below Requirement (Needs {jd_exp_years - cv_exp_years} more years)"
274
+ results["experience_match_status"] = exp_status
275
+
276
+ # 5. Education Matching (simplified)
277
+ cv_edu = cv_data["education_level"]
278
+ jd_edu = jd_data["education_level"]
279
+ results["cv_education_level"] = cv_edu
280
+ results["jd_education_level"] = jd_edu
281
+
282
+ edu_match_status = "Not Specified by Job"
283
+ if jd_edu != "Not Specified": # Only compare if JD specifies
284
+ edu_order = {"Associate's": 1, "Bachelor's": 2, "Master's": 3, "Ph.D.": 4}
285
+ if edu_order.get(cv_edu, 0) >= edu_order.get(jd_edu, 0):
286
+ edu_match_status = "Meets or Exceeds Requirement"
287
+ else:
288
+ edu_match_status = "Below Requirement"
289
+ results["education_match_status"] = edu_match_status
290
+
291
+ return results
292
+
293
+ # --- Overall Analysis Orchestrator ---
294
+ def perform_cv_job_analysis(cv_text, job_desc_text):
295
+ """
296
+ Orchestrates the entire analysis process from raw text to results.
297
+ """
298
+ cv_analysis_data = analyze_document(cv_text)
299
+ job_desc_analysis_data = analyze_document(job_desc_text)
300
+ match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
301
+ return match_results
302
+
303
+ # --- Visualization Functions (Adjusted for Streamlit) ---
304
+ # Each visualization function now returns a matplotlib figure object
305
+ # and Streamlit's st.pyplot() is used to display it, then figure is closed.
306
+
307
+ def create_overall_match_plot(score):
308
+ """Returns a matplotlib figure for overall match."""
309
+ fig, ax = plt.subplots(figsize=(6, 2))
310
+ sns.set_style("whitegrid")
311
+ ax.barh(["Overall Match"], [score], color='skyblue')
312
+ ax.set_xlim(0, 100)
313
+ ax.text(score + 2, 0, f'{score}%', va='center', color='black', fontsize=12)
314
+ ax.set_title("Overall CV-Job Description Match Score", fontsize=14)
315
+ ax.set_xlabel("Match Percentage", fontsize=12)
316
+ ax.get_yaxis().set_visible(False)
317
+ plt.tight_layout()
318
+ return fig
319
+
320
+ def create_skill_match_plot(matched_skills, missing_skills):
321
+ """Returns a matplotlib figure for skill match breakdown."""
322
+ labels = ['Matched Skills', 'Missing Skills']
323
+ sizes = [len(matched_skills), len(missing_skills)]
324
+ colors = ['#66b3ff', '#ff9999']
325
+ explode = (0.05, 0.05) if sizes[0] > 0 and sizes[1] > 0 else (0,0)
326
+
327
+ if sum(sizes) == 0:
328
+ return None # Indicate no plot can be made
329
+
330
+ fig, ax = plt.subplots(figsize=(7, 7))
331
+ ax.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
332
+ shadow=True, startangle=90, textprops={'fontsize': 12})
333
+ ax.axis('equal')
334
+ ax.set_title("Skill Match Breakdown", fontsize=14)
335
+ plt.tight_layout()
336
+ return fig
337
+
338
+ def create_top_keywords_plot(cv_keywords, jd_keywords):
339
+ """Returns a matplotlib figure for top keywords."""
340
+ fig, axes = plt.subplots(1, 2, figsize=(16, 6))
341
+ sns.set_style("whitegrid")
342
+
343
+ cv_df = pd.DataFrame(Counter(cv_keywords).most_common(10), columns=['Keyword', 'Count'])
344
+ if not cv_df.empty:
345
+ sns.barplot(x='Count', y='Keyword', data=cv_df, ax=axes[0], palette='viridis')
346
+ axes[0].set_title('Top Keywords in CV', fontsize=14)
347
+ axes[0].set_xlabel('Frequency/Importance', fontsize=12)
348
+ axes[0].set_ylabel('')
349
+
350
+ jd_df = pd.DataFrame(Counter(jd_keywords).most_common(10), columns=['Keyword', 'Count'])
351
+ if not jd_df.empty:
352
+ sns.barplot(x='Count', y='Keyword', data=jd_df, ax=axes[1], palette='plasma')
353
+ axes[1].set_title('Top Keywords in Job Description', fontsize=14)
354
+ axes[1].set_xlabel('Frequency/Importance', fontsize=12)
355
+ axes[1].set_ylabel('')
356
+
357
+ plt.tight_layout()
358
+ return fig
359
+
360
+ # --- Streamlit Application Layout ---
361
+
362
+ st.set_page_config(page_title="CV-Job Match Analyzer", layout="wide", icon="👨‍💼")
363
+
364
+ st.title("👨‍💼 CV-Job Match Analyzer 📈")
365
+ st.markdown("""
366
+ Welcome! This tool helps you understand how well a CV matches a job description.
367
+ Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
368
+ """)
369
+
370
+ # Input for CV
371
+ st.header("1. Upload Your CV")
372
+ uploaded_cv_file = st.file_uploader("Choose a CV file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"],