loki2910 commited on
Commit
f853783
Β·
verified Β·
1 Parent(s): 099910c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +343 -0
  2. requirements.txt +13 -0
app.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import re
4
+ import tempfile
5
+ import traceback
6
+ from typing import Tuple, Dict
7
+
8
+ import fitz # PyMuPDF
9
+ import docx # python-docx
10
+
11
+ import numpy as np
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ import gradio as gr
15
+
16
+ # --------------------------
17
+ # Pre-load all heavy libraries and models at startup.
18
+ # --------------------------
19
+ print("Starting up: Loading transformer models...")
20
+ from sentence_transformers import SentenceTransformer
21
+ from transformers import BertTokenizer, BertModel
22
+ import torch
23
+
24
+ # Load models into memory once when the application starts
25
+ sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
26
+ bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
27
+ bert_model = BertModel.from_pretrained("bert-base-uncased")
28
+ bert_model.eval()
29
+ print("Transformer models loaded successfully.")
30
+
31
+ # --------------------------
32
+ # Built-in stopwords
33
+ # --------------------------
34
+ EN_STOPWORDS = {
35
+ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as",
36
+ "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by",
37
+ "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further",
38
+ "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his",
39
+ "how", "i", "if", "in", "into", "is", "it", "its", "itself", "just", "me", "more", "most", "my",
40
+ "myself", "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", "other", "ought", "our",
41
+ "ours", "ourselves", "out", "over", "own", "same", "she", "should", "so", "some", "such", "than",
42
+ "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this",
43
+ "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "were", "what", "when",
44
+ "where", "which", "while", "who", "whom", "why", "with", "would", "you", "your", "yours", "yourself",
45
+ "yourselves", "resume", "job", "description", "work", "experience", "skill", "skills", "applicant", "application"
46
+ }
47
+
48
+ # --------------------------
49
+ # NEW FEATURE: Job Suggestions Database
50
+ # --------------------------
51
+ JOB_SUGGESTIONS_DB = {
52
+ "Data Scientist": {"python", "sql", "machine", "learning", "tensorflow", "pytorch", "analysis"},
53
+ "Data Analyst": {"sql", "python", "excel", "tableau", "analysis", "statistics"},
54
+ "Backend Developer": {"python", "java", "sql", "docker", "aws", "api", "git"},
55
+ "Frontend Developer": {"react", "javascript", "html", "css", "git", "ui", "ux"},
56
+ "Full-Stack Developer": {"python", "javascript", "react", "sql", "docker", "git"},
57
+ "Machine Learning Engineer": {"python", "tensorflow", "pytorch", "machine", "learning", "docker", "cloud"},
58
+ "Project Manager": {"agile", "scrum", "project", "management", "jira"}
59
+ }
60
+
61
+
62
+ # --------------------------
63
+ # Utilities: text extraction
64
+ # --------------------------
65
+ def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
66
+ try:
67
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
68
+ pages = [p.get_text("text") for p in doc]
69
+ doc.close()
70
+ return "\n".join(p for p in pages if p)
71
+ except Exception as e:
72
+ return f"[Error reading PDF: {e}]"
73
+
74
+
75
+ def extract_text_from_docx_bytes(docx_bytes: bytes) -> str:
76
+ try:
77
+ docx_io = io.BytesIO(docx_bytes)
78
+ doc = docx.Document(docx_io)
79
+ paragraphs = [p.text for p in doc.paragraphs if p.text]
80
+ return "\n".join(paragraphs)
81
+ except Exception as e:
82
+ return f"[Error reading DOCX: {e}]"
83
+
84
+
85
+ def extract_text_from_fileobj(file_obj) -> Tuple[str, str]:
86
+ fname = "uploaded_file"
87
+ try:
88
+ fname = os.path.basename(file_obj.name)
89
+ with open(file_obj.name, "rb") as f:
90
+ raw_bytes = f.read()
91
+ ext = fname.split('.')[-1].lower()
92
+ if ext == "pdf":
93
+ return (extract_text_from_pdf_bytes(raw_bytes), fname)
94
+ elif ext == "docx":
95
+ return (extract_text_from_docx_bytes(raw_bytes), fname)
96
+ else:
97
+ return (raw_bytes.decode("utf-8", errors="ignore"), fname)
98
+ except Exception as exc:
99
+ return (f"[Error reading uploaded file: {exc}\n{traceback.format_exc()}]", fname)
100
+
101
+
102
+ # --------------------------
103
+ # Text preprocessing
104
+ # --------------------------
105
+ def preprocess_text(text: str, remove_stopwords: bool = True) -> str:
106
+ if not text:
107
+ return ""
108
+ t = text.lower()
109
+ t = re.sub(r"\s+", " ", t)
110
+ t = re.sub(r"[^a-z0-9\s]", " ", t)
111
+ words = t.split()
112
+ if remove_stopwords:
113
+ words = [w for w in words if w not in EN_STOPWORDS]
114
+ return " ".join(words)
115
+
116
+
117
+ # --------------------------
118
+ # Embedding helpers
119
+ # --------------------------
120
+ def get_sentence_embedding(text: str, mode: str = "sbert") -> np.ndarray:
121
+ if mode == "sbert":
122
+ return sentence_transformer.encode([text], show_progress_bar=False)
123
+ elif mode == "bert":
124
+ tokens = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
125
+ with torch.no_grad():
126
+ out = bert_model(**tokens)
127
+ cls = out.last_hidden_state[:, 0, :].numpy()
128
+ return cls
129
+ else:
130
+ raise ValueError("Unsupported mode")
131
+
132
+
133
+ def calculate_similarity(resume_text: str, job_text: str, mode: str = "sbert") -> float:
134
+ r_emb = get_sentence_embedding(resume_text, mode=mode)
135
+ j_emb = get_sentence_embedding(job_text, mode=mode)
136
+ sim = cosine_similarity(r_emb, j_emb)[0][0]
137
+ return float(np.round(sim * 100, 2))
138
+
139
+
140
+ # --------------------------
141
+ # Keyword analysis
142
+ # --------------------------
143
+ DEFAULT_KEYWORDS = {
144
+ "skills": {"python", "nlp", "java", "sql", "tensorflow", "pytorch", "docker", "git", "react", "cloud", "aws",
145
+ "azure"},
146
+ "concepts": {"machine", "learning", "data", "analysis", "nlp", "vision", "agile", "scrum"},
147
+ "roles": {"software", "engineer", "developer", "manager", "scientist", "analyst", "architect"},
148
+ }
149
+
150
+
151
+ def analyze_resume_keywords(resume_text: str, job_description: str, keywords: Dict = None):
152
+ if keywords is None:
153
+ keywords = DEFAULT_KEYWORDS
154
+ clean_resume = preprocess_text(resume_text)
155
+ clean_job = preprocess_text(job_description)
156
+ resume_words = set(clean_resume.split())
157
+ job_words = set(clean_job.split())
158
+ missing = {}
159
+ for cat, kws in keywords.items():
160
+ missing_from_cat = [kw for kw in kws if kw in job_words and kw not in resume_words]
161
+ if missing_from_cat:
162
+ missing[cat] = sorted(missing_from_cat)
163
+ low_resume = (resume_text or "").lower()
164
+ sections_present = {
165
+ "skills": "skills" in low_resume,
166
+ "experience": "experience" in low_resume or "employment" in low_resume,
167
+ "summary": "summary" in low_resume or "objective" in low_resume,
168
+ }
169
+ suggestions = []
170
+ if any(missing.values()):
171
+ for cat, kws in missing.items():
172
+ for kw in kws:
173
+ if cat == "skills":
174
+ suggestions.append(f"Add keyword '{kw}' to your Skills section." if sections_present[
175
+ "skills"] else f"Consider creating a Skills section to include '{kw}'.")
176
+ elif cat == "concepts":
177
+ suggestions.append(
178
+ f"Try to demonstrate your knowledge of '{kw}' in your Experience or Projects section.")
179
+ elif cat == "roles":
180
+ suggestions.append(f"Align your Summary/Objective to mention the title '{kw}'.")
181
+ else:
182
+ suggestions.append("Great job! Your resume contains many of the keywords found in the job description.")
183
+ return missing, "\n".join(f"- {s}" for s in suggestions)
184
+
185
+
186
+ # --------------------------
187
+ # NEW FEATURE: Functions to format outputs and extract text keywords
188
+ # --------------------------
189
+ def format_missing_keywords(missing: Dict) -> str:
190
+ if not any(missing.values()):
191
+ return "βœ… No critical keywords seem to be missing. Great job!"
192
+
193
+ output = "### πŸ”‘ Keywords Missing From Your Resume\n"
194
+ for category, keywords in missing.items():
195
+ if keywords:
196
+ output += f"**Missing {category.capitalize()}:** {', '.join(keywords)}\n"
197
+ return output
198
+
199
+
200
+ def suggest_jobs(resume_text: str) -> str:
201
+ resume_words = set(preprocess_text(resume_text).split())
202
+ suggestions = []
203
+
204
+ for job_title, required_skills in JOB_SUGGESTIONS_DB.items():
205
+ matched_skills = resume_words.intersection(required_skills)
206
+ if len(matched_skills) >= 3:
207
+ suggestions.append(job_title)
208
+
209
+ if not suggestions:
210
+ return "Could not determine strong job matches from the resume. Try adding more specific skills and technologies."
211
+
212
+ output = "### πŸš€ Job Titles You May Be a Good Fit For\n"
213
+ for job in suggestions:
214
+ output += f"- {job}\n"
215
+ return output
216
+
217
+
218
+ def extract_top_keywords(text: str, top_n: int = 15) -> str:
219
+ if not text.strip():
220
+ return "Not enough text provided."
221
+ try:
222
+ vectorizer = TfidfVectorizer(stop_words=list(EN_STOPWORDS))
223
+ tfidf_matrix = vectorizer.fit_transform([text])
224
+ feature_names = np.array(vectorizer.get_feature_names_out())
225
+ scores = tfidf_matrix.toarray().flatten()
226
+ top_indices = scores.argsort()[-top_n:][::-1]
227
+ top_keywords = feature_names[top_indices]
228
+ return ", ".join(top_keywords)
229
+ except ValueError:
230
+ return "Could not extract keywords (text may be too short)."
231
+
232
+
233
+ # --------------------------
234
+ # Main Gradio app logic
235
+ # --------------------------
236
+ def analyze_resume(file, job_description: str, mode: str):
237
+ if file is None or not job_description.strip():
238
+ return 0.0, "Please upload a resume and paste a job description.", "", "", "", "", ""
239
+
240
+ try:
241
+ resume_text, fname = extract_text_from_fileobj(file)
242
+ if resume_text.strip().startswith("[Error"):
243
+ raise RuntimeError(resume_text)
244
+
245
+ cleaned_resume = preprocess_text(resume_text)
246
+ cleaned_job = preprocess_text(job_description)
247
+
248
+ sim_pct = calculate_similarity(cleaned_resume, cleaned_job, mode=mode)
249
+
250
+ if sim_pct >= 80:
251
+ verdict = f"<h3 style='color:green;'>βœ… Excellent Match ({sim_pct:.2f}%)</h3>"
252
+ elif sim_pct >= 60:
253
+ verdict = f"<h3 style='color:limegreen;'>πŸ‘ Good Match ({sim_pct:.2f}%)</h3>"
254
+ elif sim_pct >= 40:
255
+ verdict = f"<h3 style='color:orange;'>⚠️ Fair Match ({sim_pct:.2f}%)</h3>"
256
+ else:
257
+ verdict = f"<h3 style='color:red;'>❌ Low Match ({sim_pct:.2f}%)</h3>"
258
+
259
+ missing_dict, suggestions_text = analyze_resume_keywords(resume_text, job_description)
260
+
261
+ missing_formatted = format_missing_keywords(missing_dict)
262
+ job_suggestions = suggest_jobs(resume_text)
263
+
264
+ # NEW: Get top keywords as text instead of word clouds
265
+ resume_keywords_text = extract_top_keywords(cleaned_resume)
266
+ jd_keywords_text = extract_top_keywords(cleaned_job)
267
+
268
+ return float(
269
+ sim_pct), verdict, missing_formatted, suggestions_text, job_suggestions, resume_keywords_text, jd_keywords_text
270
+
271
+ except Exception as e:
272
+ tb = traceback.format_exc()
273
+ return 0.0, f"### An Error Occurred\n`{e}`", "", "", "", "", ""
274
+
275
+
276
+ # --------------------------
277
+ # Clear Button Logic
278
+ # --------------------------
279
+ def clear_inputs():
280
+ return None, "", "sbert", None, None, None, None, None, None
281
+
282
+
283
+ # --------------------------
284
+ # Build Gradio UI
285
+ # --------------------------
286
+ def build_ui():
287
+ with gr.Blocks(theme=gr.themes.Default(), title="Resume ↔ Job Matcher") as demo:
288
+ gr.Markdown("# πŸ“„ Resume & Job Description Analyzer 🎯")
289
+ gr.Markdown(
290
+ "Upload a resume, paste a job description, and get an instant analysis, keyword suggestions, and potential job matches.")
291
+
292
+ with gr.Row():
293
+ with gr.Column(scale=2):
294
+ file_in = gr.File(label="Upload resume (PDF or DOCX)", file_count="single",
295
+ file_types=[".pdf", ".docx"])
296
+ job_desc = gr.Textbox(lines=10, label="Job Description",
297
+ placeholder="Paste the full job description here...")
298
+ mode = gr.Radio(choices=["sbert", "bert"], value="sbert", label="Analysis Mode",
299
+ info="SBERT is faster, BERT is more detailed.")
300
+ with gr.Row():
301
+ clear_btn = gr.Button("Clear")
302
+ run_btn = gr.Button("Analyze Resume", variant="primary")
303
+
304
+ with gr.Column(scale=3):
305
+ with gr.Tabs():
306
+ with gr.TabItem("πŸ“Š Analysis & Suggestions"):
307
+ score_slider = gr.Slider(value=0, minimum=0, maximum=100, step=0.01, interactive=False,
308
+ label="Similarity Score")
309
+ score_text = gr.Markdown()
310
+ suggestions_out = gr.Textbox(label="Suggestions to Improve Your Resume", interactive=False,
311
+ lines=5)
312
+ missing_out = gr.Markdown(label="Keywords Check")
313
+
314
+ with gr.TabItem("πŸš€ Job Suggestions"):
315
+ job_suggestions_out = gr.Markdown(label="Potential Job Roles")
316
+
317
+ with gr.TabItem("πŸ”‘ Top Keywords"):
318
+ # REPLACED Word Clouds with Textboxes for keywords
319
+ resume_keywords_out = gr.Textbox(label="Top Resume Keywords")
320
+ jd_keywords_out = gr.Textbox(label="Top Job Description Keywords")
321
+
322
+ run_btn.click(
323
+ analyze_resume,
324
+ inputs=[file_in, job_desc, mode],
325
+ outputs=[score_slider, score_text, missing_out, suggestions_out, job_suggestions_out, resume_keywords_out,
326
+ jd_keywords_out],
327
+ show_progress='full'
328
+ )
329
+
330
+ clear_btn.click(
331
+ clear_inputs,
332
+ inputs=[],
333
+ outputs=[file_in, job_desc, mode, score_slider, score_text, missing_out, suggestions_out,
334
+ job_suggestions_out, resume_keywords_out, jd_keywords_out]
335
+ )
336
+
337
+ return demo
338
+
339
+
340
+ if __name__ == "__main__":
341
+ demo = build_ui()
342
+ demo.launch()
343
+ #demo.launch(server_name="0.0.0.0")
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --find-links https://storage.googleapis.com/torch-cpu/torch_stable.html
2
+ torch
3
+ torchvision
4
+
5
+ gradio
6
+ scikit-learn
7
+ numpy
8
+ PyMuPDF
9
+ python-docx
10
+ sentence-transformers
11
+ transformers
12
+ wordcloud
13
+ matplotlib