afouda commited on
Commit
33b4426
Β·
verified Β·
1 Parent(s): f4fe644

Add app file

Browse files
Files changed (1) hide show
  1. app.py +456 -0
app.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import csv
5
+ import tempfile
6
+ import time
7
+ from typing import List, Dict, Any, Tuple
8
+ import requests
9
+ import PyPDF2
10
+ import docx2txt
11
+ import gradio as gr
12
+ import pandas as pd
13
+
14
+ # Global Configuration
15
+ DEEPINFRA_API_KEY = "285LUJulGIprqT6hcPhiXtcrphU04FG4"
16
+ DEEPINFRA_BASE_URL = "https://api.deepinfra.com/v1/openai/chat/completions"
17
+ DEFAULT_MODEL = "openai/gpt-oss-120b"
18
+ REQUEST_TIMEOUT_SECS = 120
19
+
20
+ # Prompts for LLM Calls
21
+ JD_SYSTEM = """You are an expert recruitment analyst. Extract a job description into STRICT JSON.
22
+ Rules:
23
+ - Output ONLY JSON (no markdown, no prose).
24
+ - If the JD language is not English, still output keys in English but translate skills into an additional 'skills_en' array.
25
+ - Keep items short and normalized (e.g., 'python', 'sql').
26
+ Schema:
27
+ {
28
+ "title": "",
29
+ "seniority": "",
30
+ "skills": [],
31
+ "skills_en": [],
32
+ "qualifications": [],
33
+ "responsibilities": [],
34
+ "nice_to_have": []
35
+ }
36
+ """
37
+
38
+ RESUME_SYSTEM = """You are an expert resume parser. Extract a candidate profile into STRICT JSON.
39
+ Rules:
40
+ - Output ONLY JSON (no markdown, no prose).
41
+ - Provide 'skills_en' translated/normalized to English for matching.
42
+ - Keep arrays compact, deduplicate entries.
43
+ Schema:
44
+ {
45
+ "name": "",
46
+ "email": "",
47
+ "phone": "",
48
+ "skills": [],
49
+ "skills_en": [],
50
+ "education": [{"degree":"", "field":"", "institution":"", "year":""}],
51
+ "experience": [{"title":"", "company":"", "start_date":"", "end_date":"", "summary":""}],
52
+ "languages": []
53
+ }
54
+ """
55
+
56
+ FEEDBACK_SYSTEM = """You are an expert technical recruiter. Compare a job and a candidate and return STRICT JSON with actionable feedback.
57
+ Respond in the job description's language.
58
+ Schema:
59
+ {
60
+ "overall_summary": "",
61
+ "strengths": [],
62
+ "weaknesses": [],
63
+ "missing_requirements": [],
64
+ "suggestions": []
65
+ }
66
+ Keep each bullet short (max ~12 words).
67
+ Output ONLY JSON.
68
+ """
69
+
70
+ # Helper Functions
71
+ def _pdf_to_text(path: str) -> str:
72
+ text = []
73
+ with open(path, "rb") as f:
74
+ reader = PyPDF2.PdfReader(f)
75
+ for page in reader.pages:
76
+ text.append(page.extract_text() or "")
77
+ return "\n".join(text)
78
+
79
+ def _txt_to_text(path: str) -> str:
80
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
81
+ return f.read()
82
+
83
+ def _docx_to_text(path: str) -> str:
84
+ return docx2txt.process(path) or ""
85
+
86
+ def read_file_safely(path: str) -> str:
87
+ try:
88
+ low = path.lower()
89
+ if low.endswith(".pdf"):
90
+ return _pdf_to_text(path)
91
+ if low.endswith(".txt"):
92
+ return _txt_to_text(path)
93
+ if low.endswith(".docx"):
94
+ return _docx_to_text(path)
95
+ return f"[Unsupported file type: {os.path.basename(path)}]"
96
+ except Exception as e:
97
+ return f"[Error reading file: {e}]"
98
+
99
+ def safe_json_loads(text: str) -> dict:
100
+ try:
101
+ m = re.search(r"```json\s*(.*?)```", text or "", re.DOTALL | re.IGNORECASE)
102
+ block = m.group(1) if m else text
103
+ return json.loads(block)
104
+ except Exception:
105
+ return {}
106
+
107
+ def deepinfra_chat(messages: List[Dict[str, str]], api_key: str, model: str, temperature: float = 0.2) -> str:
108
+ if not api_key:
109
+ raise RuntimeError("Missing API Key.")
110
+ payload = {
111
+ "model": model,
112
+ "messages": messages,
113
+ "temperature": temperature,
114
+ }
115
+ resp = requests.post(
116
+ DEEPINFRA_BASE_URL,
117
+ headers={
118
+ "Authorization": f"Bearer {api_key}",
119
+ "Content-Type": "application/json",
120
+ },
121
+ data=json.dumps(payload),
122
+ timeout=REQUEST_TIMEOUT_SECS,
123
+ )
124
+ resp.raise_for_status()
125
+ data = resp.json()
126
+ return (data.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
127
+
128
+ def quick_contacts(text: str) -> dict:
129
+ email_re = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
130
+ phone_re = re.compile(r"(\+\d{1,3}\s?)?(\(\d{1,4}\)|\d{1,4})[-.\s]?\d{1,4}[-.\s]?\d{1,9}")
131
+ email_guess = email_re.search(text)
132
+ phone_guess = phone_re.search(text)
133
+ return {
134
+ "email_guess": email_guess.group(0) if email_guess else None,
135
+ "phone_guess": phone_guess.group(0) if phone_guess else None,
136
+ }
137
+
138
+ def load_job_description(jd_text: str, jd_file) -> str:
139
+ if jd_text and jd_text.strip():
140
+ return jd_text
141
+ if jd_file:
142
+ return read_file_safely(jd_file.name)
143
+ return ""
144
+
145
+ def load_resume(resume_file) -> Tuple[str, str]:
146
+ if not resume_file:
147
+ return "", ""
148
+ fname = os.path.basename(resume_file.name)
149
+ text = read_file_safely(resume_file.name)
150
+ return text, fname
151
+
152
+ # LLM-based Extraction Functions
153
+ def llm_extract_jd(jd_text: str, api_key: str, model: str, temperature: float = 0.1) -> Dict:
154
+ messages = [
155
+ {"role": "system", "content": JD_SYSTEM},
156
+ {"role": "user", "content": jd_text[:20000]},
157
+ ]
158
+ raw = deepinfra_chat(messages, api_key=api_key, model=model, temperature=temperature)
159
+ return safe_json_loads(raw)
160
+
161
+ def llm_extract_resume(resume_text: str, api_key: str, model: str, temperature: float = 0.1) -> Dict:
162
+ messages = [
163
+ {"role": "system", "content": RESUME_SYSTEM},
164
+ {"role": "user", "content": resume_text[:20000]},
165
+ ]
166
+ raw = deepinfra_chat(messages, api_key=api_key, model=model, temperature=temperature)
167
+ return safe_json_loads(raw)
168
+
169
+ def llm_feedback(jd_struct: Dict, resume_struct: Dict, api_key: str, model: str, temperature: float = 0.2) -> Dict:
170
+ prompt = json.dumps({"job": jd_struct, "candidate": resume_struct}, ensure_ascii=False)
171
+ messages = [
172
+ {"role": "system", "content": FEEDBACK_SYSTEM},
173
+ {"role": "user", "content": prompt},
174
+ ]
175
+ raw = deepinfra_chat(messages, api_key=api_key, model=model, temperature=temperature)
176
+ return safe_json_loads(raw)
177
+
178
+ # =========================
179
+ # Scoring via LLM (0..10)
180
+ # =========================
181
+ def prompt_for_match(jd_struct: Dict[str, Any], cv_structs: List[Dict[str, Any]]) -> List[Dict[str, str]]:
182
+ # compact candidates to reduce tokens
183
+ compact_cands = []
184
+ for c in cv_structs:
185
+ compact_cands.append({
186
+ "name": c.get("name",""),
187
+ "email": c.get("email",""),
188
+ "phone": c.get("phone",""),
189
+ "skills": (c.get("skills_en") or c.get("skills") or [])[:50],
190
+ "experience_titles": [e.get("title","") for e in (c.get("experience") or [])][:30],
191
+ "education": [e.get("degree","") for e in (c.get("education") or [])][:20],
192
+ "languages": c.get("languages", [])[:20],
193
+ })
194
+
195
+ system = (
196
+ "You are ranking candidates for a role. Output STRICT JSON ONLY:\n"
197
+ "{ \"candidates\": [ { \"candidate\": str, \"score\": number (0-10), \"justification\": str } ] }\n"
198
+ "Scoring criteria (weight them reasonably):\n"
199
+ "- Must-have skills coverage and relevant years\n"
200
+ "- Nice-to-have skills and domain fit\n"
201
+ "- Evidence quality in work history/education\n"
202
+ "- Language/locale requirements if any\n"
203
+ "IMPORTANT:\n"
204
+ "- The 'candidate' MUST EXACTLY EQUAL the resume 'name' field provided.\n"
205
+ "- No extra keys. No markdown."
206
+ )
207
+ user = (
208
+ "Role (parsed JSON):\n"
209
+ f"{json.dumps(jd_struct, ensure_ascii=False)}\n\n"
210
+ "Candidates (compact JSON):\n"
211
+ f"{json.dumps(compact_cands, ensure_ascii=False)}"
212
+ )
213
+ return [{"role": "system", "content": system}, {"role": "user", "content": user}]
214
+
215
+ RANK_LINE_RE = re.compile(r"^\s*(\d+)\.\s*(.*?)\s*[β€”\-]\s*([0-9]+(?:\.[0-9]+)?)\s*/\s*10\b", re.M)
216
+
217
+ def parse_ranked_output(content: str) -> List[Dict[str, Any]]:
218
+ # Prefer strict JSON; fallback to "1. Name β€” 8.0/10" lines.
219
+ rows: List[Dict[str, Any]] = []
220
+ parsed = safe_json_loads(content or "")
221
+
222
+ if isinstance(parsed, dict) and isinstance(parsed.get("candidates"), list):
223
+ for it in parsed["candidates"]:
224
+ rows.append({
225
+ "candidate": str(it.get("candidate","")).strip(),
226
+ "score": float(it.get("score", 0)),
227
+ "justification": str(it.get("justification","")).strip(),
228
+ })
229
+ return rows
230
+
231
+ if isinstance(parsed, list):
232
+ for it in parsed:
233
+ rows.append({
234
+ "candidate": str(it.get("candidate","")).strip(),
235
+ "score": float(it.get("score", 0)),
236
+ "justification": str(it.get("justification","")).strip(),
237
+ })
238
+ return rows
239
+
240
+ for m in RANK_LINE_RE.finditer(content or ""):
241
+ rows.append({"candidate": m.group(2).strip(), "score": float(m.group(3)), "justification": ""})
242
+
243
+ if not rows:
244
+ rows = [{"candidate": "RAW_OUTPUT", "score": 0.0, "justification": (content or "")[:2000]}]
245
+ return rows
246
+
247
+
248
+ # =========================
249
+ # Pipeline
250
+ # =========================
251
+ def process(
252
+ jd_text,
253
+ jd_file,
254
+ resume_files,
255
+ api_key_pw,
256
+ model_name,
257
+ temperature,
258
+ top_n,
259
+ w_skill, # kept for UI compatibility (unused here)
260
+ w_qual, # kept for UI compatibility (unused here)
261
+ w_resp, # kept for UI compatibility (unused here)
262
+ ):
263
+ t0 = time.perf_counter()
264
+
265
+ api_key = (api_key_pw or "").strip() or (DEEPINFRA_API_KEY or "").strip()
266
+ if not api_key:
267
+ raise gr.Error("Missing API key. Set DEEPINFRA_API_KEY env var or use the password field.")
268
+ if not model_name:
269
+ model_name = DEFAULT_MODEL
270
+
271
+ # --- JD ---
272
+ t_jd_start = time.perf_counter()
273
+ jd_raw = load_job_description(jd_text or "", jd_file)
274
+ if not jd_raw.strip():
275
+ raise gr.Error("Please paste a Job Description or upload a JD file.")
276
+ jd_struct = llm_extract_jd(jd_raw, api_key=api_key, model=model_name)
277
+ t_jd = time.perf_counter() - t_jd_start
278
+
279
+ # --- Resumes parse ---
280
+ if not resume_files or len(resume_files) == 0:
281
+ raise gr.Error("Please upload at least one resume (PDF or DOCX).")
282
+
283
+ parsed_cands = []
284
+ name_to_file = {}
285
+ t_parse_total = 0.0
286
+
287
+ for f in resume_files[:50]: # cap to avoid huge batches
288
+ t_parse_s = time.perf_counter()
289
+ text, fname = load_resume(f)
290
+ contacts = quick_contacts(text)
291
+ cand_struct = llm_extract_resume(text, api_key=api_key, model=model_name)
292
+ if not isinstance(cand_struct, dict):
293
+ cand_struct = {}
294
+ cand_struct.setdefault("name", os.path.splitext(fname)[0])
295
+ cand_struct.setdefault("skills", [])
296
+ cand_struct.setdefault("skills_en", [])
297
+ cand_struct.setdefault("education", [])
298
+ cand_struct.setdefault("experience", [])
299
+ cand_struct.setdefault("languages", [])
300
+ cand_struct.setdefault("email", cand_struct.get("email") or contacts["email_guess"])
301
+ cand_struct.setdefault("phone", cand_struct.get("phone") or contacts["phone_guess"])
302
+ parsed_cands.append(cand_struct)
303
+ name_to_file[cand_struct["name"]] = fname
304
+ t_parse_total += (time.perf_counter() - t_parse_s)
305
+
306
+ t_match_start = time.perf_counter()
307
+ match_msgs = prompt_for_match(jd_struct, parsed_cands)
308
+ raw_match = deepinfra_chat(match_msgs, api_key=api_key, model=model_name, temperature=temperature)
309
+ ranked_rows = parse_ranked_output(raw_match)
310
+ t_match_total = time.perf_counter() - t_match_start
311
+
312
+ score_map = {r["candidate"]: (float(r.get("score", 0.0)), r.get("justification","")) for r in ranked_rows}
313
+
314
+ table_rows, export_rows, detail_blobs = [], [], []
315
+
316
+ for c in parsed_cands:
317
+ nm = c.get("name","")
318
+ sc, just = score_map.get(nm, (0.0, "")) # if LLM didn't return this name, default 0
319
+ table_rows.append({
320
+ "Candidate": nm,
321
+ "Score": round(sc, 1),
322
+ "Email": c.get("email",""),
323
+ "Phone": c.get("phone",""),
324
+ "File": name_to_file.get(nm,""),
325
+ })
326
+ export_rows.append({
327
+ "candidate": nm,
328
+ "Score": round(sc, 1),
329
+ "file": name_to_file.get(nm,""),
330
+ "justification": just,
331
+ })
332
+ detail_blobs.append((
333
+ nm, sc,
334
+ f"""### {nm} β€” {sc:.1f}/10
335
+ **File:** {name_to_file.get(nm,'')}
336
+ **Email:** {c.get('email','')} | **Phone:** {c.get('phone','')}
337
+
338
+ **Justification:** {just}
339
+ """,
340
+ name_to_file.get(nm,"")
341
+ ))
342
+
343
+ # sort by Score DESC
344
+ df = pd.DataFrame(table_rows).sort_values("Score", ascending=False, kind="mergesort")
345
+ df_show = df.head(int(top_n)) if top_n and isinstance(top_n, (int, float)) else df
346
+
347
+ # CSV export: rank, candidate, Score, file, justification
348
+ sorted_items = sorted(export_rows, key=lambda r: float(r["Score"]), reverse=True)
349
+ export_with_rank = []
350
+ for i, r in enumerate(sorted_items, start=1):
351
+ export_with_rank.append({
352
+ "rank": i,
353
+ "candidate": r["candidate"],
354
+ "Score": r["Score"],
355
+ "file": r["file"],
356
+ "justification": r["justification"],
357
+ })
358
+ csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
359
+ pd.DataFrame(export_with_rank, columns=["rank", "candidate", "Score", "file", "justification"]) \
360
+ .to_csv(csv_path, index=False, encoding="utf-8")
361
+
362
+ # Candidate Details: top 5 only (based on score)
363
+ detail_blobs_sorted = sorted(detail_blobs, key=lambda t: t[1], reverse=True)
364
+ top5_md = "\n\n".join(md for (_n, _s, md, _f) in detail_blobs_sorted[:5])
365
+
366
+ # metrics
367
+ t_total = time.perf_counter() - t0
368
+ avg_parse = (t_parse_total / max(1, len(parsed_cands)))
369
+ metrics_md = (
370
+ f"""### Processing Metrics
371
+ - JD parsing: {t_jd:.2f}s
372
+ - Resume parsing (avg): {avg_parse:.2f}s
373
+ - Matching (single LLM call): {t_match_total:.2f}s
374
+ - Total (all candidates): {t_total:.2f}s
375
+ """)
376
+
377
+ jd_pretty = {
378
+ "title": jd_struct.get("title", ""),
379
+ "seniority": jd_struct.get("seniority", ""),
380
+ "skills": jd_struct.get("skills", []),
381
+ "qualifications": jd_struct.get("qualifications", []),
382
+ "responsibilities": jd_struct.get("responsibilities", []),
383
+ "nice_to_have": jd_struct.get("nice_to_have", []),
384
+ }
385
+
386
+ return metrics_md, df_show, csv_path, jd_pretty, top5_md
387
+
388
+
389
+ # =========================
390
+ # Gradio UI
391
+ # =========================
392
+ with gr.Blocks(title="JD ↔ Resume Matcher") as demo:
393
+ gr.Markdown("# πŸ“Œ JD ↔ Resume Matcher\nPaste a Job Description and upload resumes to rank candidates (Score 0–10), get Top-5 details, and download a CSV.")
394
+
395
+ with gr.Row():
396
+ with gr.Column(scale=1):
397
+ gr.Markdown("### πŸ“ Job Description")
398
+ jd_text = gr.Textbox(label="Paste JD (any language)", lines=12, placeholder="Paste the JD text here...")
399
+ jd_file = gr.File(label="...or upload JD file (.pdf / .docx / .txt)", file_count="single", type="filepath")
400
+
401
+ gr.Markdown("### πŸ‘€ Resumes")
402
+ resumes = gr.Files(label="Upload resumes (.pdf / .docx)", file_count="multiple", type="filepath")
403
+
404
+ with gr.Accordion("βš™οΈ Settings", open=False):
405
+ api_key_pw = gr.Textbox(label="DeepInfra API Key (optional, overrides env var)", value="", type="password")
406
+ model_name = gr.Textbox(label="Model", value=DEFAULT_MODEL)
407
+ temperature = gr.Slider(label="Model temperature", minimum=0.0, maximum=1.0, value=0.2, step=0.05)
408
+ top_n = gr.Slider(label="Show top N candidates (table)", minimum=1, maximum=50, value=10, step=1)
409
+
410
+ # keep sliders (unused now) to avoid UI breaking changes
411
+ w_skill = gr.Slider(label="(unused) Weight: Skills overlap", minimum=0.0, maximum=1.0, value=0.6, step=0.05)
412
+ w_qual = gr.Slider(label="(unused) Weight: Qualifications match", minimum=0.0, maximum=1.0, value=0.2, step=0.05)
413
+ w_resp = gr.Slider(label="(unused) Weight: Responsibilities match", minimum=0.0, maximum=1.0, value=0.2, step=0.05)
414
+
415
+ run_btn = gr.Button("πŸ”Ž Rank & Score", variant="primary")
416
+ clear_btn = gr.Button("Clear")
417
+
418
+ with gr.Column(scale=1):
419
+ gr.Markdown("### πŸ“Š Results")
420
+ metrics_md = gr.Markdown()
421
+ ranked_df = gr.DataFrame(row_count=(5, "dynamic"), wrap=True, label="Ranked Candidates (by Score)")
422
+ csv_out = gr.File(label="Download Ranked CSV")
423
+ gr.Markdown("### 🧩 Parsed JD")
424
+ jd_json = gr.JSON()
425
+ gr.Markdown("### πŸ—’οΈ Candidate Details (Top 5)")
426
+ details_md = gr.Markdown()
427
+
428
+ run_btn.click(
429
+ fn=process,
430
+ inputs=[jd_text, jd_file, resumes, api_key_pw, model_name, temperature, top_n, w_skill, w_qual, w_resp],
431
+ outputs=[metrics_md, ranked_df, csv_out, jd_json, details_md]
432
+ )
433
+
434
+ def clear_all():
435
+ # Reset key fields/outputs; sliders keep defaults
436
+ return (
437
+ "", # jd_text
438
+ None, # jd_file
439
+ None, # resumes
440
+ "", # api_key_pw
441
+ DEFAULT_MODEL, # model_name
442
+ "", # metrics_md
443
+ pd.DataFrame(),# ranked_df
444
+ None, # csv_out
445
+ {}, # jd_json
446
+ "", # details_md
447
+ )
448
+
449
+ clear_btn.click(
450
+ fn=clear_all,
451
+ inputs=[],
452
+ outputs=[jd_text, jd_file, resumes, api_key_pw, model_name, metrics_md, ranked_df, csv_out, jd_json, details_md]
453
+ )
454
+
455
+ if __name__ == "__main__":
456
+ demo.launch()