Spaces:

afouda
/

Rank_of_CVS

Sleeping

App Files Files Community

afouda commited on Aug 18, 2025

Commit

33b4426

verified ·

1 Parent(s): f4fe644

Add app file

Browse files

Files changed (1) hide show

app.py +456 -0

app.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import os
+import re
+import json
+import csv
+import tempfile
+import time
+from typing import List, Dict, Any, Tuple
+import requests
+import PyPDF2
+import docx2txt
+import gradio as gr
+import pandas as pd
+# Global Configuration
+DEEPINFRA_API_KEY = "285LUJulGIprqT6hcPhiXtcrphU04FG4"
+DEEPINFRA_BASE_URL = "https://api.deepinfra.com/v1/openai/chat/completions"
+DEFAULT_MODEL = "openai/gpt-oss-120b"
+REQUEST_TIMEOUT_SECS = 120
+# Prompts for LLM Calls
+JD_SYSTEM = """You are an expert recruitment analyst. Extract a job description into STRICT JSON.
+Rules:
+- Output ONLY JSON (no markdown, no prose).
+- If the JD language is not English, still output keys in English but translate skills into an additional 'skills_en' array.
+- Keep items short and normalized (e.g., 'python', 'sql').
+Schema:
+{
+  "title": "",
+  "seniority": "",
+  "skills": [],
+  "skills_en": [],
+  "qualifications": [],
+  "responsibilities": [],
+  "nice_to_have": []
+}
+"""
+RESUME_SYSTEM = """You are an expert resume parser. Extract a candidate profile into STRICT JSON.
+Rules:
+- Output ONLY JSON (no markdown, no prose).
+- Provide 'skills_en' translated/normalized to English for matching.
+- Keep arrays compact, deduplicate entries.
+Schema:
+{
+  "name": "",
+  "email": "",
+  "phone": "",
+  "skills": [],
+  "skills_en": [],
+  "education": [{"degree":"", "field":"", "institution":"", "year":""}],
+  "experience": [{"title":"", "company":"", "start_date":"", "end_date":"", "summary":""}],
+  "languages": []
+}
+"""
+FEEDBACK_SYSTEM = """You are an expert technical recruiter. Compare a job and a candidate and return STRICT JSON with actionable feedback.
+Respond in the job description's language.
+Schema:
+{
+  "overall_summary": "",
+  "strengths": [],
+  "weaknesses": [],
+  "missing_requirements": [],
+  "suggestions": []
+}
+Keep each bullet short (max ~12 words).
+Output ONLY JSON.
+"""
+# Helper Functions
+def _pdf_to_text(path: str) -> str:
+    text = []
+    with open(path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        for page in reader.pages:
+            text.append(page.extract_text() or "")
+    return "\n".join(text)
+def _txt_to_text(path: str) -> str:
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
+        return f.read()
+def _docx_to_text(path: str) -> str:
+    return docx2txt.process(path) or ""
+def read_file_safely(path: str) -> str:
+    try:
+        low = path.lower()
+        if low.endswith(".pdf"):
+            return _pdf_to_text(path)
+        if low.endswith(".txt"):
+            return _txt_to_text(path)
+        if low.endswith(".docx"):
+            return _docx_to_text(path)
+        return f"[Unsupported file type: {os.path.basename(path)}]"
+    except Exception as e:
+        return f"[Error reading file: {e}]"
+def safe_json_loads(text: str) -> dict:
+    try:
+        m = re.search(r"```json\s*(.*?)```", text or "", re.DOTALL | re.IGNORECASE)
+        block = m.group(1) if m else text
+        return json.loads(block)
+    except Exception:
+        return {}
+def deepinfra_chat(messages: List[Dict[str, str]], api_key: str, model: str, temperature: float = 0.2) -> str:
+    if not api_key:
+        raise RuntimeError("Missing API Key.")
+    payload = {
+        "model": model,
+        "messages": messages,
+        "temperature": temperature,
+    }
+    resp = requests.post(
+        DEEPINFRA_BASE_URL,
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        },
+        data=json.dumps(payload),
+        timeout=REQUEST_TIMEOUT_SECS,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    return (data.get("choices", [{}])[0].get("message", {}).get("content", "") or "").strip()
+def quick_contacts(text: str) -> dict:
+    email_re = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
+    phone_re = re.compile(r"(\+\d{1,3}\s?)?(\(\d{1,4}\)|\d{1,4})[-.\s]?\d{1,4}[-.\s]?\d{1,9}")
+    email_guess = email_re.search(text)
+    phone_guess = phone_re.search(text)
+    return {
+        "email_guess": email_guess.group(0) if email_guess else None,
+        "phone_guess": phone_guess.group(0) if phone_guess else None,
+    }
+def load_job_description(jd_text: str, jd_file) -> str:
+    if jd_text and jd_text.strip():
+        return jd_text
+    if jd_file:
+        return read_file_safely(jd_file.name)
+    return ""
+def load_resume(resume_file) -> Tuple[str, str]:
+    if not resume_file:
+        return "", ""
+    fname = os.path.basename(resume_file.name)
+    text = read_file_safely(resume_file.name)
+    return text, fname
+# LLM-based Extraction Functions
+def llm_extract_jd(jd_text: str, api_key: str, model: str, temperature: float = 0.1) -> Dict:
+    messages = [
+        {"role": "system", "content": JD_SYSTEM},
+        {"role": "user", "content": jd_text[:20000]},
+    ]
+    raw = deepinfra_chat(messages, api_key=api_key, model=model, temperature=temperature)
+    return safe_json_loads(raw)
+def llm_extract_resume(resume_text: str, api_key: str, model: str, temperature: float = 0.1) -> Dict:
+    messages = [
+        {"role": "system", "content": RESUME_SYSTEM},
+        {"role": "user", "content": resume_text[:20000]},
+    ]
+    raw = deepinfra_chat(messages, api_key=api_key, model=model, temperature=temperature)
+    return safe_json_loads(raw)
+def llm_feedback(jd_struct: Dict, resume_struct: Dict, api_key: str, model: str, temperature: float = 0.2) -> Dict:
+    prompt = json.dumps({"job": jd_struct, "candidate": resume_struct}, ensure_ascii=False)
+    messages = [
+        {"role": "system", "content": FEEDBACK_SYSTEM},
+        {"role": "user", "content": prompt},
+    ]
+    raw = deepinfra_chat(messages, api_key=api_key, model=model, temperature=temperature)
+    return safe_json_loads(raw)
+# =========================
+# Scoring via LLM (0..10)
+# =========================
+def prompt_for_match(jd_struct: Dict[str, Any], cv_structs: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+    # compact candidates to reduce tokens
+    compact_cands = []
+    for c in cv_structs:
+        compact_cands.append({
+            "name": c.get("name",""),
+            "email": c.get("email",""),
+            "phone": c.get("phone",""),
+            "skills": (c.get("skills_en") or c.get("skills") or [])[:50],
+            "experience_titles": [e.get("title","") for e in (c.get("experience") or [])][:30],
+            "education": [e.get("degree","") for e in (c.get("education") or [])][:20],
+            "languages": c.get("languages", [])[:20],
+        })
+    system = (
+        "You are ranking candidates for a role. Output STRICT JSON ONLY:\n"
+        "{ \"candidates\": [ { \"candidate\": str, \"score\": number (0-10), \"justification\": str } ] }\n"
+        "Scoring criteria (weight them reasonably):\n"
+        "- Must-have skills coverage and relevant years\n"
+        "- Nice-to-have skills and domain fit\n"
+        "- Evidence quality in work history/education\n"
+        "- Language/locale requirements if any\n"
+        "IMPORTANT:\n"
+        "- The 'candidate' MUST EXACTLY EQUAL the resume 'name' field provided.\n"
+        "- No extra keys. No markdown."
+    )
+    user = (
+        "Role (parsed JSON):\n"
+        f"{json.dumps(jd_struct, ensure_ascii=False)}\n\n"
+        "Candidates (compact JSON):\n"
+        f"{json.dumps(compact_cands, ensure_ascii=False)}"
+    )
+    return [{"role": "system", "content": system}, {"role": "user", "content": user}]
+RANK_LINE_RE = re.compile(r"^\s*(\d+)\.\s*(.*?)\s*[—\-]\s*([0-9]+(?:\.[0-9]+)?)\s*/\s*10\b", re.M)
+def parse_ranked_output(content: str) -> List[Dict[str, Any]]:
+    # Prefer strict JSON; fallback to "1. Name — 8.0/10" lines.
+    rows: List[Dict[str, Any]] = []
+    parsed = safe_json_loads(content or "")
+    if isinstance(parsed, dict) and isinstance(parsed.get("candidates"), list):
+        for it in parsed["candidates"]:
+            rows.append({
+                "candidate": str(it.get("candidate","")).strip(),
+                "score": float(it.get("score", 0)),
+                "justification": str(it.get("justification","")).strip(),
+            })
+        return rows
+    if isinstance(parsed, list):
+        for it in parsed:
+            rows.append({
+                "candidate": str(it.get("candidate","")).strip(),
+                "score": float(it.get("score", 0)),
+                "justification": str(it.get("justification","")).strip(),
+            })
+        return rows
+    for m in RANK_LINE_RE.finditer(content or ""):
+        rows.append({"candidate": m.group(2).strip(), "score": float(m.group(3)), "justification": ""})
+    if not rows:
+        rows = [{"candidate": "RAW_OUTPUT", "score": 0.0, "justification": (content or "")[:2000]}]
+    return rows
+# =========================
+# Pipeline
+# =========================
+def process(
+    jd_text,
+    jd_file,
+    resume_files,
+    api_key_pw,
+    model_name,
+    temperature,
+    top_n,
+    w_skill,   # kept for UI compatibility (unused here)
+    w_qual,    # kept for UI compatibility (unused here)
+    w_resp,    # kept for UI compatibility (unused here)
+):
+    t0 = time.perf_counter()
+    api_key = (api_key_pw or "").strip() or (DEEPINFRA_API_KEY or "").strip()
+    if not api_key:
+        raise gr.Error("Missing API key. Set DEEPINFRA_API_KEY env var or use the password field.")
+    if not model_name:
+        model_name = DEFAULT_MODEL
+    # --- JD ---
+    t_jd_start = time.perf_counter()
+    jd_raw = load_job_description(jd_text or "", jd_file)
+    if not jd_raw.strip():
+        raise gr.Error("Please paste a Job Description or upload a JD file.")
+    jd_struct = llm_extract_jd(jd_raw, api_key=api_key, model=model_name)
+    t_jd = time.perf_counter() - t_jd_start
+    # --- Resumes parse ---
+    if not resume_files or len(resume_files) == 0:
+        raise gr.Error("Please upload at least one resume (PDF or DOCX).")
+    parsed_cands = []
+    name_to_file = {}
+    t_parse_total = 0.0
+    for f in resume_files[:50]:  # cap to avoid huge batches
+        t_parse_s = time.perf_counter()
+        text, fname = load_resume(f)
+        contacts = quick_contacts(text)
+        cand_struct = llm_extract_resume(text, api_key=api_key, model=model_name)
+        if not isinstance(cand_struct, dict):
+            cand_struct = {}
+        cand_struct.setdefault("name", os.path.splitext(fname)[0])
+        cand_struct.setdefault("skills", [])
+        cand_struct.setdefault("skills_en", [])
+        cand_struct.setdefault("education", [])
+        cand_struct.setdefault("experience", [])
+        cand_struct.setdefault("languages", [])
+        cand_struct.setdefault("email", cand_struct.get("email") or contacts["email_guess"])
+        cand_struct.setdefault("phone", cand_struct.get("phone") or contacts["phone_guess"])
+        parsed_cands.append(cand_struct)
+        name_to_file[cand_struct["name"]] = fname
+        t_parse_total += (time.perf_counter() - t_parse_s)
+    t_match_start = time.perf_counter()
+    match_msgs = prompt_for_match(jd_struct, parsed_cands)
+    raw_match = deepinfra_chat(match_msgs, api_key=api_key, model=model_name, temperature=temperature)
+    ranked_rows = parse_ranked_output(raw_match)
+    t_match_total = time.perf_counter() - t_match_start
+    score_map = {r["candidate"]: (float(r.get("score", 0.0)), r.get("justification","")) for r in ranked_rows}
+    table_rows, export_rows, detail_blobs = [], [], []
+    for c in parsed_cands:
+        nm = c.get("name","")
+        sc, just = score_map.get(nm, (0.0, ""))  # if LLM didn't return this name, default 0
+        table_rows.append({
+            "Candidate": nm,
+            "Score": round(sc, 1),
+            "Email": c.get("email",""),
+            "Phone": c.get("phone",""),
+            "File": name_to_file.get(nm,""),
+        })
+        export_rows.append({
+            "candidate": nm,
+            "Score": round(sc, 1),
+            "file": name_to_file.get(nm,""),
+            "justification": just,
+        })
+        detail_blobs.append((
+            nm, sc,
+            f"""### {nm} — {sc:.1f}/10
+**File:** {name_to_file.get(nm,'')}
+**Email:** {c.get('email','')}  |  **Phone:** {c.get('phone','')}
+**Justification:** {just}
+""",
+            name_to_file.get(nm,"")
+        ))
+    # sort by Score DESC
+    df = pd.DataFrame(table_rows).sort_values("Score", ascending=False, kind="mergesort")
+    df_show = df.head(int(top_n)) if top_n and isinstance(top_n, (int, float)) else df
+    # CSV export: rank, candidate, Score, file, justification
+    sorted_items = sorted(export_rows, key=lambda r: float(r["Score"]), reverse=True)
+    export_with_rank = []
+    for i, r in enumerate(sorted_items, start=1):
+        export_with_rank.append({
+            "rank": i,
+            "candidate": r["candidate"],
+            "Score": r["Score"],
+            "file": r["file"],
+            "justification": r["justification"],
+        })
+    csv_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
+    pd.DataFrame(export_with_rank, columns=["rank", "candidate", "Score", "file", "justification"]) \
+        .to_csv(csv_path, index=False, encoding="utf-8")
+    # Candidate Details: top 5 only (based on score)
+    detail_blobs_sorted = sorted(detail_blobs, key=lambda t: t[1], reverse=True)
+    top5_md = "\n\n".join(md for (_n, _s, md, _f) in detail_blobs_sorted[:5])
+    # metrics
+    t_total = time.perf_counter() - t0
+    avg_parse = (t_parse_total / max(1, len(parsed_cands)))
+    metrics_md = (
+f"""### Processing Metrics
+- JD parsing: {t_jd:.2f}s
+- Resume parsing (avg): {avg_parse:.2f}s
+- Matching (single LLM call): {t_match_total:.2f}s
+- Total (all candidates): {t_total:.2f}s
+""")
+    jd_pretty = {
+        "title": jd_struct.get("title", ""),
+        "seniority": jd_struct.get("seniority", ""),
+        "skills": jd_struct.get("skills", []),
+        "qualifications": jd_struct.get("qualifications", []),
+        "responsibilities": jd_struct.get("responsibilities", []),
+        "nice_to_have": jd_struct.get("nice_to_have", []),
+    }
+    return metrics_md, df_show, csv_path, jd_pretty, top5_md
+# =========================
+# Gradio UI
+# =========================
+with gr.Blocks(title="JD ↔ Resume Matcher") as demo:
+    gr.Markdown("# 📌 JD ↔ Resume Matcher\nPaste a Job Description and upload resumes to rank candidates (Score 0–10), get Top-5 details, and download a CSV.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📝 Job Description")
+            jd_text = gr.Textbox(label="Paste JD (any language)", lines=12, placeholder="Paste the JD text here...")
+            jd_file = gr.File(label="...or upload JD file (.pdf / .docx / .txt)", file_count="single", type="filepath")
+            gr.Markdown("### 👤 Resumes")
+            resumes = gr.Files(label="Upload resumes (.pdf / .docx)", file_count="multiple", type="filepath")
+            with gr.Accordion("⚙️ Settings", open=False):
+                api_key_pw = gr.Textbox(label="DeepInfra API Key (optional, overrides env var)", value="", type="password")
+                model_name = gr.Textbox(label="Model", value=DEFAULT_MODEL)
+                temperature = gr.Slider(label="Model temperature", minimum=0.0, maximum=1.0, value=0.2, step=0.05)
+                top_n = gr.Slider(label="Show top N candidates (table)", minimum=1, maximum=50, value=10, step=1)
+                # keep sliders (unused now) to avoid UI breaking changes
+                w_skill = gr.Slider(label="(unused) Weight: Skills overlap", minimum=0.0, maximum=1.0, value=0.6, step=0.05)
+                w_qual  = gr.Slider(label="(unused) Weight: Qualifications match", minimum=0.0, maximum=1.0, value=0.2, step=0.05)
+                w_resp  = gr.Slider(label="(unused) Weight: Responsibilities match", minimum=0.0, maximum=1.0, value=0.2, step=0.05)
+            run_btn = gr.Button("🔎 Rank & Score", variant="primary")
+            clear_btn = gr.Button("Clear")
+        with gr.Column(scale=1):
+            gr.Markdown("### 📊 Results")
+            metrics_md = gr.Markdown()
+            ranked_df = gr.DataFrame(row_count=(5, "dynamic"), wrap=True, label="Ranked Candidates (by Score)")
+            csv_out = gr.File(label="Download Ranked CSV")
+            gr.Markdown("### 🧩 Parsed JD")
+            jd_json = gr.JSON()
+            gr.Markdown("### 🗒️ Candidate Details (Top 5)")
+            details_md = gr.Markdown()
+    run_btn.click(
+        fn=process,
+        inputs=[jd_text, jd_file, resumes, api_key_pw, model_name, temperature, top_n, w_skill, w_qual, w_resp],
+        outputs=[metrics_md, ranked_df, csv_out, jd_json, details_md]
+    )
+    def clear_all():
+        # Reset key fields/outputs; sliders keep defaults
+        return (
+            "",            # jd_text
+            None,          # jd_file
+            None,          # resumes
+            "",            # api_key_pw
+            DEFAULT_MODEL, # model_name
+            "",            # metrics_md
+            pd.DataFrame(),# ranked_df
+            None,          # csv_out
+            {},            # jd_json
+            "",            # details_md
+        )
+    clear_btn.click(
+        fn=clear_all,
+        inputs=[],
+        outputs=[jd_text, jd_file, resumes, api_key_pw, model_name, metrics_md, ranked_df, csv_out, jd_json, details_md]
+    )
+if __name__ == "__main__":
+    demo.launch()