# app.py — SGS ATS Candidate Matcher (HF Inference API ONLY, Spaces-safe) # ✅ No transformers / torch / sentence-transformers # ✅ Uses ONLY huggingface_hub.InferenceClient (works with hub 1.x) # ✅ Top 10 executive report + shortlist + exports + contacts + progress # ✅ Max CV uploads = 10 # # Space secret required: # HF_TOKEN (Settings → Secrets) # # Optional env vars: # LLM_MODEL (default: Qwen/Qwen2.5-7B-Instruct) # EMBED_MODEL (default: sentence-transformers/all-MiniLM-L6-v2) # LLM_BATCH_SIZE, LLM_MAX_TOKENS, LLM_TEMPERATURE import os import re import json import time import csv import hashlib import tempfile from typing import List, Dict, Any, Optional, Tuple import numpy as np import pandas as pd import gradio as gr from huggingface_hub import InferenceClient from huggingface_hub.errors import BadRequestError, HfHubHTTPError from pydantic import BaseModel, Field from pypdf import PdfReader import docx2txt # ========================================================= # Models (Inference API) # ========================================================= # NOTE: Meta Llama repos are often gated on Hugging Face. # If you have access, you can set LLM_MODEL to e.g. "meta-llama/Llama-3.1-8B-Instruct". LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen2.5-7B-Instruct") EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") # ========================================================= # Controls # ========================================================= MAX_CV_UPLOADS = 10 MAX_CV_CHARS = 120_000 MAX_JD_CHARS = 60_000 CHUNK_SIZE_CHARS = 1100 CHUNK_OVERLAP_CHARS = 180 TOP_CHUNKS_PER_CV = 10 # retrieval EVIDENCE_CHUNKS_PER_CV = 4 # sent to LLM judge LLM_BATCH_SIZE = int(os.getenv("LLM_BATCH_SIZE", "3")) LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "2600")) LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.15")) ALLOW_LEXICAL_FALLBACK = True # ========================================================= # Output schemas # ========================================================= class RequirementCheck(BaseModel): requirement: str status: str = Field(..., description="met | partial | missing") evidence: str = Field(..., description="short quote <=160 chars or empty") class CandidateLLMResult(BaseModel): filename: str final_score: float = Field(..., description="0-100") fit_level: str = Field(..., description="excellent | good | maybe | weak") summary: str strengths: List[str] gaps: List[str] risks: List[str] checklist: List[RequirementCheck] top_evidence: List[str] class LLMRankingOutput(BaseModel): ranked: List[CandidateLLMResult] overall_notes: str # ========================================================= # Client # ========================================================= _hf_client: Optional[InferenceClient] = None def get_hf_client() -> InferenceClient: global _hf_client if _hf_client is not None: return _hf_client token = os.getenv("HF_TOKEN", "").strip() if not token: raise gr.Error("HF_TOKEN is not set. Add it in Space Settings → Repository secrets.") _hf_client = InferenceClient(token=token) return _hf_client # ========================================================= # Text + files # ========================================================= def gr_file_to_path(f: Any) -> Optional[str]: if f is None: return None if isinstance(f, str): return f if isinstance(f, dict) and "path" in f: return f["path"] if hasattr(f, "name"): return f.name return None def clean_text(t: str) -> str: t = (t or "").replace("\x00", " ") t = re.sub(r"[ \t]+", " ", t) t = re.sub(r"\n{3,}", "\n\n", t) return t.strip() def read_file_to_text(file_path: str) -> str: lower = file_path.lower() if lower.endswith(".pdf"): reader = PdfReader(file_path) parts = [] for page in reader.pages: parts.append(page.extract_text() or "") return "\n".join(parts).strip() if lower.endswith(".docx"): return (docx2txt.process(file_path) or "").strip() with open(file_path, "rb") as f: raw = f.read() try: return raw.decode("utf-8", errors="ignore").strip() except Exception: return raw.decode(errors="ignore").strip() def file_bytes_hash(path: str) -> str: with open(path, "rb") as f: return hashlib.sha256(f.read()).hexdigest() def chunk_text_safe(text: str, chunk_size: int = CHUNK_SIZE_CHARS, overlap: int = CHUNK_OVERLAP_CHARS) -> List[str]: text = (text or "").strip() if not text: return [] chunks = [] i = 0 n = len(text) while i < n: j = min(i + chunk_size, n) ch = text[i:j].strip() if ch: chunks.append(ch) if j == n: break i = max(0, j - overlap) return chunks def mask_pii(text: str) -> str: text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "[EMAIL]", text) text = re.sub(r"(\+?\d[\d\-\s]{7,}\d)", "[PHONE]", text) return text # ========================================================= # Contact extraction # ========================================================= _EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b") _PHONE_RE = re.compile(r"(?:\+?\d{1,3}[\s\-]?)?(?:\(?\d{2,4}\)?[\s\-]?)?\d{3,4}[\s\-]?\d{3,4}") def _normalize_phone(p: str) -> str: return re.sub(r"[^\d+]", "", p) def guess_name(text: str) -> str: lines = [ln.strip() for ln in (text or "").splitlines() if ln.strip()] for ln in lines[:14]: if "@" in ln: continue if len(ln) > 55: continue if re.search(r"\d{3,}", ln): continue if re.search(r"[A-Za-z\u0600-\u06FF]", ln): bad = {"curriculum vitae", "cv", "resume", "profile"} if ln.lower() in bad: continue return ln return "" def extract_contact_info(text: str) -> Dict[str, str]: t = text or "" emails = _EMAIL_RE.findall(t) raw_phones = _PHONE_RE.findall(t) phones = [] for p in raw_phones: npn = _normalize_phone(p) digits = re.sub(r"\D", "", npn) if 8 <= len(digits) <= 16: phones.append(npn) return {"name": guess_name(t), "email": emails[0] if emails else "", "phone": phones[0] if phones else ""} # ========================================================= # Embeddings via HF Inference API (feature-extraction) # ========================================================= def _l2norm(v: np.ndarray) -> np.ndarray: return v / (np.linalg.norm(v) + 1e-12) def embed_texts_api(texts: List[str]) -> np.ndarray: """ Returns shape [len(texts), d] float32 embeddings using HF Inference 'feature-extraction'. Uses 'inputs=' to be compatible across huggingface_hub versions. """ client = get_hf_client() vecs = [] for t in texts: v = client.feature_extraction(model=EMBED_MODEL, inputs=t) v = np.array(v, dtype=np.float32).reshape(-1) v = _l2norm(v) vecs.append(v) return np.stack(vecs, axis=0) if vecs else np.zeros((0, 384), dtype=np.float32) def cosine_sim_matrix(a: np.ndarray, b: np.ndarray) -> np.ndarray: # assumes both are normalized return np.matmul(a, b.T) # ========================================================= # Lexical fallback (no embeddings) # ========================================================= _WORD_RE = re.compile(r"[A-Za-z\u0600-\u06FF0-9]+") def _tokenize(text: str) -> List[str]: return [w.lower() for w in _WORD_RE.findall(text or "") if len(w) >= 2] def lexical_rank_chunks(jd: str, chunks: List[str], top_k: int) -> List[Tuple[int, float]]: jd_tokens = _tokenize(jd) if not jd_tokens or not chunks: return [] jd_set = set(jd_tokens) scores = [] for i, ch in enumerate(chunks): ch_tokens = _tokenize(ch) if not ch_tokens: scores.append((i, 0.0)) continue inter = len(jd_set.intersection(set(ch_tokens))) scores.append((i, float(inter) / float(len(jd_set) + 1e-9))) scores.sort(key=lambda x: x[1], reverse=True) return scores[:top_k] # ========================================================= # LLM Judge (Ranking) with robust JSON parsing # ========================================================= def build_llm_prompt(jd_text: str, must_haves: str, candidates: List[Dict[str, Any]]) -> str: schema_example = { "ranked": [ { "filename": "", "final_score": 0, "fit_level": "weak", "summary": "one short paragraph", "strengths": ["max 4 items"], "gaps": ["max 4 items"], "risks": ["max 3 items"], "checklist": [ {"requirement": "SHORT label (<=8 words)", "status": "met", "evidence": "short quote <=160 chars"} ], "top_evidence": ["max 3 short quotes"], } ], "overall_notes": "short", } return f""" You are an expert recruiter and ATS evaluator. Return ONLY one JSON object, EXACTLY matching this schema: {json.dumps(schema_example, ensure_ascii=False)} Hard limits (MUST follow): - strengths: max 4 bullets - gaps: max 4 bullets - risks: max 3 bullets - checklist: max 6 requirements total - requirement: SHORT label (<=8 words). Do NOT paste long JD sentences. - evidence: <=160 chars or empty - top_evidence: max 3 short quotes Rules: - Use ONLY the provided evidence_chunks. Do NOT invent experience. - final_score 0-100 (be strict: missing must-haves should significantly reduce score) - fit_level: excellent | good | maybe | weak - status: met | partial | missing Job Description (compressed): \"\"\"{jd_text[:4000]}\"\"\" Must-haves (optional): \"\"\"{(must_haves or '').strip()[:1200]}\"\"\" Candidates: {json.dumps(candidates, ensure_ascii=False)} Output JSON only. No markdown. No extra text. """.strip() def _extract_first_complete_json_object(text: str) -> Optional[str]: if not text: return None start = text.find("{") if start < 0: return None depth = 0 in_str = False esc = False for i in range(start, len(text)): ch = text[i] if in_str: if esc: esc = False elif ch == "\\": esc = True elif ch == '"': in_str = False continue else: if ch == '"': in_str = True continue if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: return text[start : i + 1] return None def fit_level_from_score(score: float) -> str: s = float(score) if s >= 85: return "excellent" if s >= 70: return "good" if s >= 55: return "maybe" return "weak" def clamp(x: float, lo: float, hi: float) -> float: return max(lo, min(hi, x)) # ------------------------- # STRICTER scoring (post-process) # ------------------------- def apply_strict_scoring(c: CandidateLLMResult) -> CandidateLLMResult: """ Make scoring stricter using the produced checklist: - Compute checklist fulfillment ratio: met=1, partial=0.5, missing=0 - Scale score down heavily when must-haves are missing. - If ALL requirements are missing (or met=0 with >=3 reqs), hard cap score. """ base = float(c.final_score) cl = c.checklist or [] if not cl: # If model didn't produce checklist, slightly penalize (still allow ranking). adj = clamp(base * 0.85, 0.0, 100.0) c.final_score = adj c.fit_level = fit_level_from_score(adj) return c total = len(cl) met = 0 partial = 0 missing = 0 for it in cl: st = (it.status or "").strip().lower() if st == "met": met += 1 elif st == "partial": partial += 1 else: missing += 1 ratio = (met + 0.5 * partial) / float(max(1, total)) # 0..1 # Strong penalty curve: when ratio is low, multiplier drops hard. # multiplier is between 0.20 and 1.00 multiplier = 0.20 + 0.80 * (ratio ** 1.6) adj = base * multiplier # If basically no must-haves met, cap it. if total >= 3 and met == 0 and partial == 0: adj = min(adj, 25.0) elif total >= 3 and met == 0: adj = min(adj, 35.0) adj = clamp(adj, 0.0, 100.0) c.final_score = float(round(adj, 2)) c.fit_level = fit_level_from_score(c.final_score) return c def fallback_candidate(filename: str, local_score: float) -> CandidateLLMResult: # Even fallback should not look "good" if local retrieval is mid; keep. adj = float(round(local_score, 2)) return CandidateLLMResult( filename=filename, final_score=adj, fit_level=fit_level_from_score(adj), summary="LLM output incomplete; fallback score based on retrieval signals.", strengths=[], gaps=[], risks=[], checklist=[], top_evidence=[], ) def _llm_call_or_raise(prompt: str, temperature: float, max_tokens: int) -> str: client = get_hf_client() try: resp = client.chat_completion( model=LLM_MODEL, messages=[ {"role": "system", "content": "Return ONLY valid JSON matching the schema. No markdown."}, {"role": "user", "content": prompt}, ], max_tokens=max_tokens, temperature=temperature, ) return (resp.choices[0].message.content or "").strip() except BadRequestError as e: msg = str(e) raise gr.Error( "LLM call failed. This usually means the model name is wrong or the model is gated.\n\n" f"Current LLM_MODEL: {LLM_MODEL}\n" "Try setting LLM_MODEL to a public model like:\n" "- Qwen/Qwen2.5-7B-Instruct\n" "- mistralai/Mistral-7B-Instruct-v0.3\n" "Or if you have Meta access:\n" "- meta-llama/Llama-3.1-8B-Instruct\n\n" f"Raw error: {msg}" ) from e except HfHubHTTPError as e: raise gr.Error(f"HF Inference error: {e}") from e def llm_judge_rank_batch(jd_text: str, must_haves: str, batch: List[Dict[str, Any]]) -> LLMRankingOutput: prompt = build_llm_prompt( jd_text, must_haves or "", [{"filename": b["filename"], "evidence_chunks": b["evidence_chunks"]} for b in batch], ) out: Optional[LLMRankingOutput] = None text = _llm_call_or_raise(prompt, LLM_TEMPERATURE, LLM_MAX_TOKENS) try: out = LLMRankingOutput.model_validate(json.loads(text)) except Exception: obj = _extract_first_complete_json_object(text) if obj: out = LLMRankingOutput.model_validate(json.loads(obj)) if out is None: text2 = _llm_call_or_raise(prompt, 0.0, max(LLM_MAX_TOKENS, 3200)) try: out = LLMRankingOutput.model_validate(json.loads(text2)) except Exception: obj2 = _extract_first_complete_json_object(text2) if obj2: out = LLMRankingOutput.model_validate(json.loads(obj2)) if out is None: ranked = [fallback_candidate(b["filename"], b.get("local_score", 50.0)) for b in batch] return LLMRankingOutput(ranked=ranked, overall_notes="LLM parsing failed; used retrieval-based fallback.") returned = {c.filename: c for c in out.ranked} missing = [b for b in batch if b["filename"] not in returned] for b in missing: single_prompt = build_llm_prompt( jd_text, must_haves or "", [{"filename": b["filename"], "evidence_chunks": b["evidence_chunks"]}], ) single_text = _llm_call_or_raise(single_prompt, 0.0, min(2200, LLM_MAX_TOKENS)) single_out: Optional[LLMRankingOutput] = None try: single_out = LLMRankingOutput.model_validate(json.loads(single_text)) except Exception: single_obj = _extract_first_complete_json_object(single_text) if single_obj: single_out = LLMRankingOutput.model_validate(json.loads(single_obj)) if single_out and single_out.ranked: returned[b["filename"]] = single_out.ranked[0] else: returned[b["filename"]] = fallback_candidate(b["filename"], b.get("local_score", 50.0)) merged_ranked = sorted(returned.values(), key=lambda x: float(x.final_score), reverse=True) notes = (out.overall_notes or "").strip() if missing: notes = (notes + " | Some candidates re-judged individually / fallback used.").strip(" |") return LLMRankingOutput(ranked=merged_ranked, overall_notes=notes) def merge_llm_batches(batch_outputs: List[LLMRankingOutput]) -> LLMRankingOutput: all_ranked: List[CandidateLLMResult] = [] notes = [] for out in batch_outputs: notes.append(out.overall_notes) all_ranked.extend(out.ranked) # Apply strict scoring AFTER LLM returns (prevents "missing everything but 65" cases) all_ranked = [apply_strict_scoring(c) for c in all_ranked] all_ranked = sorted(all_ranked, key=lambda x: float(x.final_score), reverse=True) return LLMRankingOutput(ranked=all_ranked, overall_notes=" | ".join([n for n in notes if n])[:1200]) # ========================================================= # Local scoring (retrieval-only, scaled to 0-100) # ========================================================= def compute_retrieval_score(top_sims: List[float]) -> float: if not top_sims: return 0.0 top = sorted(top_sims, reverse=True)[:5] m = float(np.mean(top)) mx = float(np.max(top)) raw = 0.65 * m + 0.35 * mx return float(clamp(raw * 100.0, 0.0, 100.0)) # ========================================================= # UI rendering (SGS) # ========================================================= def fit_badge(level: str) -> str: level = (level or "").lower().strip() if level == "excellent": return 'Excellent' if level == "good": return 'Good' if level == "maybe": return 'Potential' return 'Weak' def score_pill(score: float) -> str: s = float(score) cls = "p-high" if s >= 80 else ("p-mid" if s >= 65 else ("p-low" if s >= 45 else "p-bad")) return f'{s:.1f}' def candidate_card_html(rank: int, c: CandidateLLMResult) -> str: score = float(c.final_score) w = max(0, min(100, int(round(score)))) checklist_rows = "" for item in (c.checklist or [])[:6]: st = (item.status or "").lower().strip() cls = "ok" if st == "met" else ("partial" if st == "partial" else "miss") ev = (item.evidence or "").strip().replace("<", "<").replace(">", ">") req = (item.requirement or "").strip().replace("<", "<").replace(">", ">") checklist_rows += f"""
{req}
{st.upper()}
{ev if ev else "—"}
""" strengths = "".join([f"
  • {s}
  • " for s in (c.strengths or [])[:4]]) or "
  • " gaps = "".join([f"
  • {g}
  • " for g in (c.gaps or [])[:4]]) or "
  • " risks = "".join([f"
  • {r}
  • " for r in (c.risks or [])[:3]]) or "
  • " evidence_html = "" for q in (c.top_evidence or [])[:3]: q = q.replace("<", "<").replace(">", ">") evidence_html += f'
    “{q}”
    ' return f"""
    #{rank}
    {c.filename}
    {fit_badge(c.fit_level)} {score_pill(score)}
    {c.summary}
    Strengths
      {strengths}
    Gaps
      {gaps}
    Risks
    Requirements Checklist
    {checklist_rows if checklist_rows else '
    No checklist produced.
    '}
    Evidence
    {evidence_html if evidence_html else '
    No evidence produced.
    '}
    """ def _safe_int(x, default: int = 0) -> int: try: return int(x) except Exception: return default def render_single_html(ranked_dicts: List[Dict[str, Any]], idx: int) -> Tuple[str, str, int]: """Render ONE candidate card at a time to reduce DOM size / fullscreen lag.""" if not ranked_dicts: html = '''
    SGS Candidate Fit Report
    Run matching to generate results.
    ''' return html, "—", 0 idx = max(0, min(_safe_int(idx, 0), len(ranked_dicts) - 1)) c = CandidateLLMResult.model_validate(ranked_dicts[idx]) card = candidate_card_html(idx + 1, c) top_score = float(ranked_dicts[0].get("final_score", 0.0)) html = f'''
    SGS Candidate Fit Report
    Navigate candidates using ◀ / ▶ (renders one card to reduce lag)
    Candidate
    {idx+1}/{len(ranked_dicts)}
    Top Score
    {top_score:.1f}
    {card} ''' nav = f"**Showing:** {idx+1} / {len(ranked_dicts)}" return html, nav, idx def nav_prev(ranked_dicts: List[Dict[str, Any]], idx: int): return render_single_html(ranked_dicts, _safe_int(idx, 0) - 1) def nav_next(ranked_dicts: List[Dict[str, Any]], idx: int): return render_single_html(ranked_dicts, _safe_int(idx, 0) + 1) # ========================================================= # Shortlist export # ========================================================= def export_shortlist(shortlist_table: pd.DataFrame) -> Tuple[str, str, str]: if shortlist_table is None or shortlist_table.empty: raise gr.Error("No shortlist data yet. Run ranking first.") shortlisted_df = shortlist_table[shortlist_table["Shortlisted"] == True] if shortlisted_df.empty: raise gr.Error("No candidates marked as shortlisted.") tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") shortlisted_df.to_csv(tmp.name, index=False) emails = shortlisted_df["Email"].dropna().astype(str).str.strip().tolist() emails = [e for e in emails if e] email_block = ", ".join(sorted(set(emails))) msg = f"Exported {len(shortlisted_df)} shortlisted candidate(s)." return tmp.name, msg, email_block # ========================================================= # Mini refresh HTML (fix scroll lag after report generation) # ========================================================= def build_mini_refresh_script() -> str: nonce = str(int(time.time() * 1000)) # Forces a layout reflow similar to what happens when opening the accordion: # - dispatch resize twice across frames # - apply temporary will-change to hint GPU # - keep scroll position stable return f""" """.strip() # ========================================================= # Main app pipeline # ========================================================= def rank_app( jd_file_obj, cv_file_objs, must_haves: str, mask_pii_toggle: bool, show_contacts_toggle: bool, progress=gr.Progress(track_tqdm=False), ): t0 = time.time() get_hf_client() # validate token early progress(0.05, desc="Loading Job Description...") jd_path = gr_file_to_path(jd_file_obj) if not jd_path: raise gr.Error("Please upload a Job Description file (PDF/DOCX/TXT).") jd_text = clean_text(read_file_to_text(jd_path))[:MAX_JD_CHARS] if not jd_text: raise gr.Error("Could not extract text from the Job Description file.") if not cv_file_objs: raise gr.Error("Please upload at least 1 CV.") if len(cv_file_objs) > MAX_CV_UPLOADS: raise gr.Error(f"Maximum allowed CV uploads is {MAX_CV_UPLOADS}. You uploaded {len(cv_file_objs)}.") cv_paths = [] for f in cv_file_objs: p = gr_file_to_path(f) if p: cv_paths.append(p) if not cv_paths: raise gr.Error("Could not read uploaded CV files (no valid paths).") progress(0.10, desc="Checking duplicates...") seen = {} duplicates = [] unique_paths = [] for p in cv_paths: fname = os.path.basename(p) try: h = file_bytes_hash(p) except Exception: h = hashlib.sha256(clean_text(read_file_to_text(p)).encode("utf-8", errors="ignore")).hexdigest() if h in seen: duplicates.append((fname, seen[h])) continue seen[h] = fname unique_paths.append(p) progress(0.14, desc="Preparing retrieval engine...") use_embeddings = True jd_vec = None try: jd_vec = embed_texts_api([jd_text]) # [1,d] except Exception: if not ALLOW_LEXICAL_FALLBACK: raise gr.Error("Embedding endpoint failed. Try again later.") use_embeddings = False local_pool = [] contacts_map: Dict[str, Dict[str, str]] = {} total = len(unique_paths) for idx, p in enumerate(unique_paths, start=1): prog = 0.14 + 0.54 * (idx / max(1, total)) progress(prog, desc=f"Processing CVs ({idx}/{total}) — {os.path.basename(p)}") raw = clean_text(read_file_to_text(p))[:MAX_CV_CHARS] if not raw: continue filename = os.path.basename(p) contacts_map[filename] = ( extract_contact_info(raw) if show_contacts_toggle else {"name": "", "email": "", "phone": ""} ) chunks = chunk_text_safe(raw) if not chunks: continue if use_embeddings and jd_vec is not None: try: chunk_vecs = embed_texts_api(chunks) # [n,d] sims = cosine_sim_matrix(jd_vec, chunk_vecs)[0] # [n] idxs = np.argsort(sims)[::-1][:TOP_CHUNKS_PER_CV] top_chunks = [(int(i), float(sims[int(i)]), chunks[int(i)]) for i in idxs] except Exception: use_embeddings = False scored = lexical_rank_chunks(jd_text, chunks, TOP_CHUNKS_PER_CV) top_chunks = [(i, s, chunks[i]) for i, s in scored] else: scored = lexical_rank_chunks(jd_text, chunks, TOP_CHUNKS_PER_CV) top_chunks = [(i, s, chunks[i]) for i, s in scored] retr_sims = [s for _, s, _ in top_chunks] local_score = compute_retrieval_score(retr_sims) evidence_chunks = [txt for _, _, txt in top_chunks[:EVIDENCE_CHUNKS_PER_CV]] if mask_pii_toggle: evidence_chunks = [mask_pii(x) for x in evidence_chunks] local_pool.append({"filename": filename, "local_score": local_score, "evidence_chunks": evidence_chunks}) if not local_pool: raise gr.Error("Could not extract usable text from the uploaded CVs.") progress(0.70, desc="Preparing LLM ranking...") local_pool = sorted(local_pool, key=lambda x: float(x["local_score"]), reverse=True) batch_outputs: List[LLMRankingOutput] = [] batches = max(1, (len(local_pool) + LLM_BATCH_SIZE - 1) // LLM_BATCH_SIZE) for b in range(batches): start = b * LLM_BATCH_SIZE end = start + LLM_BATCH_SIZE batch = local_pool[start:end] prog = 0.70 + 0.22 * ((b + 1) / batches) progress(prog, desc=f"LLM judging batches ({b+1}/{batches})...") out = llm_judge_rank_batch(jd_text, must_haves or "", batch) batch_outputs.append(out) progress(0.94, desc="Finalizing report...") judged = merge_llm_batches(batch_outputs) ranked = judged.ranked if not ranked: raise gr.Error("LLM returned an empty ranking.") # Re-sort after strict scoring (already sorted in merge, but keep safe) ranked = sorted(ranked, key=lambda x: float(x.final_score), reverse=True) ranked_dicts = [c.model_dump() for c in ranked] idx0 = 0 first_html, nav, idx0 = render_single_html(ranked_dicts, idx0) tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") with open(tmp.name, "w", newline="", encoding="utf-8") as f: w = csv.writer(f) w.writerow( ["Rank", "Filename", "FinalScore(0-100)", "FitLevel", "Name", "Email", "Phone", "Summary", "LocalScore"] ) for ridx, c in enumerate(ranked, start=1): ci = contacts_map.get(c.filename, {"name": "", "email": "", "phone": ""}) local = next((x["local_score"] for x in local_pool if x["filename"] == c.filename), "") w.writerow( [ ridx, c.filename, round(float(c.final_score), 2), c.fit_level, ci.get("name", ""), ci.get("email", ""), ci.get("phone", ""), c.summary, local, ] ) shortlist_rows = [] for ridx, c in enumerate(ranked, start=1): ci = contacts_map.get(c.filename, {"name": "", "email": "", "phone": ""}) shortlist_rows.append( [ False, ridx, c.filename, round(float(c.final_score), 2), c.fit_level, ci.get("name", ""), ci.get("email", ""), ci.get("phone", ""), ] ) shortlist_df = pd.DataFrame( shortlist_rows, columns=["Shortlisted", "Rank", "Filename", "Score", "Fit", "Name", "Email", "Phone"] ) elapsed = time.time() - t0 meta = ( f"**LLM model:** `{LLM_MODEL}` \n" f"**Embedding model:** `{EMBED_MODEL}` \n\n" f"**CVs uploaded:** {len(cv_paths)} (max {MAX_CV_UPLOADS}) → **Unique processed:** {len(unique_paths)} \n" f"**Ranked (ALL):** {len(ranked)} \n" f"**LLM batches:** {batches} (batch size={LLM_BATCH_SIZE}) \n" f"**Time:** {elapsed:.2f}s \n" f"**Duplicates skipped:** {len(duplicates)} \n" f"**Retrieval mode:** {'Embeddings (API)' if use_embeddings else 'Lexical fallback'} \n\n" f"**LLM Notes:** {(judged.overall_notes or '').strip()}" ) # Mini refresh to remove scroll lag after render refresh_html = build_mini_refresh_script() progress(1.0, desc="Done ✅") return first_html, meta, tmp.name, shortlist_df, "", "", ranked_dicts, idx0, nav, refresh_html # ========================================================= # SGS CSS (neutral light-grey + visible borders) # + file uploader readable on both themes # + progress text white (like you asked) # ========================================================= CUSTOM_CSS = """ :root{ --sgs-blue:#0B3D91; --sgs-green:#00A651; --text:#111827; --muted: rgba(17,24,39,.70); --bg1:#f2f4f7; --bg2:#e9edf2; --line: rgba(17,24,39,.22); --line2: rgba(17,24,39,.28); --shadow: 0 14px 28px rgba(2,6,23,.10); } /* Layout */ .gradio-container{max-width:1180px !important;} /* Background */ body, .gradio-container{ background: radial-gradient(1200px 700px at 10% 10%, rgba(11,61,145,.08), transparent 55%), radial-gradient(900px 600px at 90% 20%, rgba(0,166,81,.07), transparent 60%), radial-gradient(800px 520px at 55% 90%, rgba(79,178,255,.07), transparent 60%), linear-gradient(180deg, var(--bg1), var(--bg2)) !important; } /* Subtle moving veil */ body:before{ content:""; position: fixed; inset: 0; pointer-events:none; background: linear-gradient(120deg, rgba(11,61,145,.06), rgba(0,166,81,.05), rgba(79,178,255,.05), rgba(11,61,145,.06) ); background-size: 320% 320%; mix-blend-mode: multiply; opacity: .35; animation: bgShift 10s ease-in-out infinite; } @keyframes bgShift{ 0%{ background-position: 0% 50%; } 50%{ background-position: 100% 50%; } 100%{ background-position: 0% 50%; } } /* Keep text dark always */ .gradio-container, .gradio-container *{ color: var(--text) !important; } /* Hero */ .hero{ border:1.2px solid var(--line2); background: linear-gradient(135deg, rgba(255,255,255,.86), rgba(247,248,250,.82)); border-radius: 22px; padding: 20px 20px 18px; display:flex; align-items:flex-end; justify-content:space-between; gap:16px; box-shadow: 0 18px 40px rgba(2,6,23,.12); margin: 12px 0 16px; position: relative; overflow: hidden; backdrop-filter: blur(10px); -webkit-backdrop-filter: blur(10px); animation: heroIn .65s ease-out both; } @keyframes heroIn{ from{ opacity:0; transform: translateY(10px); } to{ opacity:1; transform: translateY(0); } } .hero-left{max-width: 740px;} .hero *{ position: relative; z-index: 1; } .hero:before, .hero:after{ content:""; position:absolute; width: 360px; height: 360px; border-radius: 999px; filter: blur(44px); opacity: .26; pointer-events:none; animation: floaty 7s ease-in-out infinite; } .hero:before{ background: radial-gradient(circle at 35% 35%, rgba(11,61,145,.22), transparent 62%), radial-gradient(circle at 35% 35%, rgba(79,178,255,.18), transparent 70%); top:-190px; left:-170px; } .hero:after{ background: radial-gradient(circle at 60% 40%, rgba(0,166,81,.18), transparent 64%), radial-gradient(circle at 60% 40%, rgba(11,61,145,.10), transparent 72%); bottom:-220px; right:-190px; animation-delay: -2.8s; } @keyframes floaty{ 0%,100%{ transform: translate(0,0); } 50%{ transform: translate(18px, -12px); } } .hero-title{ font-weight: 1000; font-size: 28px; letter-spacing: -0.02em; line-height: 1.08; } .hero-title .accent{ display:inline-block; position: relative; } .hero-title .accent:after{ content:""; position:absolute; left:0; right:0; height: 10px; bottom: -7px; background: linear-gradient(90deg, rgba(11,61,145,0), rgba(11,61,145,.34), rgba(79,178,255,.34), rgba(0,166,81,.26), rgba(0,166,81,0) ); filter: blur(1px); opacity: .90; transform: scaleX(0); transform-origin: left; animation: underlineIn .9s ease-out .25s both; } @keyframes underlineIn{ from{ transform: scaleX(0); opacity: 0; } to{ transform: scaleX(1); opacity: .90; } } .hero-sub{ color: var(--muted) !important; margin-top: 8px; font-size: 13.5px; line-height: 1.55rem; max-width: 74ch; } .hero-right{ display:flex; gap:10px; flex-wrap:wrap; justify-content:flex-end; } /* KPI cards */ .kpi{ background: rgba(255,255,255,.78); border:1.2px solid var(--line); border-radius: 16px; padding: 10px 12px; min-width: 150px; backdrop-filter: blur(8px); -webkit-backdrop-filter: blur(8px); transition: transform .18s ease, box-shadow .18s ease, border-color .18s ease; } .kpi:hover{ transform: translateY(-2px); box-shadow: 0 18px 38px rgba(2,6,23,.12); border-color: var(--line2); } .kpi-label{ color:rgba(17,24,39,.78) !important; font-size:12px; font-weight:800; } .kpi-val{ font-size:18px; font-weight:1000; margin-top:2px; } /* Blocks */ .gradio-container .block{ border-radius: 18px !important; border: 1.2px solid var(--line) !important; background: rgba(255,255,255,.72) !important; box-shadow: var(--shadow); } /* Inputs */ textarea, input[type="text"]{ background: rgba(255,255,255,.90) !important; border: 1.2px solid var(--line) !important; border-radius: 14px !important; } textarea:focus, input[type="text"]:focus{ outline: none !important; box-shadow: 0 0 0 3px rgba(79,178,255,.18) !important; border-color: var(--line2) !important; } /* Buttons */ button.primary, .gradio-container button{ border-radius: 14px !important; border: 1px solid rgba(15,23,42,.18) !important; background: linear-gradient(90deg, rgba(11,61,145,.92), rgba(0,166,81,.78)) !important; color: #fff !important; transition: transform .15s ease, box-shadow .15s ease, filter .15s ease; } button.primary:hover, .gradio-container button:hover{ transform: translateY(-1px); box-shadow: 0 14px 35px rgba(11,61,145,.16); filter: brightness(1.05); } button.primary:active, .gradio-container button:active{ transform: translateY(0) scale(.99); } /* Tabs */ .gradio-container .tabs{ border: 1.2px solid var(--line) !important; border-radius: 18px !important; overflow: hidden; } .gradio-container .tabitem{ background: rgba(255,255,255,.70) !important; } .gradio-container .tab-nav{ background: rgba(255,255,255,.70) !important; border-bottom: 1.2px solid var(--line) !important; } /* Cards */ .cards{display:grid;grid-template-columns: 1fr; gap: 12px;} .card{ background: linear-gradient(180deg, rgba(255,255,255,.92), rgba(247,248,250,.88)); border:1.2px solid var(--line); border-radius: 18px; padding: 14px; box-shadow: var(--shadow); transition: transform .18s ease, box-shadow .18s ease, border-color .18s ease; } .card:hover{ transform: translateY(-2px); box-shadow: 0 20px 40px rgba(2,6,23,.12); border-color: var(--line2); } .card-top{display:flex;align-items:flex-start;justify-content:space-between;gap:10px;} .card-title{display:flex;gap:10px;align-items:baseline;flex-wrap:wrap;} .rank{ background: rgba(11,61,145,.10); border:1.2px solid rgba(11,61,145,.22); font-weight: 1000; border-radius: 999px; padding: 6px 10px; font-size: 12px; } .file{font-weight:1000;font-size:16px;} .card-meta{display:flex;gap:8px;align-items:center;flex-wrap:wrap;justify-content:flex-end;} /* Badges / Pills */ .badge{ display:inline-flex;align-items:center; padding: 6px 10px;border-radius: 999px;font-size:12px;font-weight:1000; border:1.2px solid var(--line); color: var(--text) !important; } .b-exc{ background: rgba(0,166,81,.12); border-color: rgba(0,166,81,.26); } .b-good{ background: rgba(11,61,145,.10); border-color: rgba(11,61,145,.24); } .b-maybe{ background: rgba(245,158,11,.12); border-color: rgba(245,158,11,.28); } .b-weak{ background: rgba(239,68,68,.10); border-color: rgba(239,68,68,.26); } .pill{ display:inline-flex;align-items:center;justify-content:center; min-width:60px;padding: 6px 10px;border-radius: 999px;font-weight: 1000; border:1.2px solid var(--line); background: rgba(255,255,255,.78); color: var(--text) !important; } .p-high{ background: rgba(0,166,81,.12); border-color: rgba(0,166,81,.26); } .p-mid{ background: rgba(11,61,145,.10); border-color: rgba(11,61,145,.24); } .p-low{ background: rgba(245,158,11,.12); border-color: rgba(245,158,11,.28); } .p-bad{ background: rgba(239,68,68,.10); border-color: rgba(239,68,68,.26); } /* Score bar */ .bar{ width: 100%; height: 10px; border-radius: 999px; background: rgba(17,24,39,.08); overflow: hidden; border:1.2px solid var(--line); margin: 10px 0 10px; } .fill{ height:100%; border-radius: 999px; background: linear-gradient(90deg, var(--sgs-green), #4fb2ff, var(--sgs-blue)); } .summary{font-size:13px;line-height:1.55rem;margin: 6px 0 10px;color:var(--text) !important;} .section-title{font-size:13px;font-weight:1000;margin:10px 0 6px;color:var(--text) !important;} .grid{display:grid;grid-template-columns: 1fr 1fr; gap: 14px;} @media(max-width:860px){ .grid{grid-template-columns:1fr;} .hero{flex-direction:column; align-items:flex-start;} .hero-right{justify-content:flex-start;} .kpi{min-width: 160px;} .hero-title{font-size: 24px;} } .list{margin:0;padding-left:18px;color:var(--text) !important;} .list li{margin:6px 0;line-height:1.30rem;color:var(--text) !important;} /* Quotes / Evidence */ .quotes{display:grid;gap:10px;margin-top:6px;} .quote{ background: rgba(255,255,255,.82); border:1.2px solid var(--line); border-radius: 14px; padding: 10px 12px; color: var(--text) !important; font-size: 13px; line-height: 1.45rem; } .quote.muted{opacity:.85;} /* Checklist */ .checklist{display:grid;gap:8px;margin-top:6px;} .checkrow{ display:grid; grid-template-columns: 1.1fr .4fr 1.5fr; gap:10px; padding:10px 12px; border-radius:14px; border:1.2px solid var(--line); background: rgba(255,255,255,.82); font-size:13px; position: relative; overflow: hidden; } .checkrow:before{ content:""; position:absolute; left:0; top:0; bottom:0; width:4px; background: rgba(17,24,39,.22); } .checkrow .req{font-weight:1000;color:var(--text) !important;} .checkrow .ev{color:rgba(17,24,39,0.88) !important;} .checkrow .st{font-weight:1000;text-align:center;letter-spacing:.4px;} /* Status colors */ .checkrow.ok:before{ background: rgba(0,166,81,.95); } .checkrow.partial:before{ background: rgba(245,158,11,.95); } .checkrow.miss:before{ background: rgba(239,68,68,.95); } .checkrow.ok .st{ color: rgba(0,120,70,1) !important; } .checkrow.partial .st{ color: rgba(150,95,10,1) !important; } .checkrow.miss .st{ color: rgba(160,20,20,1) !important; } /* ========================================================= File uploader: readable label/filename ALWAYS ========================================================= */ .gradio-container .file, .gradio-container .file-upload, .gradio-container .upload-button, .gradio-container .file-upload > div, .gradio-container [data-testid="file"]{ background: rgba(245,247,250,.92) !important; border: 1.4px solid rgba(17,24,39,.28) !important; border-radius: 16px !important; box-shadow: 0 12px 24px rgba(2,6,23,.10) !important; } .gradio-container .file *, .gradio-container .file-upload *, .gradio-container .upload-button *, .gradio-container [data-testid="file"] *{ color: #111827 !important; } .gradio-container .file-upload .file-title, .gradio-container .file-upload .file-label, .gradio-container .file-upload .label, .gradio-container .file-upload .wrap, .gradio-container .file-upload .header, .gradio-container [data-testid="file"] .label{ background: rgba(245,247,250,.92) !important; border-bottom: 1.4px solid rgba(17,24,39,.20) !important; } .gradio-container .file-upload .file-name, .gradio-container .file-upload .filename, .gradio-container [data-testid="file"] .file-name{ font-weight: 900 !important; } .gradio-container .file-upload button, .gradio-container [data-testid="file"] button{ background: rgba(255,255,255,.85) !important; border: 1.2px solid rgba(17,24,39,.28) !important; color: #111827 !important; } .gradio-container .file:hover, .gradio-container .file-upload:hover, .gradio-container [data-testid="file"]:hover{ border-color: rgba(17,24,39,.36) !important; box-shadow: 0 16px 32px rgba(2,6,23,.12) !important; } /* ========================================================= Progress label text = white ========================================================= */ .gradio-container .progress-text, .gradio-container .progress_label, .gradio-container .progress-label, .gradio-container .eta, .gradio-container [data-testid="progress-text"], .gradio-container [data-testid="progress-label"], .gradio-container [data-testid="progress-bar"] *{ color: #ffffff !important; text-shadow: 0 1px 2px rgba(0,0,0,.55); } /* Respect reduced motion */ @media (prefers-reduced-motion: reduce){ body:before, .hero, .hero:before, .hero:after{ animation: none !important; } } """ # ========================================================= # UI # ========================================================= theme = gr.themes.Soft( primary_hue="blue", secondary_hue="green", neutral_hue="slate", radius_size="lg", font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"], ) with gr.Blocks(title="SGS ATS Candidate Matcher", theme=theme, css=CUSTOM_CSS) as demo: gr.HTML(f"""
    Intelligent CV–JD matching for SGS
    Analyze job descriptions and candidate CVs to deliver accurate matching, structured insights, and data-driven hiring decisions — all in minutes, not weeks.
    Max CV uploads
    {MAX_CV_UPLOADS}
    Important
    Set HF_TOKEN
    """) with gr.Row(): jd_file = gr.File(label="Job Description file (PDF/DOCX/TXT)", file_types=[".pdf", ".docx", ".txt"]) cv_files = gr.File(label=f"Upload CVs (max {MAX_CV_UPLOADS})", file_count="multiple", file_types=[".pdf", ".docx", ".txt"]) with gr.Accordion("Settings", open=False): must_haves = gr.Textbox( label="Must-have requirements (optional) — one per line", lines=5, placeholder="Example:\nRecruitment lifecycle\nATS usage\nInterview scheduling\nOffer negotiation", ) mask_pii_toggle = gr.Checkbox(label="Mask PII (emails/phones) in evidence", value=True) show_contacts_toggle = gr.Checkbox(label="Extract contact info (Name / Email / Phone) from CVs", value=True) run_btn = gr.Button("Generate Candidate Fit Report", variant="primary") with gr.Tabs(): with gr.Tab("Executive Report"): ranked_state = gr.State([]) idx_state = gr.State(0) # invisible HTML output used to run the mini-refresh script after report generation mini_refresh = gr.HTML(visible=False) with gr.Row(): prev_btn = gr.Button("◀", size="sm") nav_text = gr.Markdown("—") next_btn = gr.Button("▶", size="sm") report_html = gr.HTML() meta_md = gr.Markdown() export_full = gr.File(label="Download Full Ranking CSV (includes contacts)") with gr.Tab("Shortlist & Export"): gr.Markdown("Tick **Shortlisted** candidates, then click **Export Shortlist**.") shortlist_df = gr.Dataframe( headers=["Shortlisted", "Rank", "Filename", "Score", "Fit", "Name", "Email", "Phone"], datatype=["bool", "number", "str", "number", "str", "str", "str", "str"], interactive=True, ) with gr.Row(): export_shortlist_btn = gr.Button("Export Shortlist CSV", variant="secondary") export_shortlist_file = gr.File(label="Download Shortlist CSV") export_shortlist_msg = gr.Markdown() email_list = gr.Textbox( label="Email list (copy/paste) — shortlisted only", lines=3, placeholder="Emails will appear here after exporting shortlist...", ) run_btn.click( fn=rank_app, inputs=[jd_file, cv_files, must_haves, mask_pii_toggle, show_contacts_toggle], outputs=[report_html, meta_md, export_full, shortlist_df, export_shortlist_msg, email_list, ranked_state, idx_state, nav_text, mini_refresh], ) prev_btn.click( fn=nav_prev, inputs=[ranked_state, idx_state], outputs=[report_html, nav_text, idx_state], ) next_btn.click( fn=nav_next, inputs=[ranked_state, idx_state], outputs=[report_html, nav_text, idx_state], ) export_shortlist_btn.click( fn=export_shortlist, inputs=[shortlist_df], outputs=[export_shortlist_file, export_shortlist_msg, email_list], ) demo.launch(server_name="0.0.0.0", server_port=7860)