Spaces:

SAadmin
/

resume-evaluator

Sleeping

App Files Files Community

Avinashnalla7 commited on Feb 17

Commit

fea1ab9

verified ·

1 Parent(s): a917544

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +236 -74

pipeline.py CHANGED Viewed

@@ -1,22 +1,41 @@
 import json
 import os
 import re
 import shutil
 import time
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-from pypdf import PdfReader
 from openai import OpenAI
 from tenacity import retry, stop_after_attempt, wait_exponential
 ALLOWED_SCORE_KEYS = ["skill", "experience", "growth", "context_fit", "combined"]
 def _now_ts() -> str:
-    return time.strftime("%Y-%m-%d %H:%M:%S")
 def _safe_slug(s: str, max_len: int = 80) -> str:
@@ -26,21 +45,122 @@ def _safe_slug(s: str, max_len: int = 80) -> str:
     return s[:max_len] if s else "UNKNOWN"
-def extract_text_from_pdf(pdf_path: str) -> str:
-    reader = PdfReader(pdf_path)
-    parts = []
-    for page in reader.pages:
         try:
-            t = page.extract_text() or ""
         except Exception:
             t = ""
         if t.strip():
             parts.append(t)
-    return "\n\n".join(parts).strip()
 def build_prompt(text: str, config: Dict[str, Any]) -> str:
-    # You can extend this later with per-project criteria.
     projects = config.get("projects") or []
     projects_block = json.dumps(projects, ensure_ascii=False)
@@ -83,18 +203,6 @@ Resume text:
 """.strip()
-def _coerce_score(v: Any) -> float:
-    try:
-        f = float(v)
-    except Exception:
-        return 0.0
-    if f < 0:
-        return 0.0
-    if f > 10:
-        return 10.0
-    return f
 def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any]:
     scores = raw.get("scores") if isinstance(raw.get("scores"), dict) else {}
     norm_scores = {k: _coerce_score(scores.get(k, 0)) for k in ALLOWED_SCORE_KEYS}
@@ -103,7 +211,11 @@ def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any
     project_name = best_project.get("project_name")
     project_score = _coerce_score(best_project.get("project_score", 0))
-    allowed_project_names = {p.get("name") for p in (config.get("projects") or []) if isinstance(p, dict)}
     if project_name not in allowed_project_names:
         project_name = None
@@ -113,7 +225,10 @@ def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any
     tags = [str(t).strip() for t in tags if str(t).strip()]
     tags = tags[:25]
     out = {
         "candidate_name": raw.get("candidate_name"),
         "seniority": raw.get("seniority"),
         "scores": norm_scores,
@@ -121,7 +236,7 @@ def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any
         "tags": tags,
         "notes": raw.get("notes"),
         "meta": {
-            "model": config.get("model"),
             "timestamp": _now_ts(),
         },
     }
@@ -135,25 +250,21 @@ def llm_evaluate(text: str, config: Dict[str, Any]) -> Dict[str, Any]:
         raise RuntimeError("Missing OPENAI_API_KEY (set it in HF Space Secrets).")
     client = OpenAI(api_key=api_key)
-    model = config.get("model") or os.getenv("OPENAI_MODEL") or "gpt-4o-mini"
     prompt = build_prompt(text, config)
-    # Use Responses API. Enforce JSON by instruction + parsing.
-    resp = client.responses.create(
-        model=model,
-        input=prompt,
-    )
     content = resp.output_text
     if not content or not content.strip():
         raise RuntimeError("LLM returned empty response.")
-    # Hard parse JSON (no tolerance for garbage)
     try:
         raw = json.loads(content)
     except Exception as e:
-        raise RuntimeError(f"LLM did not return valid JSON. First 200 chars: {content[:200]!r}") from e
     if not isinstance(raw, dict):
         raise RuntimeError("LLM JSON must be an object/dict at top-level.")
@@ -161,67 +272,118 @@ def llm_evaluate(text: str, config: Dict[str, Any]) -> Dict[str, Any]:
     return raw
 def run_pipeline(
     input_files: List[str],
     config: Dict[str, Any],
-    base_out_dir: Optional[str] = None,
 ) -> str:
-    base_out = Path(base_out_dir or "/tmp/resume_eval_out").resolve()
-    if base_out.exists():
-        shutil.rmtree(base_out)
     base_out.mkdir(parents=True, exist_ok=True)
-    eval_dir = base_out / "EVALUATIONS"
     eval_dir.mkdir(parents=True, exist_ok=True)
-    evaluations: List[Dict[str, Any]] = []
-    errors: List[Dict[str, Any]] = []
-    for pdf_path in input_files:
         pdf_path = str(Path(pdf_path).resolve())
-        filename = os.path.basename(pdf_path)
         try:
-            text = extract_text_from_pdf(pdf_path)
-            if not text:
-                raise RuntimeError("No extractable text from PDF (scanned image / empty).")
             raw = llm_evaluate(text, config)
             ev = normalize_eval(raw, config)
             # Add file identity
-            ev["filename"] = filename
-            # Write per-file json
-            safe_name = _safe_slug(ev.get("candidate_name") or Path(filename).stem)
-            out_path = eval_dir / f"{safe_name}__{Path(filename).stem}.json"
             out_path.write_text(json.dumps(ev, ensure_ascii=False, indent=2), encoding="utf-8")
-            evaluations.append(ev)
         except Exception as e:
-            err = {"filename": filename, "error": str(e)}
-            errors.append(err)
-    master = {
-        "count": len(evaluations),
-        "errors_count": len(errors),
-        "evaluations": evaluations,
-        "errors": errors,
-        "meta": {
-            "model": config.get("model") or os.getenv("OPENAI_MODEL") or "gpt-4o-mini",
-            "timestamp": _now_ts(),
-        },
-    }
-    (base_out / "master_index.json").write_text(
-        json.dumps(master, ensure_ascii=False, indent=2),
-        encoding="utf-8",
-    )
-    zip_path = str(base_out.parent / "results.zip")
-    if os.path.exists(zip_path):
-        os.remove(zip_path)
-    shutil.make_archive(zip_path.replace(".zip", ""), "zip", str(base_out))
-    return zip_path

 import json
 import os
 import re
+import hashlib
 import shutil
 import time
+from datetime import datetime, timezone
 from pathlib import Path
+from typing import Any, Dict, List, Optional
+import fitz  # pymupdf
+import pytesseract
+from PIL import Image
 from openai import OpenAI
 from tenacity import retry, stop_after_attempt, wait_exponential
+# -----------------------------
+# Schema / Constants
+# -----------------------------
+SCHEMA_VERSION = "1.0"
 ALLOWED_SCORE_KEYS = ["skill", "experience", "growth", "context_fit", "combined"]
+DEFAULT_MODEL = "gpt-4o-mini"
+INDEX_FILENAME = "resumes_index.json"
+EVAL_DIRNAME = "EVALUATIONS"
+TEXT_DIRNAME = "EXTRACTED_TEXT"
+# -----------------------------
+# Utilities
+# -----------------------------
 def _now_ts() -> str:
+    return datetime.now(timezone.utc).isoformat()
 def _safe_slug(s: str, max_len: int = 80) -> str:
     return s[:max_len] if s else "UNKNOWN"
+def _sha256_file(path: str) -> str:
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            h.update(chunk)
+    return h.hexdigest()
+def _atomic_write_json(path: Path, obj: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
+    tmp.replace(path)  # atomic on same filesystem
+def _load_index(index_path: Path) -> List[Dict[str, Any]]:
+    if not index_path.exists():
+        return []
+    try:
+        return json.loads(index_path.read_text(encoding="utf-8"))
+    except Exception:
+        # If corrupted, do not crash the whole pipeline. Start fresh but keep the old file.
+        backup = index_path.with_suffix(".corrupt.json")
         try:
+            shutil.copy2(index_path, backup)
+        except Exception:
+            pass
+        return []
+def _index_by_sha(index: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    m: Dict[str, Dict[str, Any]] = {}
+    for r in index:
+        sha = r.get("pdf_sha256")
+        if sha:
+            m[sha] = r
+    return m
+def _coerce_score(v: Any) -> float:
+    try:
+        f = float(v)
+    except Exception:
+        return 0.0
+    if f < 0:
+        return 0.0
+    if f > 10:
+        return 10.0
+    return f
+# -----------------------------
+# PDF text extraction + OCR fallback
+# -----------------------------
+def _pixmap_to_pil_rgb(pix: "fitz.Pixmap") -> Image.Image:
+    # Ensure RGB (no alpha)
+    if pix.alpha:
+        pix = fitz.Pixmap(pix, 0)
+    return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+def extract_text_from_pdf(
+    pdf_path: str,
+    *,
+    ocr_if_empty: bool = True,
+    max_pages: int = 8,
+    ocr_dpi: int = 200,
+) -> str:
+    """
+    1) Try normal text extraction via PyMuPDF.
+    2) If empty and ocr_if_empty: render pages -> pytesseract OCR.
+    """
+    try:
+        doc = fitz.open(pdf_path)
+    except Exception:
+        return ""
+    # Fast text extraction
+    parts: List[str] = []
+    page_count = min(len(doc), max_pages)
+    for i in range(page_count):
+        try:
+            t = doc[i].get_text("text") or ""
         except Exception:
             t = ""
         if t.strip():
             parts.append(t)
+    text = "\n\n".join(parts).strip()
+    if text or not ocr_if_empty:
+        doc.close()
+        return text
+    # OCR fallback
+    ocr_parts: List[str] = []
+    for i in range(page_count):
+        try:
+            page = doc[i]
+            pix = page.get_pixmap(dpi=ocr_dpi)
+            img = _pixmap_to_pil_rgb(pix)
+            ocr_txt = pytesseract.image_to_string(img) or ""
+            if ocr_txt.strip():
+                ocr_parts.append(ocr_txt)
+        except Exception:
+            continue
+    doc.close()
+    return "\n\n".join(ocr_parts).strip()
+# -----------------------------
+# LLM prompt + normalization
+# -----------------------------
 def build_prompt(text: str, config: Dict[str, Any]) -> str:
     projects = config.get("projects") or []
     projects_block = json.dumps(projects, ensure_ascii=False)
 """.strip()
 def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any]:
     scores = raw.get("scores") if isinstance(raw.get("scores"), dict) else {}
     norm_scores = {k: _coerce_score(scores.get(k, 0)) for k in ALLOWED_SCORE_KEYS}
     project_name = best_project.get("project_name")
     project_score = _coerce_score(best_project.get("project_score", 0))
+    allowed_project_names = {
+        p.get("name")
+        for p in (config.get("projects") or [])
+        if isinstance(p, dict) and p.get("name")
+    }
     if project_name not in allowed_project_names:
         project_name = None
     tags = [str(t).strip() for t in tags if str(t).strip()]
     tags = tags[:25]
+    model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
     out = {
+        "schema_version": SCHEMA_VERSION,
         "candidate_name": raw.get("candidate_name"),
         "seniority": raw.get("seniority"),
         "scores": norm_scores,
         "tags": tags,
         "notes": raw.get("notes"),
         "meta": {
+            "model": model,
             "timestamp": _now_ts(),
         },
     }
         raise RuntimeError("Missing OPENAI_API_KEY (set it in HF Space Secrets).")
     client = OpenAI(api_key=api_key)
+    model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
     prompt = build_prompt(text, config)
+    resp = client.responses.create(model=model, input=prompt)
     content = resp.output_text
     if not content or not content.strip():
         raise RuntimeError("LLM returned empty response.")
     try:
         raw = json.loads(content)
     except Exception as e:
+        raise RuntimeError(
+            f"LLM did not return valid JSON. First 200 chars: {content[:200]!r}"
+        ) from e
     if not isinstance(raw, dict):
         raise RuntimeError("LLM JSON must be an object/dict at top-level.")
     return raw
+# -----------------------------
+# Pipeline (Notebook parity)
+# -----------------------------
+def _make_record_base(pdf_path: str, config: Dict[str, Any], project_name: str) -> Dict[str, Any]:
+    filename = os.path.basename(pdf_path)
+    model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
+    return {
+        "schema_version": SCHEMA_VERSION,
+        "pdf_sha256": _sha256_file(pdf_path),
+        "filename": filename,
+        "candidate_name": None,
+        "project": project_name,
+        "model": model,
+        "status": None,  # success|skipped|failed
+        "error": None,
+        "created_at": _now_ts(),
+        "output_json": None,      # relative path under output_dir
+        "extracted_text": None,   # relative path under output_dir
+    }
 def run_pipeline(
     input_files: List[str],
     config: Dict[str, Any],
+    output_dir: Optional[str] = None,
 ) -> str:
+    """
+    Writes outputs into output_dir:
+      - resumes_index.json (append-only audit trail)
+      - EVALUATIONS/*.json (per resume normalized evaluation)
+      - EXTRACTED_TEXT/*.txt (extracted text/OCR text)
+    Implements:
+      - OCR fallback
+      - dedupe by pdf_sha256 unless config["rewrite"] == True
+      - atomic writes to index
+      - consistent schema versioning
+    Returns:
+      output_dir (string path)
+    """
+    base_out = Path(output_dir or "/tmp/resume_eval_out").resolve()
     base_out.mkdir(parents=True, exist_ok=True)
+    eval_dir = base_out / EVAL_DIRNAME
     eval_dir.mkdir(parents=True, exist_ok=True)
+    text_dir = base_out / TEXT_DIRNAME
+    text_dir.mkdir(parents=True, exist_ok=True)
+    index_path = base_out / INDEX_FILENAME
+    index = _load_index(index_path)
+    index_map = _index_by_sha(index)
+    rewrite = bool(config.get("rewrite", False))
+    projects = config.get("projects") or [{"name": "STANDARD"}]
+    project_name = (projects[0] or {}).get("name", "STANDARD")
+    # OCR knobs (configurable)
+    ocr_max_pages = int(config.get("ocr_max_pages", 8))
+    ocr_dpi = int(config.get("ocr_dpi", 200))
+    for pdf_path in input_files or []:
         pdf_path = str(Path(pdf_path).resolve())
+        rec = _make_record_base(pdf_path, config, project_name)
+        sha = rec["pdf_sha256"]
+        # Dedupe
+        if sha in index_map and not rewrite:
+            rec["status"] = "skipped"
+            rec["error"] = "duplicate_pdf_sha256"
+            index.append(rec)
+            _atomic_write_json(index_path, index)
+            continue
         try:
+            text = extract_text_from_pdf(
+                pdf_path,
+                ocr_if_empty=True,
+                max_pages=ocr_max_pages,
+                ocr_dpi=ocr_dpi,
+            )
+            if not text.strip():
+                raise RuntimeError("No extractable text (even after OCR).")
+            # Persist extracted text
+            text_name = f"{_safe_slug(Path(pdf_path).stem)}__{sha[:12]}.txt"
+            text_path = text_dir / text_name
+            text_path.write_text(text, encoding="utf-8")
+            rec["extracted_text"] = str(text_path.relative_to(base_out))
             raw = llm_evaluate(text, config)
             ev = normalize_eval(raw, config)
             # Add file identity
+            ev["filename"] = os.path.basename(pdf_path)
+            ev["pdf_sha256"] = sha
+            safe_name = _safe_slug(ev.get("candidate_name") or Path(pdf_path).stem)
+            out_path = eval_dir / f"{safe_name}__{sha[:12]}.json"
             out_path.write_text(json.dumps(ev, ensure_ascii=False, indent=2), encoding="utf-8")
+            rec["status"] = "success"
+            rec["candidate_name"] = ev.get("candidate_name")
+            rec["output_json"] = str(out_path.relative_to(base_out))
         except Exception as e:
+            rec["status"] = "failed"
+            rec["error"] = f"{type(e).__name__}: {e}"
+        index.append(rec)
+        # Persist after each file so partial progress is safe
+        _atomic_write_json(index_path, index)
+    return str(base_out)