Spaces:

SAadmin
/

resume-evaluator

Sleeping

App Files Files Community

Avinashnalla7 commited on Feb 17

Commit

e786373

verified ·

1 Parent(s): fea1ab9

with OCR + dedupe + atomic index + schema

Browse files

Files changed (1) hide show

pipeline.py +19 -64

pipeline.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 import re
 import hashlib
 import shutil
-import time
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
@@ -15,25 +14,16 @@ from openai import OpenAI
 from tenacity import retry, stop_after_attempt, wait_exponential
-# -----------------------------
-# Schema / Constants
-# -----------------------------
 SCHEMA_VERSION = "1.0"
 ALLOWED_SCORE_KEYS = ["skill", "experience", "growth", "context_fit", "combined"]
-DEFAULT_MODEL = "gpt-4o-mini"
 INDEX_FILENAME = "resumes_index.json"
 EVAL_DIRNAME = "EVALUATIONS"
 TEXT_DIRNAME = "EXTRACTED_TEXT"
-# -----------------------------
-# Utilities
-# -----------------------------
 def _now_ts() -> str:
     return datetime.now(timezone.utc).isoformat()
@@ -57,7 +47,7 @@ def _atomic_write_json(path: Path, obj: Any) -> None:
     path.parent.mkdir(parents=True, exist_ok=True)
     tmp = path.with_suffix(path.suffix + ".tmp")
     tmp.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
-    tmp.replace(path)  # atomic on same filesystem
 def _load_index(index_path: Path) -> List[Dict[str, Any]]:
@@ -66,7 +56,6 @@ def _load_index(index_path: Path) -> List[Dict[str, Any]]:
     try:
         return json.loads(index_path.read_text(encoding="utf-8"))
     except Exception:
-        # If corrupted, do not crash the whole pipeline. Start fresh but keep the old file.
         backup = index_path.with_suffix(".corrupt.json")
         try:
             shutil.copy2(index_path, backup)
@@ -76,12 +65,12 @@ def _load_index(index_path: Path) -> List[Dict[str, Any]]:
 def _index_by_sha(index: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
-    m: Dict[str, Dict[str, Any]] = {}
     for r in index:
         sha = r.get("pdf_sha256")
         if sha:
-            m[sha] = r
-    return m
 def _coerce_score(v: Any) -> float:
@@ -96,12 +85,7 @@ def _coerce_score(v: Any) -> float:
     return f
-# -----------------------------
-# PDF text extraction + OCR fallback
-# -----------------------------
 def _pixmap_to_pil_rgb(pix: "fitz.Pixmap") -> Image.Image:
-    # Ensure RGB (no alpha)
     if pix.alpha:
         pix = fitz.Pixmap(pix, 0)
     return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
@@ -115,17 +99,18 @@ def extract_text_from_pdf(
     ocr_dpi: int = 200,
 ) -> str:
     """
-    1) Try normal text extraction via PyMuPDF.
-    2) If empty and ocr_if_empty: render pages -> pytesseract OCR.
     """
     try:
         doc = fitz.open(pdf_path)
     except Exception:
         return ""
-    # Fast text extraction
     parts: List[str] = []
     page_count = min(len(doc), max_pages)
     for i in range(page_count):
         try:
             t = doc[i].get_text("text") or ""
@@ -156,10 +141,6 @@ def extract_text_from_pdf(
     return "\n\n".join(ocr_parts).strip()
-# -----------------------------
-# LLM prompt + normalization
-# -----------------------------
 def build_prompt(text: str, config: Dict[str, Any]) -> str:
     projects = config.get("projects") or []
     projects_block = json.dumps(projects, ensure_ascii=False)
@@ -192,7 +173,7 @@ Rules:
 - scores are 0..10 (float allowed)
 - combined must be a reasonable aggregate of the others (not random)
 - best_project.project_name must be one of the provided projects' names OR null
-- tags should be short, e.g. "Backend", "Data", "Python", "Senior", "Leadership"
 - If uncertain, be conservative.
 Projects (for matching):
@@ -227,7 +208,7 @@ def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any
     model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
-    out = {
         "schema_version": SCHEMA_VERSION,
         "candidate_name": raw.get("candidate_name"),
         "seniority": raw.get("seniority"),
@@ -235,12 +216,8 @@ def normalize_eval(raw: Dict[str, Any], config: Dict[str, Any]) -> Dict[str, Any
         "best_project": {"project_name": project_name, "project_score": project_score},
         "tags": tags,
         "notes": raw.get("notes"),
-        "meta": {
-            "model": model,
-            "timestamp": _now_ts(),
-        },
     }
-    return out
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8))
@@ -254,7 +231,6 @@ def llm_evaluate(text: str, config: Dict[str, Any]) -> Dict[str, Any]:
     prompt = build_prompt(text, config)
     resp = client.responses.create(model=model, input=prompt)
     content = resp.output_text
     if not content or not content.strip():
         raise RuntimeError("LLM returned empty response.")
@@ -272,10 +248,6 @@ def llm_evaluate(text: str, config: Dict[str, Any]) -> Dict[str, Any]:
     return raw
-# -----------------------------
-# Pipeline (Notebook parity)
-# -----------------------------
 def _make_record_base(pdf_path: str, config: Dict[str, Any], project_name: str) -> Dict[str, Any]:
     filename = os.path.basename(pdf_path)
     model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
@@ -286,11 +258,11 @@ def _make_record_base(pdf_path: str, config: Dict[str, Any], project_name: str)
         "candidate_name": None,
         "project": project_name,
         "model": model,
-        "status": None,  # success|skipped|failed
         "error": None,
         "created_at": _now_ts(),
-        "output_json": None,      # relative path under output_dir
-        "extracted_text": None,   # relative path under output_dir
     }
@@ -299,20 +271,6 @@ def run_pipeline(
     config: Dict[str, Any],
     output_dir: Optional[str] = None,
 ) -> str:
-    """
-    Writes outputs into output_dir:
-      - resumes_index.json (append-only audit trail)
-      - EVALUATIONS/*.json (per resume normalized evaluation)
-      - EXTRACTED_TEXT/*.txt (extracted text/OCR text)
-    Implements:
-      - OCR fallback
-      - dedupe by pdf_sha256 unless config["rewrite"] == True
-      - atomic writes to index
-      - consistent schema versioning
-    Returns:
-      output_dir (string path)
-    """
     base_out = Path(output_dir or "/tmp/resume_eval_out").resolve()
     base_out.mkdir(parents=True, exist_ok=True)
@@ -330,7 +288,6 @@ def run_pipeline(
     projects = config.get("projects") or [{"name": "STANDARD"}]
     project_name = (projects[0] or {}).get("name", "STANDARD")
-    # OCR knobs (configurable)
     ocr_max_pages = int(config.get("ocr_max_pages", 8))
     ocr_dpi = int(config.get("ocr_dpi", 200))
@@ -339,7 +296,7 @@ def run_pipeline(
         rec = _make_record_base(pdf_path, config, project_name)
         sha = rec["pdf_sha256"]
-        # Dedupe
         if sha in index_map and not rewrite:
             rec["status"] = "skipped"
             rec["error"] = "duplicate_pdf_sha256"
@@ -359,14 +316,13 @@ def run_pipeline(
             # Persist extracted text
             text_name = f"{_safe_slug(Path(pdf_path).stem)}__{sha[:12]}.txt"
-            text_path = text_dir / text_name
-            text_path.write_text(text, encoding="utf-8")
-            rec["extracted_text"] = str(text_path.relative_to(base_out))
             raw = llm_evaluate(text, config)
             ev = normalize_eval(raw, config)
-            # Add file identity
             ev["filename"] = os.path.basename(pdf_path)
             ev["pdf_sha256"] = sha
@@ -383,7 +339,6 @@ def run_pipeline(
             rec["error"] = f"{type(e).__name__}: {e}"
         index.append(rec)
-        # Persist after each file so partial progress is safe
         _atomic_write_json(index_path, index)
     return str(base_out)

 import re
 import hashlib
 import shutil
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from tenacity import retry, stop_after_attempt, wait_exponential
 SCHEMA_VERSION = "1.0"
+DEFAULT_MODEL = "gpt-4o-mini"
 ALLOWED_SCORE_KEYS = ["skill", "experience", "growth", "context_fit", "combined"]
 INDEX_FILENAME = "resumes_index.json"
 EVAL_DIRNAME = "EVALUATIONS"
 TEXT_DIRNAME = "EXTRACTED_TEXT"
 def _now_ts() -> str:
     return datetime.now(timezone.utc).isoformat()
     path.parent.mkdir(parents=True, exist_ok=True)
     tmp = path.with_suffix(path.suffix + ".tmp")
     tmp.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
+    tmp.replace(path)
 def _load_index(index_path: Path) -> List[Dict[str, Any]]:
     try:
         return json.loads(index_path.read_text(encoding="utf-8"))
     except Exception:
         backup = index_path.with_suffix(".corrupt.json")
         try:
             shutil.copy2(index_path, backup)
 def _index_by_sha(index: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    out: Dict[str, Dict[str, Any]] = {}
     for r in index:
         sha = r.get("pdf_sha256")
         if sha:
+            out[sha] = r
+    return out
 def _coerce_score(v: Any) -> float:
     return f
 def _pixmap_to_pil_rgb(pix: "fitz.Pixmap") -> Image.Image:
     if pix.alpha:
         pix = fitz.Pixmap(pix, 0)
     return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
     ocr_dpi: int = 200,
 ) -> str:
     """
+    1) Extract text with PyMuPDF.
+    2) If empty and ocr_if_empty: OCR first max_pages pages.
     """
     try:
         doc = fitz.open(pdf_path)
     except Exception:
         return ""
     parts: List[str] = []
     page_count = min(len(doc), max_pages)
+    # Normal extraction
     for i in range(page_count):
         try:
             t = doc[i].get_text("text") or ""
     return "\n\n".join(ocr_parts).strip()
 def build_prompt(text: str, config: Dict[str, Any]) -> str:
     projects = config.get("projects") or []
     projects_block = json.dumps(projects, ensure_ascii=False)
 - scores are 0..10 (float allowed)
 - combined must be a reasonable aggregate of the others (not random)
 - best_project.project_name must be one of the provided projects' names OR null
+- tags should be short
 - If uncertain, be conservative.
 Projects (for matching):
     model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
+    return {
         "schema_version": SCHEMA_VERSION,
         "candidate_name": raw.get("candidate_name"),
         "seniority": raw.get("seniority"),
         "best_project": {"project_name": project_name, "project_score": project_score},
         "tags": tags,
         "notes": raw.get("notes"),
+        "meta": {"model": model, "timestamp": _now_ts()},
     }
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8))
     prompt = build_prompt(text, config)
     resp = client.responses.create(model=model, input=prompt)
     content = resp.output_text
     if not content or not content.strip():
         raise RuntimeError("LLM returned empty response.")
     return raw
 def _make_record_base(pdf_path: str, config: Dict[str, Any], project_name: str) -> Dict[str, Any]:
     filename = os.path.basename(pdf_path)
     model = config.get("model") or os.getenv("OPENAI_MODEL") or DEFAULT_MODEL
         "candidate_name": None,
         "project": project_name,
         "model": model,
+        "status": None,          # success|skipped|failed
         "error": None,
         "created_at": _now_ts(),
+        "output_json": None,     # relative under output_dir
+        "extracted_text": None,  # relative under output_dir
     }
     config: Dict[str, Any],
     output_dir: Optional[str] = None,
 ) -> str:
     base_out = Path(output_dir or "/tmp/resume_eval_out").resolve()
     base_out.mkdir(parents=True, exist_ok=True)
     projects = config.get("projects") or [{"name": "STANDARD"}]
     project_name = (projects[0] or {}).get("name", "STANDARD")
     ocr_max_pages = int(config.get("ocr_max_pages", 8))
     ocr_dpi = int(config.get("ocr_dpi", 200))
         rec = _make_record_base(pdf_path, config, project_name)
         sha = rec["pdf_sha256"]
+        # dedupe
         if sha in index_map and not rewrite:
             rec["status"] = "skipped"
             rec["error"] = "duplicate_pdf_sha256"
             # Persist extracted text
             text_name = f"{_safe_slug(Path(pdf_path).stem)}__{sha[:12]}.txt"
+            tpath = text_dir / text_name
+            tpath.write_text(text, encoding="utf-8")
+            rec["extracted_text"] = str(tpath.relative_to(base_out))
             raw = llm_evaluate(text, config)
             ev = normalize_eval(raw, config)
             ev["filename"] = os.path.basename(pdf_path)
             ev["pdf_sha256"] = sha
             rec["error"] = f"{type(e).__name__}: {e}"
         index.append(rec)
         _atomic_write_json(index_path, index)
     return str(base_out)