"""NER chain evaluation — Module 8 metrics for Phase 3 S5.2. Runs `parse_resume_envelope` against the 10 annotated fixture resumes, compares output against ground-truth canonical skill names, and reports precision / recall / F1 / canonical-mapping accuracy + per-fixture latency. Modes: - full : run the complete 5-layer chain (nucha,jobbert,skillner,sbert,lexical) - lexical: run the always-on floor only (GAPGUIDE_PARSE_LAYERS=lexical) - both : run both and compare Usage: python backend/scripts/evaluate_resume_parser.py --mode both Output is printed to stdout (redirect into phase3/metrics/ner_eval_raw.txt). """ from __future__ import annotations import argparse import json import os import statistics import sys import time from pathlib import Path import django import yaml BACKEND_DIR = Path(__file__).resolve().parent.parent sys.path.insert(0, str(BACKEND_DIR)) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") django.setup() from django.core.management import call_command # noqa: E402 FIX_DIR = BACKEND_DIR / "tests" / "fixtures" / "resumes" GT_PATH = BACKEND_DIR.parent / "phase3" / "metrics" / "resume_eval_groundtruth.yaml" def seed_catalog_and_embeddings() -> None: """Ensure skills are seeded and SBERT embeddings exist. Skipped silently if the catalog already has ≥50 skills and embeddings cover them (idempotent re-runs shouldn't re-download models). """ from apps.skills.models import Skill, SkillEmbedding skill_count = Skill.objects.count() if skill_count < 50: print(f"[seed] skill count = {skill_count}; calling seed_initial_skills…") call_command("seed_initial_skills") else: print(f"[seed] {skill_count} skills already present — skipping reseed.") missing = Skill.objects.count() - SkillEmbedding.objects.count() if missing > 0: print(f"[seed] {missing} skills missing embeddings — building now…") from sentence_transformers import SentenceTransformer model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") skills = list(Skill.objects.all()) vecs = model.encode( [f"{s.skill_name} — {s.description or s.category}" for s in skills], normalize_embeddings=True, ) for skill, vec in zip(skills, vecs): SkillEmbedding.objects.update_or_create( skill=skill, defaults={ "embedding": vec.tolist(), "source_text": skill.skill_name, }, ) print("[seed] embeddings built.") else: print("[seed] SBERT embeddings already present — skipping rebuild.") def load_groundtruth() -> dict[str, set[str]]: with open(GT_PATH, encoding="utf-8") as f: raw = yaml.safe_load(f) return {k: set(v) for k, v in raw.items()} def run_mode(mode: str, groundtruth: dict[str, set[str]]) -> dict: """Run parse_resume_envelope against each fixture under the given mode. Returns per-fixture and aggregate metrics. """ # Configure chain. if mode == "lexical": os.environ["GAPGUIDE_PARSE_LAYERS"] = "lexical" else: # full os.environ.pop("GAPGUIDE_PARSE_LAYERS", None) # Reload the module so the env-var change takes effect. import importlib from apps.accounts import resume_parser as rp importlib.reload(rp) per_fixture = [] catalog_names = set() from apps.skills.models import Skill catalog_names = set(Skill.objects.values_list("skill_name", flat=True)) total_tp = total_fp = total_fn = 0 total_predicted = 0 total_in_catalog = 0 latencies = [] for fname, gt in sorted(groundtruth.items()): pdf_path = FIX_DIR / fname if not pdf_path.exists(): print(f"[warn] missing fixture: {fname}") continue pdf_bytes = pdf_path.read_bytes() t0 = time.perf_counter() env = rp.parse_resume_envelope(pdf_bytes) dt = time.perf_counter() - t0 latencies.append(dt) predicted = {s["skill_name"] for s in env["skills"]} fired = env.get("parser_version", []) tp = predicted & gt fp = predicted - gt fn = gt - predicted in_catalog = predicted & catalog_names total_tp += len(tp) total_fp += len(fp) total_fn += len(fn) total_predicted += len(predicted) total_in_catalog += len(in_catalog) precision = len(tp) / len(predicted) if predicted else 0.0 recall = len(tp) / len(gt) if gt else 0.0 f1 = ( 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 ) mapping_acc = len(in_catalog) / len(predicted) if predicted else 1.0 per_fixture.append({ "fixture": fname, "latency_ms": round(dt * 1000, 1), "layers_fired": fired, "gt_count": len(gt), "predicted_count": len(predicted), "tp": sorted(tp), "fp": sorted(fp), "fn": sorted(fn), "precision": round(precision, 3), "recall": round(recall, 3), "f1": round(f1, 3), "mapping_accuracy": round(mapping_acc, 3), }) # Micro-averaged (pooled) metrics. mp = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0 mr = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0 mf = 2 * mp * mr / (mp + mr) if (mp + mr) > 0 else 0.0 m_map = total_in_catalog / total_predicted if total_predicted > 0 else 0.0 # Macro-averaged (per-fixture, then mean). macro_p = statistics.mean(pf["precision"] for pf in per_fixture) macro_r = statistics.mean(pf["recall"] for pf in per_fixture) macro_f = statistics.mean(pf["f1"] for pf in per_fixture) macro_map = statistics.mean(pf["mapping_accuracy"] for pf in per_fixture) return { "mode": mode, "n_fixtures": len(per_fixture), "per_fixture": per_fixture, "micro": { "precision": round(mp, 3), "recall": round(mr, 3), "f1": round(mf, 3), "mapping_accuracy": round(m_map, 3), }, "macro": { "precision": round(macro_p, 3), "recall": round(macro_r, 3), "f1": round(macro_f, 3), "mapping_accuracy": round(macro_map, 3), }, "latency": { "mean_ms": round(statistics.mean(latencies) * 1000, 1), "p50_ms": round(statistics.median(latencies) * 1000, 1), "max_ms": round(max(latencies) * 1000, 1), }, } def main() -> None: p = argparse.ArgumentParser() p.add_argument("--mode", choices=["full", "lexical", "both"], default="both") p.add_argument( "--out-json", default=str(BACKEND_DIR.parent / "phase3" / "metrics" / "ner_eval_results.json"), ) args = p.parse_args() seed_catalog_and_embeddings() gt = load_groundtruth() print(f"[eval] {len(gt)} fixtures in ground truth.") results = {} if args.mode in ("full", "both"): print("\n=== Running FULL chain (nucha,jobbert,skillner,sbert,lexical) ===") results["full"] = run_mode("full", gt) print(json.dumps(results["full"]["micro"], indent=2)) print(json.dumps(results["full"]["macro"], indent=2)) print(json.dumps(results["full"]["latency"], indent=2)) if args.mode in ("lexical", "both"): print("\n=== Running LEXICAL only (baseline floor) ===") results["lexical"] = run_mode("lexical", gt) print(json.dumps(results["lexical"]["micro"], indent=2)) print(json.dumps(results["lexical"]["macro"], indent=2)) print(json.dumps(results["lexical"]["latency"], indent=2)) out_path = Path(args.out_json) out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(results, indent=2), encoding="utf-8") print(f"\n[eval] wrote detailed results to {out_path}") if __name__ == "__main__": main()