Spaces:
Sleeping
Sleeping
| """NER chain evaluation — Module 8 metrics for Phase 3 S5.2. | |
| Runs `parse_resume_envelope` against the 10 annotated fixture resumes, | |
| compares output against ground-truth canonical skill names, and reports | |
| precision / recall / F1 / canonical-mapping accuracy + per-fixture latency. | |
| Modes: | |
| - full : run the complete 5-layer chain (nucha,jobbert,skillner,sbert,lexical) | |
| - lexical: run the always-on floor only (GAPGUIDE_PARSE_LAYERS=lexical) | |
| - both : run both and compare | |
| Usage: | |
| python backend/scripts/evaluate_resume_parser.py --mode both | |
| Output is printed to stdout (redirect into phase3/metrics/ner_eval_raw.txt). | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import statistics | |
| import sys | |
| import time | |
| from pathlib import Path | |
| import django | |
| import yaml | |
| BACKEND_DIR = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(BACKEND_DIR)) | |
| os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings") | |
| django.setup() | |
| from django.core.management import call_command # noqa: E402 | |
| FIX_DIR = BACKEND_DIR / "tests" / "fixtures" / "resumes" | |
| GT_PATH = BACKEND_DIR.parent / "phase3" / "metrics" / "resume_eval_groundtruth.yaml" | |
| def seed_catalog_and_embeddings() -> None: | |
| """Ensure skills are seeded and SBERT embeddings exist. | |
| Skipped silently if the catalog already has ≥50 skills and embeddings | |
| cover them (idempotent re-runs shouldn't re-download models). | |
| """ | |
| from apps.skills.models import Skill, SkillEmbedding | |
| skill_count = Skill.objects.count() | |
| if skill_count < 50: | |
| print(f"[seed] skill count = {skill_count}; calling seed_initial_skills…") | |
| call_command("seed_initial_skills") | |
| else: | |
| print(f"[seed] {skill_count} skills already present — skipping reseed.") | |
| missing = Skill.objects.count() - SkillEmbedding.objects.count() | |
| if missing > 0: | |
| print(f"[seed] {missing} skills missing embeddings — building now…") | |
| from sentence_transformers import SentenceTransformer | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| skills = list(Skill.objects.all()) | |
| vecs = model.encode( | |
| [f"{s.skill_name} — {s.description or s.category}" for s in skills], | |
| normalize_embeddings=True, | |
| ) | |
| for skill, vec in zip(skills, vecs): | |
| SkillEmbedding.objects.update_or_create( | |
| skill=skill, | |
| defaults={ | |
| "embedding": vec.tolist(), | |
| "source_text": skill.skill_name, | |
| }, | |
| ) | |
| print("[seed] embeddings built.") | |
| else: | |
| print("[seed] SBERT embeddings already present — skipping rebuild.") | |
| def load_groundtruth() -> dict[str, set[str]]: | |
| with open(GT_PATH, encoding="utf-8") as f: | |
| raw = yaml.safe_load(f) | |
| return {k: set(v) for k, v in raw.items()} | |
| def run_mode(mode: str, groundtruth: dict[str, set[str]]) -> dict: | |
| """Run parse_resume_envelope against each fixture under the given mode. | |
| Returns per-fixture and aggregate metrics. | |
| """ | |
| # Configure chain. | |
| if mode == "lexical": | |
| os.environ["GAPGUIDE_PARSE_LAYERS"] = "lexical" | |
| else: # full | |
| os.environ.pop("GAPGUIDE_PARSE_LAYERS", None) | |
| # Reload the module so the env-var change takes effect. | |
| import importlib | |
| from apps.accounts import resume_parser as rp | |
| importlib.reload(rp) | |
| per_fixture = [] | |
| catalog_names = set() | |
| from apps.skills.models import Skill | |
| catalog_names = set(Skill.objects.values_list("skill_name", flat=True)) | |
| total_tp = total_fp = total_fn = 0 | |
| total_predicted = 0 | |
| total_in_catalog = 0 | |
| latencies = [] | |
| for fname, gt in sorted(groundtruth.items()): | |
| pdf_path = FIX_DIR / fname | |
| if not pdf_path.exists(): | |
| print(f"[warn] missing fixture: {fname}") | |
| continue | |
| pdf_bytes = pdf_path.read_bytes() | |
| t0 = time.perf_counter() | |
| env = rp.parse_resume_envelope(pdf_bytes) | |
| dt = time.perf_counter() - t0 | |
| latencies.append(dt) | |
| predicted = {s["skill_name"] for s in env["skills"]} | |
| fired = env.get("parser_version", []) | |
| tp = predicted & gt | |
| fp = predicted - gt | |
| fn = gt - predicted | |
| in_catalog = predicted & catalog_names | |
| total_tp += len(tp) | |
| total_fp += len(fp) | |
| total_fn += len(fn) | |
| total_predicted += len(predicted) | |
| total_in_catalog += len(in_catalog) | |
| precision = len(tp) / len(predicted) if predicted else 0.0 | |
| recall = len(tp) / len(gt) if gt else 0.0 | |
| f1 = ( | |
| 2 * precision * recall / (precision + recall) | |
| if (precision + recall) > 0 else 0.0 | |
| ) | |
| mapping_acc = len(in_catalog) / len(predicted) if predicted else 1.0 | |
| per_fixture.append({ | |
| "fixture": fname, | |
| "latency_ms": round(dt * 1000, 1), | |
| "layers_fired": fired, | |
| "gt_count": len(gt), | |
| "predicted_count": len(predicted), | |
| "tp": sorted(tp), | |
| "fp": sorted(fp), | |
| "fn": sorted(fn), | |
| "precision": round(precision, 3), | |
| "recall": round(recall, 3), | |
| "f1": round(f1, 3), | |
| "mapping_accuracy": round(mapping_acc, 3), | |
| }) | |
| # Micro-averaged (pooled) metrics. | |
| mp = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0 | |
| mr = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0 | |
| mf = 2 * mp * mr / (mp + mr) if (mp + mr) > 0 else 0.0 | |
| m_map = total_in_catalog / total_predicted if total_predicted > 0 else 0.0 | |
| # Macro-averaged (per-fixture, then mean). | |
| macro_p = statistics.mean(pf["precision"] for pf in per_fixture) | |
| macro_r = statistics.mean(pf["recall"] for pf in per_fixture) | |
| macro_f = statistics.mean(pf["f1"] for pf in per_fixture) | |
| macro_map = statistics.mean(pf["mapping_accuracy"] for pf in per_fixture) | |
| return { | |
| "mode": mode, | |
| "n_fixtures": len(per_fixture), | |
| "per_fixture": per_fixture, | |
| "micro": { | |
| "precision": round(mp, 3), | |
| "recall": round(mr, 3), | |
| "f1": round(mf, 3), | |
| "mapping_accuracy": round(m_map, 3), | |
| }, | |
| "macro": { | |
| "precision": round(macro_p, 3), | |
| "recall": round(macro_r, 3), | |
| "f1": round(macro_f, 3), | |
| "mapping_accuracy": round(macro_map, 3), | |
| }, | |
| "latency": { | |
| "mean_ms": round(statistics.mean(latencies) * 1000, 1), | |
| "p50_ms": round(statistics.median(latencies) * 1000, 1), | |
| "max_ms": round(max(latencies) * 1000, 1), | |
| }, | |
| } | |
| def main() -> None: | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--mode", choices=["full", "lexical", "both"], default="both") | |
| p.add_argument( | |
| "--out-json", | |
| default=str(BACKEND_DIR.parent / "phase3" / "metrics" / "ner_eval_results.json"), | |
| ) | |
| args = p.parse_args() | |
| seed_catalog_and_embeddings() | |
| gt = load_groundtruth() | |
| print(f"[eval] {len(gt)} fixtures in ground truth.") | |
| results = {} | |
| if args.mode in ("full", "both"): | |
| print("\n=== Running FULL chain (nucha,jobbert,skillner,sbert,lexical) ===") | |
| results["full"] = run_mode("full", gt) | |
| print(json.dumps(results["full"]["micro"], indent=2)) | |
| print(json.dumps(results["full"]["macro"], indent=2)) | |
| print(json.dumps(results["full"]["latency"], indent=2)) | |
| if args.mode in ("lexical", "both"): | |
| print("\n=== Running LEXICAL only (baseline floor) ===") | |
| results["lexical"] = run_mode("lexical", gt) | |
| print(json.dumps(results["lexical"]["micro"], indent=2)) | |
| print(json.dumps(results["lexical"]["macro"], indent=2)) | |
| print(json.dumps(results["lexical"]["latency"], indent=2)) | |
| out_path = Path(args.out_json) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| out_path.write_text(json.dumps(results, indent=2), encoding="utf-8") | |
| print(f"\n[eval] wrote detailed results to {out_path}") | |
| if __name__ == "__main__": | |
| main() | |