gapguide-api / scripts /evaluate_resume_parser.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
8.13 kB
"""NER chain evaluation — Module 8 metrics for Phase 3 S5.2.
Runs `parse_resume_envelope` against the 10 annotated fixture resumes,
compares output against ground-truth canonical skill names, and reports
precision / recall / F1 / canonical-mapping accuracy + per-fixture latency.
Modes:
- full : run the complete 5-layer chain (nucha,jobbert,skillner,sbert,lexical)
- lexical: run the always-on floor only (GAPGUIDE_PARSE_LAYERS=lexical)
- both : run both and compare
Usage:
python backend/scripts/evaluate_resume_parser.py --mode both
Output is printed to stdout (redirect into phase3/metrics/ner_eval_raw.txt).
"""
from __future__ import annotations
import argparse
import json
import os
import statistics
import sys
import time
from pathlib import Path
import django
import yaml
BACKEND_DIR = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(BACKEND_DIR))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
django.setup()
from django.core.management import call_command # noqa: E402
FIX_DIR = BACKEND_DIR / "tests" / "fixtures" / "resumes"
GT_PATH = BACKEND_DIR.parent / "phase3" / "metrics" / "resume_eval_groundtruth.yaml"
def seed_catalog_and_embeddings() -> None:
"""Ensure skills are seeded and SBERT embeddings exist.
Skipped silently if the catalog already has ≥50 skills and embeddings
cover them (idempotent re-runs shouldn't re-download models).
"""
from apps.skills.models import Skill, SkillEmbedding
skill_count = Skill.objects.count()
if skill_count < 50:
print(f"[seed] skill count = {skill_count}; calling seed_initial_skills…")
call_command("seed_initial_skills")
else:
print(f"[seed] {skill_count} skills already present — skipping reseed.")
missing = Skill.objects.count() - SkillEmbedding.objects.count()
if missing > 0:
print(f"[seed] {missing} skills missing embeddings — building now…")
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
skills = list(Skill.objects.all())
vecs = model.encode(
[f"{s.skill_name}{s.description or s.category}" for s in skills],
normalize_embeddings=True,
)
for skill, vec in zip(skills, vecs):
SkillEmbedding.objects.update_or_create(
skill=skill,
defaults={
"embedding": vec.tolist(),
"source_text": skill.skill_name,
},
)
print("[seed] embeddings built.")
else:
print("[seed] SBERT embeddings already present — skipping rebuild.")
def load_groundtruth() -> dict[str, set[str]]:
with open(GT_PATH, encoding="utf-8") as f:
raw = yaml.safe_load(f)
return {k: set(v) for k, v in raw.items()}
def run_mode(mode: str, groundtruth: dict[str, set[str]]) -> dict:
"""Run parse_resume_envelope against each fixture under the given mode.
Returns per-fixture and aggregate metrics.
"""
# Configure chain.
if mode == "lexical":
os.environ["GAPGUIDE_PARSE_LAYERS"] = "lexical"
else: # full
os.environ.pop("GAPGUIDE_PARSE_LAYERS", None)
# Reload the module so the env-var change takes effect.
import importlib
from apps.accounts import resume_parser as rp
importlib.reload(rp)
per_fixture = []
catalog_names = set()
from apps.skills.models import Skill
catalog_names = set(Skill.objects.values_list("skill_name", flat=True))
total_tp = total_fp = total_fn = 0
total_predicted = 0
total_in_catalog = 0
latencies = []
for fname, gt in sorted(groundtruth.items()):
pdf_path = FIX_DIR / fname
if not pdf_path.exists():
print(f"[warn] missing fixture: {fname}")
continue
pdf_bytes = pdf_path.read_bytes()
t0 = time.perf_counter()
env = rp.parse_resume_envelope(pdf_bytes)
dt = time.perf_counter() - t0
latencies.append(dt)
predicted = {s["skill_name"] for s in env["skills"]}
fired = env.get("parser_version", [])
tp = predicted & gt
fp = predicted - gt
fn = gt - predicted
in_catalog = predicted & catalog_names
total_tp += len(tp)
total_fp += len(fp)
total_fn += len(fn)
total_predicted += len(predicted)
total_in_catalog += len(in_catalog)
precision = len(tp) / len(predicted) if predicted else 0.0
recall = len(tp) / len(gt) if gt else 0.0
f1 = (
2 * precision * recall / (precision + recall)
if (precision + recall) > 0 else 0.0
)
mapping_acc = len(in_catalog) / len(predicted) if predicted else 1.0
per_fixture.append({
"fixture": fname,
"latency_ms": round(dt * 1000, 1),
"layers_fired": fired,
"gt_count": len(gt),
"predicted_count": len(predicted),
"tp": sorted(tp),
"fp": sorted(fp),
"fn": sorted(fn),
"precision": round(precision, 3),
"recall": round(recall, 3),
"f1": round(f1, 3),
"mapping_accuracy": round(mapping_acc, 3),
})
# Micro-averaged (pooled) metrics.
mp = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
mr = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
mf = 2 * mp * mr / (mp + mr) if (mp + mr) > 0 else 0.0
m_map = total_in_catalog / total_predicted if total_predicted > 0 else 0.0
# Macro-averaged (per-fixture, then mean).
macro_p = statistics.mean(pf["precision"] for pf in per_fixture)
macro_r = statistics.mean(pf["recall"] for pf in per_fixture)
macro_f = statistics.mean(pf["f1"] for pf in per_fixture)
macro_map = statistics.mean(pf["mapping_accuracy"] for pf in per_fixture)
return {
"mode": mode,
"n_fixtures": len(per_fixture),
"per_fixture": per_fixture,
"micro": {
"precision": round(mp, 3),
"recall": round(mr, 3),
"f1": round(mf, 3),
"mapping_accuracy": round(m_map, 3),
},
"macro": {
"precision": round(macro_p, 3),
"recall": round(macro_r, 3),
"f1": round(macro_f, 3),
"mapping_accuracy": round(macro_map, 3),
},
"latency": {
"mean_ms": round(statistics.mean(latencies) * 1000, 1),
"p50_ms": round(statistics.median(latencies) * 1000, 1),
"max_ms": round(max(latencies) * 1000, 1),
},
}
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--mode", choices=["full", "lexical", "both"], default="both")
p.add_argument(
"--out-json",
default=str(BACKEND_DIR.parent / "phase3" / "metrics" / "ner_eval_results.json"),
)
args = p.parse_args()
seed_catalog_and_embeddings()
gt = load_groundtruth()
print(f"[eval] {len(gt)} fixtures in ground truth.")
results = {}
if args.mode in ("full", "both"):
print("\n=== Running FULL chain (nucha,jobbert,skillner,sbert,lexical) ===")
results["full"] = run_mode("full", gt)
print(json.dumps(results["full"]["micro"], indent=2))
print(json.dumps(results["full"]["macro"], indent=2))
print(json.dumps(results["full"]["latency"], indent=2))
if args.mode in ("lexical", "both"):
print("\n=== Running LEXICAL only (baseline floor) ===")
results["lexical"] = run_mode("lexical", gt)
print(json.dumps(results["lexical"]["micro"], indent=2))
print(json.dumps(results["lexical"]["macro"], indent=2))
print(json.dumps(results["lexical"]["latency"], indent=2))
out_path = Path(args.out_json)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
print(f"\n[eval] wrote detailed results to {out_path}")
if __name__ == "__main__":
main()