""" validate_pipeline.py Local validation protocol for the Redrob candidate ranking system. Runs entirely offline, no network calls, designed to be executed before any of the 3 allowed competition submissions are spent. Checks performed: 1. Probe-set NDCG@10 against hand-labeled reference candidates 2. Ablation table (component on/off, confirm monotonic improvement) 3. Honeypot injection test (synthetic violations of c1-c7, confirm suppression) 4. Top-100 diversity / homogeneity check (NEW - this revision) 5. Readiness gate (numeric threshold before spending a submission) """ import json import hashlib from collections import Counter from itertools import combinations PROBE_SET_LABELS = { "CAND_0000001": 3, "CAND_0000010": 3, "CAND_0000021": 0, "CAND_0000014": 2, "CAND_0000011": 1, } def compute_probe_ndcg10(ranked_candidate_ids: list[str], labels: dict[str, int] = PROBE_SET_LABELS) -> float: """ NDCG@10 restricted to candidates that appear in both the ranked output and the probe set. Only meaningful once the probe set is grown beyond the current 5 reference points (see TODO below). """ import math relevant_in_rank = [ (rank, labels[cid]) for rank, cid in enumerate(ranked_candidate_ids[:10], start=1) if cid in labels ] if not relevant_in_rank: return None # probe set didn't overlap with top 10 at all dcg = sum(rel / math.log2(rank + 1) for rank, rel in relevant_in_rank) ideal_order = sorted(labels.values(), reverse=True)[:10] idcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal_order)) return dcg / idcg if idcg > 0 else 0.0 # ablation table def run_ablation(pipeline_fn, candidates: list[dict], jd_config: dict) -> dict: """ pipeline_fn(candidates, jd_config, **toggles) -> list[ranked_candidate_ids] Each toggle disables one component. Confirms NDCG@10 on the probe set does not improve when a component is removed -- if it does, that component is actively hurting ranking quality and is a bug, not a feature. """ configs = { "full_pipeline": dict(), "no_consistency_checks": dict(disable_consistency=True), "no_parameter_a": dict(disable_param_a=True), "bm25_only_no_features": dict(disable_features=True), } results = {} for name, toggles in configs.items(): ranked = pipeline_fn(candidates, jd_config, **toggles) results[name] = { "ndcg10": compute_probe_ndcg10(ranked), "top10_ids": ranked[:10], } return results def print_ablation_report(results: dict) -> None: print("=" * 60) print("ABLATION REPORT") print("=" * 60) baseline = results["full_pipeline"]["ndcg10"] for name, r in results.items(): flag = "" if name != "full_pipeline" and baseline is not None and r["ndcg10"] is not None: if r["ndcg10"] > baseline: flag = " <-- WARNING: removing this IMPROVED the score. Investigate." print(f"{name:30s} NDCG@10 = {r['ndcg10']}{flag}") # honeypot injection test def make_synthetic_honeypot(violation: str, base_candidate: dict) -> dict: """ Clones a real candidate and deliberately injects exactly one consistency-check violation, so each test case isolates a single check rather than confounding several at once. """ c = json.loads(json.dumps(base_candidate)) c["candidate_id"] = f"SYNTH_{violation.upper()}" if violation == "timeline_impossibility": c["skills"][0]["duration_months"] = int(c["profile"]["years_of_experience"] * 12) + 50 elif violation == "signup_anomaly": c["redrob_signals"]["signup_date"] = "2099-01-01" c["redrob_signals"]["last_active_date"] = "2026-01-01" elif violation == "salary_inversion": c["redrob_signals"]["expected_salary_range_inr_lpa"] = {"min": 50.0, "max": 10.0} elif violation == "assessment_contradiction": skill_name = c["skills"][0]["name"] c["skills"][0]["proficiency"] = "advanced" c["redrob_signals"]["skill_assessment_scores"][skill_name] = 12.0 elif violation == "engagement_mismatch": c["redrob_signals"]["connection_count"] = 0 c["redrob_signals"]["search_appearance_30d"] = 0 c["redrob_signals"]["endorsements_received"] = 0 elif violation == "langchain_dabbler": c["skills"] = [ {"name": "LangChain", "proficiency": "advanced", "endorsements": 2, "duration_months": 6}, {"name": "Prompt Engineering", "proficiency": "advanced", "endorsements": 1, "duration_months": 4}, ] c["redrob_signals"]["skill_assessment_scores"] = {} elif violation == "cv_specialist_no_nlp": c["skills"] = [ {"name": "OpenCV", "proficiency": "advanced", "endorsements": 30, "duration_months": 36}, {"name": "YOLO", "proficiency": "advanced", "endorsements": 20, "duration_months": 30}, ] return c VIOLATION_TYPES = [ "timeline_impossibility", "signup_anomaly", "salary_inversion", "assessment_contradiction", "engagement_mismatch", "langchain_dabbler", "cv_specialist_no_nlp", ] def run_honeypot_injection_test(pipeline_fn, real_candidates: list[dict], jd_config: dict, top_n: int = 100) -> dict: base = real_candidates[0] synthetic = [make_synthetic_honeypot(v, base) for v in VIOLATION_TYPES] test_pool = real_candidates + synthetic ranked = pipeline_fn(test_pool, jd_config) top_n_ids = set(ranked[:top_n]) synthetic_ids = {c["candidate_id"] for c in synthetic} leaked = synthetic_ids & top_n_ids return { "total_synthetic": len(synthetic_ids), "leaked_into_top_n": leaked, "pass": len(leaked) == 0, } # diversity check def candidate_archetype_signature(candidate: dict, feature_vector: dict) -> tuple: """ A coarse, human readable signature for clustering deliberately simple (no embeddings, no clustering library) so it stays fast and auditable. Buckets each candidate into a small discrete profile rather than computing exact distances. """ yoe_bucket = ( "junior" if candidate["profile"]["years_of_experience"] < 3 else "mid" if candidate["profile"]["years_of_experience"] < 7 else "senior" ) top_skill = max( candidate.get("skills", [{"name": "none", "duration_months": 0}]), key=lambda s: s.get("duration_months", 0) )["name"] industry = candidate["profile"].get("current_industry", "unknown") company = candidate["profile"].get("current_company", "unknown") return (yoe_bucket, top_skill, industry, company) def check_top100_diversity(top_100_candidates: list[dict], feature_vectors: dict[str, dict], max_signature_share: float = 0.25, max_single_company_share: float = 0.20) -> dict: """ Flags two specific homogeneity failure modes: (a) one archetype signature dominating > max_signature_share of the top 100 -- e.g. 30 nearly-identical profiles (b) one single employer accounting for too large a share of the top 100 -- a narrower, more specific version of (a) that's easy to misread as "we found the best company" rather than "our company-size/industry feature is too dominant". 20% is the default on a real ~100K-candidate dataset; this threshold should be loosened for small ad hoc test pools (a handful of distinct employers will trivially exceed it by chance). """ signatures = [ candidate_archetype_signature(c, feature_vectors[c["candidate_id"]]) for c in top_100_candidates ] sig_counts = Counter(signatures) n = len(top_100_candidates) company_counts = Counter(c["profile"]["current_company"] for c in top_100_candidates) flagged_signatures = { sig: count for sig, count in sig_counts.items() if count / n > max_signature_share } flagged_companies = { company: count for company, count in company_counts.items() if count / n > max_single_company_share } most_common_sig, most_common_sig_count = sig_counts.most_common(1)[0] most_common_company, most_common_company_count = company_counts.most_common(1)[0] return { "n_distinct_signatures": len(sig_counts), "most_common_signature": most_common_sig, "most_common_signature_share": round(most_common_sig_count / n, 3), "most_common_company": most_common_company, "most_common_company_share": round(most_common_company_count / n, 3), "flagged_signatures": flagged_signatures, "flagged_companies": flagged_companies, "pass": len(flagged_signatures) == 0 and len(flagged_companies) == 0, } def print_diversity_report(report: dict) -> None: print("=" * 60) print("TOP-100 DIVERSITY CHECK") print("=" * 60) print(f"Distinct archetype signatures in top 100: {report['n_distinct_signatures']}") print(f"Most common signature: {report['most_common_signature']} " f"({report['most_common_signature_share']:.1%} of top 100)") print(f"Most common employer: {report['most_common_company']} " f"({report['most_common_company_share']:.1%} of top 100)") if report["flagged_signatures"]: print("\n WARNING -- signature(s) exceeding 25% share:") for sig, count in report["flagged_signatures"].items(): print(f" {sig}: {count} candidates") if report["flagged_companies"]: print("\n WARNING -- employer(s) exceeding 20% share:") for company, count in report["flagged_companies"].items(): print(f" {company}: {count} candidates") print(f"\n PASS: {report['pass']}") # readiness gate def readiness_gate(probe_ndcg10: float, honeypot_result: dict, diversity_result: dict, ndcg10_threshold: float = 0.75) -> dict: """ The single go/no-go check run immediately before spending one of the 3 allowed submissions. All three must pass. """ checks = { "probe_ndcg10_meets_threshold": ( probe_ndcg10 is not None and probe_ndcg10 >= ndcg10_threshold ), "zero_honeypot_leakage": honeypot_result["pass"], "top100_diversity_acceptable": diversity_result["pass"], } return { "checks": checks, "ready_to_submit": all(checks.values()), } def print_readiness_report(gate_result: dict) -> None: print("=" * 60) print("SUBMISSION READINESS GATE") print("=" * 60) for check, passed in gate_result["checks"].items(): status = "PASS" if passed else "FAIL" print(f" [{status}] {check}") print() if gate_result["ready_to_submit"]: print("READY TO SUBMIT.") else: print("NOT READY -- fix failing checks above before spending a submission.") if __name__ == "__main__": print(__doc__) print( "This module is meant to be imported and driven by your own " "test harness once rank.py's pipeline function is finalized. " "See the four functions above: run_ablation, " "run_honeypot_injection_test, check_top100_diversity, and " "readiness_gate." )