LordofMonarchs's picture
Upload folder using huggingface_hub
c754148 verified
Raw
History Blame Contribute Delete
11.8 kB
"""
validate_pipeline.py
Local validation protocol for the Redrob candidate ranking system.
Runs entirely offline, no network calls, designed to be executed
before any of the 3 allowed competition submissions are spent.
Checks performed:
1. Probe-set NDCG@10 against hand-labeled reference candidates
2. Ablation table (component on/off, confirm monotonic improvement)
3. Honeypot injection test (synthetic violations of c1-c7, confirm suppression)
4. Top-100 diversity / homogeneity check (NEW - this revision)
5. Readiness gate (numeric threshold before spending a submission)
"""
import json
import hashlib
from collections import Counter
from itertools import combinations
PROBE_SET_LABELS = {
"CAND_0000001": 3,
"CAND_0000010": 3,
"CAND_0000021": 0,
"CAND_0000014": 2,
"CAND_0000011": 1,
}
def compute_probe_ndcg10(ranked_candidate_ids: list[str],
labels: dict[str, int] = PROBE_SET_LABELS) -> float:
"""
NDCG@10 restricted to candidates that appear in both the ranked
output and the probe set. Only meaningful once the probe set is
grown beyond the current 5 reference points (see TODO below).
"""
import math
relevant_in_rank = [
(rank, labels[cid])
for rank, cid in enumerate(ranked_candidate_ids[:10], start=1)
if cid in labels
]
if not relevant_in_rank:
return None # probe set didn't overlap with top 10 at all
dcg = sum(rel / math.log2(rank + 1) for rank, rel in relevant_in_rank)
ideal_order = sorted(labels.values(), reverse=True)[:10]
idcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal_order))
return dcg / idcg if idcg > 0 else 0.0
# ablation table
def run_ablation(pipeline_fn, candidates: list[dict], jd_config: dict) -> dict:
"""
pipeline_fn(candidates, jd_config, **toggles) -> list[ranked_candidate_ids]
Each toggle disables one component. Confirms NDCG@10 on the probe
set does not improve when a component is removed -- if it does,
that component is actively hurting ranking quality and is a bug,
not a feature.
"""
configs = {
"full_pipeline": dict(),
"no_consistency_checks": dict(disable_consistency=True),
"no_parameter_a": dict(disable_param_a=True),
"bm25_only_no_features": dict(disable_features=True),
}
results = {}
for name, toggles in configs.items():
ranked = pipeline_fn(candidates, jd_config, **toggles)
results[name] = {
"ndcg10": compute_probe_ndcg10(ranked),
"top10_ids": ranked[:10],
}
return results
def print_ablation_report(results: dict) -> None:
print("=" * 60)
print("ABLATION REPORT")
print("=" * 60)
baseline = results["full_pipeline"]["ndcg10"]
for name, r in results.items():
flag = ""
if name != "full_pipeline" and baseline is not None and r["ndcg10"] is not None:
if r["ndcg10"] > baseline:
flag = " <-- WARNING: removing this IMPROVED the score. Investigate."
print(f"{name:30s} NDCG@10 = {r['ndcg10']}{flag}")
# honeypot injection test
def make_synthetic_honeypot(violation: str, base_candidate: dict) -> dict:
"""
Clones a real candidate and deliberately injects exactly one
consistency-check violation, so each test case isolates a single
check rather than confounding several at once.
"""
c = json.loads(json.dumps(base_candidate))
c["candidate_id"] = f"SYNTH_{violation.upper()}"
if violation == "timeline_impossibility":
c["skills"][0]["duration_months"] = int(c["profile"]["years_of_experience"] * 12) + 50
elif violation == "signup_anomaly":
c["redrob_signals"]["signup_date"] = "2099-01-01"
c["redrob_signals"]["last_active_date"] = "2026-01-01"
elif violation == "salary_inversion":
c["redrob_signals"]["expected_salary_range_inr_lpa"] = {"min": 50.0, "max": 10.0}
elif violation == "assessment_contradiction":
skill_name = c["skills"][0]["name"]
c["skills"][0]["proficiency"] = "advanced"
c["redrob_signals"]["skill_assessment_scores"][skill_name] = 12.0
elif violation == "engagement_mismatch":
c["redrob_signals"]["connection_count"] = 0
c["redrob_signals"]["search_appearance_30d"] = 0
c["redrob_signals"]["endorsements_received"] = 0
elif violation == "langchain_dabbler":
c["skills"] = [
{"name": "LangChain", "proficiency": "advanced", "endorsements": 2, "duration_months": 6},
{"name": "Prompt Engineering", "proficiency": "advanced", "endorsements": 1, "duration_months": 4},
]
c["redrob_signals"]["skill_assessment_scores"] = {}
elif violation == "cv_specialist_no_nlp":
c["skills"] = [
{"name": "OpenCV", "proficiency": "advanced", "endorsements": 30, "duration_months": 36},
{"name": "YOLO", "proficiency": "advanced", "endorsements": 20, "duration_months": 30},
]
return c
VIOLATION_TYPES = [
"timeline_impossibility", "signup_anomaly", "salary_inversion",
"assessment_contradiction", "engagement_mismatch",
"langchain_dabbler", "cv_specialist_no_nlp",
]
def run_honeypot_injection_test(pipeline_fn, real_candidates: list[dict],
jd_config: dict, top_n: int = 100) -> dict:
base = real_candidates[0]
synthetic = [make_synthetic_honeypot(v, base) for v in VIOLATION_TYPES]
test_pool = real_candidates + synthetic
ranked = pipeline_fn(test_pool, jd_config)
top_n_ids = set(ranked[:top_n])
synthetic_ids = {c["candidate_id"] for c in synthetic}
leaked = synthetic_ids & top_n_ids
return {
"total_synthetic": len(synthetic_ids),
"leaked_into_top_n": leaked,
"pass": len(leaked) == 0,
}
# diversity check
def candidate_archetype_signature(candidate: dict, feature_vector: dict) -> tuple:
"""
A coarse, human readable signature for clustering deliberately
simple (no embeddings, no clustering library) so it stays fast
and auditable. Buckets each candidate into a small discrete
profile rather than computing exact distances.
"""
yoe_bucket = (
"junior" if candidate["profile"]["years_of_experience"] < 3 else
"mid" if candidate["profile"]["years_of_experience"] < 7 else
"senior"
)
top_skill = max(
candidate.get("skills", [{"name": "none", "duration_months": 0}]),
key=lambda s: s.get("duration_months", 0)
)["name"]
industry = candidate["profile"].get("current_industry", "unknown")
company = candidate["profile"].get("current_company", "unknown")
return (yoe_bucket, top_skill, industry, company)
def check_top100_diversity(top_100_candidates: list[dict],
feature_vectors: dict[str, dict],
max_signature_share: float = 0.25,
max_single_company_share: float = 0.20) -> dict:
"""
Flags two specific homogeneity failure modes:
(a) one archetype signature dominating > max_signature_share
of the top 100 -- e.g. 30 nearly-identical profiles
(b) one single employer accounting for too large a share of
the top 100 -- a narrower, more specific version of (a)
that's easy to misread as "we found the best company"
rather than "our company-size/industry feature is too
dominant". 20% is the default on a real ~100K-candidate
dataset; this threshold should be loosened for small ad
hoc test pools (a handful of distinct employers will
trivially exceed it by chance).
"""
signatures = [
candidate_archetype_signature(c, feature_vectors[c["candidate_id"]])
for c in top_100_candidates
]
sig_counts = Counter(signatures)
n = len(top_100_candidates)
company_counts = Counter(c["profile"]["current_company"] for c in top_100_candidates)
flagged_signatures = {
sig: count for sig, count in sig_counts.items()
if count / n > max_signature_share
}
flagged_companies = {
company: count for company, count in company_counts.items()
if count / n > max_single_company_share
}
most_common_sig, most_common_sig_count = sig_counts.most_common(1)[0]
most_common_company, most_common_company_count = company_counts.most_common(1)[0]
return {
"n_distinct_signatures": len(sig_counts),
"most_common_signature": most_common_sig,
"most_common_signature_share": round(most_common_sig_count / n, 3),
"most_common_company": most_common_company,
"most_common_company_share": round(most_common_company_count / n, 3),
"flagged_signatures": flagged_signatures,
"flagged_companies": flagged_companies,
"pass": len(flagged_signatures) == 0 and len(flagged_companies) == 0,
}
def print_diversity_report(report: dict) -> None:
print("=" * 60)
print("TOP-100 DIVERSITY CHECK")
print("=" * 60)
print(f"Distinct archetype signatures in top 100: {report['n_distinct_signatures']}")
print(f"Most common signature: {report['most_common_signature']} "
f"({report['most_common_signature_share']:.1%} of top 100)")
print(f"Most common employer: {report['most_common_company']} "
f"({report['most_common_company_share']:.1%} of top 100)")
if report["flagged_signatures"]:
print("\n WARNING -- signature(s) exceeding 25% share:")
for sig, count in report["flagged_signatures"].items():
print(f" {sig}: {count} candidates")
if report["flagged_companies"]:
print("\n WARNING -- employer(s) exceeding 20% share:")
for company, count in report["flagged_companies"].items():
print(f" {company}: {count} candidates")
print(f"\n PASS: {report['pass']}")
# readiness gate
def readiness_gate(probe_ndcg10: float,
honeypot_result: dict,
diversity_result: dict,
ndcg10_threshold: float = 0.75) -> dict:
"""
The single go/no-go check run immediately before spending one of
the 3 allowed submissions. All three must pass.
"""
checks = {
"probe_ndcg10_meets_threshold": (
probe_ndcg10 is not None and probe_ndcg10 >= ndcg10_threshold
),
"zero_honeypot_leakage": honeypot_result["pass"],
"top100_diversity_acceptable": diversity_result["pass"],
}
return {
"checks": checks,
"ready_to_submit": all(checks.values()),
}
def print_readiness_report(gate_result: dict) -> None:
print("=" * 60)
print("SUBMISSION READINESS GATE")
print("=" * 60)
for check, passed in gate_result["checks"].items():
status = "PASS" if passed else "FAIL"
print(f" [{status}] {check}")
print()
if gate_result["ready_to_submit"]:
print("READY TO SUBMIT.")
else:
print("NOT READY -- fix failing checks above before spending a submission.")
if __name__ == "__main__":
print(__doc__)
print(
"This module is meant to be imported and driven by your own "
"test harness once rank.py's pipeline function is finalized. "
"See the four functions above: run_ablation, "
"run_honeypot_injection_test, check_top100_diversity, and "
"readiness_gate."
)