Upload folder using huggingface_hub

c754148 verified 1 day ago

11.8 kB

	"""
	validate_pipeline.py

	Local validation protocol for the Redrob candidate ranking system.
	Runs entirely offline, no network calls, designed to be executed
	before any of the 3 allowed competition submissions are spent.

	Checks performed:
	1. Probe-set NDCG@10 against hand-labeled reference candidates
	2. Ablation table (component on/off, confirm monotonic improvement)
	3. Honeypot injection test (synthetic violations of c1-c7, confirm suppression)
	4. Top-100 diversity / homogeneity check (NEW - this revision)
	5. Readiness gate (numeric threshold before spending a submission)
	"""

	import json
	import hashlib
	from collections import Counter
	from itertools import combinations


	PROBE_SET_LABELS = {
	"CAND_0000001": 3,
	"CAND_0000010": 3,

	"CAND_0000021": 0,
	"CAND_0000014": 2,
	"CAND_0000011": 1,
	}


	def compute_probe_ndcg10(ranked_candidate_ids: list[str],
	labels: dict[str, int] = PROBE_SET_LABELS) -> float:
	"""
	NDCG@10 restricted to candidates that appear in both the ranked
	output and the probe set. Only meaningful once the probe set is
	grown beyond the current 5 reference points (see TODO below).
	"""
	import math

	relevant_in_rank = [
	(rank, labels[cid])
	for rank, cid in enumerate(ranked_candidate_ids[:10], start=1)
	if cid in labels
	]
	if not relevant_in_rank:
	return None # probe set didn't overlap with top 10 at all

	dcg = sum(rel / math.log2(rank + 1) for rank, rel in relevant_in_rank)

	ideal_order = sorted(labels.values(), reverse=True)[:10]
	idcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal_order))

	return dcg / idcg if idcg > 0 else 0.0


	# ablation table

	def run_ablation(pipeline_fn, candidates: list[dict], jd_config: dict) -> dict:
	"""
	pipeline_fn(candidates, jd_config, **toggles) -> list[ranked_candidate_ids]
	Each toggle disables one component. Confirms NDCG@10 on the probe
	set does not improve when a component is removed -- if it does,
	that component is actively hurting ranking quality and is a bug,
	not a feature.
	"""
	configs = {
	"full_pipeline": dict(),
	"no_consistency_checks": dict(disable_consistency=True),
	"no_parameter_a": dict(disable_param_a=True),
	"bm25_only_no_features": dict(disable_features=True),
	}

	results = {}
	for name, toggles in configs.items():
	ranked = pipeline_fn(candidates, jd_config, **toggles)
	results[name] = {
	"ndcg10": compute_probe_ndcg10(ranked),
	"top10_ids": ranked[:10],
	}
	return results


	def print_ablation_report(results: dict) -> None:
	print("=" * 60)
	print("ABLATION REPORT")
	print("=" * 60)
	baseline = results["full_pipeline"]["ndcg10"]
	for name, r in results.items():
	flag = ""
	if name != "full_pipeline" and baseline is not None and r["ndcg10"] is not None:
	if r["ndcg10"] > baseline:
	flag = " <-- WARNING: removing this IMPROVED the score. Investigate."
	print(f"{name:30s} NDCG@10 = {r['ndcg10']}{flag}")

	# honeypot injection test

	def make_synthetic_honeypot(violation: str, base_candidate: dict) -> dict:
	"""
	Clones a real candidate and deliberately injects exactly one
	consistency-check violation, so each test case isolates a single
	check rather than confounding several at once.
	"""
	c = json.loads(json.dumps(base_candidate))
	c["candidate_id"] = f"SYNTH_{violation.upper()}"

	if violation == "timeline_impossibility":
	c["skills"][0]["duration_months"] = int(c["profile"]["years_of_experience"] * 12) + 50

	elif violation == "signup_anomaly":
	c["redrob_signals"]["signup_date"] = "2099-01-01"
	c["redrob_signals"]["last_active_date"] = "2026-01-01"

	elif violation == "salary_inversion":
	c["redrob_signals"]["expected_salary_range_inr_lpa"] = {"min": 50.0, "max": 10.0}

	elif violation == "assessment_contradiction":
	skill_name = c["skills"][0]["name"]
	c["skills"][0]["proficiency"] = "advanced"
	c["redrob_signals"]["skill_assessment_scores"][skill_name] = 12.0

	elif violation == "engagement_mismatch":
	c["redrob_signals"]["connection_count"] = 0
	c["redrob_signals"]["search_appearance_30d"] = 0
	c["redrob_signals"]["endorsements_received"] = 0

	elif violation == "langchain_dabbler":
	c["skills"] = [
	{"name": "LangChain", "proficiency": "advanced", "endorsements": 2, "duration_months": 6},
	{"name": "Prompt Engineering", "proficiency": "advanced", "endorsements": 1, "duration_months": 4},
	]
	c["redrob_signals"]["skill_assessment_scores"] = {}

	elif violation == "cv_specialist_no_nlp":
	c["skills"] = [
	{"name": "OpenCV", "proficiency": "advanced", "endorsements": 30, "duration_months": 36},
	{"name": "YOLO", "proficiency": "advanced", "endorsements": 20, "duration_months": 30},
	]

	return c


	VIOLATION_TYPES = [
	"timeline_impossibility", "signup_anomaly", "salary_inversion",
	"assessment_contradiction", "engagement_mismatch",
	"langchain_dabbler", "cv_specialist_no_nlp",
	]


	def run_honeypot_injection_test(pipeline_fn, real_candidates: list[dict],
	jd_config: dict, top_n: int = 100) -> dict:
	base = real_candidates[0]
	synthetic = [make_synthetic_honeypot(v, base) for v in VIOLATION_TYPES]
	test_pool = real_candidates + synthetic

	ranked = pipeline_fn(test_pool, jd_config)
	top_n_ids = set(ranked[:top_n])

	synthetic_ids = {c["candidate_id"] for c in synthetic}
	leaked = synthetic_ids & top_n_ids

	return {
	"total_synthetic": len(synthetic_ids),
	"leaked_into_top_n": leaked,
	"pass": len(leaked) == 0,
	}


	# diversity check

	def candidate_archetype_signature(candidate: dict, feature_vector: dict) -> tuple:
	"""
	A coarse, human readable signature for clustering deliberately
	simple (no embeddings, no clustering library) so it stays fast
	and auditable. Buckets each candidate into a small discrete
	profile rather than computing exact distances.
	"""
	yoe_bucket = (
	"junior" if candidate["profile"]["years_of_experience"] < 3 else
	"mid" if candidate["profile"]["years_of_experience"] < 7 else
	"senior"
	)
	top_skill = max(
	candidate.get("skills", [{"name": "none", "duration_months": 0}]),
	key=lambda s: s.get("duration_months", 0)
	)["name"]
	industry = candidate["profile"].get("current_industry", "unknown")
	company = candidate["profile"].get("current_company", "unknown")

	return (yoe_bucket, top_skill, industry, company)


	def check_top100_diversity(top_100_candidates: list[dict],
	feature_vectors: dict[str, dict],
	max_signature_share: float = 0.25,
	max_single_company_share: float = 0.20) -> dict:
	"""
	Flags two specific homogeneity failure modes:
	(a) one archetype signature dominating > max_signature_share
	of the top 100 -- e.g. 30 nearly-identical profiles
	(b) one single employer accounting for too large a share of
	the top 100 -- a narrower, more specific version of (a)
	that's easy to misread as "we found the best company"
	rather than "our company-size/industry feature is too
	dominant". 20% is the default on a real ~100K-candidate
	dataset; this threshold should be loosened for small ad
	hoc test pools (a handful of distinct employers will
	trivially exceed it by chance).
	"""
	signatures = [
	candidate_archetype_signature(c, feature_vectors[c["candidate_id"]])
	for c in top_100_candidates
	]
	sig_counts = Counter(signatures)
	n = len(top_100_candidates)

	company_counts = Counter(c["profile"]["current_company"] for c in top_100_candidates)

	flagged_signatures = {
	sig: count for sig, count in sig_counts.items()
	if count / n > max_signature_share
	}
	flagged_companies = {
	company: count for company, count in company_counts.items()
	if count / n > max_single_company_share
	}

	most_common_sig, most_common_sig_count = sig_counts.most_common(1)[0]
	most_common_company, most_common_company_count = company_counts.most_common(1)[0]

	return {
	"n_distinct_signatures": len(sig_counts),
	"most_common_signature": most_common_sig,
	"most_common_signature_share": round(most_common_sig_count / n, 3),
	"most_common_company": most_common_company,
	"most_common_company_share": round(most_common_company_count / n, 3),
	"flagged_signatures": flagged_signatures,
	"flagged_companies": flagged_companies,
	"pass": len(flagged_signatures) == 0 and len(flagged_companies) == 0,
	}


	def print_diversity_report(report: dict) -> None:
	print("=" * 60)
	print("TOP-100 DIVERSITY CHECK")
	print("=" * 60)
	print(f"Distinct archetype signatures in top 100: {report['n_distinct_signatures']}")
	print(f"Most common signature: {report['most_common_signature']} "
	f"({report['most_common_signature_share']:.1%} of top 100)")
	print(f"Most common employer: {report['most_common_company']} "
	f"({report['most_common_company_share']:.1%} of top 100)")
	if report["flagged_signatures"]:
	print("\n WARNING -- signature(s) exceeding 25% share:")
	for sig, count in report["flagged_signatures"].items():
	print(f" {sig}: {count} candidates")
	if report["flagged_companies"]:
	print("\n WARNING -- employer(s) exceeding 20% share:")
	for company, count in report["flagged_companies"].items():
	print(f" {company}: {count} candidates")
	print(f"\n PASS: {report['pass']}")

	# readiness gate

	def readiness_gate(probe_ndcg10: float,
	honeypot_result: dict,
	diversity_result: dict,
	ndcg10_threshold: float = 0.75) -> dict:
	"""
	The single go/no-go check run immediately before spending one of
	the 3 allowed submissions. All three must pass.
	"""
	checks = {
	"probe_ndcg10_meets_threshold": (
	probe_ndcg10 is not None and probe_ndcg10 >= ndcg10_threshold
	),
	"zero_honeypot_leakage": honeypot_result["pass"],
	"top100_diversity_acceptable": diversity_result["pass"],
	}
	return {
	"checks": checks,
	"ready_to_submit": all(checks.values()),
	}


	def print_readiness_report(gate_result: dict) -> None:
	print("=" * 60)
	print("SUBMISSION READINESS GATE")
	print("=" * 60)
	for check, passed in gate_result["checks"].items():
	status = "PASS" if passed else "FAIL"
	print(f" [{status}] {check}")
	print()
	if gate_result["ready_to_submit"]:
	print("READY TO SUBMIT.")
	else:
	print("NOT READY -- fix failing checks above before spending a submission.")



	if __name__ == "__main__":
	print(__doc__)
	print(
	"This module is meant to be imported and driven by your own "
	"test harness once rank.py's pipeline function is finalized. "
	"See the four functions above: run_ablation, "
	"run_honeypot_injection_test, check_top100_diversity, and "
	"readiness_gate."
	)