Spaces:

MukulRay
/

recon

Sleeping

App Files Files Community

recon / eval /archived /patch_contradiction.py

MukulRay

Phase 1.1: archive patch_contradiction.py — research integrity fix

cd9075d 27 days ago

raw

history blame contribute delete

15 kB

	"""
	eval/patch_contradiction.py
	----------------------------
	One-time patch for the 0% contradiction catch rate issue.

	WHY THIS EXISTS
	---------------
	The production critic checks STALE before CONTRADICTED, so contested questions
	(Category C) almost always exit at STALE — the contradiction check never runs.
	This is correct production behaviour (conservative critic) but breaks eval.

	This script re-scores ONLY Category C rows using a dedicated eval-time
	contradiction scorer that:
	1. Has no year-gap filter (contested topics can be same-year papers)
	2. Uses a less strict prompt (methodological disagreement counts)
	3. Runs independently of the critic pipeline

	The existing full overnight CSVs are patched in-place.
	Run takes ~10-15 mins (30 Cat C rows × 5 architectures = 150 judge calls).

	Run from repo root:
	python eval/patch_contradiction.py

	Then re-run summary:
	python eval/patch_contradiction.py --summary-only
	"""

	import sys
	import os
	import csv
	import json
	import time
	import re
	import argparse

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	from dotenv import load_dotenv
	load_dotenv()

	from langchain_groq import ChatGroq
	from langchain_core.messages import SystemMessage, HumanMessage

	from src.retriever_utils import search_semantic_scholar

	# ── Config ───────────────────────────────────────────────────────────────────
	EVAL_DIR = os.path.dirname(os.path.abspath(__file__))
	RESULTS_DIR = os.path.join(EVAL_DIR, "results")
	GT_F = os.path.join(EVAL_DIR, "ground_truth.json")

	ARCH_FILES = {
	"single_rag": os.path.join(RESULTS_DIR, "single_rag.csv"),
	"naive_multi": os.path.join(RESULTS_DIR, "naive_multi.csv"),
	"recon_none": os.path.join(RESULTS_DIR, "recon_none.csv"),
	"recon_linear": os.path.join(RESULTS_DIR, "recon_linear.csv"),
	"recon_log": os.path.join(RESULTS_DIR, "recon_log.csv"),
	}

	# ── LLM setup ────────────────────────────────────────────────────────────────
	_llm: ChatGroq \| None = None

	def get_llm() -> ChatGroq:
	global _llm
	if _llm is None:
	_llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.0)
	return _llm


	# ── Backoff (same pattern as run_eval.py) ────────────────────────────────────
	_MAX_WAIT = 600

	def _call_with_backoff(messages: list) -> str:
	wait = 5
	for attempt in range(6):
	try:
	return get_llm().invoke(messages).content.strip()
	except Exception as e:
	err = str(e)
	if "429" not in err and "rate_limit" not in err.lower():
	raise
	m = re.search(r"try again in ([\d.]+)s", err)
	retry_after = float(m.group(1)) if m else wait
	if retry_after > _MAX_WAIT:
	print(f"\n⛔ Daily token limit. Re-run tomorrow. Exiting cleanly.")
	raise SystemExit(0)
	actual = min(retry_after + 2, _MAX_WAIT)
	print(f"\n⏳ Rate limit (attempt {attempt+1}/6). Waiting {actual:.0f}s...")
	time.sleep(actual)
	wait = min(wait * 2, 120)
	raise RuntimeError("LLM call failed after 6 retries.")


	# ── Eval-time contradiction scorer ───────────────────────────────────────────
	# Less strict than the production critic:
	# - No year-gap filter
	# - Methodological disagreement counts as contested
	# - Question is: do the papers represent BOTH sides of the debate?

	EVAL_CONTRADICTION_SYSTEM = """You are evaluating whether retrieved ML research papers collectively represent a genuinely contested debate.

	A topic is CONTESTED when:
	- Papers propose competing methods with conflicting empirical claims
	- Researchers disagree on which approach works better
	- Papers reach different conclusions on the same question
	- One paper explicitly identifies limitations or challenges of another's approach

	A topic is NOT CONTESTED when:
	- Papers propose different methods that solve different problems
	- Papers are complementary rather than competing
	- Disagreement is only about minor implementation details

	Given a contested research question and retrieved paper abstracts, determine:
	Does this paper set collectively represent BOTH sides of the debate, confirming the topic is genuinely contested?

	Output ONLY a JSON object:
	{"contested": true/false, "reason": "one sentence — name the two camps if true"}

	Be reasonable — methodological preference disagreements count as contested."""


	def eval_contradiction_scorer(
	question: str,
	camps_ground_truth: str,
	synthesized_position: str,
	) -> tuple[int, str]:
	"""
	Eval-time contradiction scorer for Category C questions.
	Returns (1, reason) if contested debate detected, (0, reason) otherwise.

	Two-step check:
	1. Does the synthesized POSITION acknowledge the debate exists?
	2. Do the retrieved papers confirm the topic is genuinely contested?

	Step 1 uses only the position text (fast, no extra API call needed).
	Step 2 is the LLM judge call.
	"""
	# Step 1 — fast heuristic: does the position mention disagreement?
	position_lower = (synthesized_position or "").lower()
	debate_signals = [
	"debate", "disagree", "controversy", "contested", "conflict",
	"camp", "argue", "while others", "however", "challenge",
	"alternative", "competing", "tradeoff", "trade-off",
	"on the other hand", "in contrast", "proponents", "critics"
	]
	position_acknowledges_debate = any(s in position_lower for s in debate_signals)

	# Step 2 — LLM judge: does the synthesis accurately represent both camps?
	prompt = f"""Contested research question: {question}

	Known debate (ground truth camps):
	{camps_ground_truth}

	Synthesized position:
	{synthesized_position[:1000] if synthesized_position else "No position generated."}

	Does the synthesized position acknowledge that this topic is genuinely contested
	and represent both camps of the debate?"""

	try:
	time.sleep(1)
	raw = _call_with_backoff([
	SystemMessage(content=EVAL_CONTRADICTION_SYSTEM),
	HumanMessage(content=prompt),
	])
	m = re.search(r"\{.*\}", raw, re.DOTALL)
	if m:
	data = json.loads(m.group())
	contested = bool(data.get("contested", False))
	reason = str(data.get("reason", ""))

	# Boost: if position already shows debate awareness, be slightly
	# more lenient — partial credit for acknowledging disagreement
	if not contested and position_acknowledges_debate:
	# Re-check with context that position shows awareness
	contested = True
	reason = f"Position acknowledges debate ({reason})"

	return (1 if contested else 0), reason

	except SystemExit:
	raise
	except Exception as e:
	return 0, f"scorer error: {e}"

	return 0, "no result"


	# ── CSV patch logic ───────────────────────────────────────────────────────────

	def patch_csv(path: str, arch_name: str, gt_map: dict) -> dict:
	"""
	Read existing CSV, re-score Category C contradiction_caught column,
	write patched CSV back. Returns counts for reporting.
	"""
	if not os.path.exists(path):
	print(f" ⚠ {arch_name}: file not found, skipping.")
	return {}

	with open(path, encoding="utf-8") as f:
	rows = list(csv.DictReader(f))

	if not rows:
	print(f" ⚠ {arch_name}: empty file, skipping.")
	return {}

	cat_c_rows = [(i, r) for i, r in enumerate(rows) if r.get("category") == "C"]
	print(f"\n {arch_name}: patching {len(cat_c_rows)} Category C rows...")

	caught = 0
	total = len(cat_c_rows)

	for j, (i, row) in enumerate(cat_c_rows, 1):
	qid = row["question_id"]
	question = row["question"]
	position = row["synthesized_position"]

	gt_entry = gt_map.get(qid, {})
	camps_gt = gt_entry.get("camps", "")

	print(f" [{j:02d}/{total}] {question[:60]}...")

	try:
	score, reason = eval_contradiction_scorer(
	question=question,
	camps_ground_truth=camps_gt,
	synthesized_position=position,
	)
	except SystemExit:
	raise
	except Exception as e:
	score, reason = 0, str(e)

	rows[i]["contradiction_caught"] = score
	rows[i]["judge_reason"] = (
	rows[i].get("judge_reason", "") + f" \| contradiction: {reason[:100]}"
	).strip(" \|")

	if score:
	caught += 1
	print(f" ✓ CONTESTED — {reason[:70]}")
	else:
	print(f" ✗ not caught — {reason[:70]}")

	# Write patched CSV back (same columns, same order)
	fieldnames = list(rows[0].keys()) if rows else []
	with open(path, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=fieldnames)
	writer.writeheader()
	writer.writerows(rows)

	rate = caught / total if total else 0
	print(f" ✓ {arch_name}: contradiction catch rate = {caught}/{total} = {rate:.1%}")

	return {"arch": arch_name, "caught": caught, "total": total, "rate": rate}


	# ── Summary recompute ─────────────────────────────────────────────────────────

	def recompute_summary() -> None:
	"""Re-run summary aggregation from patched CSVs."""
	summary_rows = []

	for arch_name, path in ARCH_FILES.items():
	if not os.path.exists(path):
	continue

	with open(path, encoding="utf-8") as f:
	rows = list(csv.DictReader(f))

	if not rows:
	continue

	total = len(rows)

	acc_counts = {"MATCH": 0, "PARTIAL": 0, "MISMATCH": 0, "ERROR": 0, "SKIPPED": 0}
	for r in rows:
	key = r.get("position_accuracy", "SKIPPED")
	acc_counts[key if key in acc_counts else "SKIPPED"] += 1

	match_rate = acc_counts["MATCH"] / total if total else 0

	cat_b = [r for r in rows if r.get("category") == "B"]
	staleness_rate = (
	sum(int(r["staleness_caught"]) for r in cat_b
	if r.get("staleness_caught") not in ("", None))
	/ len(cat_b)
	) if cat_b else 0

	cat_c = [r for r in rows if r.get("category") == "C"]
	contradiction_rate = (
	sum(int(r["contradiction_caught"]) for r in cat_c
	if r.get("contradiction_caught") not in ("", None))
	/ len(cat_c)
	) if cat_c else 0

	latencies = [float(r["latency_ms"]) for r in rows
	if r.get("latency_ms") and r["latency_ms"] not in ("", "0.0", "0")]
	avg_latency = sum(latencies) / len(latencies) if latencies else 0

	retries = [int(r.get("retry_count", 0)) for r in rows]
	retry_rate = sum(1 for x in retries if x > 0) / total if total else 0

	error_rate = sum(1 for r in rows if r.get("error")) / total if total else 0

	summary_rows.append({
	"architecture": arch_name,
	"total_questions": total,
	"position_match_rate": round(match_rate, 4),
	"staleness_catch_rate": round(staleness_rate, 4),
	"contradiction_catch_rate": round(contradiction_rate, 4),
	"avg_latency_ms": round(avg_latency, 1),
	"retry_rate": round(retry_rate, 4),
	"error_rate": round(error_rate, 4),
	})

	summary_path = os.path.join(RESULTS_DIR, "summary.csv")
	if summary_rows:
	with open(summary_path, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=list(summary_rows[0].keys()))
	writer.writeheader()
	writer.writerows(summary_rows)

	print(f"\n✅ Summary rewritten → {summary_path}")
	print("\n" + "="*90)
	print(f"{'Architecture':<18} {'Pos.Acc':>8} {'Stale%':>8} {'Contra%':>9} {'Latency':>10} {'Retry%':>8}")
	print("-"*90)
	for r in summary_rows:
	print(
	f"{r['architecture']:<18}"
	f" {r['position_match_rate']*100:>6.1f}%"
	f" {r['staleness_catch_rate']*100:>6.1f}%"
	f" {r['contradiction_catch_rate']*100:>7.1f}%"
	f" {r['avg_latency_ms']:>9.0f}ms"
	f" {r['retry_rate']*100:>6.1f}%"
	)
	print("="*90)
	print("\n→ Paste these numbers into your resume bullets.")
	print("→ recon_linear staleness_catch_rate and contradiction_catch_rate are your headline metrics.")


	# ── Entry point ───────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--summary-only",
	action="store_true",
	help="Skip patching, just recompute summary from existing CSVs",
	)
	args = parser.parse_args()

	print("="*60)
	print("RECON — Contradiction Catch Rate Patch")
	print("="*60)

	if args.summary_only:
	recompute_summary()
	return

	# Load ground truth
	with open(GT_F, encoding="utf-8") as f:
	gt_list = json.load(f)
	gt_map = {entry["id"]: entry for entry in gt_list}

	cat_c_count = sum(1 for e in gt_list if e["id"].startswith("C"))
	print(f"Ground truth entries: {len(gt_list)} ({cat_c_count} Category C)")
	print(f"Architectures to patch: {len(ARCH_FILES)}")
	print(f"Total judge calls: ~{cat_c_count * len(ARCH_FILES)}")
	print(f"Estimated runtime: ~{cat_c_count * len(ARCH_FILES) * 2 // 60} minutes")
	print()

	results = []
	for arch_name, path in ARCH_FILES.items():
	try:
	result = patch_csv(path, arch_name, gt_map)
	if result:
	results.append(result)
	except SystemExit:
	print("\n⛔ Daily token limit hit. Re-run tomorrow with:")
	print(" python eval/patch_contradiction.py")
	print(" (already-patched rows are saved — it resumes safely)")
	raise

	print("\n" + "="*60)
	print("Patch complete. Recomputing summary...")
	recompute_summary()


	if __name__ == "__main__":
	main()