recon / eval /debug_contradiction.py
MukulRay's picture
Phase 1.3: eval results, test scripts, gap filter reverted — no improvement, changelog update
10f9a75
import sys, os, json, time
sys.path.insert(0, '.')
from dotenv import load_dotenv
load_dotenv()
from src.agents.critic import _detect_contradictions, _check_contradiction
from src.agents.retriever import retriever_node
from src.agents.planner import planner_node
from src.state import ResearchState
# Load one Cat C question
questions = json.load(open('eval/questions.json'))
cat_c = [q for q in questions if q.get('category') == 'C']
test_q = cat_c[0]
print(f"Testing: [{test_q['id']}] {test_q['question']}")
print("=" * 70)
# Build minimal state and run planner + retriever
state = {
"original_query": test_q["question"],
"session_id": "debug-001",
"session_context": None,
"sub_questions": [],
"retrieved_papers": [],
"citation_graph": {},
"web_results": [],
"critic_verdict": "",
"critic_notes": "",
"rewritten_questions": [],
"retry_count": 0,
"synthesized_position": "",
"claim_confidences": [],
"session_update": None,
"export_md": "",
"decay_config": "linear",
"calibration_bin": "",
"latency_ms": 0.0,
"paper_reliability_scores": {},
"reliability_dominant_signals": {},
"is_followup": False,
"previous_papers": [],
"previous_verdict": "",
"previous_claims": [],
"turn_number": 1,
"explore_mode": False,
"explorer_findings": [],
}
print("Step 1: Running planner...")
state = planner_node(state)
print(f"Sub-questions: {state.get('sub_questions', [])}")
print("\nStep 2: Running retriever...")
state = retriever_node(state)
papers = state.get("retrieved_papers", [])
print(f"Papers retrieved: {len(papers)}")
print("\nTop 6 papers:")
for i, p in enumerate(papers[:6]):
print(f" [{i+1}] {p.year} | citations={p.citation_count} | score={p.hybrid_score:.3f}")
print(f" {p.title[:80]}")
print("\nStep 3: Running _detect_contradictions on top 4 papers...")
top4 = papers[:4]
print("Paper pairs being checked (must have >= 2 year gap):")
checked = 0
for i, pa in enumerate(top4):
for pb in top4[i+1:]:
gap = abs((pa.year or 0) - (pb.year or 0))
print(f" Pair ({i},{i+1}): {pa.year} vs {pb.year} -- gap={gap}yrs -- {'WILL CHECK' if gap >= 2 else 'SKIPPED (gap < 2)'}")
if gap >= 2:
checked += 1
older = pa if (pa.year or 0) < (pb.year or 0) else pb
newer = pa if (pa.year or 0) > (pb.year or 0) else pb
print(f" older: [{older.year}] {older.title[:60]}")
print(f" newer: [{newer.year}] {newer.title[:60]}")
print(f" older abstract: {(older.abstract or 'NONE')[:200]}")
print(f" newer abstract: {(newer.abstract or 'NONE')[:200]}")
result, reason = _check_contradiction(older, newer)
print(f" LLM result: contradicts={result}, reason={reason}")
print()
print(f"\nTotal pairs actually LLM-checked: {checked}")
contradictions = _detect_contradictions(papers)
print(f"Final _detect_contradictions output: {len(contradictions)} contradictions found")