"""Run the full ablation study using annotated ground truth. Checks that eval_set.json has annotated entries, runs retrieval eval, and optionally summarizes generation scores. Usage: python scripts/run_ablation.py python scripts/run_ablation.py --k 1 3 5 10 """ import argparse import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.config import PROJECT_ROOT EVAL_SET_PATH = PROJECT_ROOT / "data" / "eval_set.json" def load_eval_set() -> list[dict]: if not EVAL_SET_PATH.exists(): print(f"No eval set found at {EVAL_SET_PATH}") print() print("To create ground truth annotations:") print(" 1. python scripts/write_questions.py # author questions") print(" 2. python scripts/annotate.py # annotate retrieval relevance") print(" 3. python scripts/annotate_generation.py # score generation quality") sys.exit(1) with open(EVAL_SET_PATH, encoding="utf-8") as f: return json.load(f) def summarize_generation_scores(entries: list[dict]) -> None: """Print aggregate generation quality scores.""" scored = [e for e in entries if e.get("generation_scores")] if not scored: print("\nNo generation scores yet. Run: python scripts/annotate_generation.py") return faithfulness = [e["generation_scores"]["faithfulness"] for e in scored] relevance = [e["generation_scores"]["relevance"] for e in scored] citations = [e["generation_scores"]["citation_accuracy"] for e in scored] n = len(scored) print(f"\n=== Generation Quality ({n} scored) ===\n") print(f" Faithfulness: {sum(faithfulness) / n:.1%} " f"({sum(faithfulness)}/{n} faithful)") print(f" Avg relevance: {sum(relevance) / n:.2f} / 5.0") print(f" Citation accuracy: {sum(citations) / n:.1%} " f"({sum(citations)}/{n} accurate)") # Per-entry breakdown print(f"\n {'ID':<10} {'Faith':>6} {'Relev':>6} {'Cite':>6}") print(f" {'-'*30}") for e in scored: gs = e["generation_scores"] f_str = "Y" if gs["faithfulness"] else "N" c_str = "Y" if gs["citation_accuracy"] else "N" print(f" {e['id']:<10} {f_str:>6} {gs['relevance']:>6} {c_str:>6}") def main(): parser = argparse.ArgumentParser(description="Full ablation study from annotations") parser.add_argument( "--k", type=int, nargs="+", default=[1, 3, 5, 10], help="k values for retrieval metrics", ) args = parser.parse_args() eval_set = load_eval_set() annotated = [e for e in eval_set if e.get("relevant_chunk_ids")] if not annotated: print("No retrieval annotations found in eval_set.json") print("Run: python scripts/annotate.py") sys.exit(1) print(f"\n{'='*60}") print(f" ResearchRadar Ablation Study") print(f" {len(annotated)} annotated questions") print(f"{'='*60}") # Import and run retrieval evaluation from scripts.retrieval_eval import evaluate_method, print_table print(f"\n=== Retrieval Ablation ===\n") method_keys = ["bm25_top10", "vector_top10", "hybrid_top10"] results = {} for mk in method_keys: label = mk.replace("_top10", "") results[label] = evaluate_method(annotated, mk, args.k) print_table(results, args.k) # Generation scores summarize_generation_scores(eval_set) print(f"\n{'='*60}") print(f" Study complete.") print(f"{'='*60}\n") if __name__ == "__main__": main()