"""Phase 2 entrypoint: run the eval suite and print a comparison table. Usage: python scripts/04_run_eval.py Loads the index + assistant, then reports: - retrieval recall@k / MRR - functional pass@1 on HumanEval (baseline vs RAG) """ import sys from pathlib import Path import pandas as pd sys.path.append(str(Path(__file__).resolve().parents[1])) from src.config import load_config from src.eval.functional_eval import evaluate from src.eval.retrieval_eval import evaluate_cross_modal from src.rag.generator import CodeAssistant def main(): cfg = load_config() print("=" * 60, "\nPHASE 2: EVALUATION\n", "=" * 60, sep="") assistant = CodeAssistant.from_config(cfg, with_index=True) # 1. Cross-modal retrieval quality on held-out test pairs. test = pd.read_parquet(Path(cfg.paths.processed_dir) / "test.parquet") pairs = test[["docstring", "code"]].dropna().sample( n=min(500, len(test)), random_state=42 ).reset_index(drop=True) print("\n[retrieval]", evaluate_cross_modal(assistant.index.embedder, pairs)) # 2. Functional pass@1: baseline vs RAG on HumanEval. LIMIT = 12 # raise for the real run base = evaluate(lambda i: assistant.generate(i, mode="baseline"), "humaneval", limit=LIMIT) rag = evaluate(lambda i: assistant.generate(i, mode="rag"), "humaneval", limit=LIMIT) table = pd.DataFrame([ {"system": "baseline", "pass@1": base.pass_at_1}, {"system": "rag", "pass@1": rag.pass_at_1}, ]) print(f"\n[functional] HumanEval pass@1 over {LIMIT} problems:") print(table.to_string(index=False)) if __name__ == "__main__": main()