Spaces:
Sleeping
Sleeping
| """Phase 2 entrypoint: run the eval suite and print a comparison table. | |
| Usage: python scripts/04_run_eval.py | |
| Loads the index + assistant, then reports: | |
| - retrieval recall@k / MRR | |
| - functional pass@1 on HumanEval (baseline vs RAG) | |
| """ | |
| import sys | |
| from pathlib import Path | |
| import pandas as pd | |
| sys.path.append(str(Path(__file__).resolve().parents[1])) | |
| from src.config import load_config | |
| from src.eval.functional_eval import evaluate | |
| from src.eval.retrieval_eval import evaluate_cross_modal | |
| from src.rag.generator import CodeAssistant | |
| def main(): | |
| cfg = load_config() | |
| print("=" * 60, "\nPHASE 2: EVALUATION\n", "=" * 60, sep="") | |
| assistant = CodeAssistant.from_config(cfg, with_index=True) | |
| # 1. Cross-modal retrieval quality on held-out test pairs. | |
| test = pd.read_parquet(Path(cfg.paths.processed_dir) / "test.parquet") | |
| pairs = test[["docstring", "code"]].dropna().sample( | |
| n=min(500, len(test)), random_state=42 | |
| ).reset_index(drop=True) | |
| print("\n[retrieval]", evaluate_cross_modal(assistant.index.embedder, pairs)) | |
| # 2. Functional pass@1: baseline vs RAG on HumanEval. | |
| LIMIT = 12 # raise for the real run | |
| base = evaluate(lambda i: assistant.generate(i, mode="baseline"), | |
| "humaneval", limit=LIMIT) | |
| rag = evaluate(lambda i: assistant.generate(i, mode="rag"), | |
| "humaneval", limit=LIMIT) | |
| table = pd.DataFrame([ | |
| {"system": "baseline", "pass@1": base.pass_at_1}, | |
| {"system": "rag", "pass@1": rag.pass_at_1}, | |
| ]) | |
| print(f"\n[functional] HumanEval pass@1 over {LIMIT} problems:") | |
| print(table.to_string(index=False)) | |
| if __name__ == "__main__": | |
| main() | |