code-gen-assistant / scripts /04_run_eval.py
Rushabh147's picture
Initial deploy to HF Spaces (clean history, LFS for all binaries)
b89e6d6
Raw
History Blame Contribute Delete
1.68 kB
"""Phase 2 entrypoint: run the eval suite and print a comparison table.
Usage: python scripts/04_run_eval.py
Loads the index + assistant, then reports:
- retrieval recall@k / MRR
- functional pass@1 on HumanEval (baseline vs RAG)
"""
import sys
from pathlib import Path
import pandas as pd
sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.config import load_config
from src.eval.functional_eval import evaluate
from src.eval.retrieval_eval import evaluate_cross_modal
from src.rag.generator import CodeAssistant
def main():
cfg = load_config()
print("=" * 60, "\nPHASE 2: EVALUATION\n", "=" * 60, sep="")
assistant = CodeAssistant.from_config(cfg, with_index=True)
# 1. Cross-modal retrieval quality on held-out test pairs.
test = pd.read_parquet(Path(cfg.paths.processed_dir) / "test.parquet")
pairs = test[["docstring", "code"]].dropna().sample(
n=min(500, len(test)), random_state=42
).reset_index(drop=True)
print("\n[retrieval]", evaluate_cross_modal(assistant.index.embedder, pairs))
# 2. Functional pass@1: baseline vs RAG on HumanEval.
LIMIT = 12 # raise for the real run
base = evaluate(lambda i: assistant.generate(i, mode="baseline"),
"humaneval", limit=LIMIT)
rag = evaluate(lambda i: assistant.generate(i, mode="rag"),
"humaneval", limit=LIMIT)
table = pd.DataFrame([
{"system": "baseline", "pass@1": base.pass_at_1},
{"system": "rag", "pass@1": rag.pass_at_1},
])
print(f"\n[functional] HumanEval pass@1 over {LIMIT} problems:")
print(table.to_string(index=False))
if __name__ == "__main__":
main()