File size: 1,679 Bytes
b89e6d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""Phase 2 entrypoint: run the eval suite and print a comparison table.

Usage:  python scripts/04_run_eval.py
Loads the index + assistant, then reports:
  - retrieval recall@k / MRR
  - functional pass@1 on HumanEval (baseline vs RAG)
"""
import sys
from pathlib import Path

import pandas as pd

sys.path.append(str(Path(__file__).resolve().parents[1]))
from src.config import load_config
from src.eval.functional_eval import evaluate
from src.eval.retrieval_eval import evaluate_cross_modal
from src.rag.generator import CodeAssistant


def main():
    cfg = load_config()
    print("=" * 60, "\nPHASE 2: EVALUATION\n", "=" * 60, sep="")

    assistant = CodeAssistant.from_config(cfg, with_index=True)

    # 1. Cross-modal retrieval quality on held-out test pairs.
    test = pd.read_parquet(Path(cfg.paths.processed_dir) / "test.parquet")
    pairs = test[["docstring", "code"]].dropna().sample(
        n=min(500, len(test)), random_state=42
    ).reset_index(drop=True)
    print("\n[retrieval]", evaluate_cross_modal(assistant.index.embedder, pairs))

    # 2. Functional pass@1: baseline vs RAG on HumanEval.
    LIMIT = 12  # raise for the real run
    base = evaluate(lambda i: assistant.generate(i, mode="baseline"),
                    "humaneval", limit=LIMIT)
    rag = evaluate(lambda i: assistant.generate(i, mode="rag"),
                   "humaneval", limit=LIMIT)

    table = pd.DataFrame([
        {"system": "baseline", "pass@1": base.pass_at_1},
        {"system": "rag", "pass@1": rag.pass_at_1},
    ])
    print(f"\n[functional] HumanEval pass@1 over {LIMIT} problems:")
    print(table.to_string(index=False))


if __name__ == "__main__":
    main()