File size: 6,244 Bytes
c378584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d96ea
c378584
 
 
 
 
 
 
 
 
 
 
 
68d96ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c378584
 
68d96ea
c378584
65d5480
 
 
 
 
 
 
c378584
 
 
 
 
65d5480
 
c378584
 
 
 
 
c410788
 
 
68d96ea
c410788
c378584
 
 
 
 
 
 
 
 
 
 
3d027cb
c378584
 
3d027cb
 
 
 
 
 
 
 
 
c378584
 
 
68d96ea
c378584
 
 
 
 
 
68d96ea
c378584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d96ea
 
 
 
 
 
c378584
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""Run the evaluation harness.

Usage:
    python scripts/evaluate.py --mode deterministic
    python scripts/evaluate.py --mode full
"""

from __future__ import annotations

import argparse
import asyncio
import json
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from agent_bench.agents.orchestrator import Orchestrator
from agent_bench.core.config import load_config, load_task_config
from agent_bench.core.prompts import format_system_prompt
from agent_bench.core.provider import MockProvider, create_provider
from agent_bench.evaluation.harness import run_evaluation
from agent_bench.rag.embedder import Embedder
from agent_bench.rag.retriever import Retriever
from agent_bench.rag.store import HybridStore
from agent_bench.tools.calculator import CalculatorTool
from agent_bench.tools.registry import ToolRegistry
from agent_bench.tools.search import SearchTool


async def main_async(args: argparse.Namespace) -> None:
    config = load_config(Path(args.config) if args.config else None)

    # Resolve corpus-specific settings (--corpus) vs legacy single-store path.
    # Explicit --corpus routes through config.corpora[name]; without it we keep
    # the pre-multi-corpus behavior for backward compat with `make evaluate-fast`.
    if args.corpus:
        if args.corpus not in config.corpora:
            print(
                f"Error: corpus '{args.corpus}' not in config.corpora "
                f"(available: {sorted(config.corpora.keys())})"
            )
            sys.exit(1)
        corpus_cfg = config.corpora[args.corpus]
        if not corpus_cfg.available:
            print(
                f"Error: corpus '{args.corpus}' has available=false. "
                "Flip to true after data is curated and store is built."
            )
            sys.exit(1)
        if corpus_cfg.golden_dataset is None:
            print(
                f"Error: corpus '{args.corpus}' has no golden_dataset configured. "
                f"Set corpora.{args.corpus}.golden_dataset in the config."
            )
            sys.exit(1)
        store_path = corpus_cfg.store_path
        refusal_threshold = corpus_cfg.refusal_threshold
        golden_path: str = corpus_cfg.golden_dataset
        system_prompt = format_system_prompt(corpus_cfg.label)
        corpus_label = corpus_cfg.label
    else:
        task = load_task_config("tech_docs")
        store_path = config.rag.store_path
        refusal_threshold = config.rag.refusal_threshold
        golden_path = config.evaluation.golden_dataset
        system_prompt = task.system_prompt
        corpus_label = "(legacy single-store)"

    # Build the RAG pipeline
    store = HybridStore.load(store_path, rrf_k=config.rag.retrieval.rrf_k)
    embedder = Embedder(model_name=config.embedding.model, cache_dir=config.embedding.cache_dir)
    # Optional reranker
    reranker = None
    if config.rag.reranker.enabled:
        from agent_bench.rag.reranker import CrossEncoderReranker

        reranker = CrossEncoderReranker(model_name=config.rag.reranker.model_name)

    retriever = Retriever(
        embedder=embedder,
        store=store,
        default_strategy=config.rag.retrieval.strategy,
        candidates_per_system=config.rag.retrieval.candidates_per_system,
        reranker=reranker,
        reranker_top_k=config.rag.reranker.top_k,
    )

    # Build tools + orchestrator
    registry = ToolRegistry()
    registry.register(
        SearchTool(
            retriever=retriever,
            default_top_k=config.rag.retrieval.top_k,
            refusal_threshold=refusal_threshold,
        )
    )
    registry.register(CalculatorTool())

    provider = create_provider(config)
    orchestrator = Orchestrator(
        provider=provider,
        registry=registry,
        max_iterations=config.agent.max_iterations,
        temperature=config.agent.temperature,
    )

    # Judge provider for full mode — uses configured judge_provider
    judge = None
    if args.mode == "full":
        from agent_bench.core.config import AppConfig, ProviderConfig

        judge_config = AppConfig(
            provider=ProviderConfig(
                default=config.evaluation.judge_provider,
                models=config.provider.models,
            )
        )
        judge = create_provider(judge_config)

    # Run evaluation
    print(f"Running evaluation in '{args.mode}' mode...")
    print(f"Corpus: {corpus_label}")
    print(f"Golden dataset: {golden_path}")
    print(f"Store: {store.stats().total_chunks} chunks")
    print()

    results = await run_evaluation(
        orchestrator=orchestrator,
        system_prompt=system_prompt,
        golden_path=golden_path,
        judge_provider=judge,
    )

    # Save results as JSON
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    results_data = [r.model_dump() for r in results]
    output_path.write_text(json.dumps(results_data, indent=2, default=str))
    print(f"Results saved to {output_path}")

    # Print summary
    positive = [r for r in results if r.category != "out_of_scope"]
    avg_p5 = sum(r.retrieval_precision for r in positive) / max(len(positive), 1)
    avg_r5 = sum(r.retrieval_recall for r in positive) / max(len(positive), 1)
    avg_khr = sum(r.keyword_hit_rate for r in positive) / max(len(positive), 1)
    print(f"\nSummary ({len(results)} questions):")
    print(f"  Avg P@5:  {avg_p5:.2f}")
    print(f"  Avg R@5:  {avg_r5:.2f}")
    print(f"  Avg KHR:  {avg_khr:.2f}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Run evaluation harness")
    parser.add_argument("--config", default=None, help="Config YAML path")
    parser.add_argument(
        "--corpus",
        default=None,
        help="Corpus name from config.corpora (e.g. 'fastapi', 'k8s'). "
        "If omitted, uses legacy rag.store_path + evaluation.golden_dataset.",
    )
    parser.add_argument(
        "--mode",
        choices=["deterministic", "full"],
        default="deterministic",
    )
    parser.add_argument("--output", default=".cache/eval_results.json")
    args = parser.parse_args()
    asyncio.run(main_async(args))


if __name__ == "__main__":
    main()