Spaces:
Running
Running
| """Run the evaluation harness. | |
| Usage: | |
| python scripts/evaluate.py --mode deterministic | |
| python scripts/evaluate.py --mode full | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import asyncio | |
| import json | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from agent_bench.agents.orchestrator import Orchestrator | |
| from agent_bench.core.config import load_config, load_task_config | |
| from agent_bench.core.prompts import format_system_prompt | |
| from agent_bench.core.provider import MockProvider, create_provider | |
| from agent_bench.evaluation.harness import run_evaluation | |
| from agent_bench.rag.embedder import Embedder | |
| from agent_bench.rag.retriever import Retriever | |
| from agent_bench.rag.store import HybridStore | |
| from agent_bench.tools.calculator import CalculatorTool | |
| from agent_bench.tools.registry import ToolRegistry | |
| from agent_bench.tools.search import SearchTool | |
| async def main_async(args: argparse.Namespace) -> None: | |
| config = load_config(Path(args.config) if args.config else None) | |
| # Resolve corpus-specific settings (--corpus) vs legacy single-store path. | |
| # Explicit --corpus routes through config.corpora[name]; without it we keep | |
| # the pre-multi-corpus behavior for backward compat with `make evaluate-fast`. | |
| if args.corpus: | |
| if args.corpus not in config.corpora: | |
| print( | |
| f"Error: corpus '{args.corpus}' not in config.corpora " | |
| f"(available: {sorted(config.corpora.keys())})" | |
| ) | |
| sys.exit(1) | |
| corpus_cfg = config.corpora[args.corpus] | |
| if not corpus_cfg.available: | |
| print( | |
| f"Error: corpus '{args.corpus}' has available=false. " | |
| "Flip to true after data is curated and store is built." | |
| ) | |
| sys.exit(1) | |
| if corpus_cfg.golden_dataset is None: | |
| print( | |
| f"Error: corpus '{args.corpus}' has no golden_dataset configured. " | |
| f"Set corpora.{args.corpus}.golden_dataset in the config." | |
| ) | |
| sys.exit(1) | |
| store_path = corpus_cfg.store_path | |
| refusal_threshold = corpus_cfg.refusal_threshold | |
| golden_path: str = corpus_cfg.golden_dataset | |
| system_prompt = format_system_prompt(corpus_cfg.label) | |
| corpus_label = corpus_cfg.label | |
| else: | |
| task = load_task_config("tech_docs") | |
| store_path = config.rag.store_path | |
| refusal_threshold = config.rag.refusal_threshold | |
| golden_path = config.evaluation.golden_dataset | |
| system_prompt = task.system_prompt | |
| corpus_label = "(legacy single-store)" | |
| # Build the RAG pipeline | |
| store = HybridStore.load(store_path, rrf_k=config.rag.retrieval.rrf_k) | |
| embedder = Embedder(model_name=config.embedding.model, cache_dir=config.embedding.cache_dir) | |
| # Optional reranker | |
| reranker = None | |
| if config.rag.reranker.enabled: | |
| from agent_bench.rag.reranker import CrossEncoderReranker | |
| reranker = CrossEncoderReranker(model_name=config.rag.reranker.model_name) | |
| retriever = Retriever( | |
| embedder=embedder, | |
| store=store, | |
| default_strategy=config.rag.retrieval.strategy, | |
| candidates_per_system=config.rag.retrieval.candidates_per_system, | |
| reranker=reranker, | |
| reranker_top_k=config.rag.reranker.top_k, | |
| ) | |
| # Build tools + orchestrator | |
| registry = ToolRegistry() | |
| registry.register( | |
| SearchTool( | |
| retriever=retriever, | |
| default_top_k=config.rag.retrieval.top_k, | |
| refusal_threshold=refusal_threshold, | |
| ) | |
| ) | |
| registry.register(CalculatorTool()) | |
| provider = create_provider(config) | |
| orchestrator = Orchestrator( | |
| provider=provider, | |
| registry=registry, | |
| max_iterations=config.agent.max_iterations, | |
| temperature=config.agent.temperature, | |
| ) | |
| # Judge provider for full mode — uses configured judge_provider | |
| judge = None | |
| if args.mode == "full": | |
| from agent_bench.core.config import AppConfig, ProviderConfig | |
| judge_config = AppConfig( | |
| provider=ProviderConfig( | |
| default=config.evaluation.judge_provider, | |
| models=config.provider.models, | |
| ) | |
| ) | |
| judge = create_provider(judge_config) | |
| # Run evaluation | |
| print(f"Running evaluation in '{args.mode}' mode...") | |
| print(f"Corpus: {corpus_label}") | |
| print(f"Golden dataset: {golden_path}") | |
| print(f"Store: {store.stats().total_chunks} chunks") | |
| print() | |
| results = await run_evaluation( | |
| orchestrator=orchestrator, | |
| system_prompt=system_prompt, | |
| golden_path=golden_path, | |
| judge_provider=judge, | |
| ) | |
| # Save results as JSON | |
| output_path = Path(args.output) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| results_data = [r.model_dump() for r in results] | |
| output_path.write_text(json.dumps(results_data, indent=2, default=str)) | |
| print(f"Results saved to {output_path}") | |
| # Print summary | |
| positive = [r for r in results if r.category != "out_of_scope"] | |
| avg_p5 = sum(r.retrieval_precision for r in positive) / max(len(positive), 1) | |
| avg_r5 = sum(r.retrieval_recall for r in positive) / max(len(positive), 1) | |
| avg_khr = sum(r.keyword_hit_rate for r in positive) / max(len(positive), 1) | |
| print(f"\nSummary ({len(results)} questions):") | |
| print(f" Avg P@5: {avg_p5:.2f}") | |
| print(f" Avg R@5: {avg_r5:.2f}") | |
| print(f" Avg KHR: {avg_khr:.2f}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Run evaluation harness") | |
| parser.add_argument("--config", default=None, help="Config YAML path") | |
| parser.add_argument( | |
| "--corpus", | |
| default=None, | |
| help="Corpus name from config.corpora (e.g. 'fastapi', 'k8s'). " | |
| "If omitted, uses legacy rag.store_path + evaluation.golden_dataset.", | |
| ) | |
| parser.add_argument( | |
| "--mode", | |
| choices=["deterministic", "full"], | |
| default="deterministic", | |
| ) | |
| parser.add_argument("--output", default=".cache/eval_results.json") | |
| args = parser.parse_args() | |
| asyncio.run(main_async(args)) | |
| if __name__ == "__main__": | |
| main() | |