Spaces:
Running
Running
File size: 6,244 Bytes
c378584 68d96ea c378584 68d96ea c378584 68d96ea c378584 65d5480 c378584 65d5480 c378584 c410788 68d96ea c410788 c378584 3d027cb c378584 3d027cb c378584 68d96ea c378584 68d96ea c378584 68d96ea c378584 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """Run the evaluation harness.
Usage:
python scripts/evaluate.py --mode deterministic
python scripts/evaluate.py --mode full
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from agent_bench.agents.orchestrator import Orchestrator
from agent_bench.core.config import load_config, load_task_config
from agent_bench.core.prompts import format_system_prompt
from agent_bench.core.provider import MockProvider, create_provider
from agent_bench.evaluation.harness import run_evaluation
from agent_bench.rag.embedder import Embedder
from agent_bench.rag.retriever import Retriever
from agent_bench.rag.store import HybridStore
from agent_bench.tools.calculator import CalculatorTool
from agent_bench.tools.registry import ToolRegistry
from agent_bench.tools.search import SearchTool
async def main_async(args: argparse.Namespace) -> None:
config = load_config(Path(args.config) if args.config else None)
# Resolve corpus-specific settings (--corpus) vs legacy single-store path.
# Explicit --corpus routes through config.corpora[name]; without it we keep
# the pre-multi-corpus behavior for backward compat with `make evaluate-fast`.
if args.corpus:
if args.corpus not in config.corpora:
print(
f"Error: corpus '{args.corpus}' not in config.corpora "
f"(available: {sorted(config.corpora.keys())})"
)
sys.exit(1)
corpus_cfg = config.corpora[args.corpus]
if not corpus_cfg.available:
print(
f"Error: corpus '{args.corpus}' has available=false. "
"Flip to true after data is curated and store is built."
)
sys.exit(1)
if corpus_cfg.golden_dataset is None:
print(
f"Error: corpus '{args.corpus}' has no golden_dataset configured. "
f"Set corpora.{args.corpus}.golden_dataset in the config."
)
sys.exit(1)
store_path = corpus_cfg.store_path
refusal_threshold = corpus_cfg.refusal_threshold
golden_path: str = corpus_cfg.golden_dataset
system_prompt = format_system_prompt(corpus_cfg.label)
corpus_label = corpus_cfg.label
else:
task = load_task_config("tech_docs")
store_path = config.rag.store_path
refusal_threshold = config.rag.refusal_threshold
golden_path = config.evaluation.golden_dataset
system_prompt = task.system_prompt
corpus_label = "(legacy single-store)"
# Build the RAG pipeline
store = HybridStore.load(store_path, rrf_k=config.rag.retrieval.rrf_k)
embedder = Embedder(model_name=config.embedding.model, cache_dir=config.embedding.cache_dir)
# Optional reranker
reranker = None
if config.rag.reranker.enabled:
from agent_bench.rag.reranker import CrossEncoderReranker
reranker = CrossEncoderReranker(model_name=config.rag.reranker.model_name)
retriever = Retriever(
embedder=embedder,
store=store,
default_strategy=config.rag.retrieval.strategy,
candidates_per_system=config.rag.retrieval.candidates_per_system,
reranker=reranker,
reranker_top_k=config.rag.reranker.top_k,
)
# Build tools + orchestrator
registry = ToolRegistry()
registry.register(
SearchTool(
retriever=retriever,
default_top_k=config.rag.retrieval.top_k,
refusal_threshold=refusal_threshold,
)
)
registry.register(CalculatorTool())
provider = create_provider(config)
orchestrator = Orchestrator(
provider=provider,
registry=registry,
max_iterations=config.agent.max_iterations,
temperature=config.agent.temperature,
)
# Judge provider for full mode — uses configured judge_provider
judge = None
if args.mode == "full":
from agent_bench.core.config import AppConfig, ProviderConfig
judge_config = AppConfig(
provider=ProviderConfig(
default=config.evaluation.judge_provider,
models=config.provider.models,
)
)
judge = create_provider(judge_config)
# Run evaluation
print(f"Running evaluation in '{args.mode}' mode...")
print(f"Corpus: {corpus_label}")
print(f"Golden dataset: {golden_path}")
print(f"Store: {store.stats().total_chunks} chunks")
print()
results = await run_evaluation(
orchestrator=orchestrator,
system_prompt=system_prompt,
golden_path=golden_path,
judge_provider=judge,
)
# Save results as JSON
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
results_data = [r.model_dump() for r in results]
output_path.write_text(json.dumps(results_data, indent=2, default=str))
print(f"Results saved to {output_path}")
# Print summary
positive = [r for r in results if r.category != "out_of_scope"]
avg_p5 = sum(r.retrieval_precision for r in positive) / max(len(positive), 1)
avg_r5 = sum(r.retrieval_recall for r in positive) / max(len(positive), 1)
avg_khr = sum(r.keyword_hit_rate for r in positive) / max(len(positive), 1)
print(f"\nSummary ({len(results)} questions):")
print(f" Avg P@5: {avg_p5:.2f}")
print(f" Avg R@5: {avg_r5:.2f}")
print(f" Avg KHR: {avg_khr:.2f}")
def main() -> None:
parser = argparse.ArgumentParser(description="Run evaluation harness")
parser.add_argument("--config", default=None, help="Config YAML path")
parser.add_argument(
"--corpus",
default=None,
help="Corpus name from config.corpora (e.g. 'fastapi', 'k8s'). "
"If omitted, uses legacy rag.store_path + evaluation.golden_dataset.",
)
parser.add_argument(
"--mode",
choices=["deterministic", "full"],
default="deterministic",
)
parser.add_argument("--output", default=".cache/eval_results.json")
args = parser.parse_args()
asyncio.run(main_async(args))
if __name__ == "__main__":
main()
|