Spaces:
Running
Running
File size: 5,674 Bytes
9f98da1 2c64504 9f98da1 2c64504 9f98da1 2c64504 9f98da1 2c64504 9f98da1 2c64504 9f98da1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | """Run LangChain baseline evaluation against the golden dataset.
Usage:
python scripts/run_langchain_eval.py --provider openai
python scripts/run_langchain_eval.py --provider anthropic
python scripts/run_langchain_eval.py --provider openai --max-questions 3
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
async def main_async(args: argparse.Namespace) -> None:
# Heavy imports deferred so --help works without loading torch/OpenMP/etc.
from agent_bench.core.config import load_config, load_task_config
from agent_bench.evaluation.report import generate_report, save_report
from agent_bench.langchain_baseline.agent import create_langchain_agent
from agent_bench.langchain_baseline.retriever import AgentBenchRetriever
from agent_bench.langchain_baseline.runner import run_langchain_evaluation
from agent_bench.langchain_baseline.tools import LangChainSearchTool, create_calculator_tool
from agent_bench.rag.embedder import Embedder
from agent_bench.rag.retriever import Retriever
from agent_bench.rag.store import HybridStore
config = load_config(Path(args.config) if args.config else None)
task = load_task_config("tech_docs")
# Build existing RAG pipeline (same as scripts/evaluate.py)
store = HybridStore.load(config.rag.store_path, rrf_k=config.rag.retrieval.rrf_k)
embedder = Embedder(model_name=config.embedding.model, cache_dir=config.embedding.cache_dir)
reranker = None
if config.rag.reranker.enabled:
from agent_bench.rag.reranker import CrossEncoderReranker
reranker = CrossEncoderReranker(model_name=config.rag.reranker.model_name)
retriever = Retriever(
embedder=embedder,
store=store,
default_strategy=config.rag.retrieval.strategy,
candidates_per_system=config.rag.retrieval.candidates_per_system,
reranker=reranker,
reranker_top_k=config.rag.reranker.top_k,
)
# Wrap in LangChain components
lc_retriever = AgentBenchRetriever(retriever=retriever, top_k=config.rag.retrieval.top_k)
search_tool = LangChainSearchTool(lc_retriever)
calc_tool = create_calculator_tool()
agent_executor = create_langchain_agent(
tools=[search_tool.as_tool(), calc_tool],
provider=args.provider,
system_prompt=task.system_prompt,
max_iterations=config.agent.max_iterations,
)
# Resolve model name and pricing for token cost tracking
model_defaults = {"openai": "gpt-4o-mini", "anthropic": "claude-haiku-4-5-20251001"}
model_name = model_defaults[args.provider]
pricing = config.provider.models.get(model_name)
input_cost = pricing.input_cost_per_mtok if pricing else 0.0
output_cost = pricing.output_cost_per_mtok if pricing else 0.0
# Run evaluation
golden_path = config.evaluation.golden_dataset
print("Running LangChain baseline evaluation...")
print(f" Provider: {args.provider}")
print(f" Model: {model_name}")
print(f" Store: {store.stats().total_chunks} chunks")
print(f" Golden: {golden_path}")
if args.max_questions:
print(f" Limit: {args.max_questions} questions")
print()
results = await run_langchain_evaluation(
agent_executor=agent_executor,
search_tool_state=search_tool,
golden_path=golden_path,
provider_name=args.provider,
max_questions=args.max_questions,
input_cost_per_mtok=input_cost,
output_cost_per_mtok=output_cost,
)
# Save raw results JSON
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
results_data = [r.model_dump() for r in results]
output_path.write_text(json.dumps(results_data, indent=2, default=str))
print(f"Results JSON: {output_path}")
# Generate markdown report (reuses existing report generator)
report = generate_report(
results,
provider_name=f"langchain-{args.provider}",
corpus_size=store.stats().unique_sources,
)
report_path = Path(f"docs/langchain_benchmark_{args.provider}.md")
save_report(report, report_path)
print(f"Report: {report_path}")
# Print summary
positive = [r for r in results if r.category != "out_of_scope"]
errors = [r for r in results if r.answer.startswith("ERROR")]
avg_p5 = sum(r.retrieval_precision for r in positive) / max(len(positive), 1)
avg_r5 = sum(r.retrieval_recall for r in positive) / max(len(positive), 1)
avg_khr = sum(r.keyword_hit_rate for r in positive) / max(len(positive), 1)
avg_lat = sum(r.latency_ms for r in results) / max(len(results), 1)
print(f"\nSummary ({len(results)} questions, {len(errors)} errors):")
print(f" Avg P@5: {avg_p5:.2f}")
print(f" Avg R@5: {avg_r5:.2f}")
print(f" Avg KHR: {avg_khr:.2f}")
print(f" Avg latency: {avg_lat:,.0f} ms")
def main() -> None:
parser = argparse.ArgumentParser(description="Run LangChain baseline evaluation")
parser.add_argument(
"--provider",
choices=["openai", "anthropic"],
default="openai",
)
parser.add_argument("--config", default=None, help="Config YAML path")
parser.add_argument("--output", default=".cache/langchain_eval_results.json")
parser.add_argument(
"--max-questions",
type=int,
default=None,
help="Limit number of questions (for testing)",
)
args = parser.parse_args()
asyncio.run(main_async(args))
if __name__ == "__main__":
main()
|