"""Run LangChain baseline evaluation against the golden dataset. Usage: python scripts/run_langchain_eval.py --provider openai python scripts/run_langchain_eval.py --provider anthropic python scripts/run_langchain_eval.py --provider openai --max-questions 3 """ from __future__ import annotations import argparse import asyncio import json import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) async def main_async(args: argparse.Namespace) -> None: # Heavy imports deferred so --help works without loading torch/OpenMP/etc. from agent_bench.core.config import load_config, load_task_config from agent_bench.evaluation.report import generate_report, save_report from agent_bench.langchain_baseline.agent import create_langchain_agent from agent_bench.langchain_baseline.retriever import AgentBenchRetriever from agent_bench.langchain_baseline.runner import run_langchain_evaluation from agent_bench.langchain_baseline.tools import LangChainSearchTool, create_calculator_tool from agent_bench.rag.embedder import Embedder from agent_bench.rag.retriever import Retriever from agent_bench.rag.store import HybridStore config = load_config(Path(args.config) if args.config else None) task = load_task_config("tech_docs") # Build existing RAG pipeline (same as scripts/evaluate.py) store = HybridStore.load(config.rag.store_path, rrf_k=config.rag.retrieval.rrf_k) embedder = Embedder(model_name=config.embedding.model, cache_dir=config.embedding.cache_dir) reranker = None if config.rag.reranker.enabled: from agent_bench.rag.reranker import CrossEncoderReranker reranker = CrossEncoderReranker(model_name=config.rag.reranker.model_name) retriever = Retriever( embedder=embedder, store=store, default_strategy=config.rag.retrieval.strategy, candidates_per_system=config.rag.retrieval.candidates_per_system, reranker=reranker, reranker_top_k=config.rag.reranker.top_k, ) # Wrap in LangChain components lc_retriever = AgentBenchRetriever(retriever=retriever, top_k=config.rag.retrieval.top_k) search_tool = LangChainSearchTool(lc_retriever) calc_tool = create_calculator_tool() agent_executor = create_langchain_agent( tools=[search_tool.as_tool(), calc_tool], provider=args.provider, system_prompt=task.system_prompt, max_iterations=config.agent.max_iterations, ) # Resolve model name and pricing for token cost tracking model_defaults = {"openai": "gpt-4o-mini", "anthropic": "claude-haiku-4-5-20251001"} model_name = model_defaults[args.provider] pricing = config.provider.models.get(model_name) input_cost = pricing.input_cost_per_mtok if pricing else 0.0 output_cost = pricing.output_cost_per_mtok if pricing else 0.0 # Run evaluation golden_path = config.evaluation.golden_dataset print("Running LangChain baseline evaluation...") print(f" Provider: {args.provider}") print(f" Model: {model_name}") print(f" Store: {store.stats().total_chunks} chunks") print(f" Golden: {golden_path}") if args.max_questions: print(f" Limit: {args.max_questions} questions") print() results = await run_langchain_evaluation( agent_executor=agent_executor, search_tool_state=search_tool, golden_path=golden_path, provider_name=args.provider, max_questions=args.max_questions, input_cost_per_mtok=input_cost, output_cost_per_mtok=output_cost, ) # Save raw results JSON output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) results_data = [r.model_dump() for r in results] output_path.write_text(json.dumps(results_data, indent=2, default=str)) print(f"Results JSON: {output_path}") # Generate markdown report (reuses existing report generator) report = generate_report( results, provider_name=f"langchain-{args.provider}", corpus_size=store.stats().unique_sources, ) report_path = Path(f"docs/langchain_benchmark_{args.provider}.md") save_report(report, report_path) print(f"Report: {report_path}") # Print summary positive = [r for r in results if r.category != "out_of_scope"] errors = [r for r in results if r.answer.startswith("ERROR")] avg_p5 = sum(r.retrieval_precision for r in positive) / max(len(positive), 1) avg_r5 = sum(r.retrieval_recall for r in positive) / max(len(positive), 1) avg_khr = sum(r.keyword_hit_rate for r in positive) / max(len(positive), 1) avg_lat = sum(r.latency_ms for r in results) / max(len(results), 1) print(f"\nSummary ({len(results)} questions, {len(errors)} errors):") print(f" Avg P@5: {avg_p5:.2f}") print(f" Avg R@5: {avg_r5:.2f}") print(f" Avg KHR: {avg_khr:.2f}") print(f" Avg latency: {avg_lat:,.0f} ms") def main() -> None: parser = argparse.ArgumentParser(description="Run LangChain baseline evaluation") parser.add_argument( "--provider", choices=["openai", "anthropic"], default="openai", ) parser.add_argument("--config", default=None, help="Config YAML path") parser.add_argument("--output", default=".cache/langchain_eval_results.json") parser.add_argument( "--max-questions", type=int, default=None, help="Limit number of questions (for testing)", ) args = parser.parse_args() asyncio.run(main_async(args)) if __name__ == "__main__": main()