File size: 5,674 Bytes
9f98da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c64504
 
 
 
 
 
 
 
 
 
 
9f98da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c64504
9f98da1
 
2c64504
 
 
 
 
 
 
9f98da1
 
 
 
2c64504
9f98da1
 
 
 
 
 
 
 
 
 
 
 
2c64504
 
9f98da1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Run LangChain baseline evaluation against the golden dataset.

Usage:
    python scripts/run_langchain_eval.py --provider openai
    python scripts/run_langchain_eval.py --provider anthropic
    python scripts/run_langchain_eval.py --provider openai --max-questions 3
"""

from __future__ import annotations

import argparse
import asyncio
import json
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))


async def main_async(args: argparse.Namespace) -> None:
    # Heavy imports deferred so --help works without loading torch/OpenMP/etc.
    from agent_bench.core.config import load_config, load_task_config
    from agent_bench.evaluation.report import generate_report, save_report
    from agent_bench.langchain_baseline.agent import create_langchain_agent
    from agent_bench.langchain_baseline.retriever import AgentBenchRetriever
    from agent_bench.langchain_baseline.runner import run_langchain_evaluation
    from agent_bench.langchain_baseline.tools import LangChainSearchTool, create_calculator_tool
    from agent_bench.rag.embedder import Embedder
    from agent_bench.rag.retriever import Retriever
    from agent_bench.rag.store import HybridStore

    config = load_config(Path(args.config) if args.config else None)
    task = load_task_config("tech_docs")

    # Build existing RAG pipeline (same as scripts/evaluate.py)
    store = HybridStore.load(config.rag.store_path, rrf_k=config.rag.retrieval.rrf_k)
    embedder = Embedder(model_name=config.embedding.model, cache_dir=config.embedding.cache_dir)

    reranker = None
    if config.rag.reranker.enabled:
        from agent_bench.rag.reranker import CrossEncoderReranker

        reranker = CrossEncoderReranker(model_name=config.rag.reranker.model_name)

    retriever = Retriever(
        embedder=embedder,
        store=store,
        default_strategy=config.rag.retrieval.strategy,
        candidates_per_system=config.rag.retrieval.candidates_per_system,
        reranker=reranker,
        reranker_top_k=config.rag.reranker.top_k,
    )

    # Wrap in LangChain components
    lc_retriever = AgentBenchRetriever(retriever=retriever, top_k=config.rag.retrieval.top_k)
    search_tool = LangChainSearchTool(lc_retriever)
    calc_tool = create_calculator_tool()

    agent_executor = create_langchain_agent(
        tools=[search_tool.as_tool(), calc_tool],
        provider=args.provider,
        system_prompt=task.system_prompt,
        max_iterations=config.agent.max_iterations,
    )

    # Resolve model name and pricing for token cost tracking
    model_defaults = {"openai": "gpt-4o-mini", "anthropic": "claude-haiku-4-5-20251001"}
    model_name = model_defaults[args.provider]
    pricing = config.provider.models.get(model_name)
    input_cost = pricing.input_cost_per_mtok if pricing else 0.0
    output_cost = pricing.output_cost_per_mtok if pricing else 0.0

    # Run evaluation
    golden_path = config.evaluation.golden_dataset
    print("Running LangChain baseline evaluation...")
    print(f"  Provider:  {args.provider}")
    print(f"  Model:     {model_name}")
    print(f"  Store:     {store.stats().total_chunks} chunks")
    print(f"  Golden:    {golden_path}")
    if args.max_questions:
        print(f"  Limit:     {args.max_questions} questions")
    print()

    results = await run_langchain_evaluation(
        agent_executor=agent_executor,
        search_tool_state=search_tool,
        golden_path=golden_path,
        provider_name=args.provider,
        max_questions=args.max_questions,
        input_cost_per_mtok=input_cost,
        output_cost_per_mtok=output_cost,
    )

    # Save raw results JSON
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    results_data = [r.model_dump() for r in results]
    output_path.write_text(json.dumps(results_data, indent=2, default=str))
    print(f"Results JSON: {output_path}")

    # Generate markdown report (reuses existing report generator)
    report = generate_report(
        results,
        provider_name=f"langchain-{args.provider}",
        corpus_size=store.stats().unique_sources,
    )
    report_path = Path(f"docs/langchain_benchmark_{args.provider}.md")
    save_report(report, report_path)
    print(f"Report:      {report_path}")

    # Print summary
    positive = [r for r in results if r.category != "out_of_scope"]
    errors = [r for r in results if r.answer.startswith("ERROR")]
    avg_p5 = sum(r.retrieval_precision for r in positive) / max(len(positive), 1)
    avg_r5 = sum(r.retrieval_recall for r in positive) / max(len(positive), 1)
    avg_khr = sum(r.keyword_hit_rate for r in positive) / max(len(positive), 1)
    avg_lat = sum(r.latency_ms for r in results) / max(len(results), 1)

    print(f"\nSummary ({len(results)} questions, {len(errors)} errors):")
    print(f"  Avg P@5:     {avg_p5:.2f}")
    print(f"  Avg R@5:     {avg_r5:.2f}")
    print(f"  Avg KHR:     {avg_khr:.2f}")
    print(f"  Avg latency: {avg_lat:,.0f} ms")


def main() -> None:
    parser = argparse.ArgumentParser(description="Run LangChain baseline evaluation")
    parser.add_argument(
        "--provider",
        choices=["openai", "anthropic"],
        default="openai",
    )
    parser.add_argument("--config", default=None, help="Config YAML path")
    parser.add_argument("--output", default=".cache/langchain_eval_results.json")
    parser.add_argument(
        "--max-questions",
        type=int,
        default=None,
        help="Limit number of questions (for testing)",
    )
    args = parser.parse_args()
    asyncio.run(main_async(args))


if __name__ == "__main__":
    main()