Nomearod Claude Opus 4.6 (1M context) commited on
Commit
ef5d585
·
1 Parent(s): a69ad63

feat: Day 1 — repo scaffolding, provider abstraction, config, tests

Browse files

- pyproject.toml with setuptools.build_meta (fixed from spec)
- Makefile + GitHub Actions CI
- Core types: Message, ToolCall, TokenUsage, CompletionResponse
- Config: Pydantic models + YAML loading (Path.cwd based)
- OpenAIProvider (full), MockProvider (deterministic), AnthropicProvider (stub)
- format_tools_openai/format_messages_openai as pure functions (testable without API key)
- 19 tests, all deterministic, lint + mypy clean

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

.github/workflows/ci.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+ on: [push, pull_request]
3
+ jobs:
4
+ test:
5
+ runs-on: ubuntu-latest
6
+ steps:
7
+ - uses: actions/checkout@v4
8
+ - uses: actions/setup-python@v5
9
+ with:
10
+ python-version: "3.11"
11
+ - run: pip install -e ".[dev]"
12
+ - run: make lint
13
+ - run: make test
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ *.egg
8
+ .cache/
9
+ .mypy_cache/
10
+ .pytest_cache/
11
+ .ruff_cache/
12
+ *.faiss
13
+ *.pkl
14
+ .env
15
+ .venv/
16
+ venv/
Makefile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: install test lint serve ingest evaluate-fast evaluate-full benchmark docker
2
+
3
+ install:
4
+ pip install -e ".[dev]"
5
+
6
+ test:
7
+ pytest tests/ -v --tb=short
8
+
9
+ lint:
10
+ ruff check agent_bench/ tests/
11
+ ruff format --check agent_bench/ tests/
12
+ mypy agent_bench/ --ignore-missing-imports
13
+
14
+ serve:
15
+ uvicorn agent_bench.serving.app:create_app --factory --reload --port 8000
16
+
17
+ ingest:
18
+ python scripts/ingest.py --config configs/tasks/tech_docs.yaml
19
+
20
+ evaluate-fast:
21
+ python scripts/evaluate.py --config configs/default.yaml --mode deterministic
22
+
23
+ evaluate-full:
24
+ python scripts/evaluate.py --config configs/default.yaml --mode full
25
+
26
+ benchmark:
27
+ python scripts/benchmark.py --output docs/benchmark_report.md
28
+
29
+ docker:
30
+ docker-compose -f docker/docker-compose.yaml up --build
agent_bench/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Evaluation-first agentic RAG system built from API primitives."""
agent_bench/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Core types, configuration, and provider abstraction."""
agent_bench/core/config.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration loading from YAML files via Pydantic models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import yaml
9
+ from pydantic import BaseModel
10
+
11
+ # --- Nested config models ---
12
+
13
+
14
+ class AgentConfig(BaseModel):
15
+ max_iterations: int = 3
16
+ temperature: float = 0.0
17
+
18
+
19
+ class ModelPricing(BaseModel):
20
+ input_cost_per_mtok: float
21
+ output_cost_per_mtok: float
22
+
23
+
24
+ class ProviderConfig(BaseModel):
25
+ default: str = "openai"
26
+ models: dict[str, ModelPricing] = {}
27
+
28
+
29
+ class ChunkingConfig(BaseModel):
30
+ strategy: str = "recursive"
31
+ chunk_size: int = 512
32
+ chunk_overlap: int = 64
33
+
34
+
35
+ class RetrievalConfig(BaseModel):
36
+ strategy: str = "hybrid"
37
+ rrf_k: int = 60
38
+ candidates_per_system: int = 10
39
+ top_k: int = 5
40
+
41
+
42
+ class RerankerConfig(BaseModel):
43
+ enabled: bool = False
44
+
45
+
46
+ class RAGConfig(BaseModel):
47
+ chunking: ChunkingConfig = ChunkingConfig()
48
+ retrieval: RetrievalConfig = RetrievalConfig()
49
+ reranker: RerankerConfig = RerankerConfig()
50
+ store_path: str = ".cache/store"
51
+
52
+
53
+ class EmbeddingConfig(BaseModel):
54
+ model: str = "all-MiniLM-L6-v2"
55
+ cache_dir: str = ".cache/embeddings"
56
+
57
+
58
+ class ServingConfig(BaseModel):
59
+ host: str = "0.0.0.0"
60
+ port: int = 8000
61
+ request_timeout_seconds: int = 30
62
+
63
+
64
+ class EvaluationConfig(BaseModel):
65
+ judge_provider: str = "openai"
66
+ golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
67
+
68
+
69
+ class AppConfig(BaseModel):
70
+ agent: AgentConfig = AgentConfig()
71
+ provider: ProviderConfig = ProviderConfig()
72
+ rag: RAGConfig = RAGConfig()
73
+ embedding: EmbeddingConfig = EmbeddingConfig()
74
+ serving: ServingConfig = ServingConfig()
75
+ evaluation: EvaluationConfig = EvaluationConfig()
76
+
77
+
78
+ # --- Task config ---
79
+
80
+
81
+ class TaskConfig(BaseModel):
82
+ name: str
83
+ description: str
84
+ system_prompt: str
85
+ document_dir: str = "data/tech_docs/"
86
+
87
+
88
+ class TaskFileConfig(BaseModel):
89
+ task: TaskConfig
90
+
91
+
92
+ # --- Loaders ---
93
+
94
+
95
+ def _resolve_config_dir() -> Path:
96
+ """Resolve configs directory relative to cwd."""
97
+ return Path.cwd() / "configs"
98
+
99
+
100
+ def load_config(path: Path | None = None) -> AppConfig:
101
+ """Load application config from YAML."""
102
+ if path is None:
103
+ path = _resolve_config_dir() / "default.yaml"
104
+ with open(path) as f:
105
+ data: dict[str, Any] = yaml.safe_load(f)
106
+ return AppConfig.model_validate(data)
107
+
108
+
109
+ def load_task_config(task_name: str, path: Path | None = None) -> TaskConfig:
110
+ """Load a task-specific config from YAML."""
111
+ if path is None:
112
+ path = _resolve_config_dir() / "tasks" / f"{task_name}.yaml"
113
+ with open(path) as f:
114
+ data: dict[str, Any] = yaml.safe_load(f)
115
+ return TaskFileConfig.model_validate(data).task
agent_bench/core/provider.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM provider abstraction with OpenAI, Mock, and Anthropic (stub) implementations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from abc import ABC, abstractmethod
8
+
9
+ from agent_bench.core.config import AppConfig, load_config
10
+ from agent_bench.core.types import (
11
+ CompletionResponse,
12
+ Message,
13
+ Role,
14
+ TokenUsage,
15
+ ToolCall,
16
+ ToolDefinition,
17
+ )
18
+
19
+
20
+ class ProviderTimeoutError(Exception):
21
+ """Raised when the LLM provider times out."""
22
+
23
+
24
+ # --- Pure formatting functions (used by providers and tests directly) ---
25
+
26
+
27
+ def format_tools_openai(tools: list[ToolDefinition]) -> list[dict]:
28
+ """Format tool definitions into OpenAI function-calling schema."""
29
+ return [
30
+ {
31
+ "type": "function",
32
+ "function": {
33
+ "name": t.name,
34
+ "description": t.description,
35
+ "parameters": t.parameters,
36
+ },
37
+ }
38
+ for t in tools
39
+ ]
40
+
41
+
42
+ def format_messages_openai(messages: list[Message]) -> list[dict]:
43
+ """Format internal Message objects into OpenAI chat message dicts."""
44
+ formatted = []
45
+ for m in messages:
46
+ msg: dict = {"role": m.role.value, "content": m.content}
47
+ if m.tool_call_id:
48
+ msg["tool_call_id"] = m.tool_call_id
49
+ if m.tool_calls:
50
+ msg["tool_calls"] = [
51
+ {
52
+ "id": tc.id,
53
+ "type": "function",
54
+ "function": {
55
+ "name": tc.name,
56
+ "arguments": json.dumps(tc.arguments),
57
+ },
58
+ }
59
+ for tc in m.tool_calls
60
+ ]
61
+ formatted.append(msg)
62
+ return formatted
63
+
64
+
65
+ # --- Provider interface ---
66
+
67
+
68
+ class LLMProvider(ABC):
69
+ """Async LLM provider interface."""
70
+
71
+ @abstractmethod
72
+ async def complete(
73
+ self,
74
+ messages: list[Message],
75
+ tools: list[ToolDefinition] | None = None,
76
+ temperature: float = 0.0,
77
+ max_tokens: int = 1024,
78
+ ) -> CompletionResponse: ...
79
+
80
+ @abstractmethod
81
+ def format_tools(self, tools: list[ToolDefinition]) -> list[dict]: ...
82
+
83
+
84
+ # --- Implementations ---
85
+
86
+
87
+ class MockProvider(LLMProvider):
88
+ """Deterministic provider for testing.
89
+
90
+ Behavior:
91
+ - If tools are provided AND no Role.TOOL messages exist -> returns tool_calls
92
+ - If Role.TOOL messages exist OR no tools -> returns final text answer
93
+ """
94
+
95
+ def __init__(self) -> None:
96
+ self.call_count = 0
97
+
98
+ async def complete(
99
+ self,
100
+ messages: list[Message],
101
+ tools: list[ToolDefinition] | None = None,
102
+ temperature: float = 0.0,
103
+ max_tokens: int = 1024,
104
+ ) -> CompletionResponse:
105
+ self.call_count += 1
106
+ has_tool_results = any(m.role == Role.TOOL for m in messages)
107
+
108
+ if tools and not has_tool_results:
109
+ return CompletionResponse(
110
+ content="",
111
+ tool_calls=[
112
+ ToolCall(
113
+ id=f"call_mock_{self.call_count}",
114
+ name=tools[0].name,
115
+ arguments={"query": "mock search query"},
116
+ )
117
+ ],
118
+ usage=TokenUsage(
119
+ input_tokens=150,
120
+ output_tokens=25,
121
+ estimated_cost_usd=0.0001,
122
+ ),
123
+ provider="mock",
124
+ model="mock-1",
125
+ latency_ms=1.0,
126
+ )
127
+
128
+ return CompletionResponse(
129
+ content="Based on the documentation, path parameters in FastAPI are defined "
130
+ "using curly braces in the path string. [source: fastapi_path_params.md]",
131
+ tool_calls=[],
132
+ usage=TokenUsage(
133
+ input_tokens=200,
134
+ output_tokens=50,
135
+ estimated_cost_usd=0.0002,
136
+ ),
137
+ provider="mock",
138
+ model="mock-1",
139
+ latency_ms=2.0,
140
+ )
141
+
142
+ def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
143
+ return format_tools_openai(tools)
144
+
145
+
146
+ class OpenAIProvider(LLMProvider):
147
+ """OpenAI API provider using gpt-4o-mini."""
148
+
149
+ def __init__(self, config: AppConfig | None = None) -> None:
150
+ try:
151
+ from openai import AsyncOpenAI
152
+ except ImportError as e:
153
+ raise ImportError("openai package required: pip install openai") from e
154
+
155
+ self.config = config or load_config()
156
+ self.client = AsyncOpenAI()
157
+ self.model = "gpt-4o-mini"
158
+ model_pricing = self.config.provider.models.get(self.model)
159
+ self._input_cost = model_pricing.input_cost_per_mtok if model_pricing else 0.15
160
+ self._output_cost = model_pricing.output_cost_per_mtok if model_pricing else 0.60
161
+
162
+ async def complete(
163
+ self,
164
+ messages: list[Message],
165
+ tools: list[ToolDefinition] | None = None,
166
+ temperature: float = 0.0,
167
+ max_tokens: int = 1024,
168
+ ) -> CompletionResponse:
169
+ from openai import APITimeoutError
170
+
171
+ formatted_messages = format_messages_openai(messages)
172
+ kwargs: dict = {
173
+ "model": self.model,
174
+ "messages": formatted_messages,
175
+ "temperature": temperature,
176
+ "max_tokens": max_tokens,
177
+ }
178
+ if tools:
179
+ kwargs["tools"] = self.format_tools(tools)
180
+ kwargs["tool_choice"] = "auto"
181
+
182
+ start = time.perf_counter()
183
+ try:
184
+ response = await self.client.chat.completions.create(**kwargs)
185
+ except APITimeoutError as e:
186
+ raise ProviderTimeoutError(f"OpenAI timed out: {e}") from e
187
+ latency_ms = (time.perf_counter() - start) * 1000
188
+
189
+ choice = response.choices[0]
190
+ content = choice.message.content or ""
191
+ tool_calls: list[ToolCall] = []
192
+
193
+ if choice.message.tool_calls:
194
+ for tc in choice.message.tool_calls:
195
+ try:
196
+ args = json.loads(tc.function.arguments)
197
+ except json.JSONDecodeError:
198
+ args = {}
199
+ tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))
200
+
201
+ usage = response.usage
202
+ input_tokens = usage.prompt_tokens if usage else 0
203
+ output_tokens = usage.completion_tokens if usage else 0
204
+ cost = (input_tokens * self._input_cost + output_tokens * self._output_cost) / 1_000_000
205
+
206
+ return CompletionResponse(
207
+ content=content,
208
+ tool_calls=tool_calls,
209
+ usage=TokenUsage(
210
+ input_tokens=input_tokens,
211
+ output_tokens=output_tokens,
212
+ estimated_cost_usd=cost,
213
+ ),
214
+ provider="openai",
215
+ model=self.model,
216
+ latency_ms=latency_ms,
217
+ )
218
+
219
+ def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
220
+ return format_tools_openai(tools)
221
+
222
+
223
+ class AnthropicProvider(LLMProvider):
224
+ """Anthropic Claude provider -- stub for V2."""
225
+
226
+ async def complete(
227
+ self,
228
+ messages: list[Message],
229
+ tools: list[ToolDefinition] | None = None,
230
+ temperature: float = 0.0,
231
+ max_tokens: int = 1024,
232
+ ) -> CompletionResponse:
233
+ raise NotImplementedError("Anthropic provider planned for V2")
234
+
235
+ def format_tools(self, tools: list[ToolDefinition]) -> list[dict]:
236
+ raise NotImplementedError("Anthropic provider planned for V2")
237
+
238
+
239
+ def create_provider(config: AppConfig | None = None) -> LLMProvider:
240
+ """Factory: create provider based on config."""
241
+ if config is None:
242
+ config = load_config()
243
+ name = config.provider.default
244
+ if name == "openai":
245
+ return OpenAIProvider(config)
246
+ elif name == "anthropic":
247
+ return AnthropicProvider()
248
+ elif name == "mock":
249
+ return MockProvider()
250
+ else:
251
+ raise ValueError(f"Unknown provider: {name}")
agent_bench/core/types.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared type definitions used across agent-bench."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+
7
+ from pydantic import BaseModel, Field
8
+
9
+
10
+ class Role(str, Enum):
11
+ SYSTEM = "system"
12
+ USER = "user"
13
+ ASSISTANT = "assistant"
14
+ TOOL = "tool"
15
+
16
+
17
+ class ToolCall(BaseModel):
18
+ id: str
19
+ name: str
20
+ arguments: dict
21
+
22
+
23
+ class Message(BaseModel):
24
+ role: Role
25
+ content: str
26
+ tool_call_id: str | None = None
27
+ tool_calls: list[ToolCall] | None = None
28
+
29
+
30
+ class ToolDefinition(BaseModel):
31
+ name: str
32
+ description: str
33
+ parameters: dict # JSON Schema
34
+
35
+
36
+ class TokenUsage(BaseModel):
37
+ input_tokens: int
38
+ output_tokens: int
39
+ estimated_cost_usd: float
40
+
41
+
42
+ class CompletionResponse(BaseModel):
43
+ content: str
44
+ tool_calls: list[ToolCall] = Field(default_factory=list)
45
+ usage: TokenUsage
46
+ provider: str
47
+ model: str
48
+ latency_ms: float
configs/default.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent:
2
+ max_iterations: 3
3
+ temperature: 0.0
4
+
5
+ provider:
6
+ default: openai
7
+ models:
8
+ gpt-4o-mini:
9
+ input_cost_per_mtok: 0.15
10
+ output_cost_per_mtok: 0.60
11
+ claude-sonnet-4-20250514:
12
+ input_cost_per_mtok: 3.0
13
+ output_cost_per_mtok: 15.0
14
+
15
+ rag:
16
+ chunking:
17
+ strategy: recursive
18
+ chunk_size: 512
19
+ chunk_overlap: 64
20
+ retrieval:
21
+ strategy: hybrid
22
+ rrf_k: 60
23
+ candidates_per_system: 10
24
+ top_k: 5
25
+ reranker:
26
+ enabled: false
27
+ store_path: .cache/store
28
+
29
+ embedding:
30
+ model: all-MiniLM-L6-v2
31
+ cache_dir: .cache/embeddings
32
+
33
+ serving:
34
+ host: 0.0.0.0
35
+ port: 8000
36
+ request_timeout_seconds: 30
37
+
38
+ evaluation:
39
+ judge_provider: openai
40
+ golden_dataset: agent_bench/evaluation/datasets/tech_docs_golden.json
configs/tasks/tech_docs.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task:
2
+ name: tech_docs
3
+ description: "Q&A over technical documentation"
4
+ system_prompt: |
5
+ You are a technical documentation assistant. You have access to tools
6
+ that let you search a documentation corpus and perform calculations.
7
+
8
+ Rules:
9
+ - Use search_documents to find relevant information before answering.
10
+ - Base your answer ONLY on the retrieved documents.
11
+ - Cite sources inline as [source: filename.md] for each claim.
12
+ - If the documents don't contain the answer, respond with:
13
+ "The documentation does not contain information about this topic."
14
+ - Use calculator for any numerical computations.
15
+ - Be concise and precise.
16
+ document_dir: data/tech_docs/
pyproject.toml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "agent-bench"
3
+ version = "0.1.0"
4
+ description = "Evaluation-first agentic RAG system built from API primitives"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "anthropic>=0.40.0",
8
+ "openai>=1.50.0",
9
+ "fastapi>=0.115.0",
10
+ "uvicorn[standard]>=0.30.0",
11
+ "pydantic>=2.9.0",
12
+ "pydantic-settings>=2.5.0",
13
+ "pyyaml>=6.0",
14
+ "sentence-transformers>=3.0.0",
15
+ "faiss-cpu>=1.8.0",
16
+ "rank-bm25>=0.2.2",
17
+ "structlog>=24.0.0",
18
+ "httpx>=0.27.0",
19
+ "simpleeval>=1.0.0",
20
+ "numpy>=1.26.0",
21
+ ]
22
+
23
+ [project.optional-dependencies]
24
+ dev = [
25
+ "pytest>=8.0.0",
26
+ "pytest-asyncio>=0.24.0",
27
+ "ruff>=0.6.0",
28
+ "mypy>=1.11.0",
29
+ "respx>=0.21.0",
30
+ "types-PyYAML>=2024.0.0",
31
+ ]
32
+
33
+ [build-system]
34
+ requires = ["setuptools>=69.0"]
35
+ build-backend = "setuptools.build_meta"
36
+
37
+ [tool.pytest.ini_options]
38
+ asyncio_mode = "auto"
39
+ testpaths = ["tests"]
40
+
41
+ [tool.ruff]
42
+ target-version = "py311"
43
+ line-length = 100
44
+
45
+ [tool.ruff.lint]
46
+ select = ["E", "F", "I", "N", "W"]
47
+
48
+ [tool.mypy]
49
+ python_version = "3.11"
50
+ warn_return_any = true
51
+ warn_unused_configs = true
tests/__init__.py ADDED
File without changes
tests/conftest.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared test fixtures."""
2
+
3
+ import pytest
4
+
5
+ from agent_bench.core.provider import MockProvider
6
+
7
+
8
+ @pytest.fixture
9
+ def mock_provider() -> MockProvider:
10
+ """MockProvider instance for deterministic testing."""
11
+ return MockProvider()
tests/test_provider.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for core types, config, and provider abstraction."""
2
+
3
+ import pytest
4
+
5
+ from agent_bench.core.config import AppConfig, ProviderConfig, load_config, load_task_config
6
+ from agent_bench.core.provider import (
7
+ AnthropicProvider,
8
+ MockProvider,
9
+ create_provider,
10
+ format_messages_openai,
11
+ format_tools_openai,
12
+ )
13
+ from agent_bench.core.types import (
14
+ CompletionResponse,
15
+ Message,
16
+ Role,
17
+ TokenUsage,
18
+ ToolCall,
19
+ ToolDefinition,
20
+ )
21
+
22
+ # --- Core types ---
23
+
24
+
25
+ class TestCoreTypes:
26
+ def test_message_creation(self):
27
+ msg = Message(role=Role.USER, content="hello")
28
+ assert msg.role == Role.USER
29
+ assert msg.content == "hello"
30
+ assert msg.tool_call_id is None
31
+ assert msg.tool_calls is None
32
+
33
+ def test_tool_call_creation(self):
34
+ tc = ToolCall(id="call_123", name="search", arguments={"query": "test"})
35
+ assert tc.id == "call_123"
36
+ assert tc.name == "search"
37
+ assert tc.arguments == {"query": "test"}
38
+
39
+ def test_token_usage_creation(self):
40
+ usage = TokenUsage(input_tokens=100, output_tokens=50, estimated_cost_usd=0.001)
41
+ assert usage.input_tokens == 100
42
+ assert usage.output_tokens == 50
43
+ assert usage.estimated_cost_usd == pytest.approx(0.001)
44
+
45
+ def test_completion_response_defaults(self):
46
+ resp = CompletionResponse(
47
+ content="answer",
48
+ usage=TokenUsage(input_tokens=10, output_tokens=5, estimated_cost_usd=0.0),
49
+ provider="mock",
50
+ model="mock-1",
51
+ latency_ms=50.0,
52
+ )
53
+ assert resp.tool_calls == []
54
+ assert resp.content == "answer"
55
+
56
+ def test_tool_definition_schema(self):
57
+ td = ToolDefinition(
58
+ name="calculator",
59
+ description="Evaluate math",
60
+ parameters={
61
+ "type": "object",
62
+ "properties": {"expression": {"type": "string"}},
63
+ "required": ["expression"],
64
+ },
65
+ )
66
+ assert td.name == "calculator"
67
+ assert "expression" in td.parameters["properties"]
68
+
69
+
70
+ # --- Config ---
71
+
72
+
73
+ class TestConfig:
74
+ def test_load_default_config(self):
75
+ config = load_config()
76
+ assert config.provider.default == "openai"
77
+ assert config.agent.max_iterations == 3
78
+ assert config.agent.temperature == 0.0
79
+ assert config.rag.chunking.strategy == "recursive"
80
+ assert config.rag.chunking.chunk_size == 512
81
+ assert config.rag.retrieval.rrf_k == 60
82
+ assert config.rag.retrieval.top_k == 5
83
+
84
+ def test_model_pricing_available(self):
85
+ config = load_config()
86
+ models = config.provider.models
87
+ assert "gpt-4o-mini" in models
88
+ assert models["gpt-4o-mini"].input_cost_per_mtok == 0.15
89
+ assert models["gpt-4o-mini"].output_cost_per_mtok == 0.60
90
+
91
+ def test_cost_calculation(self):
92
+ config = load_config()
93
+ model_config = config.provider.models["gpt-4o-mini"]
94
+ input_tokens = 1000
95
+ output_tokens = 500
96
+ expected_cost = (1000 * 0.15 + 500 * 0.60) / 1_000_000
97
+ cost = (
98
+ input_tokens * model_config.input_cost_per_mtok
99
+ + output_tokens * model_config.output_cost_per_mtok
100
+ ) / 1_000_000
101
+ assert cost == pytest.approx(expected_cost)
102
+
103
+ def test_load_task_config(self):
104
+ task = load_task_config("tech_docs")
105
+ assert task.name == "tech_docs"
106
+ assert "search_documents" in task.system_prompt
107
+ assert "[source:" in task.system_prompt
108
+
109
+
110
+ # --- MockProvider ---
111
+
112
+
113
+ class TestMockProvider:
114
+ @pytest.mark.asyncio
115
+ async def test_returns_tool_calls_on_first_call(self, mock_provider):
116
+ messages = [
117
+ Message(role=Role.SYSTEM, content="You are helpful."),
118
+ Message(role=Role.USER, content="Search for FastAPI path params"),
119
+ ]
120
+ tools = [
121
+ ToolDefinition(
122
+ name="search_documents",
123
+ description="Search docs",
124
+ parameters={"type": "object", "properties": {"query": {"type": "string"}}},
125
+ )
126
+ ]
127
+ response = await mock_provider.complete(messages, tools=tools)
128
+ assert len(response.tool_calls) > 0
129
+ assert response.tool_calls[0].name == "search_documents"
130
+ assert response.provider == "mock"
131
+ assert response.usage.input_tokens > 0
132
+
133
+ @pytest.mark.asyncio
134
+ async def test_returns_final_answer_when_tool_results_present(self, mock_provider):
135
+ messages = [
136
+ Message(role=Role.SYSTEM, content="You are helpful."),
137
+ Message(role=Role.USER, content="Search for FastAPI path params"),
138
+ Message(
139
+ role=Role.ASSISTANT,
140
+ content="",
141
+ tool_calls=[
142
+ ToolCall(
143
+ id="call_1", name="search_documents", arguments={"query": "path params"}
144
+ )
145
+ ],
146
+ ),
147
+ Message(role=Role.TOOL, content="Path params use curly braces.", tool_call_id="call_1"),
148
+ ]
149
+ response = await mock_provider.complete(messages)
150
+ assert response.tool_calls == []
151
+ assert len(response.content) > 0
152
+ assert response.usage.input_tokens > 0
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_returns_answer_without_tools(self, mock_provider):
156
+ messages = [
157
+ Message(role=Role.SYSTEM, content="You are helpful."),
158
+ Message(role=Role.USER, content="Hello"),
159
+ ]
160
+ response = await mock_provider.complete(messages, tools=None)
161
+ assert response.tool_calls == []
162
+ assert len(response.content) > 0
163
+
164
+ def test_format_tools_returns_list(self, mock_provider):
165
+ tools = [
166
+ ToolDefinition(
167
+ name="calc",
168
+ description="Calculate",
169
+ parameters={"type": "object", "properties": {}},
170
+ )
171
+ ]
172
+ formatted = mock_provider.format_tools(tools)
173
+ assert isinstance(formatted, list)
174
+ assert len(formatted) == 1
175
+
176
+
177
+ # --- OpenAI format functions (tested as pure functions, no API key needed) ---
178
+
179
+
180
+ class TestOpenAIFormat:
181
+ def test_format_tools_produces_openai_schema(self):
182
+ tools = [
183
+ ToolDefinition(
184
+ name="search_documents",
185
+ description="Search the documentation corpus",
186
+ parameters={
187
+ "type": "object",
188
+ "properties": {
189
+ "query": {"type": "string", "description": "Search query"},
190
+ "top_k": {"type": "integer", "description": "Number of results"},
191
+ },
192
+ "required": ["query"],
193
+ },
194
+ )
195
+ ]
196
+ formatted = format_tools_openai(tools)
197
+ assert len(formatted) == 1
198
+ assert formatted[0]["type"] == "function"
199
+ func = formatted[0]["function"]
200
+ assert func["name"] == "search_documents"
201
+ assert func["description"] == "Search the documentation corpus"
202
+ assert func["parameters"]["required"] == ["query"]
203
+
204
+ def test_format_messages_maps_roles(self):
205
+ messages = [
206
+ Message(role=Role.SYSTEM, content="system prompt"),
207
+ Message(role=Role.USER, content="user question"),
208
+ Message(
209
+ role=Role.ASSISTANT,
210
+ content="",
211
+ tool_calls=[ToolCall(id="call_1", name="search", arguments={"q": "test"})],
212
+ ),
213
+ Message(role=Role.TOOL, content="tool result", tool_call_id="call_1"),
214
+ ]
215
+ formatted = format_messages_openai(messages)
216
+ assert formatted[0]["role"] == "system"
217
+ assert formatted[1]["role"] == "user"
218
+ assert formatted[2]["role"] == "assistant"
219
+ assert formatted[2]["tool_calls"][0]["id"] == "call_1"
220
+ assert formatted[2]["tool_calls"][0]["function"]["name"] == "search"
221
+ assert formatted[3]["role"] == "tool"
222
+ assert formatted[3]["tool_call_id"] == "call_1"
223
+
224
+
225
+ # --- Anthropic stub ---
226
+
227
+
228
+ class TestAnthropicProvider:
229
+ @pytest.mark.asyncio
230
+ async def test_complete_raises_not_implemented(self):
231
+ provider = AnthropicProvider()
232
+ with pytest.raises(NotImplementedError, match="planned for V2"):
233
+ await provider.complete([Message(role=Role.USER, content="test")])
234
+
235
+ def test_format_tools_raises_not_implemented(self):
236
+ provider = AnthropicProvider()
237
+ with pytest.raises(NotImplementedError, match="planned for V2"):
238
+ provider.format_tools([])
239
+
240
+
241
+ # --- Provider factory ---
242
+
243
+
244
+ class TestProviderFactory:
245
+ def test_create_mock_provider(self):
246
+ config = AppConfig(provider=ProviderConfig(default="mock"))
247
+ provider = create_provider(config)
248
+ assert isinstance(provider, MockProvider)
249
+
250
+ def test_create_unknown_provider_raises(self):
251
+ config = AppConfig(provider=ProviderConfig(default="unknown"))
252
+ with pytest.raises(ValueError, match="Unknown provider"):
253
+ create_provider(config)