agentbench / agent_bench /core /config.py
Nomearod's picture
feat(config): add evaluation.judge_dimensions field
12cb8b7
"""Configuration loading from YAML files via Pydantic models."""
from __future__ import annotations
from pathlib import Path
from typing import Any, Literal
import yaml
from pydantic import BaseModel, model_validator
# --- Nested config models ---
class AgentConfig(BaseModel):
max_iterations: int = 3
temperature: float = 0.0
class ModelPricing(BaseModel):
input_cost_per_mtok: float
output_cost_per_mtok: float
class SelfHostedConfig(BaseModel):
base_url: str = ""
model_name: str = "mistralai/Mistral-7B-Instruct-v0.3"
api_key: str = ""
timeout_seconds: float = 120.0
class ProviderConfig(BaseModel):
default: str = "openai"
models: dict[str, ModelPricing] = {}
selfhosted: SelfHostedConfig = SelfHostedConfig()
class ChunkingConfig(BaseModel):
strategy: str = "recursive"
chunk_size: int = 512
chunk_overlap: int = 64
class RetrievalConfig(BaseModel):
strategy: str = "hybrid"
rrf_k: int = 60
candidates_per_system: int = 10
top_k: int = 5
class RerankerConfig(BaseModel):
enabled: bool = True
model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
top_k: int = 5 # independent of retrieval.top_k
class RAGConfig(BaseModel):
chunking: ChunkingConfig = ChunkingConfig()
retrieval: RetrievalConfig = RetrievalConfig()
reranker: RerankerConfig = RerankerConfig()
store_path: str = ".cache/store"
refusal_threshold: float = 0.0 # 0.0 = disabled (V1 behavior)
class RetryConfig(BaseModel):
max_retries: int = 3
base_delay: float = 1.0 # seconds
max_delay: float = 8.0 # cap for exponential backoff
class EmbeddingConfig(BaseModel):
model: str = "all-MiniLM-L6-v2"
cache_dir: str = ".cache/embeddings"
class ServingConfig(BaseModel):
host: str = "0.0.0.0"
port: int = 8000
request_timeout_seconds: int = 30
rate_limit_rpm: int = 10 # requests per minute per IP
class MemoryConfig(BaseModel):
enabled: bool = True
db_path: str = "data/conversations.db"
max_turns: int = 10
class EvaluationConfig(BaseModel):
judge_provider: str = "openai"
golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
# New in judge-layer v1: which dimensions to score with L2 LLM judges.
# citation_faithfulness is opt-in v1 (default-on v1.1).
judge_dimensions: list[str] = ["groundedness", "relevance", "completeness"]
_VALID_TIERS = {"heuristic", "classifier"}
class InjectionConfig(BaseModel):
enabled: bool = True
action: Literal["block", "warn", "flag"] = "block"
tiers: list[str] = ["heuristic", "classifier"]
classifier_url: str = ""
@model_validator(mode="after")
def _validate_tiers(self) -> "InjectionConfig":
invalid = set(self.tiers) - _VALID_TIERS
if invalid:
raise ValueError(
f"Invalid injection tier(s): {invalid}. Allowed: {_VALID_TIERS}"
)
if "classifier" in self.tiers and not self.classifier_url:
import structlog
structlog.get_logger().warning(
"injection_classifier_no_url",
msg="Tier 'classifier' configured but classifier_url is empty; "
"classifier tier will be skipped at runtime.",
)
return self
class PIIConfig(BaseModel):
enabled: bool = True
mode: Literal["redact", "detect_only", "passthrough"] = "redact"
redact_patterns: list[str] = [
"EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS",
]
use_ner: bool = False
ner_entities: list[str] = ["PERSON"]
class OutputConfig(BaseModel):
enabled: bool = True
pii_check: bool = True
url_check: bool = True
secret_check: bool = True
blocklist: list[str] = []
class AuditConfig(BaseModel):
enabled: bool = True
path: str = "logs/audit.jsonl"
max_size_mb: int = 100
rotate: bool = True
class SecurityConfig(BaseModel):
injection: InjectionConfig = InjectionConfig()
pii: PIIConfig = PIIConfig()
output: OutputConfig = OutputConfig()
audit: AuditConfig = AuditConfig()
class CorpusConfig(BaseModel):
"""Per-corpus configuration: store path, thresholds, iteration limits."""
label: str
store_path: str
data_path: str
refusal_threshold: float = 0.0
top_k: int = 5
max_iterations: int = 3
# Optional: path to the golden dataset JSON for this corpus. None is
# a valid state (corpus has no golden set yet during bring-up). The
# evaluation CLI errors clearly if --corpus targets a corpus with
# golden_dataset=None rather than requiring the field upfront.
golden_dataset: str | None = None
# When False, the corpus is kept in YAML for schema visibility but is
# not wired into corpus_map at startup. Dashboard can render the
# toggle as disabled; /ask requests for the corpus return 400.
# Use this for corpora whose docs/store are not yet curated.
available: bool = True
class AppConfig(BaseModel):
agent: AgentConfig = AgentConfig()
provider: ProviderConfig = ProviderConfig()
rag: RAGConfig = RAGConfig()
retry: RetryConfig = RetryConfig()
memory: MemoryConfig = MemoryConfig()
embedding: EmbeddingConfig = EmbeddingConfig()
serving: ServingConfig = ServingConfig()
evaluation: EvaluationConfig = EvaluationConfig()
security: SecurityConfig = SecurityConfig()
# Multi-corpus support
corpora: dict[str, CorpusConfig] = {}
default_corpus: str = "fastapi"
@model_validator(mode="after")
def _validate_default_corpus(self) -> "AppConfig":
if not self.corpora:
return self
if self.default_corpus not in self.corpora:
raise ValueError(
f"default_corpus={self.default_corpus!r} is not in corpora "
f"{sorted(self.corpora.keys())!r}. Configured corpora must "
"include the default.",
)
# The default corpus must also be available — otherwise the app
# would boot with no reachable default orchestrator.
if not self.corpora[self.default_corpus].available:
raise ValueError(
f"default_corpus={self.default_corpus!r} has available=False. "
"The default corpus must be ready to serve; set available=true "
"or point default_corpus at a ready corpus.",
)
return self
# --- Task config ---
class TaskConfig(BaseModel):
name: str
description: str
system_prompt: str
document_dir: str = "data/tech_docs/"
class TaskFileConfig(BaseModel):
task: TaskConfig
# --- Loaders ---
def _resolve_config_dir() -> Path:
"""Resolve configs directory: cwd first, then package-relative fallback."""
cwd_configs = Path.cwd() / "configs"
if cwd_configs.is_dir():
return cwd_configs
# Fallback: relative to package location (works for installed packages)
pkg_configs = Path(__file__).resolve().parent.parent.parent / "configs"
if pkg_configs.is_dir():
return pkg_configs
return cwd_configs # Let the caller get a clear FileNotFoundError
def load_config(path: Path | None = None) -> AppConfig:
"""Load application config from YAML.
If AGENT_BENCH_ENV is set (e.g. 'production'), loads configs/{env}.yaml
if it exists, otherwise falls back to default.yaml.
"""
if path is None:
import os
env = os.environ.get("AGENT_BENCH_ENV", "")
config_dir = _resolve_config_dir()
env_path = config_dir / f"{env}.yaml"
path = env_path if env and env_path.exists() else config_dir / "default.yaml"
with open(path) as f:
data: dict[str, Any] = yaml.safe_load(f)
return AppConfig.model_validate(data)
def load_task_config(task_name: str, path: Path | None = None) -> TaskConfig:
"""Load a task-specific config from YAML."""
if path is None:
path = _resolve_config_dir() / "tasks" / f"{task_name}.yaml"
with open(path) as f:
data: dict[str, Any] = yaml.safe_load(f)
return TaskFileConfig.model_validate(data).task