Spaces:

Nomearod
/

agentbench

Sleeping

App Files Files Community

agentbench / agent_bench /core /config.py

Nomearod

feat(config): add evaluation.judge_dimensions field

12cb8b7 about 1 month ago

raw

history blame contribute delete

8.14 kB

	"""Configuration loading from YAML files via Pydantic models."""

	from __future__ import annotations

	from pathlib import Path
	from typing import Any, Literal

	import yaml
	from pydantic import BaseModel, model_validator

	# --- Nested config models ---


	class AgentConfig(BaseModel):
	max_iterations: int = 3
	temperature: float = 0.0


	class ModelPricing(BaseModel):
	input_cost_per_mtok: float
	output_cost_per_mtok: float


	class SelfHostedConfig(BaseModel):
	base_url: str = ""
	model_name: str = "mistralai/Mistral-7B-Instruct-v0.3"
	api_key: str = ""
	timeout_seconds: float = 120.0


	class ProviderConfig(BaseModel):
	default: str = "openai"
	models: dict[str, ModelPricing] = {}
	selfhosted: SelfHostedConfig = SelfHostedConfig()


	class ChunkingConfig(BaseModel):
	strategy: str = "recursive"
	chunk_size: int = 512
	chunk_overlap: int = 64


	class RetrievalConfig(BaseModel):
	strategy: str = "hybrid"
	rrf_k: int = 60
	candidates_per_system: int = 10
	top_k: int = 5


	class RerankerConfig(BaseModel):
	enabled: bool = True
	model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
	top_k: int = 5 # independent of retrieval.top_k


	class RAGConfig(BaseModel):
	chunking: ChunkingConfig = ChunkingConfig()
	retrieval: RetrievalConfig = RetrievalConfig()
	reranker: RerankerConfig = RerankerConfig()
	store_path: str = ".cache/store"
	refusal_threshold: float = 0.0 # 0.0 = disabled (V1 behavior)


	class RetryConfig(BaseModel):
	max_retries: int = 3
	base_delay: float = 1.0 # seconds
	max_delay: float = 8.0 # cap for exponential backoff


	class EmbeddingConfig(BaseModel):
	model: str = "all-MiniLM-L6-v2"
	cache_dir: str = ".cache/embeddings"


	class ServingConfig(BaseModel):
	host: str = "0.0.0.0"
	port: int = 8000
	request_timeout_seconds: int = 30
	rate_limit_rpm: int = 10 # requests per minute per IP


	class MemoryConfig(BaseModel):
	enabled: bool = True
	db_path: str = "data/conversations.db"
	max_turns: int = 10


	class EvaluationConfig(BaseModel):
	judge_provider: str = "openai"
	golden_dataset: str = "agent_bench/evaluation/datasets/tech_docs_golden.json"
	# New in judge-layer v1: which dimensions to score with L2 LLM judges.
	# citation_faithfulness is opt-in v1 (default-on v1.1).
	judge_dimensions: list[str] = ["groundedness", "relevance", "completeness"]


	_VALID_TIERS = {"heuristic", "classifier"}


	class InjectionConfig(BaseModel):
	enabled: bool = True
	action: Literal["block", "warn", "flag"] = "block"
	tiers: list[str] = ["heuristic", "classifier"]
	classifier_url: str = ""

	@model_validator(mode="after")
	def _validate_tiers(self) -> "InjectionConfig":
	invalid = set(self.tiers) - _VALID_TIERS
	if invalid:
	raise ValueError(
	f"Invalid injection tier(s): {invalid}. Allowed: {_VALID_TIERS}"
	)
	if "classifier" in self.tiers and not self.classifier_url:
	import structlog
	structlog.get_logger().warning(
	"injection_classifier_no_url",
	msg="Tier 'classifier' configured but classifier_url is empty; "
	"classifier tier will be skipped at runtime.",
	)
	return self


	class PIIConfig(BaseModel):
	enabled: bool = True
	mode: Literal["redact", "detect_only", "passthrough"] = "redact"
	redact_patterns: list[str] = [
	"EMAIL", "PHONE", "SSN", "CREDIT_CARD", "IP_ADDRESS",
	]
	use_ner: bool = False
	ner_entities: list[str] = ["PERSON"]


	class OutputConfig(BaseModel):
	enabled: bool = True
	pii_check: bool = True
	url_check: bool = True
	secret_check: bool = True
	blocklist: list[str] = []


	class AuditConfig(BaseModel):
	enabled: bool = True
	path: str = "logs/audit.jsonl"
	max_size_mb: int = 100
	rotate: bool = True


	class SecurityConfig(BaseModel):
	injection: InjectionConfig = InjectionConfig()
	pii: PIIConfig = PIIConfig()
	output: OutputConfig = OutputConfig()
	audit: AuditConfig = AuditConfig()


	class CorpusConfig(BaseModel):
	"""Per-corpus configuration: store path, thresholds, iteration limits."""

	label: str
	store_path: str
	data_path: str
	refusal_threshold: float = 0.0
	top_k: int = 5
	max_iterations: int = 3
	# Optional: path to the golden dataset JSON for this corpus. None is
	# a valid state (corpus has no golden set yet during bring-up). The
	# evaluation CLI errors clearly if --corpus targets a corpus with
	# golden_dataset=None rather than requiring the field upfront.
	golden_dataset: str \| None = None
	# When False, the corpus is kept in YAML for schema visibility but is
	# not wired into corpus_map at startup. Dashboard can render the
	# toggle as disabled; /ask requests for the corpus return 400.
	# Use this for corpora whose docs/store are not yet curated.
	available: bool = True


	class AppConfig(BaseModel):
	agent: AgentConfig = AgentConfig()
	provider: ProviderConfig = ProviderConfig()
	rag: RAGConfig = RAGConfig()
	retry: RetryConfig = RetryConfig()
	memory: MemoryConfig = MemoryConfig()
	embedding: EmbeddingConfig = EmbeddingConfig()
	serving: ServingConfig = ServingConfig()
	evaluation: EvaluationConfig = EvaluationConfig()
	security: SecurityConfig = SecurityConfig()
	# Multi-corpus support
	corpora: dict[str, CorpusConfig] = {}
	default_corpus: str = "fastapi"

	@model_validator(mode="after")
	def _validate_default_corpus(self) -> "AppConfig":
	if not self.corpora:
	return self
	if self.default_corpus not in self.corpora:
	raise ValueError(
	f"default_corpus={self.default_corpus!r} is not in corpora "
	f"{sorted(self.corpora.keys())!r}. Configured corpora must "
	"include the default.",
	)
	# The default corpus must also be available — otherwise the app
	# would boot with no reachable default orchestrator.
	if not self.corpora[self.default_corpus].available:
	raise ValueError(
	f"default_corpus={self.default_corpus!r} has available=False. "
	"The default corpus must be ready to serve; set available=true "
	"or point default_corpus at a ready corpus.",
	)
	return self


	# --- Task config ---


	class TaskConfig(BaseModel):
	name: str
	description: str
	system_prompt: str
	document_dir: str = "data/tech_docs/"


	class TaskFileConfig(BaseModel):
	task: TaskConfig


	# --- Loaders ---


	def _resolve_config_dir() -> Path:
	"""Resolve configs directory: cwd first, then package-relative fallback."""
	cwd_configs = Path.cwd() / "configs"
	if cwd_configs.is_dir():
	return cwd_configs
	# Fallback: relative to package location (works for installed packages)
	pkg_configs = Path(__file__).resolve().parent.parent.parent / "configs"
	if pkg_configs.is_dir():
	return pkg_configs
	return cwd_configs # Let the caller get a clear FileNotFoundError


	def load_config(path: Path \| None = None) -> AppConfig:
	"""Load application config from YAML.

	If AGENT_BENCH_ENV is set (e.g. 'production'), loads configs/{env}.yaml
	if it exists, otherwise falls back to default.yaml.
	"""
	if path is None:
	import os

	env = os.environ.get("AGENT_BENCH_ENV", "")
	config_dir = _resolve_config_dir()
	env_path = config_dir / f"{env}.yaml"
	path = env_path if env and env_path.exists() else config_dir / "default.yaml"
	with open(path) as f:
	data: dict[str, Any] = yaml.safe_load(f)
	return AppConfig.model_validate(data)


	def load_task_config(task_name: str, path: Path \| None = None) -> TaskConfig:
	"""Load a task-specific config from YAML."""
	if path is None:
	path = _resolve_config_dir() / "tasks" / f"{task_name}.yaml"
	with open(path) as f:
	data: dict[str, Any] = yaml.safe_load(f)
	return TaskFileConfig.model_validate(data).task