"""Typed configuration objects used across the framework.""" from dataclasses import dataclass, field from pathlib import Path from typing import Optional, Literal, Union @dataclass class LoggingConfig: """Logging configuration (rotating file + console).""" log_dir: Path = Path("logs") level: str = "INFO" # DEBUG | INFO | WARNING | ERROR | CRITICAL max_mb: int = 5 # per-file size before rotation backups: int = 5 # number of rotated files to keep @dataclass class CrossEncoderConfig: """Configuration for an optional cross-encoder re-ranker.""" enable: bool = False # master switch model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2" device: str = "cpu" max_length: int = 512 # truncation length first_stage_k: int = 50 # how many docs to pass to re-ranker final_k: Optional[int] = None # override PipelineConfig.retriever.top_k @dataclass class RetrieverConfig: """Configuration for a retriever back-end.""" name: Literal["bm25", "dense", "hybrid"] = "bm25" top_k: int = 5 # For backward compatibility with tests: allow index_path alias for sparse index_path: Optional[Union[str, Path]] = None # alias for bm25_index # Specific to BM25 bm25_idx: Optional[Union[str, Path]] = None doc_store: Optional[Union[str, Path]] = None # For dense-only faiss_index: Optional[Union[str, Path]] = None model_name: str = "sentence-transformers/all-MiniLM-L6-v2" embedder_cache: Optional[Union[str, Path]] = None device: str = "cpu" # For hybrid only alpha: float = 0.5 # sparse ↔ dense weight def __post_init__(self): # If index_path is provided (legacy), use it as bm25_index if self.index_path: self.bm25_idx = self.index_path @dataclass class GeneratorConfig: """Configuration for the text generator.""" model_name: str = "google/flan-t5-base" device: str = "cpu" max_new_tokens: int = 256 temperature: float = 0.0 @dataclass class StatsConfig: """Configuration for statistical tests & robustness analyses.""" # Correlation (RQ1 & RQ2) correlation_method: Literal["spearman", "kendall"] = "spearman" n_boot: int = 1000 # bootstrap replicates for CIs ci: float = 0.95 # confidence level (e.g. 0.95 = 95 %) # Significance tests (RQ2) wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided" multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni" alpha: float = 0.05 # family-wise error rate # Robustness / sensitivity (RQ3 & RQ4) compute_effect_size: bool = True n_permutations: int = 1000 failure_threshold: float = 0.0 @dataclass class PipelineConfig: """Top-level pipeline configuration.""" logging: LoggingConfig = field(default_factory=LoggingConfig) reranker: CrossEncoderConfig = field(default_factory=CrossEncoderConfig) retriever: RetrieverConfig = field(default_factory=RetrieverConfig) generator: GeneratorConfig = field(default_factory=GeneratorConfig) stats: StatsConfig = field(default_factory=StatsConfig)