Spaces:
Sleeping
Sleeping
| """Typed configuration objects used across the framework.""" | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional, Literal, Union | |
| class LoggingConfig: | |
| """Logging configuration (rotating file + console).""" | |
| log_dir: Path = Path("logs") | |
| level: str = "INFO" # DEBUG | INFO | WARNING | ERROR | CRITICAL | |
| max_mb: int = 5 # per-file size before rotation | |
| backups: int = 5 # number of rotated files to keep | |
| class CrossEncoderConfig: | |
| """Configuration for an optional cross-encoder re-ranker.""" | |
| enable: bool = False # master switch | |
| model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2" | |
| device: str = "cpu" | |
| max_length: int = 512 # truncation length | |
| first_stage_k: int = 50 # how many docs to pass to re-ranker | |
| final_k: Optional[int] = None # override PipelineConfig.retriever.top_k | |
| class RetrieverConfig: | |
| """Configuration for a retriever back-end.""" | |
| name: Literal["bm25", "dense", "hybrid"] = "bm25" | |
| top_k: int = 5 | |
| # For backward compatibility with tests: allow index_path alias for sparse | |
| index_path: Optional[Union[str, Path]] = None # alias for bm25_index | |
| # Specific to BM25 | |
| bm25_idx: Optional[Union[str, Path]] = None | |
| doc_store: Optional[Union[str, Path]] = None | |
| # For dense-only | |
| faiss_index: Optional[Union[str, Path]] = None | |
| model_name: str = "sentence-transformers/all-MiniLM-L6-v2" | |
| embedder_cache: Optional[Union[str, Path]] = None | |
| device: str = "cpu" | |
| # For hybrid only | |
| alpha: float = 0.5 # sparse ↔ dense weight | |
| def __post_init__(self): | |
| # If index_path is provided (legacy), use it as bm25_index | |
| if self.index_path: | |
| self.bm25_idx = self.index_path | |
| class GeneratorConfig: | |
| """Configuration for the text generator.""" | |
| model_name: str = "google/flan-t5-base" | |
| device: str = "cpu" | |
| max_new_tokens: int = 256 | |
| temperature: float = 0.0 | |
| class StatsConfig: | |
| """Configuration for statistical tests & robustness analyses.""" | |
| # Correlation (RQ1 & RQ2) | |
| correlation_method: Literal["spearman", "kendall"] = "spearman" | |
| n_boot: int = 1000 # bootstrap replicates for CIs | |
| ci: float = 0.95 # confidence level (e.g. 0.95 = 95 %) | |
| # Significance tests (RQ2) | |
| wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided" | |
| multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni" | |
| alpha: float = 0.05 # family-wise error rate | |
| # Robustness / sensitivity (RQ3 & RQ4) | |
| compute_effect_size: bool = True | |
| n_permutations: int = 1000 | |
| failure_threshold: float = 0.0 | |
| class PipelineConfig: | |
| """Top-level pipeline configuration.""" | |
| logging: LoggingConfig = field(default_factory=LoggingConfig) | |
| reranker: CrossEncoderConfig = field(default_factory=CrossEncoderConfig) | |
| retriever: RetrieverConfig = field(default_factory=RetrieverConfig) | |
| generator: GeneratorConfig = field(default_factory=GeneratorConfig) | |
| stats: StatsConfig = field(default_factory=StatsConfig) | |