Spaces:
Sleeping
Sleeping
| """ | |
| server/fault_math.py | |
| -------------------- | |
| Pure fault-injection math: apply_faults(S_true, ...) -> S_faulted. | |
| Extracted from RAGDebugEnvironment._recompute_S_faulted so that both | |
| the server environment and corpora/stages/verify.py can use identical | |
| transformations without instantiating the full environment class. | |
| All transformations are deterministic given the same noise/dupe_ids arrays. | |
| Noise is pre-generated by the caller (at episode reset, or fresh per-test in | |
| verify.py) and scaled here. | |
| """ | |
| from __future__ import annotations | |
| from typing import Dict, Optional, Set, Tuple | |
| import numpy as np | |
| from scipy.ndimage import uniform_filter1d | |
| from models import FaultType | |
| def apply_faults( | |
| S: np.ndarray, | |
| fault_types: Set[FaultType], | |
| config_chunk_size: int, | |
| config_context_limit: int, | |
| config_use_reranking: bool, | |
| noise: Dict[FaultType, np.ndarray], | |
| dupe_ids: np.ndarray, | |
| rewrite_boosts: Optional[np.ndarray] = None, | |
| config_chunk_overlap: int = 0, | |
| ) -> np.ndarray: | |
| """ | |
| Apply fault math transformations to a similarity matrix. | |
| Parameters | |
| ---------- | |
| S : np.ndarray, shape (n_queries, n_chunks), float32 | |
| The S_true matrix for the active embedding model. Copied internally. | |
| fault_types : set of FaultType | |
| Which faults to apply. WRONG_EMBEDDING_MODEL is not handled here — | |
| it is implicit in which S matrix the caller passes in. | |
| config_chunk_size : int | |
| Current PipelineConfig.chunk_size. Modulates CHUNK_TOO_LARGE and | |
| CHUNK_TOO_SMALL severity. | |
| config_context_limit : int | |
| Current PipelineConfig.context_window_limit. Modulates CONTEXT_OVERFLOW. | |
| config_use_reranking : bool | |
| Current PipelineConfig.use_reranking. Modulates DUPLICATE_FLOODING, | |
| suppresses NO_RERANKING when True, and partially restores score signal | |
| under TOP_K_TOO_SMALL (cross-encoder reranking recovers rank order). | |
| noise : dict mapping FaultType -> np.ndarray (n_queries, n_chunks), float32 | |
| Pre-generated unit-normal arrays. Caller generates these once per | |
| episode (for determinism) or fresh per test (for verify.py). | |
| Required keys: CHUNK_TOO_SMALL, THRESHOLD_TOO_LOW, NO_RERANKING. | |
| dupe_ids : np.ndarray of int | |
| Column indices of chunks designated as "duplicates" for this episode. | |
| rewrite_boosts : np.ndarray (n_queries, n_chunks) float32, optional | |
| Persistent boost overlay from REWRITE_QUERY actions. Pass None or | |
| zeros if not applicable. | |
| config_chunk_overlap : int | |
| Current PipelineConfig.chunk_overlap. Higher overlap stabilises | |
| boundary embeddings, reducing score noise under CHUNK_TOO_SMALL. | |
| Returns | |
| ------- | |
| np.ndarray, same shape as S, dtype float32, values clipped to [0, 1]. | |
| """ | |
| S = S.copy().astype(np.float32) | |
| S_clean = S.copy() # preserve pre-fault scores for cross-encoder reranking blend | |
| n_q, n_c = S.shape | |
| # --- CHUNK_TOO_LARGE --- | |
| # Box filter along chunk axis smears scores across neighboring chunks. | |
| # filter_size scales with chunk_size: larger chunks -> more smearing. | |
| if FaultType.CHUNK_TOO_LARGE in fault_types: | |
| filter_size = max(1, round(4 * config_chunk_size / 512)) | |
| if filter_size > 1: | |
| S = uniform_filter1d(S, size=filter_size, axis=1, mode="nearest").astype(np.float32) | |
| # --- CHUNK_TOO_SMALL --- | |
| # Adds Gaussian noise: smaller chunks -> noisier embeddings. | |
| # Higher overlap stabilises content at chunk boundaries, partially | |
| # mitigating the noise (overlap up to 500 can halve the sigma). | |
| if FaultType.CHUNK_TOO_SMALL in fault_types: | |
| overlap_reduction = min(0.5, config_chunk_overlap / 1000.0) | |
| sigma = 0.15 * min(1.0, 512.0 / max(config_chunk_size, 64)) * (1.0 - overlap_reduction) | |
| S = S + sigma * noise[FaultType.CHUNK_TOO_SMALL] | |
| # --- THRESHOLD_TOO_LOW --- | |
| # Adds Gaussian noise: irrelevant chunks score high by chance, cluttering retrieval. | |
| if FaultType.THRESHOLD_TOO_LOW in fault_types: | |
| S = S + 0.10 * noise[FaultType.THRESHOLD_TOO_LOW] | |
| # --- THRESHOLD_TOO_HIGH --- | |
| # Deflates all scores: relevant chunks fall below any reasonable threshold. | |
| if FaultType.THRESHOLD_TOO_HIGH in fault_types: | |
| S = S * 0.55 | |
| # --- TOP_K_TOO_SMALL --- | |
| # Compresses score range toward 0.5: rankings become unreliable, tanking | |
| # precision when top_k is large relative to the number of relevant chunks. | |
| # Enabling reranking (cross-encoder) partially restores score signal by | |
| # re-scoring the top_k candidates with a stronger model, so compression | |
| # is less severe when use_reranking=True. | |
| if FaultType.TOP_K_TOO_SMALL in fault_types: | |
| compress = 0.24 if not config_use_reranking else 0.65 | |
| S = 0.5 + (S - 0.5) * compress | |
| # --- DUPLICATE_FLOODING --- | |
| # Boosts a random ~14% of chunks. Reranking sharply cuts the boost. | |
| if FaultType.DUPLICATE_FLOODING in fault_types: | |
| boost = 0.08 if config_use_reranking else 0.20 | |
| S[:, dupe_ids] = np.minimum(S[:, dupe_ids] + boost, 1.0) | |
| # --- CONTEXT_OVERFLOW --- | |
| # Zeros out chunks beyond the context window cutoff. | |
| if FaultType.CONTEXT_OVERFLOW in fault_types: | |
| cutoff = max(1, int(n_c * config_context_limit / 16384)) | |
| if cutoff < n_c: | |
| S[:, cutoff:] = 0.0 | |
| # --- NO_RERANKING --- | |
| # Adds mild Gaussian noise. Skipped entirely when reranking is enabled. | |
| if FaultType.NO_RERANKING in fault_types and not config_use_reranking: | |
| S = S + 0.10 * noise[FaultType.NO_RERANKING] | |
| # --- CROSS-ENCODER RERANKING --- | |
| # A cross-encoder re-scores candidates using a stronger model that sees | |
| # the full query-document pair, partially recovering the true relevance | |
| # signal corrupted by faults. We simulate this by blending faulted | |
| # scores back toward the original (pre-fault) scores. | |
| # This is non-monotonic for noise-based faults (undoes random | |
| # perturbations) and restores score spread for compression faults | |
| # (enabling effective threshold-based filtering). | |
| if config_use_reranking: | |
| rerank_alpha = 0.35 | |
| S = (1.0 - rerank_alpha) * S + rerank_alpha * S_clean | |
| # Clip to valid range and apply persistent query-rewrite boosts | |
| S = np.clip(S, 0.0, 1.0) | |
| if rewrite_boosts is not None: | |
| S = np.clip(S + rewrite_boosts, 0.0, 1.0) | |
| return S | |
| def make_noise(rng: np.random.Generator, shape: Tuple[int, int]) -> Dict[FaultType, np.ndarray]: | |
| """ | |
| Generate the standard noise dict expected by apply_faults. | |
| Parameters | |
| ---------- | |
| rng : np.random.Generator | |
| shape : (n_queries, n_chunks) | |
| Returns | |
| ------- | |
| dict mapping the three noise-dependent FaultTypes to unit-normal arrays. | |
| """ | |
| return { | |
| FaultType.CHUNK_TOO_SMALL: rng.standard_normal(shape).astype(np.float32), | |
| FaultType.THRESHOLD_TOO_LOW: rng.standard_normal(shape).astype(np.float32), | |
| FaultType.NO_RERANKING: rng.standard_normal(shape).astype(np.float32), | |
| } | |