""" Integration test: the core math must actually DETECT drift on a degrading series — and detect it *before* a blunt static threshold (what a human watching a dashboard would use) would fire. This is the smallest honest version of the product's headline metric, Time-to-Organizational-Awareness: we assert positive lead time. When Stage 2 builds the full labelled evaluation harness, this test is its seed case. """ import random from core.drift import drift_score, severity_from_score from core.signals import ( anomaly_xi, change_delta, latency_gamma, stability_psi, ) WARMUP = 14 EXPECTED_RESPONSE = 30.0 # SLA target used by latency_gamma CHANGE_SCALE = 25.0 # normalisation scale for change_delta HUMAN_THRESHOLD = 32.0 # blunt static line a dashboard-watcher would eyeball DRIFT_THRESHOLD = 0.35 # core: "medium" severity and above def _degrading_series(stable_days=20, ramp_days=25, seed=42): """Flat ~20 for stable_days, then a linear ramp upward with mild noise.""" rng = random.Random(seed) series = [] for i in range(stable_days): series.append(20.0 + rng.uniform(-1.5, 1.5)) for j in range(ramp_days): series.append(20.0 + j * 0.9 + rng.uniform(-1.5, 1.5)) return series def _score_day(series, i): """Mirror the per-day signal composition the pipeline uses, for one metric.""" value = series[i] history = series[:i] baseline_window = history[: max(1, i - 7)] baseline = sum(baseline_window) / len(baseline_window) recent = series[max(0, i - WARMUP): i + 1] delta = change_delta(value, baseline, CHANGE_SCALE) psi = stability_psi(recent) xi = anomaly_xi(value, history) gamma = latency_gamma(value, EXPECTED_RESPONSE) kappa = 1.0 # no contradiction source in this minimal fixture return drift_score({"delta": delta, "psi": psi, "xi": xi, "gamma": gamma, "kappa": kappa}) def test_stable_period_stays_below_drift_threshold(): series = _degrading_series() scores = [_score_day(series, i) for i in range(WARMUP, 20)] assert max(scores) < DRIFT_THRESHOLD, f"false positive during stable period: {scores}" def test_drift_is_detected_on_a_degrading_series(): series = _degrading_series() scores = [_score_day(series, i) for i in range(WARMUP, len(series))] assert max(scores) >= 0.55, "core failed to escalate a clearly degrading series" assert severity_from_score(scores[-1]) in {"medium", "high", "critical"} def test_core_detects_drift_before_a_blunt_static_threshold(): """Positive lead time: drift_score crosses 0.35 strictly before the raw metric crosses the human eyeball threshold.""" series = _degrading_series() day_drift = next( (i for i in range(WARMUP, len(series)) if _score_day(series, i) >= DRIFT_THRESHOLD), None, ) day_human = next( (i for i in range(WARMUP, len(series)) if series[i] > HUMAN_THRESHOLD), None, ) assert day_drift is not None, "core never flagged the degrading series" assert day_human is not None, "fixture never crossed the human threshold" lead_time = day_human - day_drift assert lead_time > 0, ( f"no early warning: core flagged day {day_drift}, " f"human threshold crossed day {day_human}" )