| """ |
| Integration test: the core math must actually DETECT drift on a degrading |
| series — and detect it *before* a blunt static threshold (what a human |
| watching a dashboard would use) would fire. |
| |
| This is the smallest honest version of the product's headline metric, |
| Time-to-Organizational-Awareness: we assert positive lead time. |
| |
| When Stage 2 builds the full labelled evaluation harness, this test is its |
| seed case. |
| """ |
| import random |
|
|
| from core.drift import drift_score, severity_from_score |
| from core.signals import ( |
| anomaly_xi, |
| change_delta, |
| latency_gamma, |
| stability_psi, |
| ) |
|
|
| WARMUP = 14 |
| EXPECTED_RESPONSE = 30.0 |
| CHANGE_SCALE = 25.0 |
| HUMAN_THRESHOLD = 32.0 |
| DRIFT_THRESHOLD = 0.35 |
|
|
|
|
| def _degrading_series(stable_days=20, ramp_days=25, seed=42): |
| """Flat ~20 for stable_days, then a linear ramp upward with mild noise.""" |
| rng = random.Random(seed) |
| series = [] |
| for i in range(stable_days): |
| series.append(20.0 + rng.uniform(-1.5, 1.5)) |
| for j in range(ramp_days): |
| series.append(20.0 + j * 0.9 + rng.uniform(-1.5, 1.5)) |
| return series |
|
|
|
|
| def _score_day(series, i): |
| """Mirror the per-day signal composition the pipeline uses, for one metric.""" |
| value = series[i] |
| history = series[:i] |
| baseline_window = history[: max(1, i - 7)] |
| baseline = sum(baseline_window) / len(baseline_window) |
| recent = series[max(0, i - WARMUP): i + 1] |
|
|
| delta = change_delta(value, baseline, CHANGE_SCALE) |
| psi = stability_psi(recent) |
| xi = anomaly_xi(value, history) |
| gamma = latency_gamma(value, EXPECTED_RESPONSE) |
| kappa = 1.0 |
|
|
| return drift_score({"delta": delta, "psi": psi, "xi": xi, "gamma": gamma, "kappa": kappa}) |
|
|
|
|
| def test_stable_period_stays_below_drift_threshold(): |
| series = _degrading_series() |
| scores = [_score_day(series, i) for i in range(WARMUP, 20)] |
| assert max(scores) < DRIFT_THRESHOLD, f"false positive during stable period: {scores}" |
|
|
|
|
| def test_drift_is_detected_on_a_degrading_series(): |
| series = _degrading_series() |
| scores = [_score_day(series, i) for i in range(WARMUP, len(series))] |
| assert max(scores) >= 0.55, "core failed to escalate a clearly degrading series" |
| assert severity_from_score(scores[-1]) in {"medium", "high", "critical"} |
|
|
|
|
| def test_core_detects_drift_before_a_blunt_static_threshold(): |
| """Positive lead time: drift_score crosses 0.35 strictly before the raw |
| metric crosses the human eyeball threshold.""" |
| series = _degrading_series() |
|
|
| day_drift = next( |
| (i for i in range(WARMUP, len(series)) if _score_day(series, i) >= DRIFT_THRESHOLD), |
| None, |
| ) |
| day_human = next( |
| (i for i in range(WARMUP, len(series)) if series[i] > HUMAN_THRESHOLD), |
| None, |
| ) |
|
|
| assert day_drift is not None, "core never flagged the degrading series" |
| assert day_human is not None, "fixture never crossed the human threshold" |
|
|
| lead_time = day_human - day_drift |
| assert lead_time > 0, ( |
| f"no early warning: core flagged day {day_drift}, " |
| f"human threshold crossed day {day_human}" |
| ) |
|
|