Spaces:

Legal-i
/

orgstate

Running

File size: 3,305 Bytes

d2d1903

"""
Integration test: the core math must actually DETECT drift on a degrading
series — and detect it *before* a blunt static threshold (what a human
watching a dashboard would use) would fire.

This is the smallest honest version of the product's headline metric,
Time-to-Organizational-Awareness: we assert positive lead time.

When Stage 2 builds the full labelled evaluation harness, this test is its
seed case.
"""
import random

from core.drift import drift_score, severity_from_score
from core.signals import (
    anomaly_xi,
    change_delta,
    latency_gamma,
    stability_psi,
)

WARMUP = 14
EXPECTED_RESPONSE = 30.0   # SLA target used by latency_gamma
CHANGE_SCALE = 25.0        # normalisation scale for change_delta
HUMAN_THRESHOLD = 32.0     # blunt static line a dashboard-watcher would eyeball
DRIFT_THRESHOLD = 0.35     # core: "medium" severity and above


def _degrading_series(stable_days=20, ramp_days=25, seed=42):
    """Flat ~20 for stable_days, then a linear ramp upward with mild noise."""
    rng = random.Random(seed)
    series = []
    for i in range(stable_days):
        series.append(20.0 + rng.uniform(-1.5, 1.5))
    for j in range(ramp_days):
        series.append(20.0 + j * 0.9 + rng.uniform(-1.5, 1.5))
    return series


def _score_day(series, i):
    """Mirror the per-day signal composition the pipeline uses, for one metric."""
    value = series[i]
    history = series[:i]
    baseline_window = history[: max(1, i - 7)]
    baseline = sum(baseline_window) / len(baseline_window)
    recent = series[max(0, i - WARMUP): i + 1]

    delta = change_delta(value, baseline, CHANGE_SCALE)
    psi = stability_psi(recent)
    xi = anomaly_xi(value, history)
    gamma = latency_gamma(value, EXPECTED_RESPONSE)
    kappa = 1.0  # no contradiction source in this minimal fixture

    return drift_score({"delta": delta, "psi": psi, "xi": xi, "gamma": gamma, "kappa": kappa})


def test_stable_period_stays_below_drift_threshold():
    series = _degrading_series()
    scores = [_score_day(series, i) for i in range(WARMUP, 20)]
    assert max(scores) < DRIFT_THRESHOLD, f"false positive during stable period: {scores}"


def test_drift_is_detected_on_a_degrading_series():
    series = _degrading_series()
    scores = [_score_day(series, i) for i in range(WARMUP, len(series))]
    assert max(scores) >= 0.55, "core failed to escalate a clearly degrading series"
    assert severity_from_score(scores[-1]) in {"medium", "high", "critical"}


def test_core_detects_drift_before_a_blunt_static_threshold():
    """Positive lead time: drift_score crosses 0.35 strictly before the raw
    metric crosses the human eyeball threshold."""
    series = _degrading_series()

    day_drift = next(
        (i for i in range(WARMUP, len(series)) if _score_day(series, i) >= DRIFT_THRESHOLD),
        None,
    )
    day_human = next(
        (i for i in range(WARMUP, len(series)) if series[i] > HUMAN_THRESHOLD),
        None,
    )

    assert day_drift is not None, "core never flagged the degrading series"
    assert day_human is not None, "fixture never crossed the human threshold"

    lead_time = day_human - day_drift
    assert lead_time > 0, (
        f"no early warning: core flagged day {day_drift}, "
        f"human threshold crossed day {day_human}"
    )