orgstate / tests /test_core_integration.py
Legal-i's picture
Initial OrgState deploy via Stage 150 free-tier stack
d2d1903 verified
"""
Integration test: the core math must actually DETECT drift on a degrading
series — and detect it *before* a blunt static threshold (what a human
watching a dashboard would use) would fire.
This is the smallest honest version of the product's headline metric,
Time-to-Organizational-Awareness: we assert positive lead time.
When Stage 2 builds the full labelled evaluation harness, this test is its
seed case.
"""
import random
from core.drift import drift_score, severity_from_score
from core.signals import (
anomaly_xi,
change_delta,
latency_gamma,
stability_psi,
)
WARMUP = 14
EXPECTED_RESPONSE = 30.0 # SLA target used by latency_gamma
CHANGE_SCALE = 25.0 # normalisation scale for change_delta
HUMAN_THRESHOLD = 32.0 # blunt static line a dashboard-watcher would eyeball
DRIFT_THRESHOLD = 0.35 # core: "medium" severity and above
def _degrading_series(stable_days=20, ramp_days=25, seed=42):
"""Flat ~20 for stable_days, then a linear ramp upward with mild noise."""
rng = random.Random(seed)
series = []
for i in range(stable_days):
series.append(20.0 + rng.uniform(-1.5, 1.5))
for j in range(ramp_days):
series.append(20.0 + j * 0.9 + rng.uniform(-1.5, 1.5))
return series
def _score_day(series, i):
"""Mirror the per-day signal composition the pipeline uses, for one metric."""
value = series[i]
history = series[:i]
baseline_window = history[: max(1, i - 7)]
baseline = sum(baseline_window) / len(baseline_window)
recent = series[max(0, i - WARMUP): i + 1]
delta = change_delta(value, baseline, CHANGE_SCALE)
psi = stability_psi(recent)
xi = anomaly_xi(value, history)
gamma = latency_gamma(value, EXPECTED_RESPONSE)
kappa = 1.0 # no contradiction source in this minimal fixture
return drift_score({"delta": delta, "psi": psi, "xi": xi, "gamma": gamma, "kappa": kappa})
def test_stable_period_stays_below_drift_threshold():
series = _degrading_series()
scores = [_score_day(series, i) for i in range(WARMUP, 20)]
assert max(scores) < DRIFT_THRESHOLD, f"false positive during stable period: {scores}"
def test_drift_is_detected_on_a_degrading_series():
series = _degrading_series()
scores = [_score_day(series, i) for i in range(WARMUP, len(series))]
assert max(scores) >= 0.55, "core failed to escalate a clearly degrading series"
assert severity_from_score(scores[-1]) in {"medium", "high", "critical"}
def test_core_detects_drift_before_a_blunt_static_threshold():
"""Positive lead time: drift_score crosses 0.35 strictly before the raw
metric crosses the human eyeball threshold."""
series = _degrading_series()
day_drift = next(
(i for i in range(WARMUP, len(series)) if _score_day(series, i) >= DRIFT_THRESHOLD),
None,
)
day_human = next(
(i for i in range(WARMUP, len(series)) if series[i] > HUMAN_THRESHOLD),
None,
)
assert day_drift is not None, "core never flagged the degrading series"
assert day_human is not None, "fixture never crossed the human threshold"
lead_time = day_human - day_drift
assert lead_time > 0, (
f"no early warning: core flagged day {day_drift}, "
f"human threshold crossed day {day_human}"
)