Spaces:

Legal-i
/

orgstate

Running

App Files Files Community

orgstate / tests /test_core_integration.py

Legal-i

Initial OrgState deploy via Stage 150 free-tier stack

d2d1903 verified 16 days ago

raw

history blame contribute delete

3.31 kB

	"""
	Integration test: the core math must actually DETECT drift on a degrading
	series — and detect it before a blunt static threshold (what a human
	watching a dashboard would use) would fire.

	This is the smallest honest version of the product's headline metric,
	Time-to-Organizational-Awareness: we assert positive lead time.

	When Stage 2 builds the full labelled evaluation harness, this test is its
	seed case.
	"""
	import random

	from core.drift import drift_score, severity_from_score
	from core.signals import (
	anomaly_xi,
	change_delta,
	latency_gamma,
	stability_psi,
	)

	WARMUP = 14
	EXPECTED_RESPONSE = 30.0 # SLA target used by latency_gamma
	CHANGE_SCALE = 25.0 # normalisation scale for change_delta
	HUMAN_THRESHOLD = 32.0 # blunt static line a dashboard-watcher would eyeball
	DRIFT_THRESHOLD = 0.35 # core: "medium" severity and above


	def _degrading_series(stable_days=20, ramp_days=25, seed=42):
	"""Flat ~20 for stable_days, then a linear ramp upward with mild noise."""
	rng = random.Random(seed)
	series = []
	for i in range(stable_days):
	series.append(20.0 + rng.uniform(-1.5, 1.5))
	for j in range(ramp_days):
	series.append(20.0 + j * 0.9 + rng.uniform(-1.5, 1.5))
	return series


	def _score_day(series, i):
	"""Mirror the per-day signal composition the pipeline uses, for one metric."""
	value = series[i]
	history = series[:i]
	baseline_window = history[: max(1, i - 7)]
	baseline = sum(baseline_window) / len(baseline_window)
	recent = series[max(0, i - WARMUP): i + 1]

	delta = change_delta(value, baseline, CHANGE_SCALE)
	psi = stability_psi(recent)
	xi = anomaly_xi(value, history)
	gamma = latency_gamma(value, EXPECTED_RESPONSE)
	kappa = 1.0 # no contradiction source in this minimal fixture

	return drift_score({"delta": delta, "psi": psi, "xi": xi, "gamma": gamma, "kappa": kappa})


	def test_stable_period_stays_below_drift_threshold():
	series = _degrading_series()
	scores = [_score_day(series, i) for i in range(WARMUP, 20)]
	assert max(scores) < DRIFT_THRESHOLD, f"false positive during stable period: {scores}"


	def test_drift_is_detected_on_a_degrading_series():
	series = _degrading_series()
	scores = [_score_day(series, i) for i in range(WARMUP, len(series))]
	assert max(scores) >= 0.55, "core failed to escalate a clearly degrading series"
	assert severity_from_score(scores[-1]) in {"medium", "high", "critical"}


	def test_core_detects_drift_before_a_blunt_static_threshold():
	"""Positive lead time: drift_score crosses 0.35 strictly before the raw
	metric crosses the human eyeball threshold."""
	series = _degrading_series()

	day_drift = next(
	(i for i in range(WARMUP, len(series)) if _score_day(series, i) >= DRIFT_THRESHOLD),
	None,
	)
	day_human = next(
	(i for i in range(WARMUP, len(series)) if series[i] > HUMAN_THRESHOLD),
	None,
	)

	assert day_drift is not None, "core never flagged the degrading series"
	assert day_human is not None, "fixture never crossed the human threshold"

	lead_time = day_human - day_drift
	assert lead_time > 0, (
	f"no early warning: core flagged day {day_drift}, "
	f"human threshold crossed day {day_human}"
	)