Spaces:

Legal-i
/

orgstate

Running

App Files Files Community

orgstate / tests /test_delivery_backtest.py

Legal-i

Initial OrgState deploy via Stage 150 free-tier stack

d2d1903 verified 16 days ago

raw

history blame contribute delete

20.1 kB

	"""
	Tests for Stage 16 — rolling backtest.

	The salesforce sample data (50 days) is shorter than the
	salesforce config's baseline_window + lag + recent_window (51), so
	the backtest needs its own synthetic series that's long enough for
	the engine to actually have a window to evaluate. We build one with
	core.MetricConfig + plain Observations rather than going through
	a vertical config, so the test stays focused on the backtest logic.
	"""
	from typing import List

	import pytest

	from core import (
	HIGHER_IS_WORSE,
	EntityTypeConfig,
	MetricConfig,
	Observation,
	)
	from delivery.backtest import (
	BacktestPoint,
	BacktestResult,
	render_html,
	run_backtest,
	)

	# --- fixtures --------------------------------------------------------------

	def _short_window_config():
	"""Tiny windows so a short synthetic series produces multiple steps."""
	return EntityTypeConfig(
	entity_type="thing",
	metrics=[MetricConfig("m", HIGHER_IS_WORSE, weight=1.0,
	feeds_anomaly=True, feeds_stability=True)],
	baseline_window=7, baseline_lag=2, recent_window=5,
	)


	def _series(entity_id: str, values: List[float]) -> List[Observation]:
	"""Build a daily series starting 2026-01-01 with the given values."""
	out = []
	for i, v in enumerate(values):
	# YYYY-MM-DD, only the first 31 days for simplicity
	from datetime import date, timedelta
	d = (date(2026, 1, 1) + timedelta(days=i)).isoformat()
	out.append(Observation(entity_id, d, {"m": float(v)}))
	return out


	def _healthy(entity_id: str, n_days: int) -> List[Observation]:
	return _series(entity_id, [100.0] * n_days)


	def _drifting_after(entity_id: str, n_days: int, onset: int,
	delta: float = 8.0) -> List[Observation]:
	"""A series that is flat at 100 then climbs (worse) from ``onset``."""
	vals = []
	for i in range(n_days):
	vals.append(100.0 if i < onset else 100.0 + (i - onset + 1) * delta)
	return _series(entity_id, vals)


	# --- empty / degenerate inputs ------------------------------------------

	def test_empty_observations_returns_empty_result():
	result = run_backtest([], _short_window_config(), step_days=7)
	assert isinstance(result, BacktestResult)
	assert result.points == []
	assert result.n_steps == 0


	def test_too_short_for_one_window():
	"""Series shorter than baseline + lag + recent produces zero steps —
	backtest can't compute a single drift score causally."""
	cfg = _short_window_config()
	obs = _healthy("e", 5) # 5 days, need >= 14
	result = run_backtest(obs, cfg, step_days=7)
	assert result.n_steps == 0


	# --- happy path: rolling steps ------------------------------------------

	def test_steps_advance_by_step_days():
	cfg = _short_window_config()
	obs = _healthy("e", 30) # 30 days, plenty of room
	result = run_backtest(obs, cfg, step_days=7)
	# at least: floor((30 - 14 + 1) / 7) + 1 cursors
	assert result.n_steps >= 2
	cursors = [p.cursor for p in result.points]
	# successive cursors are exactly 7 days apart
	from datetime import date
	diffs = [(date.fromisoformat(b) - date.fromisoformat(a)).days
	for a, b in zip(cursors, cursors[1:])]
	assert all(d == 7 for d in diffs)


	def test_engine_does_not_see_the_future_at_any_cursor():
	"""Causality test: at cursor C, the engine only sees observations
	with day <= C. We can prove this indirectly: a drifting series whose
	drift starts AFTER all our cursors should produce zero issues at
	every cursor."""
	cfg = _short_window_config()
	# 30 days; drift starts on day 28 (very near the end)
	obs = _drifting_after("e", 30, onset=28, delta=20.0)
	# Step large so cursors land at days 13, 23 — both BEFORE onset
	result = run_backtest(obs, cfg, step_days=10)
	# the cursors are at days 13 and 23 (indices); neither sees the drift
	cursor_days = {p.cursor for p in result.points}
	assert "2026-01-24" in cursor_days # day index 23 (0-based)
	# ... and no issues at that cursor
	early = [p for p in result.points if p.cursor == "2026-01-24"][0]
	assert early.n_issues == 0


	def test_drift_appears_only_after_onset():
	"""The complementary causality test: as cursor advances past the
	drift onset, the engine starts to flag the entity."""
	cfg = _short_window_config()
	obs = _drifting_after("e", 30, onset=15, delta=20.0) + _healthy("h", 30)
	result = run_backtest(obs, cfg, step_days=2)
	# at SOME later cursor, 'e' should be flagged; at the first cursor
	# (day 13, before onset 15) it should NOT be flagged.
	by_cursor = {p.cursor: p for p in result.points}
	first = result.points[0]
	assert "e" not in {i["entity_id"] for i in first.issues}
	flagged_at = [p for p in result.points
	if any(i["entity_id"] == "e" for i in p.issues)]
	assert flagged_at, "drift should surface at some cursor past onset"
	# and 'h' is never flagged (no drift to detect)
	for p in result.points:
	assert all(i["entity_id"] != "h" for i in p.issues)


	def test_rollups_match_summed_points():
	cfg = _short_window_config()
	obs = _drifting_after("e", 30, onset=10, delta=15.0)
	result = run_backtest(obs, cfg, step_days=3)
	summed = sum(p.n_issues for p in result.points)
	assert result.n_issues_total == summed
	sev_summed = {s: sum(p.severity_counts.get(s, 0) for p in result.points)
	for s in ("critical", "high", "medium", "low")}
	assert {k: v for k, v in result.per_severity_total.items() if v} \
	== {k: v for k, v in sev_summed.items() if v}


	def test_unique_entities_ever_flagged_dedupes():
	cfg = _short_window_config()
	# one entity, drifting hard for many days — should be flagged in
	# many windows but counted once in the unique total.
	obs = _drifting_after("e", 30, onset=10, delta=20.0)
	result = run_backtest(obs, cfg, step_days=2)
	flagged_windows = [p for p in result.points if p.n_issues > 0]
	assert len(flagged_windows) >= 2
	assert result.n_unique_entities_ever_flagged == 1


	# --- HTML render ---------------------------------------------------------

	def test_render_html_returns_self_contained_doc():
	cfg = _short_window_config()
	obs = _drifting_after("e", 30, onset=10, delta=20.0)
	result = run_backtest(obs, cfg, step_days=3)
	html = render_html(result)
	assert html.startswith("<!doctype html>")
	assert html.rstrip().endswith("</html>")
	assert "OrgState backtest" in html
	# the entity that drifted should appear in the per-entity table
	assert "<code>e</code>" in html


	def test_render_html_empty_result_says_so():
	cfg = _short_window_config()
	result = run_backtest(_healthy("e", 5), cfg, step_days=7)
	html = render_html(result)
	assert "No steps produced" in html


	def test_render_html_severity_bars_when_issues_present():
	cfg = _short_window_config()
	obs = _drifting_after("e", 30, onset=10, delta=20.0)
	result = run_backtest(obs, cfg, step_days=3)
	html = render_html(result)
	# at least one stacked-severity bar appears
	assert 'class="bar bar-' in html


	# --- recurrence detection (Stage 17) -----------------------------------

	from delivery.backtest import ( # noqa: E402 (after fixtures intentionally)
	RecurringEntity,
	find_recurring_entities,
	)


	def _fake_result(point_specs):
	"""Build a BacktestResult from a list of (cursor, [(entity_id, severity, score)])."""
	points = []
	for cursor, issues in point_specs:
	sev_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
	issue_dicts = []
	for eid, sev, score in issues:
	sev_counts[sev] = sev_counts.get(sev, 0) + 1
	issue_dicts.append({"entity_id": eid, "severity": sev,
	"score": score, "title": f"d {eid}"})
	points.append(BacktestPoint(
	cursor=cursor, n_states=10, n_issues=len(issues),
	severity_counts=sev_counts, issues=issue_dicts,
	))
	return BacktestResult(
	tenant_id="t", entity_type="thing", step_days=7,
	n_steps=len(points), points=points,
	)


	def test_find_recurring_below_threshold_returns_empty():
	"""One-off events do not surface as recurring — that's the whole
	point of the classifier."""
	result = _fake_result([
	("2026-01-01", [("e1", "medium", 0.4)]),
	("2026-01-08", [("e2", "medium", 0.4)]),
	("2026-01-15", [("e3", "medium", 0.4)]),
	])
	assert find_recurring_entities(result, min_events=3) == []


	def test_find_recurring_at_threshold():
	"""An entity flagged exactly ``min_events`` times surfaces."""
	result = _fake_result([
	("2026-01-01", [("e1", "medium", 0.4)]),
	("2026-01-08", [("e1", "medium", 0.4)]),
	("2026-01-15", [("e1", "medium", 0.4)]),
	])
	[rec] = find_recurring_entities(result, min_events=3)
	assert isinstance(rec, RecurringEntity)
	assert rec.entity_id == "e1"
	assert rec.n_events == 3
	assert rec.first_cursor == "2026-01-01"
	assert rec.last_cursor == "2026-01-15"
	assert rec.cursors == ["2026-01-01", "2026-01-08", "2026-01-15"]


	def test_find_recurring_top_severity_is_worst_ever():
	"""``top_severity`` is the worst the entity ever hit across windows,
	not the most recent or the most common."""
	result = _fake_result([
	("2026-01-01", [("e1", "low", 0.3)]),
	("2026-01-08", [("e1", "critical", 0.9)]), # worst
	("2026-01-15", [("e1", "medium", 0.4)]),
	])
	[rec] = find_recurring_entities(result, min_events=3)
	assert rec.top_severity == "critical"


	def test_find_recurring_ordering_most_recurring_first():
	"""Sort key is (-n_events, recency-desc, entity_id) — most events
	first, ties broken by who was flagged most recently."""
	result = _fake_result([
	("2026-01-01", [("a", "medium", 0.4), ("b", "medium", 0.4)]),
	("2026-01-08", [("a", "medium", 0.4), ("b", "medium", 0.4)]),
	("2026-01-15", [("a", "medium", 0.4), ("b", "medium", 0.4),
	("c", "medium", 0.4)]),
	("2026-01-22", [("a", "medium", 0.4)]), # a gets 4 events
	("2026-01-29", [("b", "medium", 0.4),
	("c", "medium", 0.4)]), # b: 4, c: 2
	("2026-02-05", [("b", "medium", 0.4),
	("c", "medium", 0.4)]), # b: 5, c: 3
	])
	recurring = find_recurring_entities(result, min_events=3)
	# b has the most events (5), then a (4), then c (3)
	assert [r.entity_id for r in recurring] == ["b", "a", "c"]


	def test_recurring_callout_appears_in_rendered_html():
	"""The recurrence section is visible above the timeline when there
	is anything to surface."""
	result = _fake_result([
	("2026-01-01", [("hot_entity", "medium", 0.4)]),
	("2026-01-08", [("hot_entity", "medium", 0.4)]),
	("2026-01-15", [("hot_entity", "high", 0.6)]),
	])
	html = render_html(result, min_recurring_events=3)
	assert "Recurring drift" in html
	assert "<code>hot_entity</code>" in html
	# the worst severity badge is present
	assert 'class="sev sev-high"' in html


	def test_render_html_no_recurring_section_when_none():
	"""If nothing is recurring the callout block is suppressed so the
	page does not show an empty section. We still see the per-entity
	table."""
	result = _fake_result([
	("2026-01-01", [("e1", "medium", 0.4)]),
	("2026-01-08", [("e2", "medium", 0.4)]),
	])
	html = render_html(result, min_recurring_events=3)
	assert "Recurring drift" not in html


	def test_min_events_flag_is_configurable():
	"""Operators tune the threshold via min_recurring_events on render
	and min_events on the classifier — they're independently usable."""
	result = _fake_result([
	("2026-01-01", [("e1", "medium", 0.4)]),
	("2026-01-08", [("e1", "medium", 0.4)]),
	])
	assert find_recurring_entities(result, min_events=3) == []
	[rec] = find_recurring_entities(result, min_events=2)
	assert rec.entity_id == "e1"


	# --- per-entity drill-down (Stage 18) ----------------------------------

	from delivery.backtest import ( # noqa: E402
	_slug,
	render_entity_detail_html,
	)


	def test_entity_detail_renders_summary_and_sparkline():
	"""The drill-down shows summary + a CSS-only sparkline. We do not
	snapshot the whole HTML; we check the key story elements are there."""
	result = _fake_result([
	("2026-01-01", [("hot", "medium", 0.42)]),
	("2026-01-08", []), # quiet
	("2026-01-15", [("hot", "high", 0.6)]), # worst severity
	("2026-01-22", [("hot", "medium", 0.38)]),
	])
	html = render_entity_detail_html(result, "hot")
	assert html.startswith("<!doctype html>")
	assert "<h1>hot</h1>" in html
	# summary numbers
	assert "<strong>Events:</strong> 3" in html
	# worst severity is reported (not the most recent)
	assert "sev-high" in html
	# sparkline present — one cell per cursor
	assert html.count('class="spark-cell"') == 4
	# bars only for flagged cursors (3 of 4)
	assert html.count("spark-bar spark-") == 3
	# event table lists the flagged cursors only
	assert "2026-01-08" not in html.split('<h2>Flagged events</h2>')[-1]


	def test_entity_detail_for_non_flagged_entity_says_so():
	"""An entity that's never in any issue list should produce a
	cleanly-empty page, not a stack trace."""
	result = _fake_result([("2026-01-01", [("other", "medium", 0.4)])])
	html = render_entity_detail_html(result, "ghost")
	assert "<!doctype html>" in html
	assert "not flagged in any backtest window" in html


	def test_entity_detail_back_link_to_backtest_page():
	"""The drill-down lives at entities/X.html so the back link is
	../backtest.html — relative, no JS needed."""
	result = _fake_result([("2026-01-01", [("x", "medium", 0.4)])])
	html = render_entity_detail_html(result, "x")
	assert 'href="../backtest.html"' in html


	def test_main_html_links_entity_ids_when_link_entities_set():
	"""When the CLI renders drill-down pages, the entity ids in the
	main report become links to those pages."""
	result = _fake_result([
	("2026-01-01", [("e1", "medium", 0.4)]),
	("2026-01-08", [("e1", "medium", 0.4)]),
	("2026-01-15", [("e1", "medium", 0.4)]),
	])
	linked = render_html(result, link_entities=True)
	bare = render_html(result, link_entities=False)
	# link-mode renders <a href="entities/...">
	assert 'href="entities/e1.html"' in linked
	# bare-mode keeps the entity id as plain code
	assert 'href="entities/' not in bare


	def test_link_entities_can_be_a_subset():
	"""Passing a set links only those entities — prevents 404s when
	only the recurring entities have drill-down pages rendered."""
	result = _fake_result([
	# e_rec recurs 3 times (rendered), e_oneoff once (not rendered)
	("2026-01-01", [("e_rec", "medium", 0.4)]),
	("2026-01-08", [("e_rec", "medium", 0.4)]),
	("2026-01-15", [("e_rec", "medium", 0.4)]),
	("2026-01-22", [("e_oneoff", "medium", 0.4)]),
	])
	html = render_html(result, link_entities={"e_rec"})
	assert 'href="entities/e_rec.html"' in html
	# one-off entity stays plain — no broken link to a missing page
	assert 'href="entities/e_oneoff.html"' not in html
	assert "<code>e_oneoff</code>" in html


	# --- "what changed" details (Stage 19) ------------------------------------

	from delivery.backtest import ( # noqa: E402
	_compute_event_details,
	_fmt_delta_pct,
	_fmt_num,
	)


	def _cfg_for_details_test():
	return EntityTypeConfig(
	entity_type="thing",
	metrics=[MetricConfig("m", HIGHER_IS_WORSE, weight=1.0,
	feeds_anomaly=True, feeds_stability=True)],
	baseline_window=5, baseline_lag=2, recent_window=5,
	)


	def test_compute_event_details_baseline_vs_recent():
	"""The recent mean reflects the recent_window before the cursor,
	the baseline mean reflects the baseline_window before that (with a
	gap of baseline_lag days). Verified by constructing a series
	where the two halves are obviously different."""
	cfg = _cfg_for_details_test()
	# 12 days: baseline mean = 100, recent mean = 200 (drift up)
	# baseline = days 0..4 -> values 100; gap days 5..6; recent = days 7..11 -> 200
	from datetime import date, timedelta
	obs = []
	for i in range(12):
	v = 200.0 if i >= 7 else 100.0
	obs.append(Observation("e", (date(2026, 1, 1) + timedelta(days=i))
	.isoformat(), {"m": v}))
	details = _compute_event_details(obs, "e", "2026-01-12", cfg)
	assert "m" in details
	d = details["m"]
	assert d["baseline_mean"] == pytest.approx(100.0)
	assert d["recent_mean"] == pytest.approx(200.0)
	assert d["delta_pct"] == pytest.approx(100.0)


	def test_compute_event_details_handles_zero_baseline():
	"""If baseline mean is zero, delta% is None (would divide by 0).
	The renderer formats that as an em-dash."""
	cfg = _cfg_for_details_test()
	from datetime import date, timedelta
	obs = []
	for i in range(12):
	v = 0.0 if i < 7 else 5.0
	obs.append(Observation("e", (date(2026, 1, 1) + timedelta(days=i))
	.isoformat(), {"m": v}))
	details = _compute_event_details(obs, "e", "2026-01-12", cfg)
	d = details["m"]
	assert d["baseline_mean"] == 0.0
	assert d["recent_mean"] == pytest.approx(5.0)
	assert d["delta_pct"] is None


	def test_compute_event_details_other_entities_excluded():
	"""An entity's details must not pull in another entity's numbers."""
	cfg = _cfg_for_details_test()
	from datetime import date, timedelta
	obs = []
	for i in range(12):
	for entity_id, v in (("target", 50.0),
	("noise", 9999.0)):
	obs.append(Observation(entity_id,
	(date(2026, 1, 1)
	+ timedelta(days=i)).isoformat(),
	{"m": v}))
	details = _compute_event_details(obs, "target", "2026-01-12", cfg)
	# only "target"'s 50.0 values, never "noise"'s 9999
	assert details["m"]["baseline_mean"] == pytest.approx(50.0)
	assert details["m"]["recent_mean"] == pytest.approx(50.0)


	def test_what_changed_appears_in_entity_detail_html():
	"""Run a real backtest with the details path active, then verify
	the entity drill-down renders the 'What changed' section."""
	cfg = _short_window_config()
	obs = _drifting_after("e", 30, onset=10, delta=20.0)
	result = run_backtest(obs, cfg, step_days=3)
	html = render_entity_detail_html(result, "e")
	assert "What changed" in html
	assert 'class="changed-block"' in html
	assert "Baseline mean" in html
	assert "Recent mean" in html


	def test_what_changed_section_absent_when_no_events():
	cfg = _short_window_config()
	obs = _healthy("e", 30)
	result = run_backtest(obs, cfg, step_days=3)
	html = render_entity_detail_html(result, "e")
	assert "What changed" not in html


	def test_fmt_helpers():
	assert _fmt_num(None) == "—"
	assert _fmt_num(1234.5) == "1234.5"
	assert _fmt_num(0.123) == "0.123"
	assert _fmt_delta_pct(None) == "—"
	assert _fmt_delta_pct(5.0) == "+5.0%"
	assert _fmt_delta_pct(-3.2) == "-3.2%"


	def test_slug_makes_unsafe_entity_ids_filesystem_safe():
	"""A real customer might use ids containing '/', '..', or spaces;
	we must not let those become writable paths or path-traversal."""
	assert _slug("acc/123") == "acc_123"
	assert _slug("../etc/passwd") == ".._etc_passwd"
	assert _slug("with spaces") == "with_spaces"
	# alnum, dot, underscore, dash are preserved
	assert _slug("a-b_c.d99") == "a-b_c.d99"