orgstate / tests /test_delivery_backtest.py
Legal-i's picture
Initial OrgState deploy via Stage 150 free-tier stack
d2d1903 verified
"""
Tests for Stage 16 — rolling backtest.
The salesforce sample data (50 days) is shorter than the
salesforce config's baseline_window + lag + recent_window (51), so
the backtest needs its own synthetic series that's long enough for
the engine to actually have a window to evaluate. We build one with
core.MetricConfig + plain Observations rather than going through
a vertical config, so the test stays focused on the backtest logic.
"""
from typing import List
import pytest
from core import (
HIGHER_IS_WORSE,
EntityTypeConfig,
MetricConfig,
Observation,
)
from delivery.backtest import (
BacktestPoint,
BacktestResult,
render_html,
run_backtest,
)
# --- fixtures --------------------------------------------------------------
def _short_window_config():
"""Tiny windows so a short synthetic series produces multiple steps."""
return EntityTypeConfig(
entity_type="thing",
metrics=[MetricConfig("m", HIGHER_IS_WORSE, weight=1.0,
feeds_anomaly=True, feeds_stability=True)],
baseline_window=7, baseline_lag=2, recent_window=5,
)
def _series(entity_id: str, values: List[float]) -> List[Observation]:
"""Build a daily series starting 2026-01-01 with the given values."""
out = []
for i, v in enumerate(values):
# YYYY-MM-DD, only the first 31 days for simplicity
from datetime import date, timedelta
d = (date(2026, 1, 1) + timedelta(days=i)).isoformat()
out.append(Observation(entity_id, d, {"m": float(v)}))
return out
def _healthy(entity_id: str, n_days: int) -> List[Observation]:
return _series(entity_id, [100.0] * n_days)
def _drifting_after(entity_id: str, n_days: int, onset: int,
delta: float = 8.0) -> List[Observation]:
"""A series that is flat at 100 then climbs (worse) from ``onset``."""
vals = []
for i in range(n_days):
vals.append(100.0 if i < onset else 100.0 + (i - onset + 1) * delta)
return _series(entity_id, vals)
# --- empty / degenerate inputs ------------------------------------------
def test_empty_observations_returns_empty_result():
result = run_backtest([], _short_window_config(), step_days=7)
assert isinstance(result, BacktestResult)
assert result.points == []
assert result.n_steps == 0
def test_too_short_for_one_window():
"""Series shorter than baseline + lag + recent produces zero steps —
backtest can't compute a single drift score causally."""
cfg = _short_window_config()
obs = _healthy("e", 5) # 5 days, need >= 14
result = run_backtest(obs, cfg, step_days=7)
assert result.n_steps == 0
# --- happy path: rolling steps ------------------------------------------
def test_steps_advance_by_step_days():
cfg = _short_window_config()
obs = _healthy("e", 30) # 30 days, plenty of room
result = run_backtest(obs, cfg, step_days=7)
# at least: floor((30 - 14 + 1) / 7) + 1 cursors
assert result.n_steps >= 2
cursors = [p.cursor for p in result.points]
# successive cursors are exactly 7 days apart
from datetime import date
diffs = [(date.fromisoformat(b) - date.fromisoformat(a)).days
for a, b in zip(cursors, cursors[1:])]
assert all(d == 7 for d in diffs)
def test_engine_does_not_see_the_future_at_any_cursor():
"""Causality test: at cursor C, the engine only sees observations
with day <= C. We can prove this indirectly: a drifting series whose
drift starts AFTER all our cursors should produce zero issues at
every cursor."""
cfg = _short_window_config()
# 30 days; drift starts on day 28 (very near the end)
obs = _drifting_after("e", 30, onset=28, delta=20.0)
# Step large so cursors land at days 13, 23 — both BEFORE onset
result = run_backtest(obs, cfg, step_days=10)
# the cursors are at days 13 and 23 (indices); neither sees the drift
cursor_days = {p.cursor for p in result.points}
assert "2026-01-24" in cursor_days # day index 23 (0-based)
# ... and no issues at that cursor
early = [p for p in result.points if p.cursor == "2026-01-24"][0]
assert early.n_issues == 0
def test_drift_appears_only_after_onset():
"""The complementary causality test: as cursor advances past the
drift onset, the engine starts to flag the entity."""
cfg = _short_window_config()
obs = _drifting_after("e", 30, onset=15, delta=20.0) + _healthy("h", 30)
result = run_backtest(obs, cfg, step_days=2)
# at SOME later cursor, 'e' should be flagged; at the first cursor
# (day 13, before onset 15) it should NOT be flagged.
by_cursor = {p.cursor: p for p in result.points}
first = result.points[0]
assert "e" not in {i["entity_id"] for i in first.issues}
flagged_at = [p for p in result.points
if any(i["entity_id"] == "e" for i in p.issues)]
assert flagged_at, "drift should surface at some cursor past onset"
# and 'h' is never flagged (no drift to detect)
for p in result.points:
assert all(i["entity_id"] != "h" for i in p.issues)
def test_rollups_match_summed_points():
cfg = _short_window_config()
obs = _drifting_after("e", 30, onset=10, delta=15.0)
result = run_backtest(obs, cfg, step_days=3)
summed = sum(p.n_issues for p in result.points)
assert result.n_issues_total == summed
sev_summed = {s: sum(p.severity_counts.get(s, 0) for p in result.points)
for s in ("critical", "high", "medium", "low")}
assert {k: v for k, v in result.per_severity_total.items() if v} \
== {k: v for k, v in sev_summed.items() if v}
def test_unique_entities_ever_flagged_dedupes():
cfg = _short_window_config()
# one entity, drifting hard for many days — should be flagged in
# many windows but counted once in the unique total.
obs = _drifting_after("e", 30, onset=10, delta=20.0)
result = run_backtest(obs, cfg, step_days=2)
flagged_windows = [p for p in result.points if p.n_issues > 0]
assert len(flagged_windows) >= 2
assert result.n_unique_entities_ever_flagged == 1
# --- HTML render ---------------------------------------------------------
def test_render_html_returns_self_contained_doc():
cfg = _short_window_config()
obs = _drifting_after("e", 30, onset=10, delta=20.0)
result = run_backtest(obs, cfg, step_days=3)
html = render_html(result)
assert html.startswith("<!doctype html>")
assert html.rstrip().endswith("</html>")
assert "OrgState backtest" in html
# the entity that drifted should appear in the per-entity table
assert "<code>e</code>" in html
def test_render_html_empty_result_says_so():
cfg = _short_window_config()
result = run_backtest(_healthy("e", 5), cfg, step_days=7)
html = render_html(result)
assert "No steps produced" in html
def test_render_html_severity_bars_when_issues_present():
cfg = _short_window_config()
obs = _drifting_after("e", 30, onset=10, delta=20.0)
result = run_backtest(obs, cfg, step_days=3)
html = render_html(result)
# at least one stacked-severity bar appears
assert 'class="bar bar-' in html
# --- recurrence detection (Stage 17) -----------------------------------
from delivery.backtest import ( # noqa: E402 (after fixtures intentionally)
RecurringEntity,
find_recurring_entities,
)
def _fake_result(point_specs):
"""Build a BacktestResult from a list of (cursor, [(entity_id, severity, score)])."""
points = []
for cursor, issues in point_specs:
sev_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
issue_dicts = []
for eid, sev, score in issues:
sev_counts[sev] = sev_counts.get(sev, 0) + 1
issue_dicts.append({"entity_id": eid, "severity": sev,
"score": score, "title": f"d {eid}"})
points.append(BacktestPoint(
cursor=cursor, n_states=10, n_issues=len(issues),
severity_counts=sev_counts, issues=issue_dicts,
))
return BacktestResult(
tenant_id="t", entity_type="thing", step_days=7,
n_steps=len(points), points=points,
)
def test_find_recurring_below_threshold_returns_empty():
"""One-off events do not surface as recurring — that's the whole
point of the classifier."""
result = _fake_result([
("2026-01-01", [("e1", "medium", 0.4)]),
("2026-01-08", [("e2", "medium", 0.4)]),
("2026-01-15", [("e3", "medium", 0.4)]),
])
assert find_recurring_entities(result, min_events=3) == []
def test_find_recurring_at_threshold():
"""An entity flagged exactly ``min_events`` times surfaces."""
result = _fake_result([
("2026-01-01", [("e1", "medium", 0.4)]),
("2026-01-08", [("e1", "medium", 0.4)]),
("2026-01-15", [("e1", "medium", 0.4)]),
])
[rec] = find_recurring_entities(result, min_events=3)
assert isinstance(rec, RecurringEntity)
assert rec.entity_id == "e1"
assert rec.n_events == 3
assert rec.first_cursor == "2026-01-01"
assert rec.last_cursor == "2026-01-15"
assert rec.cursors == ["2026-01-01", "2026-01-08", "2026-01-15"]
def test_find_recurring_top_severity_is_worst_ever():
"""``top_severity`` is the worst the entity ever hit across windows,
not the most recent or the most common."""
result = _fake_result([
("2026-01-01", [("e1", "low", 0.3)]),
("2026-01-08", [("e1", "critical", 0.9)]), # worst
("2026-01-15", [("e1", "medium", 0.4)]),
])
[rec] = find_recurring_entities(result, min_events=3)
assert rec.top_severity == "critical"
def test_find_recurring_ordering_most_recurring_first():
"""Sort key is (-n_events, recency-desc, entity_id) — most events
first, ties broken by who was flagged most recently."""
result = _fake_result([
("2026-01-01", [("a", "medium", 0.4), ("b", "medium", 0.4)]),
("2026-01-08", [("a", "medium", 0.4), ("b", "medium", 0.4)]),
("2026-01-15", [("a", "medium", 0.4), ("b", "medium", 0.4),
("c", "medium", 0.4)]),
("2026-01-22", [("a", "medium", 0.4)]), # a gets 4 events
("2026-01-29", [("b", "medium", 0.4),
("c", "medium", 0.4)]), # b: 4, c: 2
("2026-02-05", [("b", "medium", 0.4),
("c", "medium", 0.4)]), # b: 5, c: 3
])
recurring = find_recurring_entities(result, min_events=3)
# b has the most events (5), then a (4), then c (3)
assert [r.entity_id for r in recurring] == ["b", "a", "c"]
def test_recurring_callout_appears_in_rendered_html():
"""The recurrence section is visible above the timeline when there
is anything to surface."""
result = _fake_result([
("2026-01-01", [("hot_entity", "medium", 0.4)]),
("2026-01-08", [("hot_entity", "medium", 0.4)]),
("2026-01-15", [("hot_entity", "high", 0.6)]),
])
html = render_html(result, min_recurring_events=3)
assert "Recurring drift" in html
assert "<code>hot_entity</code>" in html
# the worst severity badge is present
assert 'class="sev sev-high"' in html
def test_render_html_no_recurring_section_when_none():
"""If nothing is recurring the callout block is suppressed so the
page does not show an empty section. We still see the per-entity
table."""
result = _fake_result([
("2026-01-01", [("e1", "medium", 0.4)]),
("2026-01-08", [("e2", "medium", 0.4)]),
])
html = render_html(result, min_recurring_events=3)
assert "Recurring drift" not in html
def test_min_events_flag_is_configurable():
"""Operators tune the threshold via min_recurring_events on render
and min_events on the classifier — they're independently usable."""
result = _fake_result([
("2026-01-01", [("e1", "medium", 0.4)]),
("2026-01-08", [("e1", "medium", 0.4)]),
])
assert find_recurring_entities(result, min_events=3) == []
[rec] = find_recurring_entities(result, min_events=2)
assert rec.entity_id == "e1"
# --- per-entity drill-down (Stage 18) ----------------------------------
from delivery.backtest import ( # noqa: E402
_slug,
render_entity_detail_html,
)
def test_entity_detail_renders_summary_and_sparkline():
"""The drill-down shows summary + a CSS-only sparkline. We do not
snapshot the whole HTML; we check the key story elements are there."""
result = _fake_result([
("2026-01-01", [("hot", "medium", 0.42)]),
("2026-01-08", []), # quiet
("2026-01-15", [("hot", "high", 0.6)]), # worst severity
("2026-01-22", [("hot", "medium", 0.38)]),
])
html = render_entity_detail_html(result, "hot")
assert html.startswith("<!doctype html>")
assert "<h1>hot</h1>" in html
# summary numbers
assert "<strong>Events:</strong> 3" in html
# worst severity is reported (not the most recent)
assert "sev-high" in html
# sparkline present — one cell per cursor
assert html.count('class="spark-cell"') == 4
# bars only for flagged cursors (3 of 4)
assert html.count("spark-bar spark-") == 3
# event table lists the flagged cursors only
assert "2026-01-08" not in html.split('<h2>Flagged events</h2>')[-1]
def test_entity_detail_for_non_flagged_entity_says_so():
"""An entity that's never in any issue list should produce a
cleanly-empty page, not a stack trace."""
result = _fake_result([("2026-01-01", [("other", "medium", 0.4)])])
html = render_entity_detail_html(result, "ghost")
assert "<!doctype html>" in html
assert "not flagged in any backtest window" in html
def test_entity_detail_back_link_to_backtest_page():
"""The drill-down lives at entities/X.html so the back link is
../backtest.html — relative, no JS needed."""
result = _fake_result([("2026-01-01", [("x", "medium", 0.4)])])
html = render_entity_detail_html(result, "x")
assert 'href="../backtest.html"' in html
def test_main_html_links_entity_ids_when_link_entities_set():
"""When the CLI renders drill-down pages, the entity ids in the
main report become links to those pages."""
result = _fake_result([
("2026-01-01", [("e1", "medium", 0.4)]),
("2026-01-08", [("e1", "medium", 0.4)]),
("2026-01-15", [("e1", "medium", 0.4)]),
])
linked = render_html(result, link_entities=True)
bare = render_html(result, link_entities=False)
# link-mode renders <a href="entities/...">
assert 'href="entities/e1.html"' in linked
# bare-mode keeps the entity id as plain code
assert 'href="entities/' not in bare
def test_link_entities_can_be_a_subset():
"""Passing a set links only those entities — prevents 404s when
only the recurring entities have drill-down pages rendered."""
result = _fake_result([
# e_rec recurs 3 times (rendered), e_oneoff once (not rendered)
("2026-01-01", [("e_rec", "medium", 0.4)]),
("2026-01-08", [("e_rec", "medium", 0.4)]),
("2026-01-15", [("e_rec", "medium", 0.4)]),
("2026-01-22", [("e_oneoff", "medium", 0.4)]),
])
html = render_html(result, link_entities={"e_rec"})
assert 'href="entities/e_rec.html"' in html
# one-off entity stays plain — no broken link to a missing page
assert 'href="entities/e_oneoff.html"' not in html
assert "<code>e_oneoff</code>" in html
# --- "what changed" details (Stage 19) ------------------------------------
from delivery.backtest import ( # noqa: E402
_compute_event_details,
_fmt_delta_pct,
_fmt_num,
)
def _cfg_for_details_test():
return EntityTypeConfig(
entity_type="thing",
metrics=[MetricConfig("m", HIGHER_IS_WORSE, weight=1.0,
feeds_anomaly=True, feeds_stability=True)],
baseline_window=5, baseline_lag=2, recent_window=5,
)
def test_compute_event_details_baseline_vs_recent():
"""The recent mean reflects the recent_window before the cursor,
the baseline mean reflects the baseline_window before that (with a
gap of baseline_lag days). Verified by constructing a series
where the two halves are obviously different."""
cfg = _cfg_for_details_test()
# 12 days: baseline mean = 100, recent mean = 200 (drift up)
# baseline = days 0..4 -> values 100; gap days 5..6; recent = days 7..11 -> 200
from datetime import date, timedelta
obs = []
for i in range(12):
v = 200.0 if i >= 7 else 100.0
obs.append(Observation("e", (date(2026, 1, 1) + timedelta(days=i))
.isoformat(), {"m": v}))
details = _compute_event_details(obs, "e", "2026-01-12", cfg)
assert "m" in details
d = details["m"]
assert d["baseline_mean"] == pytest.approx(100.0)
assert d["recent_mean"] == pytest.approx(200.0)
assert d["delta_pct"] == pytest.approx(100.0)
def test_compute_event_details_handles_zero_baseline():
"""If baseline mean is zero, delta% is None (would divide by 0).
The renderer formats that as an em-dash."""
cfg = _cfg_for_details_test()
from datetime import date, timedelta
obs = []
for i in range(12):
v = 0.0 if i < 7 else 5.0
obs.append(Observation("e", (date(2026, 1, 1) + timedelta(days=i))
.isoformat(), {"m": v}))
details = _compute_event_details(obs, "e", "2026-01-12", cfg)
d = details["m"]
assert d["baseline_mean"] == 0.0
assert d["recent_mean"] == pytest.approx(5.0)
assert d["delta_pct"] is None
def test_compute_event_details_other_entities_excluded():
"""An entity's details must not pull in another entity's numbers."""
cfg = _cfg_for_details_test()
from datetime import date, timedelta
obs = []
for i in range(12):
for entity_id, v in (("target", 50.0),
("noise", 9999.0)):
obs.append(Observation(entity_id,
(date(2026, 1, 1)
+ timedelta(days=i)).isoformat(),
{"m": v}))
details = _compute_event_details(obs, "target", "2026-01-12", cfg)
# only "target"'s 50.0 values, never "noise"'s 9999
assert details["m"]["baseline_mean"] == pytest.approx(50.0)
assert details["m"]["recent_mean"] == pytest.approx(50.0)
def test_what_changed_appears_in_entity_detail_html():
"""Run a real backtest with the details path active, then verify
the entity drill-down renders the 'What changed' section."""
cfg = _short_window_config()
obs = _drifting_after("e", 30, onset=10, delta=20.0)
result = run_backtest(obs, cfg, step_days=3)
html = render_entity_detail_html(result, "e")
assert "What changed" in html
assert 'class="changed-block"' in html
assert "Baseline mean" in html
assert "Recent mean" in html
def test_what_changed_section_absent_when_no_events():
cfg = _short_window_config()
obs = _healthy("e", 30)
result = run_backtest(obs, cfg, step_days=3)
html = render_entity_detail_html(result, "e")
assert "What changed" not in html
def test_fmt_helpers():
assert _fmt_num(None) == "—"
assert _fmt_num(1234.5) == "1234.5"
assert _fmt_num(0.123) == "0.123"
assert _fmt_delta_pct(None) == "—"
assert _fmt_delta_pct(5.0) == "+5.0%"
assert _fmt_delta_pct(-3.2) == "-3.2%"
def test_slug_makes_unsafe_entity_ids_filesystem_safe():
"""A real customer might use ids containing '/', '..', or spaces;
we must not let those become writable paths or path-traversal."""
assert _slug("acc/123") == "acc_123"
assert _slug("../etc/passwd") == ".._etc_passwd"
assert _slug("with spaces") == "with_spaces"
# alnum, dot, underscore, dash are preserved
assert _slug("a-b_c.d99") == "a-b_c.d99"