""" Tests for Stage 16 — rolling backtest. The salesforce sample data (50 days) is shorter than the salesforce config's baseline_window + lag + recent_window (51), so the backtest needs its own synthetic series that's long enough for the engine to actually have a window to evaluate. We build one with core.MetricConfig + plain Observations rather than going through a vertical config, so the test stays focused on the backtest logic. """ from typing import List import pytest from core import ( HIGHER_IS_WORSE, EntityTypeConfig, MetricConfig, Observation, ) from delivery.backtest import ( BacktestPoint, BacktestResult, render_html, run_backtest, ) # --- fixtures -------------------------------------------------------------- def _short_window_config(): """Tiny windows so a short synthetic series produces multiple steps.""" return EntityTypeConfig( entity_type="thing", metrics=[MetricConfig("m", HIGHER_IS_WORSE, weight=1.0, feeds_anomaly=True, feeds_stability=True)], baseline_window=7, baseline_lag=2, recent_window=5, ) def _series(entity_id: str, values: List[float]) -> List[Observation]: """Build a daily series starting 2026-01-01 with the given values.""" out = [] for i, v in enumerate(values): # YYYY-MM-DD, only the first 31 days for simplicity from datetime import date, timedelta d = (date(2026, 1, 1) + timedelta(days=i)).isoformat() out.append(Observation(entity_id, d, {"m": float(v)})) return out def _healthy(entity_id: str, n_days: int) -> List[Observation]: return _series(entity_id, [100.0] * n_days) def _drifting_after(entity_id: str, n_days: int, onset: int, delta: float = 8.0) -> List[Observation]: """A series that is flat at 100 then climbs (worse) from ``onset``.""" vals = [] for i in range(n_days): vals.append(100.0 if i < onset else 100.0 + (i - onset + 1) * delta) return _series(entity_id, vals) # --- empty / degenerate inputs ------------------------------------------ def test_empty_observations_returns_empty_result(): result = run_backtest([], _short_window_config(), step_days=7) assert isinstance(result, BacktestResult) assert result.points == [] assert result.n_steps == 0 def test_too_short_for_one_window(): """Series shorter than baseline + lag + recent produces zero steps — backtest can't compute a single drift score causally.""" cfg = _short_window_config() obs = _healthy("e", 5) # 5 days, need >= 14 result = run_backtest(obs, cfg, step_days=7) assert result.n_steps == 0 # --- happy path: rolling steps ------------------------------------------ def test_steps_advance_by_step_days(): cfg = _short_window_config() obs = _healthy("e", 30) # 30 days, plenty of room result = run_backtest(obs, cfg, step_days=7) # at least: floor((30 - 14 + 1) / 7) + 1 cursors assert result.n_steps >= 2 cursors = [p.cursor for p in result.points] # successive cursors are exactly 7 days apart from datetime import date diffs = [(date.fromisoformat(b) - date.fromisoformat(a)).days for a, b in zip(cursors, cursors[1:])] assert all(d == 7 for d in diffs) def test_engine_does_not_see_the_future_at_any_cursor(): """Causality test: at cursor C, the engine only sees observations with day <= C. We can prove this indirectly: a drifting series whose drift starts AFTER all our cursors should produce zero issues at every cursor.""" cfg = _short_window_config() # 30 days; drift starts on day 28 (very near the end) obs = _drifting_after("e", 30, onset=28, delta=20.0) # Step large so cursors land at days 13, 23 — both BEFORE onset result = run_backtest(obs, cfg, step_days=10) # the cursors are at days 13 and 23 (indices); neither sees the drift cursor_days = {p.cursor for p in result.points} assert "2026-01-24" in cursor_days # day index 23 (0-based) # ... and no issues at that cursor early = [p for p in result.points if p.cursor == "2026-01-24"][0] assert early.n_issues == 0 def test_drift_appears_only_after_onset(): """The complementary causality test: as cursor advances past the drift onset, the engine starts to flag the entity.""" cfg = _short_window_config() obs = _drifting_after("e", 30, onset=15, delta=20.0) + _healthy("h", 30) result = run_backtest(obs, cfg, step_days=2) # at SOME later cursor, 'e' should be flagged; at the first cursor # (day 13, before onset 15) it should NOT be flagged. by_cursor = {p.cursor: p for p in result.points} first = result.points[0] assert "e" not in {i["entity_id"] for i in first.issues} flagged_at = [p for p in result.points if any(i["entity_id"] == "e" for i in p.issues)] assert flagged_at, "drift should surface at some cursor past onset" # and 'h' is never flagged (no drift to detect) for p in result.points: assert all(i["entity_id"] != "h" for i in p.issues) def test_rollups_match_summed_points(): cfg = _short_window_config() obs = _drifting_after("e", 30, onset=10, delta=15.0) result = run_backtest(obs, cfg, step_days=3) summed = sum(p.n_issues for p in result.points) assert result.n_issues_total == summed sev_summed = {s: sum(p.severity_counts.get(s, 0) for p in result.points) for s in ("critical", "high", "medium", "low")} assert {k: v for k, v in result.per_severity_total.items() if v} \ == {k: v for k, v in sev_summed.items() if v} def test_unique_entities_ever_flagged_dedupes(): cfg = _short_window_config() # one entity, drifting hard for many days — should be flagged in # many windows but counted once in the unique total. obs = _drifting_after("e", 30, onset=10, delta=20.0) result = run_backtest(obs, cfg, step_days=2) flagged_windows = [p for p in result.points if p.n_issues > 0] assert len(flagged_windows) >= 2 assert result.n_unique_entities_ever_flagged == 1 # --- HTML render --------------------------------------------------------- def test_render_html_returns_self_contained_doc(): cfg = _short_window_config() obs = _drifting_after("e", 30, onset=10, delta=20.0) result = run_backtest(obs, cfg, step_days=3) html = render_html(result) assert html.startswith("") assert html.rstrip().endswith("") assert "OrgState backtest" in html # the entity that drifted should appear in the per-entity table assert "e" in html def test_render_html_empty_result_says_so(): cfg = _short_window_config() result = run_backtest(_healthy("e", 5), cfg, step_days=7) html = render_html(result) assert "No steps produced" in html def test_render_html_severity_bars_when_issues_present(): cfg = _short_window_config() obs = _drifting_after("e", 30, onset=10, delta=20.0) result = run_backtest(obs, cfg, step_days=3) html = render_html(result) # at least one stacked-severity bar appears assert 'class="bar bar-' in html # --- recurrence detection (Stage 17) ----------------------------------- from delivery.backtest import ( # noqa: E402 (after fixtures intentionally) RecurringEntity, find_recurring_entities, ) def _fake_result(point_specs): """Build a BacktestResult from a list of (cursor, [(entity_id, severity, score)]).""" points = [] for cursor, issues in point_specs: sev_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0} issue_dicts = [] for eid, sev, score in issues: sev_counts[sev] = sev_counts.get(sev, 0) + 1 issue_dicts.append({"entity_id": eid, "severity": sev, "score": score, "title": f"d {eid}"}) points.append(BacktestPoint( cursor=cursor, n_states=10, n_issues=len(issues), severity_counts=sev_counts, issues=issue_dicts, )) return BacktestResult( tenant_id="t", entity_type="thing", step_days=7, n_steps=len(points), points=points, ) def test_find_recurring_below_threshold_returns_empty(): """One-off events do not surface as recurring — that's the whole point of the classifier.""" result = _fake_result([ ("2026-01-01", [("e1", "medium", 0.4)]), ("2026-01-08", [("e2", "medium", 0.4)]), ("2026-01-15", [("e3", "medium", 0.4)]), ]) assert find_recurring_entities(result, min_events=3) == [] def test_find_recurring_at_threshold(): """An entity flagged exactly ``min_events`` times surfaces.""" result = _fake_result([ ("2026-01-01", [("e1", "medium", 0.4)]), ("2026-01-08", [("e1", "medium", 0.4)]), ("2026-01-15", [("e1", "medium", 0.4)]), ]) [rec] = find_recurring_entities(result, min_events=3) assert isinstance(rec, RecurringEntity) assert rec.entity_id == "e1" assert rec.n_events == 3 assert rec.first_cursor == "2026-01-01" assert rec.last_cursor == "2026-01-15" assert rec.cursors == ["2026-01-01", "2026-01-08", "2026-01-15"] def test_find_recurring_top_severity_is_worst_ever(): """``top_severity`` is the worst the entity ever hit across windows, not the most recent or the most common.""" result = _fake_result([ ("2026-01-01", [("e1", "low", 0.3)]), ("2026-01-08", [("e1", "critical", 0.9)]), # worst ("2026-01-15", [("e1", "medium", 0.4)]), ]) [rec] = find_recurring_entities(result, min_events=3) assert rec.top_severity == "critical" def test_find_recurring_ordering_most_recurring_first(): """Sort key is (-n_events, recency-desc, entity_id) — most events first, ties broken by who was flagged most recently.""" result = _fake_result([ ("2026-01-01", [("a", "medium", 0.4), ("b", "medium", 0.4)]), ("2026-01-08", [("a", "medium", 0.4), ("b", "medium", 0.4)]), ("2026-01-15", [("a", "medium", 0.4), ("b", "medium", 0.4), ("c", "medium", 0.4)]), ("2026-01-22", [("a", "medium", 0.4)]), # a gets 4 events ("2026-01-29", [("b", "medium", 0.4), ("c", "medium", 0.4)]), # b: 4, c: 2 ("2026-02-05", [("b", "medium", 0.4), ("c", "medium", 0.4)]), # b: 5, c: 3 ]) recurring = find_recurring_entities(result, min_events=3) # b has the most events (5), then a (4), then c (3) assert [r.entity_id for r in recurring] == ["b", "a", "c"] def test_recurring_callout_appears_in_rendered_html(): """The recurrence section is visible above the timeline when there is anything to surface.""" result = _fake_result([ ("2026-01-01", [("hot_entity", "medium", 0.4)]), ("2026-01-08", [("hot_entity", "medium", 0.4)]), ("2026-01-15", [("hot_entity", "high", 0.6)]), ]) html = render_html(result, min_recurring_events=3) assert "Recurring drift" in html assert "hot_entity" in html # the worst severity badge is present assert 'class="sev sev-high"' in html def test_render_html_no_recurring_section_when_none(): """If nothing is recurring the callout block is suppressed so the page does not show an empty section. We still see the per-entity table.""" result = _fake_result([ ("2026-01-01", [("e1", "medium", 0.4)]), ("2026-01-08", [("e2", "medium", 0.4)]), ]) html = render_html(result, min_recurring_events=3) assert "Recurring drift" not in html def test_min_events_flag_is_configurable(): """Operators tune the threshold via min_recurring_events on render and min_events on the classifier — they're independently usable.""" result = _fake_result([ ("2026-01-01", [("e1", "medium", 0.4)]), ("2026-01-08", [("e1", "medium", 0.4)]), ]) assert find_recurring_entities(result, min_events=3) == [] [rec] = find_recurring_entities(result, min_events=2) assert rec.entity_id == "e1" # --- per-entity drill-down (Stage 18) ---------------------------------- from delivery.backtest import ( # noqa: E402 _slug, render_entity_detail_html, ) def test_entity_detail_renders_summary_and_sparkline(): """The drill-down shows summary + a CSS-only sparkline. We do not snapshot the whole HTML; we check the key story elements are there.""" result = _fake_result([ ("2026-01-01", [("hot", "medium", 0.42)]), ("2026-01-08", []), # quiet ("2026-01-15", [("hot", "high", 0.6)]), # worst severity ("2026-01-22", [("hot", "medium", 0.38)]), ]) html = render_entity_detail_html(result, "hot") assert html.startswith("") assert "

hot

" in html # summary numbers assert "Events: 3" in html # worst severity is reported (not the most recent) assert "sev-high" in html # sparkline present — one cell per cursor assert html.count('class="spark-cell"') == 4 # bars only for flagged cursors (3 of 4) assert html.count("spark-bar spark-") == 3 # event table lists the flagged cursors only assert "2026-01-08" not in html.split('

Flagged events

')[-1] def test_entity_detail_for_non_flagged_entity_says_so(): """An entity that's never in any issue list should produce a cleanly-empty page, not a stack trace.""" result = _fake_result([("2026-01-01", [("other", "medium", 0.4)])]) html = render_entity_detail_html(result, "ghost") assert "" in html assert "not flagged in any backtest window" in html def test_entity_detail_back_link_to_backtest_page(): """The drill-down lives at entities/X.html so the back link is ../backtest.html — relative, no JS needed.""" result = _fake_result([("2026-01-01", [("x", "medium", 0.4)])]) html = render_entity_detail_html(result, "x") assert 'href="../backtest.html"' in html def test_main_html_links_entity_ids_when_link_entities_set(): """When the CLI renders drill-down pages, the entity ids in the main report become links to those pages.""" result = _fake_result([ ("2026-01-01", [("e1", "medium", 0.4)]), ("2026-01-08", [("e1", "medium", 0.4)]), ("2026-01-15", [("e1", "medium", 0.4)]), ]) linked = render_html(result, link_entities=True) bare = render_html(result, link_entities=False) # link-mode renders assert 'href="entities/e1.html"' in linked # bare-mode keeps the entity id as plain code assert 'href="entities/' not in bare def test_link_entities_can_be_a_subset(): """Passing a set links only those entities — prevents 404s when only the recurring entities have drill-down pages rendered.""" result = _fake_result([ # e_rec recurs 3 times (rendered), e_oneoff once (not rendered) ("2026-01-01", [("e_rec", "medium", 0.4)]), ("2026-01-08", [("e_rec", "medium", 0.4)]), ("2026-01-15", [("e_rec", "medium", 0.4)]), ("2026-01-22", [("e_oneoff", "medium", 0.4)]), ]) html = render_html(result, link_entities={"e_rec"}) assert 'href="entities/e_rec.html"' in html # one-off entity stays plain — no broken link to a missing page assert 'href="entities/e_oneoff.html"' not in html assert "e_oneoff" in html # --- "what changed" details (Stage 19) ------------------------------------ from delivery.backtest import ( # noqa: E402 _compute_event_details, _fmt_delta_pct, _fmt_num, ) def _cfg_for_details_test(): return EntityTypeConfig( entity_type="thing", metrics=[MetricConfig("m", HIGHER_IS_WORSE, weight=1.0, feeds_anomaly=True, feeds_stability=True)], baseline_window=5, baseline_lag=2, recent_window=5, ) def test_compute_event_details_baseline_vs_recent(): """The recent mean reflects the recent_window before the cursor, the baseline mean reflects the baseline_window before that (with a gap of baseline_lag days). Verified by constructing a series where the two halves are obviously different.""" cfg = _cfg_for_details_test() # 12 days: baseline mean = 100, recent mean = 200 (drift up) # baseline = days 0..4 -> values 100; gap days 5..6; recent = days 7..11 -> 200 from datetime import date, timedelta obs = [] for i in range(12): v = 200.0 if i >= 7 else 100.0 obs.append(Observation("e", (date(2026, 1, 1) + timedelta(days=i)) .isoformat(), {"m": v})) details = _compute_event_details(obs, "e", "2026-01-12", cfg) assert "m" in details d = details["m"] assert d["baseline_mean"] == pytest.approx(100.0) assert d["recent_mean"] == pytest.approx(200.0) assert d["delta_pct"] == pytest.approx(100.0) def test_compute_event_details_handles_zero_baseline(): """If baseline mean is zero, delta% is None (would divide by 0). The renderer formats that as an em-dash.""" cfg = _cfg_for_details_test() from datetime import date, timedelta obs = [] for i in range(12): v = 0.0 if i < 7 else 5.0 obs.append(Observation("e", (date(2026, 1, 1) + timedelta(days=i)) .isoformat(), {"m": v})) details = _compute_event_details(obs, "e", "2026-01-12", cfg) d = details["m"] assert d["baseline_mean"] == 0.0 assert d["recent_mean"] == pytest.approx(5.0) assert d["delta_pct"] is None def test_compute_event_details_other_entities_excluded(): """An entity's details must not pull in another entity's numbers.""" cfg = _cfg_for_details_test() from datetime import date, timedelta obs = [] for i in range(12): for entity_id, v in (("target", 50.0), ("noise", 9999.0)): obs.append(Observation(entity_id, (date(2026, 1, 1) + timedelta(days=i)).isoformat(), {"m": v})) details = _compute_event_details(obs, "target", "2026-01-12", cfg) # only "target"'s 50.0 values, never "noise"'s 9999 assert details["m"]["baseline_mean"] == pytest.approx(50.0) assert details["m"]["recent_mean"] == pytest.approx(50.0) def test_what_changed_appears_in_entity_detail_html(): """Run a real backtest with the details path active, then verify the entity drill-down renders the 'What changed' section.""" cfg = _short_window_config() obs = _drifting_after("e", 30, onset=10, delta=20.0) result = run_backtest(obs, cfg, step_days=3) html = render_entity_detail_html(result, "e") assert "What changed" in html assert 'class="changed-block"' in html assert "Baseline mean" in html assert "Recent mean" in html def test_what_changed_section_absent_when_no_events(): cfg = _short_window_config() obs = _healthy("e", 30) result = run_backtest(obs, cfg, step_days=3) html = render_entity_detail_html(result, "e") assert "What changed" not in html def test_fmt_helpers(): assert _fmt_num(None) == "—" assert _fmt_num(1234.5) == "1234.5" assert _fmt_num(0.123) == "0.123" assert _fmt_delta_pct(None) == "—" assert _fmt_delta_pct(5.0) == "+5.0%" assert _fmt_delta_pct(-3.2) == "-3.2%" def test_slug_makes_unsafe_entity_ids_filesystem_safe(): """A real customer might use ids containing '/', '..', or spaces; we must not let those become writable paths or path-traversal.""" assert _slug("acc/123") == "acc_123" assert _slug("../etc/passwd") == ".._etc_passwd" assert _slug("with spaces") == "with_spaces" # alnum, dot, underscore, dash are preserved assert _slug("a-b_c.d99") == "a-b_c.d99"