Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Benchmark script for the major update detection heuristic. | |
| Evaluates UpdateDetectionService._is_update_related, _collect_update_candidates, | |
| and _is_major_update against a curated set of Steam games. | |
| Three modes: | |
| --discover Fetch news for all games (count=20 by default, matches | |
| production) and display all items with classification | |
| details. Use this to identify ground truth. | |
| --evaluate Item-level evaluation: for each ItemCase, find the item | |
| by gid and check if _is_update_related / _is_major_update | |
| match expectations. | |
| --evaluate-service Service-level evaluation: for each ServiceCase, run the | |
| full selection pipeline and compare the outcome. | |
| Both --evaluate and --evaluate-service run by default when no mode is specified. | |
| Examples: | |
| python scripts/benchmark_major_update.py --discover | |
| python scripts/benchmark_major_update.py --discover --count 50 | |
| python scripts/benchmark_major_update.py --evaluate | |
| python scripts/benchmark_major_update.py --evaluate-service | |
| python scripts/benchmark_major_update.py # runs both evaluate modes | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Literal | |
| import httpx | |
| # ── import project service ──────────────────────────────────────────────────── | |
| sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) | |
| from app.services.update_detection_service import UpdateDetectionService # noqa: E402 | |
| STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/" | |
| # ── benchmark games ─────────────────────────────────────────────────────────── | |
| GAMES: list[tuple[str, str]] = [ | |
| ("Going Medieval", "1029780"), | |
| ("Timberborn", "1062090"), | |
| ("Hades II", "1145350"), | |
| ("Against the Storm", "1336490"), | |
| ("Valheim", "892970"), | |
| ("Manor Lords", "1363080"), | |
| ("Project Zomboid", "108600"), | |
| ("Dwarf Fortress", "975370"), | |
| ("Helldivers 2", "553850"), | |
| ("Deep Rock Galactic", "548430"), | |
| ("Lethal Company", "1966720"), | |
| ("Factorio", "427520"), | |
| ("Satisfactory", "526870"), | |
| ] | |
| # ── ground truth structures ─────────────────────────────────────────────────── | |
| class ItemCase: | |
| """Per-item ground truth: is this specific event major?""" | |
| game_name: str | |
| appid: str | |
| gid: str | |
| title: str # for display | |
| expected: Literal["major", "not_major", "ambiguous"] | |
| reasoning: str | |
| class ServiceCase: | |
| """Per-game ground truth: what should the production code do?""" | |
| game_name: str | |
| appid: str | |
| expected_major: bool | None # True / False / None = ambiguous | |
| reasoning: str | |
| # ── item-level ground truth ─────────────────────────────────────────────────── | |
| # Populated from --discover run on 2026-03-19. | |
| ITEM_CASES: list[ItemCase] = [ | |
| # ── Going Medieval ──────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Going Medieval", | |
| appid="1029780", | |
| gid="1826992588604105", | |
| title="Going Medieval is out now in 1.0!", | |
| expected="major", | |
| reasoning=( | |
| "1.0 full release out of Early Access — unambiguously major. " | |
| "Phase 1: RELEASE_PHRASE_RE matches 'is out now' → update-related. " | |
| "ONE_ZERO_RE matches '1.0' → major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Going Medieval", | |
| appid="1029780", | |
| gid="1827626365751261", | |
| title="Experimental Branch Patch (1.0.48)", | |
| expected="not_major", | |
| reasoning=( | |
| "Experimental branch incremental patch. Three-segment version (1.0.48) " | |
| "excluded by VERSION_RE. BRANCH_RE blocks major classification." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Going Medieval", | |
| appid="1029780", | |
| gid="1827626365750723", | |
| title="Patch Notes (1.0.47)", | |
| expected="not_major", | |
| reasoning="Incremental stable patch, three-segment version. not_major is correct.", | |
| ), | |
| # ── Timberborn ──────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Timberborn", | |
| appid="1062090", | |
| gid="1826992588592887", | |
| title="Timberborn 1.0 is live!", | |
| expected="major", | |
| reasoning=( | |
| "1.0 full release out of Early Access — unambiguously major. " | |
| "Phase 1: RELEASE_PHRASE_RE matches 'is live' → update-related. " | |
| "ONE_ZERO_RE matches '1.0' → major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Timberborn", | |
| appid="1062090", | |
| gid="1826992588603124", | |
| title="Patch notes 2026-03-17 (experimental)", | |
| expected="not_major", | |
| reasoning="Experimental branch date-based patch notes. No version number. not_major is correct.", | |
| ), | |
| # ── Hades II ────────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Hades II", | |
| appid="1145350", | |
| gid="1816215235360707", | |
| title="Hades II v1.0 Hotfix 3", | |
| expected="not_major", | |
| reasoning=( | |
| "A bugfix hotfix on top of the v1.0 launch — not a content update. " | |
| "Phase 1: HOTFIX_RE blocks major classification. Correct: not_major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Hades II", | |
| appid="1145350", | |
| gid="1811772772516846", | |
| title="Hades II v1.0 Hotfix 2", | |
| expected="not_major", | |
| reasoning="Same pattern: HOTFIX_RE blocks 'v1.0 Hotfix N' from being classified as major.", | |
| ), | |
| ItemCase( | |
| game_name="Hades II", | |
| appid="1145350", | |
| gid="1811772772248738", | |
| title="Hades II v1.0 Is Now Available!", | |
| expected="major", | |
| reasoning=( | |
| "v1.0 full launch — unambiguously major. " | |
| "Phase 1: RELEASE_PHRASE_RE matches 'Is Now Available' → update-related. " | |
| "No hotfix/branch blocker. VERSION_RE matches 'v1.0' → major." | |
| ), | |
| ), | |
| # ── Against the Storm ───────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Against the Storm", | |
| appid="1336490", | |
| gid="1818752592135840", | |
| title="Demo Update 1.9.6", | |
| expected="not_major", | |
| reasoning=( | |
| "Demo game update, three-segment version 1.9.6. " | |
| "Service correctly classifies as not_major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Against the Storm", | |
| appid="1336490", | |
| gid="1816849002010836", | |
| title="Brineworks Update (1.9) available!", | |
| expected="major", | |
| reasoning=( | |
| "Named major content update with version 1.9. " | |
| "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' " | |
| "→ update-related. VERSION_RE → major." | |
| ), | |
| ), | |
| # ── Valheim ─────────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Valheim", | |
| appid="892970", | |
| gid="1825093633184197", | |
| title="Patch 0.221.12", | |
| expected="not_major", | |
| reasoning="Three-segment maintenance patch. Correctly classified as not_major.", | |
| ), | |
| ItemCase( | |
| game_name="Valheim", | |
| appid="892970", | |
| gid="1809869179994587", | |
| title="Patch 0.221.4 (Public Test)", | |
| expected="not_major", | |
| reasoning="Public test branch three-segment patch. Correctly classified as not_major.", | |
| ), | |
| # ── Manor Lords ─────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Manor Lords", | |
| appid="1363080", | |
| gid="1827626365750540", | |
| title="Major Update #6: Battlefield Changes, New Map, and Family Based Progression", | |
| expected="major", | |
| reasoning=( | |
| "Developer-declared major content drop. " | |
| "Phase 1: CONTENT_UPDATE_RE matches 'Major Update' → update-related and major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Manor Lords", | |
| appid="1363080", | |
| gid="1826992588603500", | |
| title="New BETA version is available for testing (0.8.065)", | |
| expected="not_major", | |
| reasoning=( | |
| "Beta/testing build announcement, not a production major update. " | |
| "Current heuristic misses it entirely, which is acceptable for this benchmark case." | |
| ), | |
| ), | |
| # ── Project Zomboid ─────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Project Zomboid", | |
| appid="108600", | |
| gid="1826992588590120", | |
| title="42.15.2 UNSTABLE HOTFIX Released", | |
| expected="not_major", | |
| reasoning=( | |
| "Unstable-branch hotfix. patchnotes tag makes it update-related, " | |
| "but HOTFIX_RE correctly blocks major classification." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Project Zomboid", | |
| appid="108600", | |
| gid="1826362059930323", | |
| title="Build 42.15.0 Unstable Released", | |
| expected="not_major", | |
| reasoning=( | |
| "Unstable build release, not a production major update. " | |
| "Current heuristic does not classify it as update-related because the three-segment " | |
| "build number fails VERSION_RE." | |
| ), | |
| ), | |
| # ── Dwarf Fortress ──────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Dwarf Fortress", | |
| appid="975370", | |
| gid="1826362059918689", | |
| title="Food fixes, AMA, community spotlight and more! Dwarf Fortress Patch 53.11", | |
| expected="not_major", | |
| reasoning=( | |
| "Maintenance patch with Dwarf Fortress' two-segment numbering scheme. " | |
| "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fixes' " | |
| "→ maintenance blocker fires before VERSION_RE → not_major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Dwarf Fortress", | |
| appid="975370", | |
| gid="1821288646585998", | |
| title="Aquatic portraits, Naked dwarf fix and more Dwarf Fortress Patch 53.10", | |
| expected="not_major", | |
| reasoning=( | |
| "Another maintenance patch under the same numbering scheme. " | |
| "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fix' " | |
| "→ maintenance blocker fires → not_major." | |
| ), | |
| ), | |
| # ── Helldivers 2 ────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Helldivers 2", | |
| appid="553850", | |
| gid="1826992588603352", | |
| title="Machinery of Oppression: 6.1.0", | |
| expected="major", | |
| reasoning=( | |
| "Named content drop with new missions/enemies. This should count as a major update. " | |
| "Useful to test whether named major drops with three-segment versions are still found." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Helldivers 2", | |
| appid="553850", | |
| gid="1826992588603981", | |
| title="Revealing our Machinery of Oppression Content Roadmap!", | |
| expected="not_major", | |
| reasoning=( | |
| "Roadmap/announcement post, not the update itself. Should not be treated as major." | |
| ), | |
| ), | |
| # ── Deep Rock Galactic ──────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Deep Rock Galactic", | |
| appid="548430", | |
| gid="1825727806720055", | |
| title="'Eight Years in Orbit' Anniversary Event is live now!", | |
| expected="not_major", | |
| reasoning=( | |
| "Live event announcement, not a game patch. " | |
| "Phase 2: EVENT_FESTIVAL_RE matches 'anniversary event'; no 'update'/'patch' in title " | |
| "→ UPDATE_OR_PATCH_RE guard fails → event blocker fires → not_major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Deep Rock Galactic", | |
| appid="548430", | |
| gid="1824644522847377", | |
| title="Lunar Festival 2026 is now live!", | |
| expected="not_major", | |
| reasoning=( | |
| "Seasonal event announcement, not a major patch/update. " | |
| "Phase 2: EVENT_FESTIVAL_RE matches 'festival'; no 'update'/'patch' → event blocker fires → not_major." | |
| ), | |
| ), | |
| # ── Lethal Company ──────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Lethal Company", | |
| appid="1966720", | |
| gid="1800991756395986", | |
| title="V70 - The Incubating Update", | |
| expected="major", | |
| reasoning=( | |
| "Named major content update. " | |
| "Phase 2: NAMED_VERSION_RE matches 'V70'; UPDATE_WORD_RE matches 'Update' " | |
| "→ condition F makes it update-related; named version positive signal → major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Lethal Company", | |
| appid="1966720", | |
| gid="1801617199407807", | |
| title="V72 Bug fix patch", | |
| expected="not_major", | |
| reasoning=( | |
| "Small bug-fix patch. patchnotes tag makes it update-related. " | |
| "Phase 2: PATCH_WORD_RE matches 'patch'; MAINT_LANGUAGE_RE matches 'bug fix' " | |
| "→ maintenance blocker fires → not_major." | |
| ), | |
| ), | |
| # ── Factorio ────────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Factorio", | |
| appid="427520", | |
| gid="1827626365752749", | |
| title="Version 2.0.76 released as stable", | |
| expected="not_major", | |
| reasoning=( | |
| "Stable maintenance patch under a three-segment versioning scheme. " | |
| "Useful as a clean true negative." | |
| ), | |
| ), | |
| # ── Satisfactory ────────────────────────────────────────────────────────── | |
| ItemCase( | |
| game_name="Satisfactory", | |
| appid="526870", | |
| gid="1826992588604352", | |
| title="Update 1.2 is out now on Experimental!", | |
| expected="not_major", | |
| reasoning=( | |
| "Experimental-branch release, not a production major update. " | |
| "Phase 2: extended BRANCH_RE matches 'on Experimental' → branch blocker fires → not_major." | |
| ), | |
| ), | |
| ItemCase( | |
| game_name="Satisfactory", | |
| appid="526870", | |
| gid="1825093633185794", | |
| title="Experimental Hotfix v1.1.3.1", | |
| expected="not_major", | |
| reasoning=( | |
| "Experimental hotfix on a three-segment version. Correct behavior is not_major." | |
| ), | |
| ), | |
| ] | |
| # ── service-level ground truth ──────────────────────────────────────────────── | |
| # What SHOULD the production code do for this game given the current news window? | |
| # Populated from --discover run on 2026-03-19. | |
| # Phase 1 semantics: verdict based on is_major (major_date is not None), not on selected item title. | |
| SERVICE_CASES: list[ServiceCase] = [ | |
| ServiceCase( | |
| game_name="Going Medieval", | |
| appid="1029780", | |
| expected_major=True, | |
| reasoning=( | |
| "Game released 1.0 on 2026-03-17. Phase 1: 'is out now in 1.0!' matches " | |
| "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. " | |
| "Expected: major_date is not None (TP)." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Timberborn", | |
| appid="1062090", | |
| expected_major=True, | |
| reasoning=( | |
| "Game reached 1.0 on 2026-03-12. Phase 1: '1.0 is live!' matches " | |
| "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. " | |
| "Expected: major_date is not None (TP)." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Hades II", | |
| appid="1145350", | |
| expected_major=True, | |
| reasoning=( | |
| "Game launched v1.0 on 2025-09-25. Phase 1: 'v1.0 Is Now Available!' matches " | |
| "RELEASE_PHRASE_RE → update-related (developer feed). VERSION_RE matches 'v1.0' → major. " | |
| "Subsequent hotfixes (v1.0 Hotfix 2, 3) are correctly blocked by HOTFIX_RE. " | |
| "major_date = v1.0 launch date, latest_update_date = most recent hotfix date. " | |
| "Expected: major_date is not None (TP)." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Against the Storm", | |
| appid="1336490", | |
| expected_major=True, | |
| reasoning=( | |
| "'Brineworks Update (1.9) available!' is a named major content update. " | |
| "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' " | |
| "→ update-related (developer feed). VERSION_RE → major. " | |
| "Expected: major_date is not None (TP)." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Valheim", | |
| appid="892970", | |
| expected_major=False, | |
| reasoning=( | |
| "Top items are three-segment maintenance patches. " | |
| "Correctly classified as not_major. TN." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Manor Lords", | |
| appid="1363080", | |
| expected_major=True, | |
| reasoning=( | |
| "Current window contains a clearly labeled 'Major Update #6' post. " | |
| "Expected: major_date is not None." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Project Zomboid", | |
| appid="108600", | |
| expected_major=False, | |
| reasoning=( | |
| "Current window is dominated by unstable builds and hotfixes. " | |
| "These should update activity, but should not count as major releases." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Dwarf Fortress", | |
| appid="975370", | |
| expected_major=False, | |
| reasoning=( | |
| "Current window contains only maintenance patches (53.11/53.10/53.09 plus hotfixes). " | |
| "Phase 2: maintenance blocker (patch + fix language) correctly blocks all of them → no major_date." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Helldivers 2", | |
| appid="553850", | |
| expected_major=True, | |
| reasoning=( | |
| "Current window contains 'Machinery of Oppression: 6.1.0', a named content update. " | |
| "Expected: major_date is not None." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Lethal Company", | |
| appid="1966720", | |
| expected_major=True, | |
| reasoning=( | |
| "Current window contains 'V70 - The Incubating Update', a named major content drop, " | |
| "plus newer bug-fix patches. Phase 2: NAMED_VERSION_RE + UPDATE_WORD_RE detects V70 → major_date set." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Factorio", | |
| appid="427520", | |
| expected_major=False, | |
| reasoning=( | |
| "Current window contains only three-segment stable maintenance releases (2.0.x). " | |
| "Expected: not_major." | |
| ), | |
| ), | |
| ServiceCase( | |
| game_name="Satisfactory", | |
| appid="526870", | |
| expected_major=False, | |
| reasoning=( | |
| "Current window contains an experimental 1.2 rollout and experimental hotfixes. " | |
| "Phase 2: extended BRANCH_RE ('on Experimental') blocks the 1.2 rollout → no major_date." | |
| ), | |
| ), | |
| ] | |
| # ── helpers ─────────────────────────────────────────────────────────────────── | |
| def _fmt_ts(ts: int | None) -> str: | |
| if not ts: | |
| return "—" | |
| try: | |
| return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") | |
| except (OSError, ValueError): | |
| return "—" | |
| def _fmt_dt(dt: datetime | None) -> str: | |
| if dt is None: | |
| return "—" | |
| return dt.strftime("%Y-%m-%d") | |
| def _trunc(s: str, n: int) -> str: | |
| return (s[:n] + "…") if len(s) > n else s | |
| def _fetch_news(client: httpx.Client, appid: str, count: int) -> list[dict]: | |
| try: | |
| resp = client.get( | |
| STEAM_NEWS_API_URL, | |
| params={"appid": appid, "count": count, "maxlength": 0}, | |
| ) | |
| if resp.status_code != 200: | |
| print(f" [WARN] HTTP {resp.status_code} for appid {appid}", file=sys.stderr) | |
| return [] | |
| data = resp.json() | |
| return data.get("appnews", {}).get("newsitems", []) or [] | |
| except Exception as exc: | |
| print(f" [WARN] Request failed for appid {appid}: {exc}", file=sys.stderr) | |
| return [] | |
| # ── Mode 1: discover ────────────────────────────────────────────────────────── | |
| def run_discover(count: int) -> None: | |
| if count != 20: | |
| print(f"NOTE: count={count} — beyond production window (prod uses count=20)\n") | |
| col_idx = 4 | |
| col_gid = 20 | |
| col_date = 10 | |
| col_title = 40 | |
| col_fl = 16 | |
| col_tags = 24 | |
| col_ur = 9 | |
| col_maj = 7 | |
| header = ( | |
| f"{'#':<{col_idx}} " | |
| f"{'gid':<{col_gid}} " | |
| f"{'date':<{col_date}} " | |
| f"{'title':<{col_title}} " | |
| f"{'feedlabel':<{col_fl}} " | |
| f"{'tags':<{col_tags}} " | |
| f"{'upd_rel?':<{col_ur}} " | |
| f"{'major?':<{col_maj}}" | |
| ) | |
| sep = "-" * len(header) | |
| with httpx.Client(timeout=30.0) as client: | |
| for game_name, appid in GAMES: | |
| print(f"\n{'=' * len(header)}") | |
| print(f" {game_name} (appid={appid})") | |
| print(f"{'=' * len(header)}") | |
| print(header) | |
| print(sep) | |
| items = _fetch_news(client, appid, count) | |
| if not items: | |
| print(" (no items returned)") | |
| continue | |
| for idx, item in enumerate(items, start=1): | |
| gid = str(item.get("gid") or "")[:col_gid] | |
| date_str = _fmt_ts(item.get("date")) | |
| title = _trunc(item.get("title", ""), col_title) | |
| feedlabel = _trunc(item.get("feedlabel") or "", col_fl) | |
| tags = _trunc(str(item.get("tags") or ""), col_tags) | |
| is_ur = UpdateDetectionService._is_update_related(item) | |
| is_maj = UpdateDetectionService._is_major_update(item) | |
| ur_str = "Yes" if is_ur else "No" | |
| maj_str = "Yes" if is_maj else "No" | |
| print( | |
| f"{idx:<{col_idx}} " | |
| f"{gid:<{col_gid}} " | |
| f"{date_str:<{col_date}} " | |
| f"{title:<{col_title}} " | |
| f"{feedlabel:<{col_fl}} " | |
| f"{tags:<{col_tags}} " | |
| f"{ur_str:<{col_ur}} " | |
| f"{maj_str:<{col_maj}}" | |
| ) | |
| latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items) | |
| print(f"\n >> latest_update_date: {_fmt_dt(latest_update_date)} | major_date: {_fmt_dt(major_date)}") | |
| verdict = "MAJOR" if major_date is not None else "not_major" | |
| print(f" >> Service result: {verdict}") | |
| # ── Mode 2: evaluate (item-level) ───────────────────────────────────────────── | |
| def run_evaluate() -> None: | |
| if not ITEM_CASES: | |
| print("[evaluate] No item-level ground truth defined yet.") | |
| print(" Run --discover first, then populate ITEM_CASES in this script.") | |
| return | |
| # Build lookup: appid → {gid → item} | |
| gid_index: dict[str, dict[str, dict]] = {} | |
| needed_appids = {case.appid for case in ITEM_CASES} | |
| with httpx.Client(timeout=30.0) as client: | |
| for appid in needed_appids: | |
| items = _fetch_news(client, appid, count=20) | |
| gid_index[appid] = {str(item.get("gid", "")): item for item in items} | |
| tp = tn = fp = fn = amb = not_found = 0 | |
| rows: list[tuple] = [] | |
| for case in ITEM_CASES: | |
| item = gid_index.get(case.appid, {}).get(case.gid) | |
| if item is None: | |
| not_found += 1 | |
| rows.append((case.game_name, case.title, "—", "—", "—", case.expected, "NOT FOUND")) | |
| continue | |
| is_ur = UpdateDetectionService._is_update_related(item) | |
| is_maj = UpdateDetectionService._is_major_update(item) | |
| predicted = "major" if (is_ur and is_maj) else "not_major" | |
| expected = case.expected | |
| if expected == "ambiguous": | |
| verdict = "ambiguous" | |
| amb += 1 | |
| elif predicted == expected: | |
| verdict = "PASS" | |
| if expected == "major": | |
| tp += 1 | |
| else: | |
| tn += 1 | |
| else: | |
| if predicted == "major" and expected == "not_major": | |
| verdict = "FAIL (FP)" | |
| fp += 1 | |
| else: | |
| verdict = "FAIL (FN)" | |
| fn += 1 | |
| rows.append(( | |
| case.game_name, | |
| _trunc(case.title, 30), | |
| _fmt_ts(item.get("date")), | |
| str(item.get("tags", ""))[:20], | |
| item.get("feedlabel", "")[:16], | |
| expected, | |
| "Yes" if is_ur else "No", | |
| "Yes" if is_maj else "No", | |
| verdict, | |
| )) | |
| # Print report | |
| print("\n" + "=" * 110) | |
| print("REPORT A — Item-level classification") | |
| print("=" * 110) | |
| hdr = f"{'Game':<18} {'Title':<30} {'Date':<10} {'Tags':<20} {'FeedLabel':<16} {'Expected':<10} {'UpdRel?':<8} {'Major?':<7} Verdict" | |
| print(hdr) | |
| print("-" * 110) | |
| for row in rows: | |
| if len(row) == 7: | |
| print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {'—':<20} {'—':<16} {row[5]:<10} {'—':<8} {'—':<7} {row[6]}") | |
| else: | |
| print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {row[3]:<20} {row[4]:<16} {row[5]:<10} {row[6]:<8} {row[7]:<7} {row[8]}") | |
| total = tp + tn + fp + fn | |
| print("\nSummary:") | |
| print(f" Total cases : {len(ITEM_CASES)} | not found: {not_found} | ambiguous: {amb}") | |
| print(f" TP={tp} TN={tn} FP={fp} FN={fn}") | |
| if total > 0: | |
| prec = tp / (tp + fp) if (tp + fp) else float("nan") | |
| recall = tp / (tp + fn) if (tp + fn) else float("nan") | |
| acc = (tp + tn) / total | |
| print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}") | |
| fps = [c for c in ITEM_CASES if "FAIL (FP)" in str(rows[ITEM_CASES.index(c)])] | |
| fns = [c for c in ITEM_CASES if "FAIL (FN)" in str(rows[ITEM_CASES.index(c)])] | |
| if fps: | |
| print("\nFalse Positives:") | |
| for c in fps: | |
| print(f" [{c.game_name}] {c.title!r} — {c.reasoning}") | |
| if fns: | |
| print("\nFalse Negatives:") | |
| for c in fns: | |
| print(f" [{c.game_name}] {c.title!r} — {c.reasoning}") | |
| # ── Mode 3: evaluate-service (end-to-end) ───────────────────────────────────── | |
| def run_evaluate_service() -> None: | |
| if not SERVICE_CASES: | |
| print("[evaluate-service] No service-level ground truth defined yet.") | |
| print(" Run --discover first, then populate SERVICE_CASES in this script.") | |
| return | |
| tp = tn = fp = fn = amb = 0 | |
| rows: list[tuple] = [] | |
| with httpx.Client(timeout=30.0) as client: | |
| for case in SERVICE_CASES: | |
| items = _fetch_news(client, case.appid, count=20) | |
| latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items) | |
| is_maj = major_date is not None | |
| latest_str = _fmt_dt(latest_update_date) | |
| major_str = _fmt_dt(major_date) | |
| maj_label = "Yes" if is_maj else "No" | |
| if case.expected_major is None: | |
| verdict = "ambiguous" | |
| amb += 1 | |
| elif is_maj == case.expected_major: | |
| verdict = "PASS" | |
| if case.expected_major: | |
| tp += 1 | |
| else: | |
| tn += 1 | |
| else: | |
| if is_maj and not case.expected_major: | |
| verdict = "FAIL (FP)" | |
| fp += 1 | |
| else: | |
| verdict = "FAIL (FN)" | |
| fn += 1 | |
| rows.append(( | |
| case.game_name, | |
| latest_str, | |
| major_str, | |
| maj_label, | |
| "True" if case.expected_major else ("None" if case.expected_major is None else "False"), | |
| verdict, | |
| )) | |
| print("\n" + "=" * 100) | |
| print("REPORT B — Service-level (end-to-end)") | |
| print("=" * 100) | |
| hdr = f"{'Game':<18} {'LatestUpdate':<13} {'MajorDate':<11} {'Major?':<7} {'Expected':<9} Verdict" | |
| print(hdr) | |
| print("-" * 100) | |
| for row in rows: | |
| print(f"{row[0]:<18} {row[1]:<13} {row[2]:<11} {row[3]:<7} {row[4]:<9} {row[5]}") | |
| total = tp + tn + fp + fn | |
| print("\nSummary:") | |
| print(f" Total games : {len(SERVICE_CASES)} | ambiguous: {amb}") | |
| print(f" TP={tp} TN={tn} FP={fp} FN={fn}") | |
| if total > 0: | |
| prec = tp / (tp + fp) if (tp + fp) else float("nan") | |
| recall = tp / (tp + fn) if (tp + fn) else float("nan") | |
| acc = (tp + tn) / total | |
| print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}") | |
| for idx, case in enumerate(SERVICE_CASES): | |
| verdict = rows[idx][5] | |
| if verdict.startswith("FAIL"): | |
| print(f"\n [{case.game_name}] {verdict} — {case.reasoning}") | |
| # ── main ────────────────────────────────────────────────────────────────────── | |
| def _parse_args() -> argparse.Namespace: | |
| p = argparse.ArgumentParser( | |
| description="Benchmark the major update detection heuristic against real Steam games." | |
| ) | |
| p.add_argument( | |
| "--discover", | |
| action="store_true", | |
| help="Fetch news for all games and display per-item classification details.", | |
| ) | |
| p.add_argument( | |
| "--evaluate", | |
| action="store_true", | |
| help="Run item-level evaluation against ITEM_CASES ground truth.", | |
| ) | |
| p.add_argument( | |
| "--evaluate-service", | |
| action="store_true", | |
| dest="evaluate_service", | |
| help="Run service-level end-to-end evaluation against SERVICE_CASES ground truth.", | |
| ) | |
| p.add_argument( | |
| "--count", | |
| type=int, | |
| default=20, | |
| help="Number of news items to fetch (default: 20, matches production). " | |
| "Values > 20 are beyond the production window.", | |
| ) | |
| return p.parse_args() | |
| def main() -> int: | |
| args = _parse_args() | |
| discover = args.discover | |
| evaluate = args.evaluate | |
| eval_svc = args.evaluate_service | |
| # Default: run both evaluate modes when nothing is specified | |
| if not discover and not evaluate and not eval_svc: | |
| evaluate = True | |
| eval_svc = True | |
| if discover: | |
| run_discover(count=args.count) | |
| if evaluate: | |
| run_evaluate() | |
| if eval_svc: | |
| run_evaluate_service() | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |