Spaces:

klzn
/

sentimentstream-worker

Running

App Files Files Community

GitHub Action commited on 15 days ago

Commit

8ff1b66

0 Parent(s):

deploy: worker release from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +23 -0
Dockerfile +47 -0
README.md +14 -0
backend/.env.example +38 -0
backend/app/__init__.py +8 -0
backend/app/core/__init__.py +1 -0
backend/app/core/config.py +174 -0
backend/app/core/freshness.py +71 -0
backend/app/core/jieba_userdict.txt +14 -0
backend/app/core/keywords.py +273 -0
backend/app/core/rate_limit.py +6 -0
backend/app/core/sampling.py +135 -0
backend/app/core/stopwords_zh.py +39 -0
backend/app/core/ttl_tiers.py +19 -0
backend/app/core/worker_logging.py +316 -0
backend/app/db/__init__.py +1 -0
backend/app/db/mongodb.py +1152 -0
backend/app/main.py +159 -0
backend/app/models/__init__.py +19 -0
backend/app/models/schemas.py +210 -0
backend/app/routers/__init__.py +5 -0
backend/app/routers/analyze.py +597 -0
backend/app/routers/games.py +68 -0
backend/app/services/__init__.py +6 -0
backend/app/services/analysis_runner.py +643 -0
backend/app/services/analysis_utils.py +259 -0
backend/app/services/game_sync_service.py +290 -0
backend/app/services/highlights_service.py +202 -0
backend/app/services/nlp_service.py +524 -0
backend/app/services/precache_service.py +199 -0
backend/app/services/priority_refresh_service.py +387 -0
backend/app/services/steam_errors.py +22 -0
backend/app/services/steam_service.py +499 -0
backend/app/services/update_detection_service.py +453 -0
backend/pytest.ini +6 -0
backend/requirements.txt +42 -0
backend/scripts/smoke_news_cursor.py +264 -0
backend/scripts/smoke_test.py +185 -0
backend/worker_main.py +244 -0
scripts/benchmark_major_update.py +848 -0
scripts/check_db_stats.py +47 -0
scripts/expand_keywords/__init__.py +8 -0
scripts/expand_keywords/__main__.py +6 -0
scripts/expand_keywords/config.py +106 -0
scripts/expand_keywords/expander.py +350 -0
scripts/expand_keywords/fetcher.py +355 -0
scripts/expand_keywords/keywords_base.py +324 -0
scripts/expand_keywords/main.py +447 -0
scripts/expand_keywords/preprocessor.py +282 -0
scripts/expand_keywords/trainer.py +185 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Ignore everything by default
+*
+# Allow only what is needed for Docker build
+!backend/
+!frontend/
+!scripts/
+!Dockerfile
+!README.md
+!requirements.txt
+!.gitignore
+# Exclude unnecessary subfolders
+backend/tests/
+backend/__pycache__/
+backend/.pytest_cache/
+frontend/node_modules/
+frontend/dist/
+# Exclude specific files
+*.pdf
+.env
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,47 @@

+# ------------------------------------------------------------------------------
+# Stage 1: Quantize NLP model (torch needed ONLY here for PyTorch -> ONNX export)
+# ------------------------------------------------------------------------------
+FROM python:3.11-slim AS model-quantizer
+WORKDIR /app
+RUN pip install --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cpu \
+    "torch==2.2.0" \
+    "optimum[onnxruntime]==1.16.2" \
+    "transformers==4.37.2" \
+    "huggingface-hub==0.20.3" \
+    "numpy<2.0.0"
+COPY scripts/quantize_model.py scripts/quantize_model.py
+RUN python3 scripts/quantize_model.py
+# ------------------------------------------------------------------------------
+# Stage 2: Runtime (Python FastAPI Worker — no torch, no frontend)
+# ------------------------------------------------------------------------------
+FROM python:3.11-slim
+WORKDIR /app
+# Create non-root user for security
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Install Python dependencies (no torch — ~700MB RAM saved)
+COPY --chown=user:user backend/requirements.txt backend/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r backend/requirements.txt
+# Copy Backend code
+COPY --chown=user:user backend backend
+# Copy pre-quantized ONNX model from Stage 1
+COPY --chown=user:user --from=model-quantizer /app/backend/models/quantized backend/models/quantized
+WORKDIR /app/backend
+EXPOSE 7860
+CMD ["uvicorn", "worker_main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: SentimentStream Worker
+emoji: ⚙️
+colorFrom: gray
+colorTo: blue
+sdk: docker
+app_port: 7860
+pinned: false
+license: agpl-3.0
+---
+# SentimentStream Worker
+Background worker for SentimentStream. Syncs games from SteamSpy, detects updates via Steam News API, and pre-caches sentiment analyses.

backend/.env.example ADDED Viewed

	@@ -0,0 +1,38 @@

+# MongoDB
+MONGODB_URL=mongodb://admin:password@localhost:27017
+MONGODB_DB_NAME=sentimentSummary
+# App Settings
+DEBUG=true
+CORS_ORIGINS=http://localhost:5173,http://localhost:3000
+# Cache Settings
+CACHE_TTL_HOURS=24
+# Steam API Settings
+REVIEW_BATCH_SIZE=500
+STEAM_REVIEW_LANGUAGE=schinese
+STEAM_REGION=CN
+# Steam API Retry
+STEAM_RETRY_MAX_ATTEMPTS=3
+STEAM_RETRY_BASE_DELAY=1.0
+STEAM_RETRY_MAX_DELAY=10.0
+# Sampling Settings - Statistical sampling parameters
+SAMPLE_TOP_HELPFUL=50
+SAMPLE_CONFIDENCE_LEVEL=0.95
+SAMPLE_MARGIN_OF_ERROR=0.01
+SAMPLE_MAX_REVIEWS=3000
+# NLP Settings - Hugging Face Models
+HF_SENTIMENT_MODEL=uer/roberta-base-finetuned-jd-binary-chinese
+# NLP Settings - Analysis Parameters
+TEXT_MAX_LENGTH=512
+SENTIMENT_POSITIVE_THRESHOLD=0.1
+SENTIMENT_NEGATIVE_THRESHOLD=-0.1
+TOPIC_MIN_MENTIONS=5
+# Deduplication Cache
+DEDUP_CACHE_MAXSIZE=10000

backend/app/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+SentimentStream Backend Application.
+Narzędzie do analizy sentymentu i modelowania tematów
+w recenzjach gier Steam w czasie rzeczywistym.
+"""
+__version__ = "0.1.0"

backend/app/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Moduł konfiguracji aplikacji."""

backend/app/core/config.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Konfiguracja aplikacji.
+Wykorzystuje Pydantic Settings do zarządzania zmiennymi środowiskowymi.
+"""
+from functools import lru_cache
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """
+    Ustawienia aplikacji ładowane ze zmiennych środowiskowych.
+    """
+    model_config = SettingsConfigDict(
+        env_file=(".env", "backend/.env"),
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+    # MongoDB
+    mongodb_url: str = ""
+    mongodb_db_name: str = "sentimentSummary"
+    # App Mode
+    app_mode: str = "full"  # "full" = monolith, "api" = API-only (no frontend)
+    # App Settings
+    debug: bool = False
+    cors_origins: str = "http://localhost:5173,http://localhost:3000"
+    # Cache Settings
+    cache_ttl_hours: int = 24
+    cache_ttl_short_hours: int = 12      # frequently updated games
+    cache_ttl_long_hours: int = 168      # stable games (7 days)
+    cache_ttl_worker_managed_hours: int = 1440  # 60 days to preserve stale fallback results
+    cache_ttl_on_demand_hours: int = 1440       # 60 days to preserve stale fallback results
+    # Incremental Analysis
+    incremental_enabled: bool = True
+    incremental_max_stored_ids: int = 5000
+    incremental_max_gap_days: int = 90  # fall back to full analysis after this many days without reviews
+    recent_sample_limit: int = 1000
+    niche_cache_max_age_days: int = 60
+    analysis_freshness_max_age_days: int = 60
+    patch_context_max_age_days: int = 90
+    dlc_min_reviews_for_analysis: int = 50
+    dlc_visible_in_search: bool = False       # Temporary policy: hide DLC from autocomplete/suggestions
+    dlc_worker_analysis_enabled: bool = False  # Temporary policy: exclude DLC from worker-managed analysis
+    # Steam API Settings
+    review_batch_size: int = 100
+    steam_review_language: str = "schinese"  # Review fetch scope; product analyzes Simplified Chinese Steam reviews.
+    steam_region: str = "CN"  # CN, US, etc.
+    # Steam API Retry
+    steam_retry_max_attempts: int = 3
+    steam_retry_base_delay: float = 1.0  # doubles each retry
+    steam_retry_max_delay: float = 10.0  # cap
+    # Steam API Error Cache TTL (seconds)
+    steam_error_cache_ttl_404: int = 3600  # 1h
+    steam_error_cache_ttl_429: int = 300  # 5min
+    # Sampling Settings - Statistical sampling parameters
+    sample_top_helpful: int = 50
+    sample_confidence_level: float = 0.95
+    sample_margin_of_error: float = 0.02
+    sample_max_reviews: int = 3000
+    sample_minority_min: int = 100
+    # NLP Settings - Analysis Parameters
+    text_max_length: int = 512
+    sentiment_positive_threshold: float = 0.1
+    sentiment_negative_threshold: float = -0.1
+    topic_min_mentions: int = 5
+    # NLP Settings - Deduplication Cache
+    dedup_cache_maxsize: int = 10000
+    # NLP Settings - Performance & Logic
+    nlp_onnx_intra_threads: int = 2
+    nlp_onnx_inter_threads: int = 2
+    nlp_negation_window: int = 3
+    # Prediction Settings
+    prediction_retention_threshold_pos: float = 0.2
+    prediction_retention_threshold_neg: float = -0.2
+    # Community Highlights
+    highlights_ngram_min: int = 2
+    highlights_ngram_max: int = 5
+    highlights_min_mentions: int = 3
+    highlights_max_doc_freq_ratio: float = 0.4
+    highlights_top_n_general: int = 15
+    highlights_top_n_per_topic: int = 5
+    # Worker — Pre-cache
+    worker_trigger_token: str = ""
+    precache_enabled: bool = False
+    precache_top_n_games: int = 500
+    precache_batch_delay_seconds: int = 10
+    precache_checkpoints_hours: str = "6,12,24,72,168,336"
+    precache_max_analyses_per_cycle: int = 50
+    # Worker — Priority Games
+    steam_priority_categories: str = "top_sellers,new_releases,specials"
+    steam_priority_regions: str = "CN,US"
+    steam_priority_grace_days: int = 3
+    steam_priority_categories_url: str = "https://store.steampowered.com/api/featuredcategories"
+    steam_bootstrap_max_per_cycle: int = 20
+    steam_bootstrap_delay: float = 1.5
+    # Worker — News Scan
+    news_refresh_window_hours: int = 6
+    news_initial_count: int = 20
+    news_incremental_count: int = 5
+    # Worker — Game Sync
+    game_sync_enabled: bool = False
+    game_sync_steamspy_delay: float = 61.0
+    game_sync_details_delay: float = 1.1
+    game_sync_top_n_details: int = 500
+    game_sync_cn_enrichment_delay: float = 1.5
+    game_sync_cn_enrichment_limit: int = 200
+    game_sync_app_type_enrichment_delay: float = 1.5
+    game_sync_app_type_enrichment_limit: int = 200
+    # Logging (both Live API and Worker)
+    worker_log_dir: str = "/data/worker_logs"
+    worker_log_fallback_dir: str = "/tmp/worker_logs"
+    worker_log_max_bytes: int = 5_000_000       # 5 MB per file
+    worker_log_backup_count: int = 3            # 3 rotated files = 20 MB max
+    nlp_verbose_logging: bool = False           # re-enable NLP debug logs to stdout
+    nlp_debug_log_max_bytes: int = 2_000_000    # 2 MB per file
+    errors_log_max_bytes: int = 2_000_000       # 2 MB per file
+    # Rate Limiting
+    rate_limit_analyze: str = "10/minute"
+    rate_limit_default: str = "30/minute"
+    # NLP Settings - Hugging Face Models
+    # Using specialized Chinese model (RoBERTa-JD) - 90% accuracy on product reviews
+    hf_sentiment_model: str = "uer/roberta-base-finetuned-jd-binary-chinese"
+    @property
+    def cors_origins_list(self) -> list[str]:
+        """Zwraca listę dozwolonych originów CORS."""
+        return [origin.strip() for origin in self.cors_origins.split(",")]
+    @property
+    def precache_checkpoints_list(self) -> list[int]:
+        """Parse checkpoint hours from comma-separated string."""
+        return sorted(int(h.strip()) for h in self.precache_checkpoints_hours.split(","))
+    @property
+    def steam_priority_categories_list(self) -> list[str]:
+        return [c.strip() for c in self.steam_priority_categories.split(",") if c.strip()]
+    @property
+    def steam_priority_regions_list(self) -> list[str]:
+        return [r.strip() for r in self.steam_priority_regions.split(",") if r.strip()]
+@lru_cache
+def get_settings() -> Settings:
+    """Zwraca singleton instancji Settings."""
+    return Settings()
+settings = get_settings()

backend/app/core/freshness.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Product-level analysis freshness rules.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, cast
+from app.core.config import settings
+class FreshnessStatus(str, Enum):
+    """Product freshness state for an existing analysis."""
+    FRESH = "fresh"
+    STALE_BY_AGE = "stale_by_age"
+    STALE_BY_PATCH = "stale_by_patch"
+def _as_utc_datetime(value: Any) -> datetime | None:
+    if value is None:
+        return None
+    if isinstance(value, datetime):
+        return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
+    if isinstance(value, str):
+        parsed = datetime.fromisoformat(value)
+        return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
+    return None
+def get_analysis_reference_at(document: dict[str, Any]) -> datetime | None:
+    """Return the best available execution timestamp for freshness checks."""
+    raw = document.get("results")
+    results: dict[str, Any] = cast(dict[str, Any], raw) if isinstance(raw, dict) else {}
+    return (
+        _as_utc_datetime(results.get("analysis_date"))
+        or _as_utc_datetime(document.get("analyzed_at"))
+        or _as_utc_datetime(document.get("cached_at"))
+    )
+def evaluate_freshness(
+    document: dict[str, Any],
+    current_patch_at: datetime | None,
+) -> FreshnessStatus:
+    """
+    Evaluate analysis freshness using product rules:
+    patch recency first, then max age.
+    """
+    analysis_at = get_analysis_reference_at(document)
+    if analysis_at is None:
+        return FreshnessStatus.STALE_BY_AGE
+    if current_patch_at is not None and analysis_at < current_patch_at:
+        return FreshnessStatus.STALE_BY_PATCH
+    age_days = (datetime.now(timezone.utc) - analysis_at).days
+    if age_days > settings.analysis_freshness_max_age_days:
+        return FreshnessStatus.STALE_BY_AGE
+    return FreshnessStatus.FRESH
+def get_staleness_reason(status: FreshnessStatus) -> str | None:
+    if status == FreshnessStatus.STALE_BY_AGE:
+        return "STALE_REASON_AGE"
+    if status == FreshnessStatus.STALE_BY_PATCH:
+        return "STALE_REASON_PATCH"
+    return None

backend/app/core/jieba_userdict.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+boss战 5 n
+开放世界 5 n
+大逃杀 5 n
+战斗通行证 5 n
+皮肤系统 5 n
+氪金 10 v
+开箱 5 v
+人机对战 5 n
+帧数不稳 5 n
+内存泄漏 5 n
+手感好 5 a
+手感差 5 a
+上手简单 5 a
+劝退新手 5 v

backend/app/core/keywords.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Chinese keywords for game review topic detection.
+Used in hybrid approach (Keywords + ML Sentiment).
+Categories based on common topics in Steam game reviews.
+Seed keywords will be expanded using the expand_keywords pipeline.
+Structure: topic -> {single_char, compound, phrase}
+- single_char: standalone Chinese characters (1 char, prone to false positives)
+- compound: multi-char Chinese words or short English words
+- phrase: multi-word phrases (EN or ZH)
+"""
+TOPIC_KEYWORDS: dict[str, dict[str, list[str]]] = {
+    # =========================================================================
+    # CORE GAMEPLAY - 核心玩法
+    # =========================================================================
+    "Gameplay": {
+        "single_char": ["刷", "肝"],
+        "compound": [
+            "玩法", "游戏性", "机制", "战斗", "任务", "关卡",
+            "探索", "技能", "装备", "gameplay",
+        ],
+        "phrase": ["战斗系统"],
+    },
+    "Fun": {
+        "single_char": ["爽", "烂"],
+        "compound": [
+            # Positive
+            "好玩", "有趣", "上瘾", "神作", "佳作", "精品",
+            "沉浸", "过瘾", "带感", "回血", "爽游",
+            "解压", "杀时间",
+            # Negative
+            "无聊", "枯燥", "乏味", "垃圾", "辣鸡", "粪作",
+            "失望", "无趣",
+        ],
+        "phrase": [
+            "电子伟哥", "治好了", "精神时光屋", "时光屋",
+            "电子阳痿", "电子ed",
+        ],
+    },
+    "Difficulty": {
+        "single_char": [],
+        "compound": [
+            "难度", "简单", "困难", "硬核",
+            "劝退", "手残", "新手", "上手",
+            "souls", "魂类",
+        ],
+        "phrase": ["太难", "太简单"],
+    },
+    # =========================================================================
+    # TECHNICAL - 技术
+    # =========================================================================
+    "Performance": {
+        "single_char": ["卡"],
+        "compound": [
+            "优化", "卡顿", "帧率", "帧数", "流畅", "掉帧",
+            "丝滑", "显卡", "显存", "延迟",
+            "fps", "cpu", "gpu",
+        ],
+        "phrase": [
+            "稳60", "锁60", "解锁帧率", "吃配置", "带不动",
+            "PPT效果", "幻灯片", "帧生成", "输入延迟", "帧数不稳",
+        ],
+    },
+    "Bugs": {
+        "single_char": [],
+        "compound": [
+            "闪退", "崩溃", "卡死", "报错", "存档",
+            "黑屏", "进不去", "打不开", "未响应", "无响应",
+            "弹窗", "坏档", "掉线",
+            "bug", "bugs",
+        ],
+        "phrase": [
+            "存档损坏", "无法保存", "卡加载",
+            "加载失败", "连不上",
+        ],
+    },
+    # =========================================================================
+    # AUDIO-VISUAL - 视听
+    # =========================================================================
+    "Graphics": {
+        "single_char": [],
+        "compound": [
+            "画面", "画质", "特效", "建模", "贴图",
+            "美术", "风格", "场景", "光影",
+            "4k", "hdr",
+        ],
+        "phrase": [],
+    },
+    "Sound": {
+        "single_char": [],
+        "compound": [
+            "音乐", "音效", "配音", "配乐", "声音",
+            "原声",
+            "bgm", "ost",
+        ],
+        "phrase": ["中文配音"],
+    },
+    # =========================================================================
+    # CONTENT & VALUE - 内容与价值
+    # =========================================================================
+    "Content": {
+        "single_char": [],
+        "compound": [
+            "内容", "时长", "流程", "耐玩", "通关",
+            "主线", "支线", "收集", "小时", "体量",
+            "注水", "重复", "换皮", "多周目",
+            "dlc",
+        ],
+        "phrase": [
+            "素材复用", "拖时长", "强行延长", "通关后",
+        ],
+    },
+    "Monetization": {
+        "single_char": [],
+        "compound": [
+            # ex-Price
+            "价格", "定价", "值得", "不值", "贵", "便宜",
+            "打折", "史低", "入手", "白嫖", "性价比",
+            # ex-Microtransactions
+            "氪金", "内购", "充值", "抽卡", "648",
+            "课金", "首充", "月卡", "战令", "季票",
+            "开箱", "箱子", "钥匙", "保底", "抽奖",
+            "p2w",
+        ],
+        "phrase": [
+            "通行证", "pay to win",
+        ],
+    },
+    # =========================================================================
+    # MULTIPLAYER & COMMUNITY - 多人与社区
+    # =========================================================================
+    "Multiplayer": {
+        "single_char": [],
+        "compound": [
+            "联机", "多人", "匹��", "服务器", "延迟",
+            "掉线", "开黑", "组队", "单机", "野排", "车队",
+            "单排", "组排", "路人", "挂机",
+            "pvp", "pve", "coop",
+        ],
+        "phrase": [
+            "坑比", "猪队友", "送人头",
+        ],
+    },
+    "Community": {
+        "single_char": [],
+        "compound": [
+            "社区", "玩家", "汉化",
+            "官方", "民间",
+            "mod", "mods",
+        ],
+        "phrase": ["创意工坊"],
+    },
+    # =========================================================================
+    # CONTROLS & UI - 操控与界面
+    # =========================================================================
+    "Controls": {
+        "single_char": [],
+        "compound": [
+            "操作", "手感", "手柄", "键鼠", "键盘",
+            "摇杆", "触发", "键位", "改键",
+            "死区", "陀螺仪", "扳机", "震动",
+        ],
+        "phrase": [
+            "自定义键位", "辅助瞄准", "触觉反馈", "自适应扳机",
+        ],
+    },
+    "UI": {
+        "single_char": [],
+        "compound": [
+            "界面", "菜单", "字幕", "字体",
+            "中文", "汉化",
+            "ui", "hud",
+        ],
+        "phrase": [],
+    },
+    # =========================================================================
+    # STORY & NARRATIVE - 剧情
+    # =========================================================================
+    "Story": {
+        "single_char": [],
+        "compound": [
+            "剧情", "故事", "人物", "角色", "结局",
+            "剧本", "叙事", "世界观", "背景", "喂屎",
+            "烂尾", "降智", "工具人", "脸谱化",
+            "剧情杀", "都合主义",
+            "npc",
+        ],
+        "phrase": ["逻辑硬伤"],
+    },
+    # =========================================================================
+    # DEVELOPER SUPPORT - 开发支持
+    # =========================================================================
+    "Support": {
+        "single_char": [],
+        "compound": [
+            "更新", "修复", "维护", "开发商", "官方",
+            "补丁", "版本",
+        ],
+        "phrase": [],
+    },
+    "Localization": {
+        "single_char": [],
+        "compound": [
+            "本地化", "汉化", "翻译", "机翻", "缺字", "乱码",
+            "繁体", "简体",
+        ],
+        "phrase": [
+            "语言支持", "中文支持", "无中文", "不支援中文",
+            "文本质量", "字幕翻译", "界面翻译",
+        ],
+    },
+    # =========================================================================
+    # REFINEMENT - 打磨
+    # =========================================================================
+    "Polish": {
+        "single_char": [],
+        "compound": [
+            "打磨", "精致", "粗糙", "用心", "敷衍", "细节",
+            "诚意", "偷懒", "不用心", "精良", "精美",
+        ],
+        "phrase": ["粗制滥造"],
+    },
+    # =========================================================================
+    # RETENTION - 留存
+    # =========================================================================
+    "Retention": {
+        "single_char": [],
+        "compound": [
+            # Positive (High Retention)
+            "推荐", "安利", "入正", "入坑", "必玩",
+            "神作", "年度", "满分",
+            # Negative (Churn)
+            "退款", "卸载", "弃坑", "劝退", "不推荐",
+            "避雷", "踩雷", "退坑",
+            "回坑", "出坑", "已弃",
+        ],
+        "phrase": [
+            "坚持玩", "每天玩", "停不下来", "刷了",
+            "已退", "退款了",
+        ],
+    },
+}
+# =============================================================================
+# EXCLUSIONS (Context-aware filtering)
+# =============================================================================
+# Words to exclude when they appear in certain contexts.
+# Format: "keyword": ["context_word1", "context_word2"]
+EXCLUSIONS = {
+    # "fps" as genre (FPS shooter) vs performance (60 fps)
+    "fps": ["射击", "枪战", "第一人称"],
+    # Empty for now - will be expanded based on false positives
+}

backend/app/core/rate_limit.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Shared rate limiter instance for the application."""
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+limiter = Limiter(key_func=get_remote_address)

backend/app/core/sampling.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Moduł do obliczania statystycznej wielkości próbki.
+Implementuje wzory statystyczne dla próbkowania populacji.
+"""
+import math
+from dataclasses import dataclass
+from app.core.config import settings
+# Wartości Z dla poziomów ufności
+Z_SCORES = {
+    0.90: 1.645,
+    0.95: 1.96,
+    0.99: 2.576,
+}
+@dataclass
+class SamplePlan:
+    """
+    Plan próbkowania dla gry.
+    Attributes:
+        top_helpful: Liczba najprzydatniejszych recenzji.
+        statistical_sample: Wielkość próbki statystycznej.
+        positive_count: Ile pobrać pozytywnych (stratified).
+        negative_count: Ile pobrać negatywnych (stratified).
+        total: Łączna liczba recenzji do pobrania.
+    """
+    top_helpful: int
+    statistical_sample: int
+    positive_count: int
+    negative_count: int
+    total: int
+def calculate_sample_size(
+    population: int,
+    confidence_level: float | None = None,
+    margin_of_error: float | None = None,
+) -> int:
+    """
+    Oblicza minimalną wielkość próbki dla danej populacji.
+    Wykorzystuje wzór Cochrana z korektą dla populacji skończonej.
+    """
+    if confidence_level is None:
+        confidence_level = settings.sample_confidence_level
+    if margin_of_error is None:
+        margin_of_error = settings.sample_margin_of_error
+    # 1. Pobieramy Z-score (np. 1.96 dla 95% ufności).
+    # Mówi on, jak bardzo wynik może odbiegać od średniej w jednostkach odchylenia standardowego.
+    z = Z_SCORES.get(confidence_level, 1.96)
+    # 2. Zakładamy p=0.5 (maksymalna zmienność).
+    # To daje nam najbezpieczniejszą (największą) wielkość próbki.
+    p = 0.5
+    # 3. Wzór Cochrana dla nieskończonej populacji:
+    # n0 = (Z^2 * p * (1-p)) / e^2
+    # Wyjaśnienie: Z kwadrat razy zmienność, podzielone przez kwadrat błędu.
+    n_0 = (z ** 2 * p * (1 - p)) / (margin_of_error ** 2)
+    # 4. Korekta dla populacji skończonej (Steam ma policzalną liczbę recenzji):
+    # n = n0 / (1 + (n0 - 1) / N)
+    # Wyjaśnienie: Zmniejszamy próbkę, bo wiemy dokładnie, ile osób (recenzji) jest w "całym świecie" tej gry.
+    n = n_0 / (1 + (n_0 - 1) / population)
+    # Zaokrąglamy w górę do pełnej recenzji
+    return math.ceil(n)
+def create_sample_plan(
+    total_reviews: int,
+    positive_reviews: int,
+    negative_reviews: int,
+) -> SamplePlan:
+    """
+    Tworzy plan próbkowania, łącząc dwa podejścia.
+    """
+    top_helpful = settings.sample_top_helpful
+    max_reviews = settings.sample_max_reviews
+    # Obliczamy, ile recenzji musimy pobrać, żeby wynik był wiarygodny
+    statistical_sample = calculate_sample_size(total_reviews)
+    # Pilnujemy, żeby nie przekroczyć ustawionego limitu (np. 3000)
+    statistical_sample = min(statistical_sample, max_reviews - top_helpful)
+    # Obliczamy jaki procent stanowią pozytywy i negatywy w całości
+    if total_reviews > 0:
+        pos_ratio = positive_reviews / total_reviews
+        neg_ratio = negative_reviews / total_reviews
+    else:
+        pos_ratio = 0.5
+        neg_ratio = 0.5
+    # Rozdzielamy naszą próbkę proporcjonalnie do tych wyników (Stratified Sampling)
+    pos_target = math.ceil(statistical_sample * pos_ratio)
+    neg_target = math.ceil(statistical_sample * neg_ratio)
+    # Minority protection: boost the smaller group to minority_min if possible
+    minority_min = settings.sample_minority_min
+    if pos_target < minority_min and positive_reviews > pos_target:
+        pos_target = min(minority_min, positive_reviews)
+    if neg_target < minority_min and negative_reviews > neg_target:
+        neg_target = min(minority_min, negative_reviews)
+    # Final adjustment to stay within statistical_sample limit
+    if pos_target + neg_target > statistical_sample:
+        if pos_target > neg_target:
+            pos_target = max(pos_target - (pos_target + neg_target - statistical_sample), minority_min)
+        else:
+            neg_target = max(neg_target - (pos_target + neg_target - statistical_sample), minority_min)
+    # Final cap by actual availability
+    positive_count = min(pos_target, positive_reviews)
+    negative_count = min(neg_target, negative_reviews)
+    # Sumujemy wszystko (Top Helpful + Próbka Statystyczna)
+    total = top_helpful + positive_count + negative_count
+    return SamplePlan(
+        top_helpful=top_helpful,
+        statistical_sample=statistical_sample,
+        positive_count=positive_count,
+        negative_count=negative_count,
+        total=total,
+    )

backend/app/core/stopwords_zh.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+Chinskie stop words dla NLP pipeline.
+Uzywane przez Community Highlights (n-gram extraction) i potencjalnie inne moduly.
+"""
+# Jednooznakowe tokeny do odfiltrowania (WYJATKI ponizej)
+SINGLE_CHAR_EXCEPTIONS = {"卡", "肝", "爽", "氪", "菜", "毒"}
+# Stop words — czeste slowa bez wartosci informacyjnej
+STOPWORDS_ZH = {
+    # Zaimki
+    "我", "你", "他", "她", "它", "我们", "你们", "他们",
+    # Czastki i spojniki
+    "的", "了", "是", "在", "不", "有", "和", "就",
+    "都", "也", "很", "要", "会", "可以", "这", "那",
+    "还", "没", "着", "被", "把", "让", "给", "从",
+    "到", "对", "但", "而", "或", "与",
+    # Czastki modalne
+    "吗", "呢", "啊", "吧", "呀", "嘛", "哦", "哈",
+    # Przysliwki
+    "比较", "非常", "真的", "确实", "其实", "可能",
+    "已经", "一直", "马上", "刚刚",
+    # Czasowniki ogolne
+    "觉得", "感觉", "知道", "看到", "说",
+    # Liczebniki i okreslniki
+    "一个", "一些", "这个", "那个", "什么", "怎么",
+    "多少", "几个",
+    # Filler w recenzjach gier
+    "这游戏", "这个游戏", "游戏", "玩家",
+}
+def is_stopword(token: str) -> bool:
+    """Sprawdza czy token jest stop wordem lub jednooznakowym tokenem bez wartosci."""
+    if token in STOPWORDS_ZH:
+        return True
+    if len(token) == 1 and token not in SINGLE_CHAR_EXCEPTIONS:
+        return True
+    return False

backend/app/core/ttl_tiers.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Tiered TTL configuration for game cache expiry.
+Popular games (worker-managed top N) get longer cache,
+niche games (on-demand) get shorter cache.
+"""
+from app.core.config import settings
+async def get_ttl_hours(app_id: str) -> int:
+    """Return TTL in hours based on whether the game is a priority game."""
+    from app.db.mongodb import mongodb
+    priority_ids = await mongodb.get_priority_game_ids_for_analysis()
+    if app_id in priority_ids:
+        return settings.cache_ttl_worker_managed_hours  # 1440h (60d)
+    return settings.cache_ttl_on_demand_hours  # 1440h (60d)

backend/app/core/worker_logging.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Structured logging infrastructure for Worker and Live API.
+Provides JSON-line file logging with rotation, timing context managers,
+and module-level accessors for use across the codebase.
+"""
+import json
+import logging
+import logging.handlers
+import os
+import time
+from typing import Any
+from app.core.config import settings
+# Module-level state
+_structured_logger: logging.Logger | None = None
+_cycle_id: str | None = None
+_app_logging_initialized: bool = False
+# Per-process log file whitelists (key → filename)
+LIVE_LOG_WHITELIST: dict[str, str] = {
+    "live": "live.jsonl",
+    "errors": "errors.log",
+    "nlp_debug": "nlp_debug.log",
+}
+WORKER_LOG_WHITELIST: dict[str, str] = {
+    "worker": "worker.jsonl",
+    "errors": "errors.log",
+    "nlp_debug": "nlp_debug.log",
+}
+class DebugOnlyFilter(logging.Filter):
+    """Pass only DEBUG-level records (blocks INFO and above)."""
+    def filter(self, record: logging.LogRecord) -> bool:
+        return record.levelno == logging.DEBUG
+def _get_writable_log_dir() -> str:
+    """Return the first writable log directory (primary or fallback)."""
+    log_dir = settings.worker_log_dir
+    try:
+        os.makedirs(log_dir, exist_ok=True)
+        test_path = os.path.join(log_dir, ".write_test")
+        with open(test_path, "w") as f:
+            f.write("ok")
+        os.remove(test_path)
+    except (OSError, PermissionError):
+        log_dir = settings.worker_log_fallback_dir
+        os.makedirs(log_dir, exist_ok=True)
+    return log_dir
+class JsonLineFormatter(logging.Formatter):
+    """Formats log records as single-line JSON (JSONL)."""
+    def format(self, record: logging.LogRecord) -> str:
+        entry: dict[str, Any] = {
+            "ts": self.formatTime(record, self.datefmt),
+            "level": record.levelname,
+            "event": getattr(record, "event", record.getMessage()),
+        }
+        # Optional structured fields
+        for key in ("detail", "elapsed_s", "breakdown", "app_id",
+                     "game_name", "source", "reviews_processed",
+                     "topics_found", "analysis_type", "cycle_id", "error"):
+            val = getattr(record, key, None)
+            if val is not None:
+                entry[key] = val
+        # Include cycle_id from module state if not on record
+        if "cycle_id" not in entry or entry["cycle_id"] is None:
+            cid = get_cycle_id()
+            if cid:
+                entry["cycle_id"] = cid
+        # Remove None values
+        entry = {k: v for k, v in entry.items() if v is not None}
+        return json.dumps(entry, default=str, ensure_ascii=False)
+def setup_structured_logger(name: str) -> logging.Logger:
+    """
+    Create a rotating JSON-line file logger.
+    Tries settings.worker_log_dir first, falls back to
+    settings.worker_log_fallback_dir if the primary is not writable.
+    Args:
+        name: Logger name and file prefix (e.g. "worker" or "live").
+    Returns:
+        Configured logger instance.
+    """
+    logger = logging.getLogger(f"structured.{name}")
+    logger.setLevel(logging.INFO)
+    # Don't add duplicate handlers on re-init
+    if logger.handlers:
+        return logger
+    log_dir = _get_writable_log_dir()
+    log_path = os.path.join(log_dir, f"{name}.jsonl")
+    handler = logging.handlers.RotatingFileHandler(
+        log_path,
+        maxBytes=settings.worker_log_max_bytes,
+        backupCount=settings.worker_log_backup_count,
+        encoding="utf-8",
+    )
+    handler.setFormatter(JsonLineFormatter())
+    logger.addHandler(handler)
+    # Also store as module-level default
+    set_structured_logger(logger)
+    return logger
+class TimingContext:
+    """Sync context manager that measures wall-clock time via time.monotonic()."""
+    def __init__(self) -> None:
+        self.elapsed_s: float = 0.0
+        self._start: float = 0.0
+    def __enter__(self) -> "TimingContext":
+        self._start = time.monotonic()
+        return self
+    def __exit__(self, *exc: Any) -> None:
+        self.elapsed_s = round(time.monotonic() - self._start, 3)
+class AsyncTimingContext:
+    """Async context manager that measures wall-clock time via time.monotonic()."""
+    def __init__(self) -> None:
+        self.elapsed_s: float = 0.0
+        self._start: float = 0.0
+    async def __aenter__(self) -> "AsyncTimingContext":
+        self._start = time.monotonic()
+        return self
+    async def __aexit__(self, *exc: Any) -> None:
+        self.elapsed_s = round(time.monotonic() - self._start, 3)
+def read_log_tail(
+    path: str,
+    lines: int = 100,
+    level: str | None = None,
+    event: str | None = None,
+) -> list[dict[str, Any]]:
+    """
+    Read last N JSON lines from a log file, with optional filtering.
+    Args:
+        path: Path to .jsonl log file.
+        lines: Max number of lines to return.
+        level: Filter by log level (e.g. "ERROR").
+        event: Filter by event name substring.
+    Returns:
+        List of parsed JSON dicts, newest last.
+    """
+    if not os.path.exists(path):
+        return []
+    # Read all lines, take last N (simple approach for small-ish files)
+    with open(path, "r", encoding="utf-8") as f:
+        all_lines = f.readlines()
+    # Parse from the end, collect up to `lines` matching entries
+    results: list[dict[str, Any]] = []
+    for raw in reversed(all_lines):
+        raw = raw.strip()
+        if not raw:
+            continue
+        try:
+            entry = json.loads(raw)
+        except json.JSONDecodeError:
+            continue
+        if level and entry.get("level") != level:
+            continue
+        if event and event not in entry.get("event", ""):
+            continue
+        results.append(entry)
+        if len(results) >= lines:
+            break
+    results.reverse()  # Restore chronological order
+    return results
+def resolve_log_path(file_key: str, whitelist: dict[str, str]) -> str | None:
+    """
+    Resolve a whitelisted log file key to its absolute path.
+    Returns the expected path if the key is in the whitelist, None otherwise.
+    The file may not exist yet (read_log_tail handles that gracefully).
+    Args:
+        file_key: Logical name for the log file (e.g. "live", "errors").
+        whitelist: Mapping of allowed keys to filenames for this process.
+    Returns:
+        Absolute path to the log file, or None if key is not whitelisted.
+    """
+    filename = whitelist.get(file_key)
+    if not filename:
+        return None
+    primary = os.path.join(settings.worker_log_dir, filename)
+    if os.path.isdir(settings.worker_log_dir):
+        return primary
+    return os.path.join(settings.worker_log_fallback_dir, filename)
+def setup_app_logging() -> None:
+    """
+    Set up application-wide file logging handlers. Idempotent.
+    Creates:
+    - errors.log: WARNING+ from all loggers (attached to root logger)
+    - nlp_debug.log: DEBUG-only NLP trace from app.services.nlp_service
+    Call once during app lifespan startup, after setup_structured_logger().
+    """
+    global _app_logging_initialized
+    if _app_logging_initialized:
+        return
+    _app_logging_initialized = True
+    log_dir = _get_writable_log_dir()
+    # 1. errors.log — WARNING+ from root (catches all loggers via propagation)
+    errors_handler = logging.handlers.RotatingFileHandler(
+        os.path.join(log_dir, "errors.log"),
+        maxBytes=settings.errors_log_max_bytes,
+        backupCount=settings.worker_log_backup_count,
+        encoding="utf-8",
+    )
+    errors_handler.setLevel(logging.WARNING)
+    errors_handler.setFormatter(JsonLineFormatter())
+    logging.getLogger().addHandler(errors_handler)
+    # 2. nlp_debug.log — DEBUG-only NLP trace (Dedup/Cache messages)
+    nlp_handler = logging.handlers.RotatingFileHandler(
+        os.path.join(log_dir, "nlp_debug.log"),
+        maxBytes=settings.nlp_debug_log_max_bytes,
+        backupCount=settings.worker_log_backup_count,
+        encoding="utf-8",
+    )
+    nlp_handler.setLevel(logging.DEBUG)
+    nlp_handler.addFilter(DebugOnlyFilter())
+    nlp_handler.setFormatter(JsonLineFormatter())
+    nlp_logger = logging.getLogger("app.services.nlp_service")
+    nlp_logger.setLevel(logging.DEBUG)
+    nlp_logger.addHandler(nlp_handler)
+    # 3. Optional: re-enable NLP debug to stdout
+    if settings.nlp_verbose_logging:
+        verbose_handler = logging.StreamHandler()
+        verbose_handler.setLevel(logging.DEBUG)
+        verbose_handler.setFormatter(logging.Formatter(
+            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+        ))
+        nlp_logger.addHandler(verbose_handler)
+def get_structured_logger() -> logging.Logger | None:
+    """Get the module-level structured logger (if initialized)."""
+    return _structured_logger
+def set_structured_logger(logger: logging.Logger) -> None:
+    """Set the module-level structured logger."""
+    global _structured_logger
+    _structured_logger = logger
+def get_cycle_id() -> str | None:
+    """Get the current worker cycle ID."""
+    return _cycle_id
+def set_cycle_id(cycle_id: str | None) -> None:
+    """Set the current worker cycle ID."""
+    global _cycle_id
+    _cycle_id = cycle_id
+def log_structured(
+    event: str,
+    level: int = logging.INFO,
+    **kwargs: Any,
+) -> None:
+    """
+    Emit a structured log entry via the module-level logger.
+    No-op if no structured logger has been initialized (e.g. in tests).
+    """
+    slog = get_structured_logger()
+    if not slog:
+        return
+    slog.log(level, event, extra={"event": event, **kwargs})

backend/app/db/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Moduł bazy danych."""

backend/app/db/mongodb.py ADDED Viewed

	@@ -0,0 +1,1152 @@

+"""
+Moduł połączenia z bazą danych MongoDB.
+Wykorzystuje Motor (async driver) do asynchronicznej komunikacji z MongoDB.
+Implementuje cache wyników analizy z TTL 24h.
+"""
+import asyncio
+import logging
+import re
+from datetime import datetime, timedelta, timezone
+from typing import Any
+from bson.codec_options import CodecOptions
+from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
+from pymongo import ASCENDING, DESCENDING, UpdateOne
+from pymongo.errors import (
+    BulkWriteError,
+    ConnectionFailure,
+    OperationFailure,
+    PyMongoError,
+)
+from app.core.config import settings
+logger = logging.getLogger(__name__)
+class MongoDB:
+    """
+    Klasa zarządzająca połączeniem z MongoDB.
+    Implementuje wzorzec Singleton poprzez globalną instancję.
+    Obsługuje cache wyników analizy z automatyczną walidacją TTL.
+    Przechowuje listę gier Steam do autouzupełniania.
+    Attributes:
+        client: Klient MongoDB (Motor).
+        db: Referencja do bazy danych.
+    """
+    COLLECTION_ANALYSES = "analyses"
+    COLLECTION_GAMES = "games"
+    COLLECTION_STEAM_ERRORS = "steam_errors"
+    COLLECTION_REFRESH_SCHEDULES = "refresh_schedules"
+    def __init__(self) -> None:
+        """Inicjalizuje instancję bez aktywnego połączenia."""
+        self.client: AsyncIOMotorClient | None = None  # type: ignore
+        self.db: AsyncIOMotorDatabase | None = None  # type: ignore
+    async def connect(self, max_retries: int = 3) -> None:
+        """
+        Nawiązuje połączenie z MongoDB z exponential backoff.
+        Tworzy indeksy dla optymalnej wydajności zapytań.
+        Args:
+            max_retries: Maksymalna liczba prób połączenia.
+        Raises:
+            ConnectionError: Gdy nie można połączyć się z bazą po wszystkich próbach.
+        """
+        for attempt in range(1, max_retries + 1):
+            try:
+                self.client = AsyncIOMotorClient(settings.mongodb_url, tz_aware=True)
+                codec_options: CodecOptions = CodecOptions(tz_aware=True)
+                self.db = self.client.get_database(
+                    settings.mongodb_db_name, codec_options=codec_options
+                )
+                # Weryfikacja połączenia
+                await self.client.admin.command("ping")
+                logger.info(f"Połączono z MongoDB: {settings.mongodb_db_name}")
+                # Utwórz indeksy
+                await self._create_indexes()
+                return
+            except (ConnectionFailure, PyMongoError) as e:
+                if attempt < max_retries:
+                    delay = 2 ** (attempt - 1)  # 1s, 2s, 4s
+                    logger.warning(
+                        f"MongoDB connection attempt {attempt}/{max_retries} failed: {e}. "
+                        f"Retrying in {delay}s..."
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    logger.error(f"MongoDB connection failed after {max_retries} attempts: {e}")
+                    raise ConnectionError(
+                        f"Nie można połączyć się z MongoDB po {max_retries} próbach: {e}"
+                    )
+    async def _create_indexes(self) -> None:
+        """Tworzy indeksy dla kolekcji."""
+        if self.db is None:
+            return
+        # Indeksy dla analiz
+        analyses = self.db[self.COLLECTION_ANALYSES]
+        await analyses.create_index("game_id", unique=True)
+        # Migrate from old global TTL index (cached_at) to per-document TTL (expires_at)
+        try:
+            existing_indexes = await analyses.index_information()
+            for idx_name, idx_info in existing_indexes.items():
+                if idx_info.get("expireAfterSeconds") is not None and "cached_at" in str(idx_info.get("key")):
+                    await analyses.drop_index(idx_name)
+                    logger.info(f"Dropped old TTL index: {idx_name}")
+                    break
+        except OperationFailure:
+            pass  # Old index may not exist
+        await analyses.create_index("expires_at", expireAfterSeconds=0)
+        # Indeksy dla listy gier
+        games = self.db[self.COLLECTION_GAMES]
+        await games.create_index("appid", unique=True)
+        # Indeks dla wyszukiwania regex (case-insensitive)
+        await games.create_index("name_lower")
+        await games.create_index("name_cn")
+        # Rzadki indeks dla flagi sprawdzenia (oszczędność miejsca, szybkość zapytania)
+        await games.create_index("cn_name_checked", sparse=True)
+        await games.create_index("parent_appid", sparse=True)
+        # Compound index for sorting games by review count (worker game sync)
+        await games.create_index(
+            [("positive", DESCENDING), ("negative", DESCENDING)],
+            sparse=True,
+        )
+        await games.create_index(
+            [
+                ("name_lower", ASCENDING),
+                ("app_type", ASCENDING),
+                ("positive", DESCENDING),
+                ("negative", DESCENDING),
+            ]
+        )
+        await games.create_index("is_priority", sparse=True)
+        # Indeksy dla cache błędów Steam API
+        steam_errors = self.db[self.COLLECTION_STEAM_ERRORS]
+        await steam_errors.create_index("app_id", unique=True)
+        await steam_errors.create_index("expires_at", expireAfterSeconds=0)
+        # Indexes for refresh schedules (worker pre-cache)
+        schedules = self.db[self.COLLECTION_REFRESH_SCHEDULES]
+        await schedules.create_index("app_id", unique=True)
+        await schedules.create_index("status")
+        logger.debug("Utworzono indeksy MongoDB")
+    async def disconnect(self) -> None:
+        """Zamyka połączenie z MongoDB."""
+        if self.client:
+            self.client.close()
+            logger.info("Rozłączono z MongoDB")
+    def _is_document_expired(self, document: dict[str, Any]) -> bool:
+        """Check if a cache document is expired using expires_at or cached_at fallback.
+        With tz_aware=True on the Motor client, all datetimes from MongoDB are
+        already timezone-aware, so no manual .replace(tzinfo=...) is needed.
+        """
+        now = datetime.now(timezone.utc)
+        # New-format: per-document expires_at
+        expires_at = document.get("expires_at")
+        if expires_at:
+            if isinstance(expires_at, str):
+                expires_at = datetime.fromisoformat(expires_at)
+            return now >= expires_at
+        # Old-format fallback: cached_at + default TTL
+        cached_at = document.get("cached_at")
+        if cached_at:
+            if isinstance(cached_at, str):
+                cached_at = datetime.fromisoformat(cached_at)
+            ttl_hours = document.get("ttl_hours", settings.cache_ttl_hours)
+            return now - cached_at > timedelta(hours=ttl_hours)
+        return True  # No timestamp info = treat as expired
+    async def get_cached_analysis_full(self, game_id: str) -> dict[str, Any] | None:
+        """
+        Returns full cache document (with review IDs, TTL info) or None if expired/missing.
+        """
+        if self.db is None:
+            return None
+        collection = self.db[self.COLLECTION_ANALYSES]
+        try:
+            document = await collection.find_one({"game_id": game_id})
+            if not document:
+                return None
+            if self._is_document_expired(document):
+                logger.info(f"Cache expired for game {game_id}")
+                return None
+            document.pop("_id", None)
+            return document
+        except PyMongoError as e:
+            logger.error(f"Error reading cache: {e}")
+            return None
+    async def get_stale_analysis(self, game_id: str) -> dict[str, Any] | None:
+        """
+        Returns cache document even if expired. Used by incremental path
+        to retrieve old review IDs. Returns None only if no document exists.
+        """
+        return await self.get_analysis(game_id)
+    async def get_analysis(self, game_id: str) -> dict[str, Any] | None:
+        """
+        Returns an analysis document regardless of TTL.
+        Product freshness is evaluated outside MongoDB, so this method is the
+        canonical read path for "show stale result + refresh" behavior.
+        """
+        if self.db is None:
+            return None
+        collection = self.db[self.COLLECTION_ANALYSES]
+        try:
+            document = await collection.find_one({"game_id": game_id})
+            if not document:
+                return None
+            document.pop("_id", None)
+            return document
+        except PyMongoError as e:
+            logger.error(f"Error reading stale cache: {e}")
+            return None
+    async def get_cached_analysis(self, game_id: str) -> dict[str, Any] | None:
+        """
+        Returns cached analysis results or None if expired/missing.
+        Backward-compatible wrapper around get_cached_analysis_full.
+        """
+        doc = await self.get_cached_analysis_full(game_id)
+        if doc is None:
+            return None
+        results = doc.get("results")
+        if isinstance(results, dict) and results.get("cached_at") is None and doc.get("cached_at") is not None:
+            results = {**results, "cached_at": doc["cached_at"]}
+        return results
+    async def save_analysis(
+        self,
+        game_id: str,
+        results: dict[str, Any],
+        analyzed_review_ids: list[str] | None = None,
+        latest_review_timestamp: int = 0,
+        ttl_hours: int | None = None,
+        analyzed_at: datetime | None = None,
+    ) -> None:
+        """
+        Saves analysis results to cache with per-document TTL.
+        Purges review IDs to keep only the most recent ones (space efficiency).
+        """
+        if self.db is None:
+            logger.warning("Brak połączenia z MongoDB - nie zapisano cache")
+            return
+        collection = self.db[self.COLLECTION_ANALYSES]
+        effective_ttl = ttl_hours or settings.cache_ttl_hours
+        now = datetime.now(timezone.utc)
+        analysis_date = analyzed_at
+        if analysis_date is None:
+            raw_value = results.get("analysis_date") or results.get("cached_at")
+            if isinstance(raw_value, str):
+                analysis_date = datetime.fromisoformat(raw_value)
+            elif isinstance(raw_value, datetime):
+                analysis_date = raw_value
+        if analysis_date is None:
+            analysis_date = now
+        if results.get("analysis_date") is None:
+            results = {**results, "analysis_date": analysis_date}
+        # Purge old IDs — keep only the most recent N
+        if analyzed_review_ids:
+            analyzed_review_ids = analyzed_review_ids[-settings.incremental_max_stored_ids:]
+        document: dict[str, Any] = {
+            "game_id": game_id,
+            "results": results,
+            "analyzed_review_ids": analyzed_review_ids or [],
+            "latest_review_timestamp": latest_review_timestamp,
+            "cached_at": now,
+            "analyzed_at": analysis_date,
+            "ttl_hours": effective_ttl,
+            "expires_at": now + timedelta(hours=effective_ttl),
+        }
+        try:
+            await collection.update_one(
+                {"game_id": game_id},
+                {"$set": document},
+                upsert=True,
+            )
+            logger.info(f"Saved cache for game {game_id} (TTL: {effective_ttl}h)")
+        except PyMongoError as e:
+            logger.error(f"Error saving cache: {e}")
+    async def delete_cached_analysis(self, game_id: str) -> bool:
+        """
+        Usuwa cache dla danej gry.
+        Args:
+            game_id: Identyfikator gry Steam.
+        Returns:
+            True jeśli usunięto, False w przeciwnym razie.
+        """
+        if self.db is None:
+            return False
+        collection = self.db[self.COLLECTION_ANALYSES]
+        try:
+            result = await collection.delete_one({"game_id": game_id})
+            return result.deleted_count > 0
+        except PyMongoError as e:
+            logger.error(f"Błąd usuwania cache: {e}")
+            return False
+    # ========== Steam API Error Cache ==========
+    async def get_steam_error(self, app_id: str) -> dict[str, Any] | None:
+        """
+        Sprawdza czy app_id ma cached error.
+        Returns:
+            Dict z polami app_id, status_code, expires_at lub None.
+        """
+        if self.db is None:
+            return None
+        collection = self.db[self.COLLECTION_STEAM_ERRORS]
+        try:
+            document = await collection.find_one({"app_id": app_id})
+            if not document:
+                return None
+            document.pop("_id", None)
+            return document
+        except PyMongoError as e:
+            logger.error(f"Błąd odczytu steam error cache: {e}")
+            return None
+    async def cache_steam_error(
+        self, app_id: str, status_code: int, ttl_seconds: int
+    ) -> None:
+        """
+        Cachuje błąd Steam API z automatycznym TTL.
+        MongoDB TTL index automatycznie usunie dokument po expires_at.
+        """
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_STEAM_ERRORS]
+        document = {
+            "app_id": app_id,
+            "status_code": status_code,
+            "cached_at": datetime.now(timezone.utc),
+            "expires_at": datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds),
+        }
+        try:
+            await collection.update_one(
+                {"app_id": app_id},
+                {"$set": document},
+                upsert=True,
+            )
+            logger.info(
+                f"Cached Steam error {status_code} for app {app_id} (TTL: {ttl_seconds}s)"
+            )
+        except PyMongoError as e:
+            logger.error(f"Błąd zapisu steam error cache: {e}")
+    # ========== Metody dla listy gier (autouzupełnianie) ==========
+    async def get_games_count(self) -> int:
+        """Zwraca liczbę gier w bazie."""
+        if self.db is None:
+            return 0
+        collection = self.db[self.COLLECTION_GAMES]
+        return await collection.count_documents({})
+    async def save_games_batch(self, games: list[dict[str, str]]) -> int:
+        """
+        Zapisuje partię gier do bazy (bulk insert).
+        Args:
+            games: Lista słowników z kluczami 'appid', 'name', opcjonalnie 'developer', 'publisher'.
+        Returns:
+            Liczba zapisanych gier.
+        """
+        if self.db is None or not games:
+            return 0
+        collection = self.db[self.COLLECTION_GAMES]
+        # Dodaj pole name_lower dla wyszukiwania case-insensitive
+        documents = []
+        for game in games:
+            if not game.get("name"):
+                continue
+            doc = {
+                "appid": game["appid"],
+                "name": game["name"],
+                "name_lower": game["name"].lower(),
+            }
+            # Dodaj opcjonalne pola
+            if game.get("developer"):
+                doc["developer"] = game["developer"]
+            if game.get("publisher"):
+                doc["publisher"] = game["publisher"]
+            documents.append(doc)
+        try:
+            # Użyj ordered=False żeby kontynuować mimo duplikatów
+            result = await collection.insert_many(documents, ordered=False)
+            return len(result.inserted_ids)
+        except BulkWriteError as e:
+            # Duplicates are expected with ordered=False — count successful inserts
+            inserted = e.details.get("nInserted", 0)
+            logger.debug(f"Pominięto duplikaty podczas zapisu gier ({inserted} inserted)")
+            return inserted
+        except PyMongoError as e:
+            logger.error(f"Błąd zapisu gier: {e}")
+            return 0
+    async def clear_games(self) -> None:
+        """Usuwa wszystkie gry z bazy."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        await collection.delete_many({})
+        logger.info("Usunięto wszystkie gry z bazy")
+    async def upsert_game(self, game_data: dict[str, Any]) -> None:
+        """
+        Dodaje lub aktualizuje pojedynczą grę w bazie danych.
+        Używane głównie przez mechanizm Fallback Search.
+        """
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        appid = str(game_data["appid"])
+        # Przygotuj dokument
+        update_doc = {
+            "appid": appid,
+            "name": game_data["name"],
+            "name_lower": game_data["name"].lower(),
+        }
+        if game_data.get("name_cn"):
+            update_doc["name_cn"] = game_data["name_cn"]
+            update_doc["cn_name_checked"] = True
+        elif game_data.get("cn_name_checked"):
+            update_doc["cn_name_checked"] = True
+        if game_data.get("header_image") is not None:
+            update_doc["header_image"] = game_data["header_image"]
+        if game_data.get("total_reviews") is not None:
+            update_doc["total_reviews"] = game_data["total_reviews"]
+        # Worker-supplied fields
+        for field in (
+            "positive", "negative", "tags", "genre", "ccu",
+            "last_game_update_at", "synced_at", "developer", "publisher",
+            "app_type", "parent_appid", "dlc_checked_at",
+        ):
+            if game_data.get(field) is not None:
+                update_doc[field] = game_data[field]
+        try:
+            await collection.update_one(
+                {"appid": appid},
+                {"$set": update_doc},
+                upsert=True
+            )
+            logger.debug(f"Zsynchronizowano grę {appid} w MongoDB")
+        except PyMongoError as e:
+            logger.error(f"Błąd upsert gry {appid}: {e}")
+    async def search_games(self, query: str, limit: int = 10) -> list[dict[str, Any]]:
+        """
+        Wyszukuje gry po nazwie (EN lub CN).
+        Używa wyszukiwania case-insensitive z prefiksem.
+        Args:
+            query: Tekst do wyszukania.
+            limit: Maksymalna liczba wyników.
+        Returns:
+            Lista gier pasujących do zapytania (appid, name, name_cn, developer, publisher).
+        """
+        normalized_query = query.strip()
+        if self.db is None or not normalized_query or len(normalized_query) < 2:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            query_lower = normalized_query.lower()
+            name_pattern = re.escape(query_lower)
+            name_prefix_pattern = f"^{name_pattern}"
+            name_exact_pattern = f"^{name_pattern}$"
+            cn_pattern = re.escape(normalized_query)
+            cn_prefix_pattern = f"^{cn_pattern}"
+            cn_exact_pattern = f"^{cn_pattern}$"
+            match_filter: dict[str, Any] = {
+                "$or": [
+                    {"name_lower": {"$regex": name_pattern}},
+                    {"name_cn": {"$regex": cn_pattern, "$options": "i"}},
+                ]
+            }
+            if not settings.dlc_visible_in_search:
+                match_filter["app_type"] = {"$ne": "dlc"}
+            pipeline = [
+                {"$match": match_filter},
+                {
+                    "$addFields": {
+                        "match_rank": {
+                            "$switch": {
+                                "branches": [
+                                    {
+                                        "case": {
+                                            "$or": [
+                                                {
+                                                    "$regexMatch": {
+                                                        "input": {"$ifNull": ["$name_lower", ""]},
+                                                        "regex": name_exact_pattern,
+                                                    }
+                                                },
+                                                {
+                                                    "$regexMatch": {
+                                                        "input": {"$ifNull": ["$name_cn", ""]},
+                                                        "regex": cn_exact_pattern,
+                                                        "options": "i",
+                                                    }
+                                                },
+                                            ]
+                                        },
+                                        "then": 0,
+                                    },
+                                    {
+                                        "case": {
+                                            "$or": [
+                                                {
+                                                    "$regexMatch": {
+                                                        "input": {"$ifNull": ["$name_lower", ""]},
+                                                        "regex": name_prefix_pattern,
+                                                    }
+                                                },
+                                                {
+                                                    "$regexMatch": {
+                                                        "input": {"$ifNull": ["$name_cn", ""]},
+                                                        "regex": cn_prefix_pattern,
+                                                        "options": "i",
+                                                    }
+                                                },
+                                            ]
+                                        },
+                                        "then": 1,
+                                    },
+                                ],
+                                "default": 2,
+                            }
+                        },
+                        "type_rank": {
+                            "$switch": {
+                                "branches": [
+                                    {
+                                        "case": {
+                                            "$in": [
+                                                {"$ifNull": ["$app_type", "unknown"]},
+                                                ["game", "unknown"],
+                                            ]
+                                        },
+                                        "then": 0,
+                                    },
+                                    {"case": {"$eq": ["$app_type", "dlc"]}, "then": 1},
+                                    {"case": {"$eq": ["$app_type", "demo"]}, "then": 2},
+                                ],
+                                "default": 1,
+                            }
+                        },
+                        "review_count": {
+                            "$add": [
+                                {"$ifNull": ["$positive", 0]},
+                                {"$ifNull": ["$negative", 0]},
+                            ]
+                        },
+                    }
+                },
+                {
+                    "$sort": {
+                        "match_rank": 1,
+                        "type_rank": 1,
+                        "review_count": -1,
+                        "name": 1,
+                    }
+                },
+                {"$limit": limit},
+                {
+                    "$project": {
+                        "_id": 0,
+                        "appid": 1,
+                        "name": 1,
+                        "name_cn": 1,
+                        "developer": 1,
+                        "publisher": 1,
+                        "app_type": 1,
+                        "parent_appid": 1,
+                    }
+                },
+            ]
+            cursor = collection.aggregate(pipeline)
+            results = await cursor.to_list(length=limit)
+            return results
+        except PyMongoError as e:
+            logger.error(f"Błąd wyszukiwania gier: {e}")
+            return []
+    async def get_game_update_date(self, app_id: str) -> datetime | None:
+        """Get the last game update timestamp for a game."""
+        if self.db is None:
+            return None
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            doc = await collection.find_one(
+                {"appid": str(app_id)},
+                {"_id": 0, "last_game_update_at": 1},
+            )
+            if doc and doc.get("last_game_update_at"):
+                val = doc["last_game_update_at"]
+                if isinstance(val, datetime):
+                    return val
+                return None
+            return None
+        except PyMongoError as e:
+            logger.error(f"Error getting game update date for {app_id}: {e}")
+            return None
+    async def get_games_without_cn_name(self, limit: int = 200) -> list[dict[str, Any]]:
+        """
+        Pobiera gry, które nie mają jeszcze nazwy chińskiej i nie były sprawdzane.
+        Sortuje po liczbie pozytywnych recenzji (jeśli dostępne, dla priorytetyzacji).
+        """
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            pipeline = [
+                {"$match": {
+                    "name_cn": {"$exists": False},
+                    "cn_name_checked": {"$ne": True}, # Pomiń już sprawdzone
+                }},
+                # Sortowanie po positive (DESC), ale gry bez tego pola trafią na koniec (sparse index handling)
+                {"$sort": {"positive": -1}},
+                {"$limit": limit},
+                {"$project": {"_id": 0, "appid": 1, "name": 1}},
+            ]
+            cursor = collection.aggregate(pipeline)
+            return await cursor.to_list(length=limit)
+        except PyMongoError as e:
+            logger.error(f"Error getting games without CN name: {e}")
+            return []
+    async def mark_cn_name_checked(self, app_id: str, name_cn: str | None = None) -> None:
+        """
+        Oznacza grę jako sprawdzoną pod kątem chińskiej nazwy.
+        Opcjonalnie zapisuje znalezioną nazwę.
+        """
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        update_doc: dict[str, Any] = {"cn_name_checked": True}
+        if name_cn:
+            update_doc["name_cn"] = name_cn
+        try:
+            await collection.update_one(
+                {"appid": str(app_id)},
+                {"$set": update_doc}
+            )
+        except PyMongoError as e:
+            logger.error(f"Error marking CN name checked for {app_id}: {e}")
+    async def get_games_missing_app_type(self, limit: int = 200) -> list[dict[str, Any]]:
+        """
+        Return high-signal games that still need Steam Store type enrichment.
+        We prioritize already-priority games first, then any app with enough reviews
+        to qualify a DLC for worker-managed analysis.
+        """
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            pipeline = [
+                {
+                    "$addFields": {
+                        "total_reviews_sum": {
+                            "$add": [
+                                {"$ifNull": ["$positive", 0]},
+                                {"$ifNull": ["$negative", 0]},
+                            ]
+                        }
+                    }
+                },
+                {
+                    "$match": {
+                        "dlc_checked_at": {"$exists": False},
+                        "$or": [
+                            {"is_priority": True},
+                            {
+                                "total_reviews_sum": {
+                                    "$gte": settings.dlc_min_reviews_for_analysis
+                                }
+                            },
+                        ],
+                    }
+                },
+                {"$sort": {"is_priority": -1, "total_reviews_sum": -1}},
+                {"$limit": limit},
+                {"$project": {"_id": 0, "appid": 1, "name": 1}},
+            ]
+            cursor = collection.aggregate(pipeline)
+            return await cursor.to_list(length=limit)
+        except PyMongoError as e:
+            logger.error(f"Error getting games missing app type: {e}")
+            return []
+    async def mark_app_type_checked(
+        self,
+        app_id: str,
+        *,
+        app_type: str,
+        parent_appid: str | None = None,
+    ) -> None:
+        """Persist Steam Store app type metadata."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        update_doc: dict[str, Any] = {
+            "app_type": app_type,
+            "parent_appid": str(parent_appid) if parent_appid else None,
+            "dlc_checked_at": datetime.now(timezone.utc),
+        }
+        try:
+            await collection.update_one(
+                {"appid": str(app_id)},
+                {"$set": update_doc},
+            )
+        except PyMongoError as e:
+            logger.error(f"Error marking app type checked for {app_id}: {e}")
+    # ========== Worker Methods ==========
+    async def upsert_games_batch(self, games: list[dict[str, Any]]) -> tuple[int, int]:
+        """
+        Bulk upsert games via UpdateOne operations.
+        Returns:
+            (upserted_count, modified_count)
+        """
+        if self.db is None or not games:
+            return (0, 0)
+        collection = self.db[self.COLLECTION_GAMES]
+        operations = []
+        for game in games:
+            appid = str(game.get("appid", ""))
+            name = game.get("name", "")
+            if not appid or not name:
+                continue
+            update_doc: dict[str, Any] = {
+                "appid": appid,
+                "name": name,
+                "name_lower": name.lower(),
+            }
+            for field in (
+                "developer", "publisher", "positive", "negative",
+                "tags", "genre", "ccu", "synced_at",
+                "app_type", "parent_appid", "dlc_checked_at",
+            ):
+                if game.get(field) is not None:
+                    update_doc[field] = game[field]
+            operations.append(
+                UpdateOne({"appid": appid}, {"$set": update_doc}, upsert=True)
+            )
+        if not operations:
+            return (0, 0)
+        try:
+            result = await collection.bulk_write(operations, ordered=False)
+            return (result.upserted_count, result.modified_count)
+        except BulkWriteError as e:
+            details = e.details or {}
+            return (details.get("nUpserted", 0), details.get("nModified", 0))
+        except PyMongoError as e:
+            logger.error(f"Error in upsert_games_batch: {e}")
+            return (0, 0)
+    async def get_top_games_by_reviews(self, limit: int = 500) -> list[dict[str, Any]]:
+        """Top N games sorted by total review count (positive + negative) DESC."""
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            pipeline = [
+                {"$match": {"positive": {"$exists": True}, "negative": {"$exists": True}}},
+                {"$addFields": {"total_reviews_sum": {"$add": ["$positive", "$negative"]}}},
+                {"$sort": {"total_reviews_sum": -1}},
+                {"$limit": limit},
+                {"$project": {"_id": 0}},
+            ]
+            cursor = collection.aggregate(pipeline)
+            return await cursor.to_list(length=limit)
+        except PyMongoError as e:
+            logger.error(f"Error getting top games: {e}")
+            return []
+    async def update_game_update_date(self, app_id: str, update_at: datetime) -> None:
+        """Store the latest game update timestamp."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            await collection.update_one(
+                {"appid": str(app_id)},
+                {"$set": {"last_game_update_at": update_at}},
+            )
+        except PyMongoError as e:
+            logger.error(f"Error updating game update date for {app_id}: {e}")
+    async def update_game_patch_date(self, app_id: str, patch_date: datetime) -> None:
+        """Store the latest confirmed major-update timestamp."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            await collection.update_one(
+                {"appid": str(app_id)},
+                {"$set": {"current_patch_at": patch_date}},
+            )
+        except PyMongoError as e:
+            logger.error(f"Error updating game patch date for {app_id}: {e}")
+    async def update_news_cursor(self, app_id: str, gid: str, date: datetime) -> None:
+        """Store the latest seen news GID and its date as an incremental scan cursor."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            await collection.update_one(
+                {"appid": str(app_id)},
+                {"$set": {"last_seen_news_gid": gid, "last_seen_news_at": date}},
+            )
+        except PyMongoError as e:
+            logger.error(f"Error updating news cursor for {app_id}: {e}")
+    async def get_game_patch_date(self, app_id: str) -> datetime | None:
+        """Get the latest confirmed major-update timestamp for a game."""
+        if self.db is None:
+            return None
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            doc = await collection.find_one(
+                {"appid": str(app_id)},
+                {"_id": 0, "current_patch_at": 1},
+            )
+            if doc and doc.get("current_patch_at"):
+                val = doc["current_patch_at"]
+                if isinstance(val, datetime):
+                    return val
+                return None
+            return None
+        except PyMongoError as e:
+            logger.error(f"Error getting game patch date for {app_id}: {e}")
+            return None
+    async def upsert_refresh_schedule(self, schedule: dict[str, Any]) -> None:
+        """Create or replace a refresh schedule document."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
+        try:
+            await collection.update_one(
+                {"app_id": schedule["app_id"]},
+                {"$set": schedule},
+                upsert=True,
+            )
+        except PyMongoError as e:
+            logger.error(f"Error upserting refresh schedule for {schedule.get('app_id')}: {e}")
+    async def get_active_schedules(self) -> list[dict[str, Any]]:
+        """All schedules with status: 'active'."""
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
+        try:
+            cursor = collection.find({"status": "active"}, {"_id": 0})
+            return await cursor.to_list(length=10000)
+        except PyMongoError as e:
+            logger.error(f"Error getting active schedules: {e}")
+            return []
+    async def has_due_refresh_schedule(self, app_id: str) -> bool:
+        """True when an active schedule has at least one due, incomplete checkpoint."""
+        if self.db is None:
+            return False
+        collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
+        now = datetime.now(timezone.utc)
+        try:
+            document = await collection.find_one(
+                {
+                    "app_id": str(app_id),
+                    "status": "active",
+                    "checkpoints": {
+                        "$elemMatch": {
+                            "completed": False,
+                            "due_at": {"$lte": now},
+                        }
+                    },
+                },
+                {"_id": 0, "app_id": 1},
+            )
+            return document is not None
+        except PyMongoError as e:
+            logger.error(f"Error checking due refresh schedule for {app_id}: {e}")
+            return False
+    async def mark_checkpoint_completed(self, app_id: str, offset_hours: int) -> None:
+        """Mark a specific checkpoint as completed using positional $ update."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
+        try:
+            await collection.update_one(
+                {"app_id": str(app_id), "checkpoints.offset_hours": offset_hours},
+                {"$set": {"checkpoints.$.completed": True}},
+            )
+        except PyMongoError as e:
+            logger.error(f"Error marking checkpoint for {app_id}/{offset_hours}h: {e}")
+    async def complete_schedule(self, app_id: str) -> None:
+        """Set schedule status to 'completed'."""
+        if self.db is None:
+            return
+        collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
+        try:
+            await collection.update_one(
+                {"app_id": str(app_id)},
+                {"$set": {"status": "completed"}},
+            )
+        except PyMongoError as e:
+            logger.error(f"Error completing schedule for {app_id}: {e}")
+    # ========== Priority Games Methods ==========
+    async def get_priority_games(self) -> list[dict[str, Any]]:
+        """All games with is_priority == True, all fields except _id."""
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            cursor = collection.find({"is_priority": True}, {"_id": 0})
+            return await cursor.to_list(length=10000)
+        except PyMongoError as e:
+            logger.error(f"Error getting priority games: {e}")
+            return []
+    async def get_priority_games_for_analysis(self) -> list[dict[str, Any]]:
+        """
+        Priority games eligible for worker-managed analysis.
+        DLC stays linked to the priority universe via is_priority, but low-review DLC
+        falls back to on-demand mode instead of occupying worker capacity.
+        """
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        if settings.dlc_worker_analysis_enabled:
+            query: dict[str, Any] = {
+                "is_priority": True,
+                "$or": [
+                    {"app_type": {"$ne": "dlc"}},
+                    {
+                        "$expr": {
+                            "$gte": [
+                                {
+                                    "$add": [
+                                        {"$ifNull": ["$positive", 0]},
+                                        {"$ifNull": ["$negative", 0]},
+                                    ]
+                                },
+                                settings.dlc_min_reviews_for_analysis,
+                            ]
+                        }
+                    },
+                ],
+            }
+        else:
+            query = {
+                "is_priority": True,
+                "app_type": {"$ne": "dlc"},
+            }
+        try:
+            cursor = collection.find(query, {"_id": 0})
+            return await cursor.to_list(length=10000)
+        except PyMongoError as e:
+            logger.error(f"Error getting priority games for analysis: {e}")
+            return []
+    async def get_priority_game_ids(self) -> set[str]:
+        """Lightweight set of appids for is_priority == True games."""
+        if self.db is None:
+            return set()
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            cursor = collection.find({"is_priority": True}, {"_id": 0, "appid": 1})
+            docs = await cursor.to_list(length=10000)
+            return {str(d["appid"]) for d in docs if d.get("appid")}
+        except PyMongoError as e:
+            logger.error(f"Error getting priority game ids: {e}")
+            return set()
+    async def get_priority_game_ids_for_analysis(self) -> set[str]:
+        """App IDs that should behave as worker-managed in runtime decisions."""
+        docs = await self.get_priority_games_for_analysis()
+        return {str(d["appid"]) for d in docs if d.get("appid")}
+    async def get_dlcs_by_parent_appid(self, parent_appid: str) -> list[dict[str, Any]]:
+        """Return DLC documents linked to a given base game."""
+        if self.db is None:
+            return []
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            cursor = collection.find(
+                {"app_type": "dlc", "parent_appid": str(parent_appid)},
+                {"_id": 0},
+            )
+            return await cursor.to_list(length=1000)
+        except PyMongoError as e:
+            logger.error(f"Error getting DLCs for parent {parent_appid}: {e}")
+            return []
+    async def get_existing_appids(self, appids: set[str]) -> set[str]:
+        """Return the subset of the given appids that have a document in games."""
+        if self.db is None or not appids:
+            return set()
+        collection = self.db[self.COLLECTION_GAMES]
+        try:
+            cursor = collection.find(
+                {"appid": {"$in": list(appids)}},
+                {"_id": 0, "appid": 1},
+            )
+            docs = await cursor.to_list(length=len(appids) + 1)
+            return {str(d["appid"]) for d in docs if d.get("appid")}
+        except PyMongoError as e:
+            logger.error(f"Error in get_existing_appids: {e}")
+            return set()
+    async def bulk_update_priority_fields(self, updates: list[tuple[str, dict]]) -> int:
+        """
+        Batch UpdateOne operations for priority fields.
+        Args:
+            updates: List of (appid, fields_dict) tuples.
+        Returns:
+            modified_count
+        """
+        if self.db is None or not updates:
+            return 0
+        collection = self.db[self.COLLECTION_GAMES]
+        operations = [
+            UpdateOne({"appid": appid}, {"$set": fields})
+            for appid, fields in updates
+        ]
+        try:
+            result = await collection.bulk_write(operations, ordered=False)
+            return result.modified_count
+        except BulkWriteError as e:
+            details = e.details or {}
+            return details.get("nModified", 0)
+        except PyMongoError as e:
+            logger.error(f"Error in bulk_update_priority_fields: {e}")
+            return 0
+# Globalna instancja (Singleton)
+mongodb = MongoDB()

backend/app/main.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import logging
+import os
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator
+from fastapi import FastAPI, Query, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse, JSONResponse
+from slowapi import _rate_limit_exceeded_handler
+from slowapi.errors import RateLimitExceeded
+from starlette.middleware.base import BaseHTTPMiddleware
+from app.core.config import settings
+from app.core.rate_limit import limiter
+from app.core.worker_logging import (
+    LIVE_LOG_WHITELIST,
+    read_log_tail,
+    resolve_log_path,
+    setup_app_logging,
+    setup_structured_logger,
+)
+from app.db.mongodb import mongodb
+from app.routers import analyze, games
+from app.services.nlp_service import get_nlp_service
+from app.services.steam_service import steam_service
+# Konfiguracja logowania
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+class SecurityHeadersMiddleware(BaseHTTPMiddleware):
+    async def dispatch(self, request: Request, call_next):
+        response: Response = await call_next(request)
+        response.headers["X-Content-Type-Options"] = "nosniff"
+        response.headers["X-Frame-Options"] = "SAMEORIGIN"
+        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+        return response
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
+    """
+    Zarządza cyklem życia aplikacji.
+    Nawiązuje połączenie z MongoDB przy starcie
+    i zamyka je przy wyłączeniu.
+    """
+    if not settings.mongodb_url:
+        raise RuntimeError(
+            "MONGODB_URL is not set. Please configure it in .env or environment variables."
+        )
+    await mongodb.connect()
+    setup_structured_logger("live")
+    setup_app_logging()
+    yield
+    await steam_service.close()
+    await mongodb.disconnect()
+app = FastAPI(
+    title="SentimentStream API",
+    description="API do analizy sentymentu recenzji gier Steam w czasie rzeczywistym",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+# Rate limiter
+app.state.limiter = limiter
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)  # type: ignore[arg-type]
+# Konfiguracja CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=settings.cors_origins_list,
+    allow_credentials=True,
+    allow_methods=["GET", "POST", "OPTIONS"],
+    allow_headers=["Content-Type", "Accept"],
+)
+# Security headers
+app.add_middleware(SecurityHeadersMiddleware)
+# Rejestracja routerów
+app.include_router(analyze.router, prefix="/api", tags=["analyze"])
+app.include_router(games.router, prefix="/api", tags=["games"])
+@app.get("/api/logs")
+async def get_logs(
+    request: Request,
+    lines: int = Query(default=100, ge=1, le=1000),
+    level: str | None = Query(default=None),
+    event: str | None = Query(default=None),
+    file: str = Query(default="live"),
+):
+    """Token-protected endpoint to read structured log tail."""
+    auth = request.headers.get("Authorization", "")
+    expected = settings.worker_trigger_token
+    if expected:
+        if not auth.startswith("Bearer ") or auth[7:] != expected:
+            return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
+    log_path = resolve_log_path(file, LIVE_LOG_WHITELIST)
+    if log_path is None:
+        return JSONResponse(
+            status_code=400,
+            content={"detail": f"Unknown log file: '{file}'. Valid: {list(LIVE_LOG_WHITELIST.keys())}"},
+        )
+    entries = read_log_tail(log_path, lines=lines, level=level, event=event)
+    return {"entries": entries, "count": len(entries)}
+@app.get("/health")
+async def health_check() -> dict:
+    """Endpoint sprawdzający stan aplikacji z rzeczywistą weryfikacją zależności."""
+    mongo_ok = False
+    if mongodb.client is not None:
+        try:
+            await mongodb.client.admin.command("ping")
+            mongo_ok = True
+        except Exception:
+            pass
+    nlp_svc = get_nlp_service()
+    model_ok = hasattr(nlp_svc, "classifier") and nlp_svc.classifier is not None
+    overall = "healthy" if (mongo_ok and model_ok) else "degraded"
+    return {
+        "status": overall,
+        "mongodb": "connected" if mongo_ok else "disconnected",
+        "model": "loaded" if model_ok else "not_loaded",
+    }
+# Obsługa plików statycznych (Frontend) - tylko jeśli istnieją (np. w Dockerze)
+# Ścieżka w kontenerze Docker będzie: /app/frontend/dist
+# Lokalnie zazwyczaj nie istnieje (bo używamy vite dev server), więc pomijamy
+static_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "frontend", "dist")
+if settings.app_mode != "api" and os.path.exists(static_dir):
+    app.mount("/assets", StaticFiles(directory=os.path.join(static_dir, "assets")), name="assets")
+    # Catch-all dla SPA (React Router)
+    @app.get("/{full_path:path}")
+    async def serve_spa(full_path: str):
+        if full_path.startswith("api"):
+            return {"error": "API route not found"}
+        file_path = os.path.join(static_dir, full_path)
+        if os.path.exists(file_path) and os.path.isfile(file_path):
+            return FileResponse(file_path)
+        return FileResponse(os.path.join(static_dir, "index.html"))

backend/app/models/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Modele danych Pydantic."""
+from app.models.schemas import (
+    AnalysisProgress,
+    AnalysisResult,
+    GameInfo,
+    ReviewBatch,
+    SentimentType,
+    TopicSentiment,
+)
+__all__ = [
+    "AnalysisProgress",
+    "AnalysisResult",
+    "GameInfo",
+    "ReviewBatch",
+    "SentimentType",
+    "TopicSentiment",
+]

backend/app/models/schemas.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+Modele danych Pydantic.
+Definiuje struktury danych używane w API oraz do walidacji.
+"""
+from datetime import datetime
+from enum import Enum
+from pydantic import BaseModel, Field
+class SentimentType(str, Enum):
+    """Typ sentymentu dla tematu."""
+    POSITIVE = "positive"
+    NEGATIVE = "negative"
+    NEUTRAL = "neutral"
+class PredictionType(str, Enum):
+    """Typ przewidywanego trendu liczby graczy."""
+    INCREASING = "increasing"
+    DECREASING = "decreasing"
+    STABLE = "stable"
+    UNCERTAIN = "uncertain"
+class UserCountPrediction(BaseModel):
+    """
+    Przewidywanie trendu liczby graczy.
+    Attributes:
+        trend: Przewidywany kierunek (wzrost/spadek).
+        confidence: Pewność predykcji (0.0 - 1.0).
+        reasoning: Krótkie uzasadnienie.
+    """
+    trend: PredictionType
+    confidence: float
+    reasoning: str
+class GameInfo(BaseModel):
+    """
+    Informacje o grze ze Steam.
+    Attributes:
+        app_id: Unikalny identyfikator gry na Steam.
+        name: Nazwa gry.
+        name_cn: Chińska nazwa gry (jeśli dostępna).
+        header_image: URL obrazka nagłówkowego.
+        total_reviews: Całkowita liczba recenzji.
+        target_count: Docelowa liczba recenzji do analizy (sample size).
+    """
+    app_id: str
+    name: str
+    name_cn: str | None = None
+    header_image: str | None = None
+    total_reviews: int = 0
+    target_count: int | None = None
+    last_game_update_at: int | None = None
+class TopicSentiment(BaseModel):
+    """
+    Sentyment dla pojedynczego tematu.
+    Attributes:
+        topic: Nazwa tematu (np. "Grafika", "Gameplay").
+        sentiment: Typ sentymentu.
+        score: Wynik sentymentu (-1.0 do 1.0).
+        mention_count: Liczba wzmianek o temacie.
+        example: Przykładowe zdanie z recenzji.
+    """
+    topic: str
+    sentiment: SentimentType
+    score: float = Field(ge=-1.0, le=1.0)
+    mention_count: int = 0
+    example: str | None = None
+    example_score: float | None = None  # score przykładu do porównań przy agregacji
+class Highlight(BaseModel):
+    """Czesto powtarzana fraza z recenzji."""
+    phrase: str
+    mention_count: int
+    sentiment: SentimentType
+    score: float
+    ngram_size: int
+class TopicHighlights(BaseModel):
+    """Highlights dla konkretnego tematu."""
+    topic: str
+    highlights: list[Highlight]
+class AnalysisProgress(BaseModel):
+    """
+    Postęp analizy (wysyłany przez SSE).
+    Attributes:
+        processed: Liczba przetworzonych recenzji.
+        total: Całkowita liczba recenzji do przetworzenia.
+        current_topics: Aktualne wyniki tematów.
+        skipped_count: Liczba zdań pominiętych (brak słów kluczowych).
+    """
+    processed: int
+    total: int
+    current_topics: list[TopicSentiment] = []
+    skipped_count: int = 0
+class AnalysisResult(BaseModel):
+    """
+    Końcowy wynik analizy.
+    Attributes:
+        game: Informacje o grze.
+        general_topics: Lista tematów z sentymentem (pełny agregat).
+        prediction: Przewidywanie trendu liczby graczy.
+        analyzed_reviews: Liczba przeanalizowanych recenzji.
+        skipped_count: Łączna liczba pominiętych zdań.
+        cached_at: Data zapisania w cache.
+    """
+    game: GameInfo
+    general_topics: list[TopicSentiment]
+    prediction: UserCountPrediction | None = None
+    analyzed_reviews: int
+    skipped_count: int = 0
+    general_highlights: list[Highlight] = []
+    recent_highlights: list[Highlight] | None = None
+    current_patch_highlights: list[Highlight] | None = None
+    topic_highlights: list[TopicHighlights] = []
+    cached_at: datetime | None = None
+    recent_topics: list[TopicSentiment] | None = None
+    recent_reviews_count: int = 0
+    current_patch_topics: list[TopicSentiment] | None = None
+    current_patch_reviews_count: int = 0
+    last_patch_topics: list[TopicSentiment] | None = None
+    last_patch_reviews_count: int = 0
+    current_patch_timestamp: int | None = None
+    analysis_date: datetime | None = None
+    current_patch_date: datetime | None = None
+    preferred_context: str | None = None
+    freshness_status: str | None = None
+    staleness_reason: str | None = None
+    is_refreshing: bool = False
+class ReviewItem(BaseModel):
+    """Single review with metadata for incremental tracking."""
+    text: str
+    recommendation_id: str
+    timestamp_created: int
+class ReviewBatch(BaseModel):
+    """
+    Partia recenzji do przetworzenia.
+    Attributes:
+        reviews: Lista tekstów recenzji.
+        review_items: Recenzje z metadanymi (do incremental analysis).
+        cursor: Kursor do paginacji Steam API.
+    """
+    reviews: list[str]
+    review_items: list[ReviewItem] = []
+    cursor: str | None = None
+class SSEEvent(BaseModel):
+    """
+    Wydarzenie Server-Sent Events.
+    Attributes:
+        event: Typ wydarzenia (progress/complete/error).
+        data: Dane wydarzenia.
+    """
+    event: str
+    data: AnalysisProgress | AnalysisResult | dict
+class CachedAnalysis(BaseModel):
+    """
+    Dokument cache w MongoDB.
+    Przechowuje wyniki analizy z timestampem dla walidacji TTL.
+    Attributes:
+        game_id: Identyfikator gry Steam (klucz cache).
+        results: Wyniki analizy sentymentu.
+        cached_at: Data i czas zapisania do cache.
+    """
+    game_id: str
+    results: AnalysisResult
+    cached_at: datetime
+    analyzed_at: datetime | None = None

backend/app/routers/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Routery API."""
+from app.routers import analyze
+__all__ = ["analyze"]

backend/app/routers/analyze.py ADDED Viewed

	@@ -0,0 +1,597 @@

+"""
+Router API do analizy sentymentu.
+Zawiera endpoint do streamowania wyników analizy przez SSE.
+"""
+import asyncio
+import contextlib
+import json
+import logging
+import time
+from datetime import datetime, timezone
+from typing import Any, AsyncGenerator
+from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request
+from sse_starlette.sse import EventSourceResponse
+from app.core.config import settings
+from app.core.freshness import (
+    FreshnessStatus,
+    evaluate_freshness,
+    get_staleness_reason,
+)
+from app.core.sampling import SamplePlan, create_sample_plan
+from app.core.ttl_tiers import get_ttl_hours
+from app.core.worker_logging import get_structured_logger, log_structured
+from app.db.mongodb import mongodb
+from app.core.rate_limit import limiter
+from app.models.schemas import (
+    AnalysisProgress,
+    AnalysisResult,
+    GameInfo,
+    Highlight,
+    TopicHighlights,
+    TopicSentiment,
+)
+from app.services.analysis_utils import (
+    aggregate_topics,
+    calculate_prediction,
+    coerce_utc_datetime,
+    compute_preferred_context,
+    datetime_from_timestamp,
+    filter_topics_by_min_mentions,
+    normalize_legacy_results,
+    serialize_datetime,
+)
+from app.services.highlights_service import HighlightsCollector
+from app.services.analysis_runner import iter_incremental_analysis_events
+from app.services.nlp_service import NLPService
+from app.services.nlp_service import get_nlp_service as _get_nlp_service_instance
+from app.services.steam_errors import SteamAPIError, SteamRateLimitError
+from app.services.steam_service import SteamService, steam_service
+logger = logging.getLogger(__name__)
+router = APIRouter()
+# Background refresh concurrency control
+_refreshing_app_ids: set[str] = set()
+_refresh_semaphore = asyncio.Semaphore(3)  # max 3 concurrent background refreshes
+# Funkcje pomocnicze dla Dependency Injection
+def get_nlp_service() -> NLPService:
+    return _get_nlp_service_instance()
+def get_steam_service() -> SteamService:
+    return steam_service
+@router.get("/health")
+async def health_check():
+    """
+    Endpoint do sprawdzania stanu aplikacji (Health Check).
+    """
+    return {
+        "status": "ok",
+        "services": {
+            "mongodb": "connected",
+            "nlp": "ready",
+            "steam_api": "reachable"
+        }
+    }
+def _build_analysis_payload(
+    document: dict[str, Any],
+    freshness_status: FreshnessStatus,
+    *,
+    current_patch_at: datetime | None = None,
+    is_refreshing: bool = False,
+) -> dict[str, Any]:
+    results = normalize_legacy_results(document.get("results", {}))
+    payload = dict(results)
+    analysis_date = (
+        coerce_utc_datetime(payload.get("analysis_date"))
+        or coerce_utc_datetime(document.get("analyzed_at"))
+        or coerce_utc_datetime(payload.get("cached_at"))
+        or coerce_utc_datetime(document.get("cached_at"))
+    )
+    if current_patch_at is not None:
+        current_patch_date: datetime | None = current_patch_at
+    else:
+        # No confirmed major update in DB — nullify current_patch fields so
+        # legacy cached values don't appear as a valid Current Patch tab.
+        current_patch_date = None
+        payload["current_patch_topics"] = None
+        payload["current_patch_reviews_count"] = 0
+        payload["current_patch_highlights"] = None
+        payload["current_patch_timestamp"] = None
+    if payload.get("cached_at") is None and document.get("cached_at") is not None:
+        payload["cached_at"] = serialize_datetime(document["cached_at"])
+    elif payload.get("cached_at") is not None:
+        payload["cached_at"] = serialize_datetime(payload["cached_at"])
+    payload["analysis_date"] = serialize_datetime(analysis_date)
+    payload["current_patch_date"] = serialize_datetime(current_patch_date)
+    payload["freshness_status"] = freshness_status.value
+    payload["staleness_reason"] = get_staleness_reason(freshness_status)
+    payload["is_refreshing"] = is_refreshing
+    # Always recompute preferred_context from the current patch date so cached
+    # documents with a stale stored value get the correct tab on read.
+    patch_ts_for_context = int(current_patch_date.timestamp()) if current_patch_date else None
+    payload["preferred_context"] = compute_preferred_context(patch_ts_for_context)
+    return payload
+async def _full_analysis(
+    game: GameInfo,
+    sample_plan: SamplePlan,
+    steam_svc: SteamService,
+    nlp_svc: NLPService,
+    patch_timestamp: int | None = None,
+    stale_doc: dict[str, Any] | None = None,
+) -> AsyncGenerator[dict, None]:
+    """Full analysis path — Producer-Consumer queue pattern."""
+    total_target = sample_plan.total
+    ttl_hours = await get_ttl_hours(game.app_id)
+    nlp_cumulative_s: float = 0.0
+    # Producer-Consumer queue (max 5 batches in flight)
+    queue: asyncio.Queue = asyncio.Queue(maxsize=5)
+    async def fetch_worker():
+        try:
+            async for batch in steam_svc.fetch_reviews_stratified(game.app_id, sample_plan):
+                await queue.put(batch)
+        except Exception as e:
+            # Relay all exceptions to consumer via queue — they'll be re-raised
+            # and caught by the SSE generator's specific exception handlers.
+            await queue.put(e)
+        finally:
+            await queue.put(None)
+    fetch_task = asyncio.create_task(fetch_worker())
+    processed = 0
+    total_skipped = 0
+    aggregated_topics: list[TopicSentiment] = []
+    recent_processed = 0
+    recent_limit = settings.recent_sample_limit
+    all_review_ids: list[str] = []
+    latest_timestamp = 0
+    highlights_collector = HighlightsCollector()
+    current_patch_topics: list[TopicSentiment] = []
+    current_patch_count = 0
+    review_topic_results: list[tuple[int, list[TopicSentiment]]] = []
+    try:
+        while True:
+            item = await queue.get()
+            if item is None:
+                break
+            if isinstance(item, Exception):
+                raise item
+            batch = item
+            if not batch.reviews:
+                continue
+            # Collect review IDs for incremental cache
+            for ri in batch.review_items:
+                all_review_ids.append(ri.recommendation_id)
+                if ri.timestamp_created > latest_timestamp:
+                    latest_timestamp = ri.timestamp_created
+            # Split by patch timestamp when available and we have review_items
+            batch_skipped = 0
+            if patch_timestamp and batch.review_items:
+                for ri, text in zip(batch.review_items, batch.reviews):
+                    is_recent = recent_processed < recent_limit
+                    cat = []
+                    if is_recent:
+                        cat.append("recent")
+                    if ri.timestamp_created >= patch_timestamp:
+                        cat.append("current_patch")
+                        nlp_start = time.monotonic()
+                        res, skipped = await nlp_svc.analyze_batch(
+                            [text], highlights_collector=highlights_collector, categories=cat
+                        )
+                        nlp_cumulative_s += time.monotonic() - nlp_start
+                        batch_skipped += skipped
+                        if res:
+                            aggregated_topics = aggregate_topics(aggregated_topics, res)
+                            current_patch_topics = aggregate_topics(current_patch_topics, res)
+                            review_topic_results.append((ri.timestamp_created, res))
+                        current_patch_count += 1
+                    else:
+                        nlp_start = time.monotonic()
+                        res, skipped = await nlp_svc.analyze_batch(
+                            [text], highlights_collector=highlights_collector, categories=cat
+                        )
+                        nlp_cumulative_s += time.monotonic() - nlp_start
+                        batch_skipped += skipped
+                        if res:
+                            aggregated_topics = aggregate_topics(aggregated_topics, res)
+                            review_topic_results.append((ri.timestamp_created, res))
+                    recent_processed += 1
+            else:
+                for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews):
+                    is_recent = recent_processed < recent_limit
+                    cat = ["recent"] if is_recent else []
+                    nlp_start = time.monotonic()
+                    res, skipped = await nlp_svc.analyze_batch(
+                        [text], highlights_collector=highlights_collector, categories=cat
+                    )
+                    nlp_cumulative_s += time.monotonic() - nlp_start
+                    batch_skipped += skipped
+                    ts = ri.timestamp_created if batch.review_items else 0
+                    if res:
+                        aggregated_topics = aggregate_topics(aggregated_topics, res)
+                        review_topic_results.append((ts, res))
+                    recent_processed += 1
+            total_skipped += batch_skipped
+            processed += len(batch.reviews)
+            progress = AnalysisProgress(
+                processed=processed,
+                total=total_target,
+                current_topics=aggregated_topics,
+                skipped_count=total_skipped,
+            )
+            yield {"event": "progress", "data": progress.model_dump_json()}
+        await fetch_task
+    except BaseException:
+        fetch_task.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await fetch_task
+        raise
+    # Build recent_topics from highest-timestamp reviews
+    review_topic_results.sort(key=lambda x: x[0], reverse=True)
+    recent_entries = review_topic_results[:recent_limit]
+    recent_topics: list[TopicSentiment] = []
+    for _, topics_batch in recent_entries:
+        for ts in topics_batch:
+            recent_topics = aggregate_topics(recent_topics, [ts])
+    recent_reviews_count = len(recent_entries)
+    # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
+    aggregated_topics = filter_topics_by_min_mentions(aggregated_topics)
+    recent_topics = filter_topics_by_min_mentions(recent_topics)
+    current_patch_topics = filter_topics_by_min_mentions(current_patch_topics)
+    prediction = calculate_prediction(aggregated_topics)
+    highlights_data = highlights_collector.compute_highlights()
+    general_highlights = highlights_data["general"]
+    recent_highlights = highlights_data["recent"]
+    current_patch_highlights = highlights_data["current_patch"]
+    topic_highlights_dict = highlights_data["topics"]
+    # Restrict topic highlights to topics that survived the min-mentions filter,
+    # so the topic_highlights set is always consistent with general_topics.
+    _surviving_topics = {t.topic for t in aggregated_topics}
+    topic_highlights_list = [
+        TopicHighlights(
+            topic=topic,
+            highlights=[Highlight(**h) for h in highlights],
+        )
+        for topic, highlights in topic_highlights_dict.items()
+        if topic in _surviving_topics
+    ]
+    # Show recent tab if we have enough reviews to make the split meaningful
+    has_recent_split = processed > recent_limit
+    has_current_patch = patch_timestamp is not None and current_patch_count > 0
+    analysis_generated_at = datetime.now(timezone.utc)
+    current_patch_date = datetime_from_timestamp(patch_timestamp)
+    # Archive last_patch_topics when this full analysis replaces a doc with a different patch.
+    last_patch_topics: list[TopicSentiment] | None = None
+    last_patch_reviews_count = 0
+    if stale_doc:
+        old_r = normalize_legacy_results(stale_doc.get("results", {}))
+        old_patch_ts = old_r.get("current_patch_timestamp")
+        if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
+            raw_cp = old_r.get("current_patch_topics")
+            last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None
+            last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0)
+        else:
+            raw_lp = old_r.get("last_patch_topics")
+            last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None
+            last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0)
+    result = AnalysisResult(
+        game=game,
+        general_topics=aggregated_topics,
+        recent_topics=recent_topics if has_recent_split else None,
+        recent_reviews_count=recent_reviews_count if has_recent_split else 0,
+        current_patch_topics=current_patch_topics if has_current_patch else None,
+        current_patch_reviews_count=current_patch_count if has_current_patch else 0,
+        last_patch_topics=last_patch_topics,
+        last_patch_reviews_count=last_patch_reviews_count,
+        current_patch_timestamp=patch_timestamp,
+        analysis_date=analysis_generated_at,
+        current_patch_date=current_patch_date,
+        prediction=prediction,
+        analyzed_reviews=processed,
+        skipped_count=total_skipped,
+        general_highlights=[Highlight(**h) for h in general_highlights],
+        recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None,
+        current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None,
+        topic_highlights=topic_highlights_list,
+        cached_at=analysis_generated_at,
+        preferred_context=compute_preferred_context(patch_timestamp),
+        freshness_status=FreshnessStatus.FRESH.value,
+        is_refreshing=False,
+    )
+    await mongodb.save_analysis(
+        game.app_id,
+        result.model_dump(),
+        analyzed_review_ids=all_review_ids,
+        latest_review_timestamp=latest_timestamp,
+        ttl_hours=ttl_hours,
+        analyzed_at=analysis_generated_at,
+    )
+    # Log structured timing for full analysis
+    if get_structured_logger():
+        log_structured(
+            "full_analysis_complete",
+            app_id=game.app_id,
+            game_name=game.name if hasattr(game, "name") else str(game.app_id),
+            source="live",
+            reviews_processed=processed,
+            topics_found=len(aggregated_topics),
+            detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)},
+        )
+    yield {"event": "complete", "data": result.model_dump_json()}
+async def _incremental_analysis(
+    game: GameInfo,
+    stale_doc: dict[str, Any],
+    steam_svc: SteamService,
+    nlp_svc: NLPService,
+    patch_timestamp: int | None = None,
+) -> AsyncGenerator[dict, None]:
+    """Incremental analysis SSE wrapper over the shared service implementation."""
+    async for event in iter_incremental_analysis_events(
+        game,
+        stale_doc,
+        steam_svc,
+        nlp_svc,
+        patch_timestamp=patch_timestamp,
+        source="live",
+    ):
+        yield event
+async def _background_refresh(
+    game: GameInfo,
+    stale_doc: dict[str, Any],
+    steam_svc: SteamService,
+    nlp_svc: NLPService,
+    patch_ts: int | None,
+) -> None:
+    """Fire-and-forget incremental analysis for stale niche caches."""
+    async with _refresh_semaphore:
+        try:
+            async for _ in _incremental_analysis(
+                game, stale_doc, steam_svc, nlp_svc, patch_timestamp=patch_ts
+            ):
+                pass
+            logger.info(f"Background refresh completed for {game.app_id}")
+        except Exception as e:
+            logger.error(f"Background refresh failed for {game.app_id}: {e}")
+        finally:
+            _refreshing_app_ids.discard(game.app_id)
+async def analysis_event_generator(
+    game_name: str,
+    steam_service: SteamService,
+    nlp_service: NLPService,
+    *,
+    appid: str | None = None,
+) -> AsyncGenerator[dict, None]:
+    """
+    Main SSE event generator. Decides between full and incremental analysis paths.
+    """
+    t_start = time.monotonic()
+    analysis_type = "unknown"
+    app_id = ""
+    resolved_game_name = game_name
+    reviews_processed = 0
+    try:
+        # 1. Resolve game — use appid directly if provided, otherwise search by name
+        if appid:
+            game = await steam_service.get_game_info(appid)
+        else:
+            game = await steam_service.search_game(game_name)
+        if not game:
+            yield {
+                "event": "analysis_error",
+                "data": json.dumps({"message": "ERROR_GAME_NOT_FOUND"}),
+            }
+            return
+        app_id = game.app_id
+        resolved_game_name = game.name if hasattr(game, "name") else game_name
+        # 1b. Fetch game patch date for current_patch tab / freshness evaluation
+        patch_date = await mongodb.get_game_patch_date(game.app_id)
+        patch_ts = int(patch_date.timestamp()) if patch_date else None
+        if patch_ts:
+            game = game.model_copy(update={"last_game_update_at": patch_ts})
+        # 2. Load any existing analysis and evaluate product freshness.
+        analysis_doc = await mongodb.get_analysis(game.app_id)
+        priority_ids = await mongodb.get_priority_game_ids_for_analysis()
+        is_priority = game.app_id in priority_ids
+        is_niche = not is_priority
+        if analysis_doc and analysis_doc.get("results"):
+            freshness_status = evaluate_freshness(analysis_doc, patch_date)
+            if freshness_status == FreshnessStatus.FRESH:
+                analysis_type = "cached"
+                payload = _build_analysis_payload(
+                    analysis_doc,
+                    freshness_status,
+                    current_patch_at=patch_date,
+                )
+                yield {"event": "result", "data": json.dumps(payload)}
+                return
+            analysis_type = "stale_result"
+            is_refreshing = (
+                await mongodb.has_due_refresh_schedule(game.app_id)
+                if is_priority
+                else True
+            )
+            stale_payload = _build_analysis_payload(
+                analysis_doc,
+                freshness_status,
+                current_patch_at=patch_date,
+                is_refreshing=is_refreshing,
+            )
+            yield {"event": "result", "data": json.dumps(stale_payload)}
+            if is_priority:
+                return
+            try:
+                if settings.incremental_enabled and analysis_doc.get("analyzed_review_ids"):
+                    refresh_generator = _incremental_analysis(
+                        game, analysis_doc, steam_service, nlp_service, patch_timestamp=patch_ts
+                    )
+                else:
+                    stats = await steam_service.get_review_stats(game.app_id)
+                    sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
+                    game = game.model_copy(update={"target_count": sample_plan.total})
+                    refresh_generator = _full_analysis(
+                        game,
+                        sample_plan,
+                        steam_service,
+                        nlp_service,
+                        patch_timestamp=patch_ts,
+                        stale_doc=analysis_doc,
+                    )
+                async for event in refresh_generator:
+                    if event.get("event") == "complete":
+                        try:
+                            data = json.loads(event["data"])
+                            reviews_processed = data.get("analyzed_reviews", 0)
+                        except (json.JSONDecodeError, KeyError):
+                            pass
+                    yield event
+                return
+            except Exception as e:
+                logger.error(f"Refresh failed for {game.app_id}: {e}")
+                return
+        # 3. No cache at all — live analysis
+        analysis_type = "full"
+        stats = await steam_service.get_review_stats(game.app_id)
+        sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
+        total_target = sample_plan.total
+        game = game.model_copy(update={"target_count": total_target})
+        yield {"event": "game_found", "data": game.model_dump_json()}
+        if is_niche:
+            yield {
+                "event": "state",
+                "data": json.dumps({"type": "first_live_analysis"}),
+            }
+        async for event in _full_analysis(game, sample_plan, steam_service, nlp_service, patch_timestamp=patch_ts):
+            if event.get("event") == "complete":
+                try:
+                    data = json.loads(event["data"])
+                    reviews_processed = data.get("analyzed_reviews", 0)
+                except (json.JSONDecodeError, KeyError):
+                    pass
+            yield event
+    except SteamRateLimitError as e:
+        logger.warning(f"Steam rate limit: {e}")
+        yield {
+            "event": "analysis_error",
+            "data": json.dumps({"message": "ERROR_STEAM_RATE_LIMIT"}),
+        }
+    except SteamAPIError as e:
+        logger.error(f"Steam API error: {e}")
+        yield {
+            "event": "analysis_error",
+            "data": json.dumps({"message": "ERROR_STEAM_API"}),
+        }
+    except Exception as e:
+        # Safety net — SSE generator must always send an error event, never crash silently.
+        logger.error(f"Analysis error: {e}", exc_info=True)
+        yield {
+            "event": "analysis_error",
+            "data": json.dumps({"message": "ERROR_INTERNAL"}),
+        }
+    finally:
+        elapsed = round(time.monotonic() - t_start, 3)
+        if get_structured_logger():
+            log_structured(
+                "live_analysis",
+                app_id=app_id,
+                game_name=resolved_game_name,
+                analysis_type=analysis_type,
+                elapsed_s=elapsed,
+                reviews_processed=reviews_processed,
+                source="live",
+            )
+@router.get("/analyze/{game_name}")
+@limiter.limit(settings.rate_limit_analyze)
+async def analyze_game(
+    request: Request,
+    game_name: str = Path(..., min_length=1, max_length=200),
+    appid: str | None = Query(None, min_length=1, max_length=20),
+    steam_service: SteamService = Depends(get_steam_service),
+    nlp_service: NLPService = Depends(get_nlp_service),
+) -> EventSourceResponse:
+    """
+    Endpoint do analizy sentymentu gry (SSE Stream).
+    """
+    return EventSourceResponse(
+        analysis_event_generator(game_name, steam_service, nlp_service, appid=appid)
+    )
+@router.get("/game/{game_name}")
+@limiter.limit(settings.rate_limit_default)
+async def get_game_info(
+    request: Request,
+    game_name: str = Path(..., min_length=1, max_length=200),
+    steam_service: SteamService = Depends(get_steam_service),
+) -> dict:
+    """
+    Endpoint do pobierania informacji o grze.
+    """
+    game = await steam_service.search_game(game_name)
+    if not game:
+        raise HTTPException(
+            status_code=404, detail="ERROR_GAME_NOT_FOUND"
+        )
+    return game.model_dump()

backend/app/routers/games.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Router API do wyszukiwania gier.
+Zawiera endpoint do pobierania sugestii gier dla autouzupełniania.
+"""
+from fastapi import APIRouter, Query, Request
+from app.core.config import settings
+from app.db.mongodb import mongodb
+from app.core.rate_limit import limiter
+router = APIRouter()
+@router.get("/games/suggestions")
+@limiter.limit(settings.rate_limit_default)
+async def get_game_suggestions(
+    request: Request,
+    q: str = Query(..., min_length=2, max_length=100, description="Tekst do wyszukania"),
+    limit: int = Query(10, ge=1, le=20, description="Maksymalna liczba wyników"),
+) -> list[dict[str, str]]:
+    """
+    Endpoint do pobierania sugestii gier dla autouzupełniania.
+    Wyszukuje gry po nazwie (case-insensitive).
+    Wymaga minimum 2 znaków.
+    Args:
+        q: Tekst do wyszukania w nazwie gry.
+        limit: Maksymalna liczba wyników (1-20).
+    Returns:
+        Lista gier pasujących do zapytania.
+    Example:
+        ```
+        GET /api/games/suggestions?q=cyber&limit=5
+        [
+            {"appid": "1091500", "name": "Cyberpunk 2077"},
+            {"appid": "12345", "name": "Cyber Shadow"},
+            ...
+        ]
+        ```
+    """
+    games = await mongodb.search_games(q, limit)
+    return games
+@router.get("/games/count")
+@limiter.limit(settings.rate_limit_default)
+async def get_games_count(request: Request) -> dict[str, int]:
+    """
+    Endpoint do sprawdzenia liczby gier w bazie.
+    Returns:
+        Liczba gier w bazie danych.
+    Example:
+        ```
+        GET /api/games/count
+        {"count": 85432}
+        ```
+    """
+    count = await mongodb.get_games_count()
+    return {"count": count}

backend/app/services/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Serwisy biznesowe aplikacji."""
+from app.services.nlp_service import get_nlp_service
+from app.services.steam_service import steam_service
+__all__ = ["get_nlp_service", "steam_service"]

backend/app/services/analysis_runner.py ADDED Viewed

	@@ -0,0 +1,643 @@

+"""
+Analysis Runner — programmatic (non-SSE) analysis for the Worker.
+Extracts the core full-analysis logic from analyze.py without SSE wrapping.
+Used by the pre-cache service to run analyses in the background.
+"""
+import asyncio
+import contextlib
+import json
+import logging
+import time
+from datetime import datetime, timezone
+from typing import Any, AsyncGenerator
+from app.core.config import settings
+from app.core.freshness import FreshnessStatus
+from app.core.sampling import create_sample_plan
+from app.core.ttl_tiers import get_ttl_hours
+from app.core.worker_logging import AsyncTimingContext, get_structured_logger, log_structured
+from app.db.mongodb import mongodb
+from app.models.schemas import (
+    AnalysisProgress,
+    AnalysisResult,
+    GameInfo,
+    Highlight,
+    TopicHighlights,
+    TopicSentiment,
+)
+from app.services.highlights_service import HighlightsCollector
+from app.services.analysis_utils import (
+    aggregate_topics,
+    calculate_prediction,
+    compute_preferred_context,
+    datetime_from_timestamp,
+    filter_topics_by_min_mentions,
+    normalize_legacy_results,
+    scale_topics,
+    serialize_datetime,
+)
+from app.services.nlp_service import NLPService
+from app.services.steam_service import SteamService
+logger = logging.getLogger(__name__)
+async def iter_incremental_analysis_events(
+    game: GameInfo,
+    stale_doc: dict[str, Any],
+    steam_svc: SteamService,
+    nlp_svc: NLPService,
+    patch_timestamp: int | None = None,
+    *,
+    source: str = "live",
+) -> AsyncGenerator[dict[str, str], None]:
+    """Yield incremental-analysis progress and final result events."""
+    ttl_hours = await get_ttl_hours(game.app_id)
+    old_results = normalize_legacy_results(stale_doc.get("results", {}))
+    old_review_ids: list[str] = stale_doc.get("analyzed_review_ids", [])
+    old_review_ids_set = set(old_review_ids)
+    nlp_cumulative_s: float = 0.0
+    old_general = [TopicSentiment(**topic) for topic in old_results.get("general_topics", [])]
+    old_recent = (
+        [TopicSentiment(**topic) for topic in old_results.get("recent_topics", [])]
+        if old_results.get("recent_topics")
+        else []
+    )
+    old_current_patch = (
+        [TopicSentiment(**topic) for topic in old_results.get("current_patch_topics", [])]
+        if old_results.get("current_patch_topics")
+        else []
+    )
+    old_last_patch = (
+        [TopicSentiment(**topic) for topic in old_results.get("last_patch_topics", [])]
+        if old_results.get("last_patch_topics")
+        else None
+    )
+    old_last_patch_count = old_results.get("last_patch_reviews_count", 0)
+    old_patch_ts = old_results.get("current_patch_timestamp")
+    new_items = await steam_svc.fetch_recent_reviews(
+        game.app_id,
+        exclude_ids=old_review_ids_set,
+    )
+    if not new_items:
+        refreshed_at = datetime.now(timezone.utc)
+        refreshed_results = {
+            **old_results,
+            "cached_at": refreshed_at,
+            "analysis_date": refreshed_at,
+            "current_patch_date": datetime_from_timestamp(
+                patch_timestamp if patch_timestamp is not None else old_results.get("current_patch_timestamp")
+            ),
+            "freshness_status": FreshnessStatus.FRESH.value,
+            "staleness_reason": None,
+            "is_refreshing": False,
+        }
+        await mongodb.save_analysis(
+            game.app_id,
+            refreshed_results,
+            analyzed_review_ids=old_review_ids,
+            latest_review_timestamp=stale_doc.get("latest_review_timestamp", 0),
+            ttl_hours=ttl_hours,
+            analyzed_at=refreshed_at,
+        )
+        yield {
+            "event": "complete",
+            "data": json.dumps(refreshed_results, default=serialize_datetime),
+        }
+        return
+    new_texts = [item.text for item in new_items]
+    new_review_ids = [item.recommendation_id for item in new_items]
+    latest_timestamp = max(
+        (item.timestamp_created for item in new_items),
+        default=stale_doc.get("latest_review_timestamp", 0),
+    )
+    batch_size = settings.review_batch_size
+    delta_topics: list[TopicSentiment] = []
+    delta_current_patch_topics: list[TopicSentiment] = []
+    delta_current_patch_count = 0
+    highlights_collector = HighlightsCollector()
+    processed = 0
+    total_skipped = 0
+    for i in range(0, len(new_texts), batch_size):
+        chunk_texts = new_texts[i:i + batch_size]
+        chunk_items = new_items[i:i + batch_size]
+        batch_skipped = 0
+        if patch_timestamp:
+            for review_item, text in zip(chunk_items, chunk_texts):
+                categories = ["recent"]
+                if review_item.timestamp_created >= patch_timestamp:
+                    categories.append("current_patch")
+                nlp_start = time.monotonic()
+                result_topics, skipped = await nlp_svc.analyze_batch(
+                    [text],
+                    highlights_collector=highlights_collector,
+                    categories=categories,
+                )
+                nlp_cumulative_s += time.monotonic() - nlp_start
+                batch_skipped += skipped
+                if result_topics:
+                    delta_topics = aggregate_topics(delta_topics, result_topics)
+                    if review_item.timestamp_created >= patch_timestamp:
+                        delta_current_patch_topics = aggregate_topics(
+                            delta_current_patch_topics,
+                            result_topics,
+                        )
+                        delta_current_patch_count += 1
+            total_skipped += batch_skipped
+        else:
+            nlp_start = time.monotonic()
+            batch_results, batch_skipped = await nlp_svc.analyze_batch(
+                chunk_texts,
+                highlights_collector=highlights_collector,
+                categories=["recent"],
+            )
+            nlp_cumulative_s += time.monotonic() - nlp_start
+            if batch_results:
+                delta_topics = aggregate_topics(delta_topics, batch_results)
+            total_skipped += batch_skipped
+        processed += len(chunk_texts)
+        progress = AnalysisProgress(
+            processed=processed,
+            total=len(new_texts),
+            current_topics=delta_topics,
+            skipped_count=total_skipped,
+        )
+        yield {"event": "progress", "data": progress.model_dump_json()}
+    new_general = aggregate_topics(old_general, delta_topics)
+    old_recent_count = old_results.get("recent_reviews_count", 0)
+    new_count = len(new_texts)
+    if (
+        old_recent_count + new_count > settings.recent_sample_limit
+        and old_recent
+        and old_recent_count > 0
+    ):
+        overflow = old_recent_count + new_count - settings.recent_sample_limit
+        retain_ratio = max(0.2, 1.0 - overflow / old_recent_count)
+        scaled_old = scale_topics(old_recent, retain_ratio)
+        new_recent = aggregate_topics(scaled_old, delta_topics)
+        recent_count = int(old_recent_count * retain_ratio) + new_count
+    else:
+        new_recent = aggregate_topics(old_recent, delta_topics) if old_recent else delta_topics
+        recent_count = old_recent_count + new_count
+    last_patch_topics = old_last_patch
+    last_patch_count = old_last_patch_count
+    if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
+        last_patch_topics = old_current_patch if old_current_patch else None
+        last_patch_count = old_results.get("current_patch_reviews_count", 0)
+        old_current_patch = []
+    new_current_patch = (
+        aggregate_topics(old_current_patch, delta_current_patch_topics)
+        if old_current_patch
+        else (delta_current_patch_topics if delta_current_patch_topics else [])
+    )
+    base_current_patch_count = (
+        0
+        if (patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts)
+        else old_results.get("current_patch_reviews_count", 0)
+    )
+    new_current_patch_count = base_current_patch_count + delta_current_patch_count
+    has_current_patch = patch_timestamp is not None and (
+        new_current_patch_count > 0 or bool(old_current_patch)
+    )
+    # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
+    new_general = filter_topics_by_min_mentions(new_general)
+    new_recent = filter_topics_by_min_mentions(new_recent)
+    new_current_patch = filter_topics_by_min_mentions(new_current_patch)
+    prediction = calculate_prediction(new_general)
+    highlights_data = highlights_collector.compute_highlights()
+    general_highlights = highlights_data["general"]
+    recent_highlights = highlights_data["recent"]
+    current_patch_highlights = highlights_data["current_patch"]
+    topic_highlights_dict = highlights_data["topics"]
+    # Restrict topic highlights to topics that survived the min-mentions filter,
+    # so the topic_highlights set is always consistent with general_topics.
+    _surviving_topics = {t.topic for t in new_general}
+    topic_highlights_list = [
+        TopicHighlights(
+            topic=topic,
+            highlights=[Highlight(**highlight) for highlight in highlights],
+        )
+        for topic, highlights in topic_highlights_dict.items()
+        if topic in _surviving_topics
+    ]
+    merged_review_ids = old_review_ids + new_review_ids
+    analysis_generated_at = datetime.now(timezone.utc)
+    result = AnalysisResult(
+        game=game,
+        general_topics=new_general,
+        recent_topics=new_recent,
+        recent_reviews_count=recent_count,
+        current_patch_topics=new_current_patch if has_current_patch else None,
+        current_patch_reviews_count=new_current_patch_count if has_current_patch else 0,
+        last_patch_topics=last_patch_topics,
+        last_patch_reviews_count=last_patch_count,
+        current_patch_timestamp=patch_timestamp,
+        analysis_date=analysis_generated_at,
+        current_patch_date=datetime_from_timestamp(patch_timestamp),
+        prediction=prediction,
+        analyzed_reviews=old_results.get("analyzed_reviews", 0) + processed,
+        skipped_count=old_results.get("skipped_count", 0) + total_skipped,
+        general_highlights=[Highlight(**highlight) for highlight in general_highlights],
+        recent_highlights=[Highlight(**highlight) for highlight in recent_highlights] if recent_highlights else None,
+        current_patch_highlights=[Highlight(**highlight) for highlight in current_patch_highlights] if current_patch_highlights else None,
+        topic_highlights=topic_highlights_list,
+        cached_at=analysis_generated_at,
+        preferred_context=compute_preferred_context(patch_timestamp),
+        freshness_status=FreshnessStatus.FRESH.value,
+        is_refreshing=False,
+    )
+    await mongodb.save_analysis(
+        game.app_id,
+        result.model_dump(),
+        analyzed_review_ids=merged_review_ids,
+        latest_review_timestamp=latest_timestamp,
+        ttl_hours=ttl_hours,
+        analyzed_at=analysis_generated_at,
+    )
+    if get_structured_logger():
+        log_structured(
+            "incremental_analysis_complete",
+            app_id=game.app_id,
+            game_name=game.name if hasattr(game, "name") else str(game.app_id),
+            source=source,
+            reviews_processed=processed,
+            topics_found=len(new_general),
+            detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)},
+        )
+    yield {"event": "complete", "data": result.model_dump_json()}
+async def run_incremental_analysis(
+    app_id: str,
+    game_name: str,
+    steam_svc: SteamService,
+    nlp_svc: NLPService,
+) -> dict[str, Any] | None:
+    """Run a non-SSE incremental analysis for worker jobs."""
+    slog = get_structured_logger()
+    try:
+        stale_doc = await mongodb.get_analysis(app_id)
+        if not stale_doc or not stale_doc.get("results") or not stale_doc.get("analyzed_review_ids"):
+            return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc)
+        # Long gap guard: if the most recent review we have is too old, Steam's cursor-based
+        # API may not reliably surface all reviews since then. Fall back to full analysis.
+        latest_ts = stale_doc.get("latest_review_timestamp", 0)
+        if latest_ts > 0:
+            gap_days = (time.time() - latest_ts) / 86400
+            if gap_days > settings.incremental_max_gap_days:
+                logger.info(
+                    f"Incremental gap {gap_days:.0f}d > {settings.incremental_max_gap_days}d "
+                    f"for {app_id} ({game_name}) — falling back to full analysis"
+                )
+                return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc)
+        game = await steam_svc.get_game_info(app_id)
+        if not game:
+            cached_game = stale_doc.get("results", {}).get("game")
+            if isinstance(cached_game, dict):
+                game = GameInfo(**cached_game)
+            else:
+                game = GameInfo(app_id=app_id, name=game_name)
+        patch_date = await mongodb.get_game_patch_date(app_id)
+        patch_timestamp = int(patch_date.timestamp()) if patch_date else None
+        if patch_timestamp:
+            game = game.model_copy(update={"last_game_update_at": patch_timestamp})
+        final_payload: dict[str, Any] | None = None
+        async for event in iter_incremental_analysis_events(
+            game,
+            stale_doc,
+            steam_svc,
+            nlp_svc,
+            patch_timestamp=patch_timestamp,
+            source="worker",
+        ):
+            if event.get("event") == "complete":
+                final_payload = json.loads(event["data"])
+        return final_payload
+    except Exception as e:
+        logger.error(f"Incremental analysis runner error for {app_id} ({game_name}): {e}", exc_info=True)
+        if slog:
+            log_structured(
+                "analysis_error",
+                level=logging.ERROR,
+                app_id=app_id,
+                game_name=game_name,
+                source="worker",
+                error=str(e),
+            )
+        return None
+async def run_full_analysis(
+    app_id: str,
+    game_name: str,
+    steam_svc: SteamService,
+    nlp_svc: NLPService,
+    stale_doc: dict[str, Any] | None = None,
+) -> dict[str, Any] | None:
+    """
+    Run a full analysis for a game (no SSE, no streaming).
+    Returns:
+        Analysis result dict, or None on error.
+    """
+    slog = get_structured_logger()
+    try:
+        # Phase 1: Setup — game info + review stats + sample plan
+        async with AsyncTimingContext() as t_setup:
+            # 1. Get game info
+            game = await steam_svc.get_game_info(app_id)
+            if not game:
+                logger.warning(f"Analysis runner: game info not found for {app_id}")
+                return None
+            # 2. Get review stats
+            stats = await steam_svc.get_review_stats(app_id)
+            if stats.total == 0:
+                logger.warning(f"Analysis runner: no reviews for {app_id}")
+                return None
+            # 3. Create sample plan
+            sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
+            ttl_hours = await get_ttl_hours(app_id)
+            # 3b. Fetch game patch date for current_patch splitting
+            patch_date = await mongodb.get_game_patch_date(app_id)
+            patch_timestamp = int(patch_date.timestamp()) if patch_date else None
+            if patch_timestamp and isinstance(game, GameInfo):
+                game = game.model_copy(update={"last_game_update_at": patch_timestamp})
+        # Phase 2: Fetch + Analyze — producer-consumer loop
+        nlp_cumulative_s: float = 0.0
+        async with AsyncTimingContext() as t_fetch_analyze:
+            # 4. Producer-consumer fetch + analyze
+            queue: asyncio.Queue = asyncio.Queue(maxsize=5)
+            async def fetch_worker():
+                try:
+                    async for batch in steam_svc.fetch_reviews_stratified(app_id, sample_plan):
+                        await queue.put(batch)
+                except Exception as e:
+                    await queue.put(e)
+                finally:
+                    await queue.put(None)
+            fetch_task = asyncio.create_task(fetch_worker())
+            processed = 0
+            total_skipped = 0
+            aggregated_topics: list[TopicSentiment] = []
+            recent_processed = 0
+            recent_limit = settings.recent_sample_limit
+            all_review_ids: list[str] = []
+            latest_timestamp = 0
+            highlights_collector = HighlightsCollector()
+            current_patch_topics: list[TopicSentiment] = []
+            current_patch_count = 0
+            review_topic_results: list[tuple[int, list[TopicSentiment]]] = []
+            try:
+                while True:
+                    item = await queue.get()
+                    if item is None:
+                        break
+                    if isinstance(item, Exception):
+                        raise item
+                    batch = item
+                    if not batch.reviews:
+                        continue
+                    for ri in batch.review_items:
+                        all_review_ids.append(ri.recommendation_id)
+                        if ri.timestamp_created > latest_timestamp:
+                            latest_timestamp = ri.timestamp_created
+                    batch_skipped = 0
+                    if patch_timestamp and batch.review_items:
+                        for ri, text in zip(batch.review_items, batch.reviews):
+                            is_recent = recent_processed < recent_limit
+                            cat = []
+                            if is_recent:
+                                cat.append("recent")
+                            if ri.timestamp_created >= patch_timestamp:
+                                cat.append("current_patch")
+                                nlp_start = time.monotonic()
+                                res, skipped = await nlp_svc.analyze_batch(
+                                    [text], highlights_collector=highlights_collector, categories=cat
+                                )
+                                nlp_cumulative_s += time.monotonic() - nlp_start
+                                batch_skipped += skipped
+                                if res:
+                                    aggregated_topics = aggregate_topics(aggregated_topics, res)
+                                    current_patch_topics = aggregate_topics(current_patch_topics, res)
+                                    review_topic_results.append((ri.timestamp_created, res))
+                                current_patch_count += 1
+                            else:
+                                nlp_start = time.monotonic()
+                                res, skipped = await nlp_svc.analyze_batch(
+                                    [text], highlights_collector=highlights_collector, categories=cat
+                                )
+                                nlp_cumulative_s += time.monotonic() - nlp_start
+                                batch_skipped += skipped
+                                if res:
+                                    aggregated_topics = aggregate_topics(aggregated_topics, res)
+                                    review_topic_results.append((ri.timestamp_created, res))
+                            recent_processed += 1
+                    else:
+                        for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews):
+                            is_recent = recent_processed < recent_limit
+                            cat = ["recent"] if is_recent else []
+                            nlp_start = time.monotonic()
+                            res, skipped = await nlp_svc.analyze_batch(
+                                [text], highlights_collector=highlights_collector, categories=cat
+                            )
+                            nlp_cumulative_s += time.monotonic() - nlp_start
+                            batch_skipped += skipped
+                            ts = ri.timestamp_created if batch.review_items else 0
+                            if res:
+                                aggregated_topics = aggregate_topics(aggregated_topics, res)
+                                review_topic_results.append((ts, res))
+                            recent_processed += 1
+                    total_skipped += batch_skipped
+                    processed += len(batch.reviews)
+                await fetch_task
+            except BaseException:
+                fetch_task.cancel()
+                with contextlib.suppress(asyncio.CancelledError):
+                    await fetch_task
+                raise
+        # Phase 3: Save — highlights + MongoDB save
+        async with AsyncTimingContext() as t_save:
+            # 5. Compute prediction + highlights
+            # Build recent_topics from highest-timestamp reviews
+            review_topic_results.sort(key=lambda x: x[0], reverse=True)
+            recent_entries = review_topic_results[:recent_limit]
+            recent_topics: list[TopicSentiment] = []
+            for _, topics_batch in recent_entries:
+                for ts in topics_batch:
+                    recent_topics = aggregate_topics(recent_topics, [ts])
+            recent_reviews_count = len(recent_entries)
+            # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
+            aggregated_topics = filter_topics_by_min_mentions(aggregated_topics)
+            recent_topics = filter_topics_by_min_mentions(recent_topics)
+            current_patch_topics = filter_topics_by_min_mentions(current_patch_topics)
+            prediction = calculate_prediction(aggregated_topics)
+            highlights_data = highlights_collector.compute_highlights()
+            general_highlights = highlights_data["general"]
+            recent_highlights = highlights_data["recent"]
+            current_patch_highlights = highlights_data["current_patch"]
+            topic_highlights_dict = highlights_data["topics"]
+            # Restrict topic highlights to topics that survived the min-mentions filter,
+            # so the topic_highlights set is always consistent with general_topics.
+            _surviving_topics = {t.topic for t in aggregated_topics}
+            topic_highlights_list = [
+                TopicHighlights(
+                    topic=topic,
+                    highlights=[Highlight(**h) for h in highlights],
+                )
+                for topic, highlights in topic_highlights_dict.items()
+                if topic in _surviving_topics
+            ]
+            has_recent_split = processed > recent_limit
+            has_current_patch = patch_timestamp is not None and current_patch_count > 0
+            analysis_generated_at = datetime.now(timezone.utc)
+            current_patch_date = (
+                datetime.fromtimestamp(patch_timestamp, tz=timezone.utc)
+                if patch_timestamp is not None
+                else None
+            )
+            # Archive last_patch_topics when full analysis replaces a doc with a different patch.
+            last_patch_topics: list[TopicSentiment] | None = None
+            last_patch_reviews_count = 0
+            if stale_doc:
+                old_r = normalize_legacy_results(stale_doc.get("results", {}))
+                old_patch_ts = old_r.get("current_patch_timestamp")
+                if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
+                    raw_cp = old_r.get("current_patch_topics")
+                    last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None
+                    last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0)
+                else:
+                    raw_lp = old_r.get("last_patch_topics")
+                    last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None
+                    last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0)
+            result = AnalysisResult(
+                game=game,
+                general_topics=aggregated_topics,
+                recent_topics=recent_topics if has_recent_split else None,
+                recent_reviews_count=recent_reviews_count if has_recent_split else 0,
+                current_patch_topics=current_patch_topics if has_current_patch else None,
+                current_patch_reviews_count=current_patch_count if has_current_patch else 0,
+                last_patch_topics=last_patch_topics,
+                last_patch_reviews_count=last_patch_reviews_count,
+                current_patch_timestamp=patch_timestamp,
+                analysis_date=analysis_generated_at,
+                current_patch_date=current_patch_date,
+                prediction=prediction,
+                analyzed_reviews=processed,
+                skipped_count=total_skipped,
+                general_highlights=[Highlight(**h) for h in general_highlights],
+                recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None,
+                current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None,
+                topic_highlights=topic_highlights_list,
+                cached_at=analysis_generated_at,
+                preferred_context=compute_preferred_context(patch_timestamp),
+                freshness_status=FreshnessStatus.FRESH.value,
+                is_refreshing=False,
+            )
+            # 6. Save to cache
+            await mongodb.save_analysis(
+                game.app_id,
+                result.model_dump(),
+                analyzed_review_ids=all_review_ids,
+                latest_review_timestamp=latest_timestamp,
+                ttl_hours=ttl_hours,
+                analyzed_at=analysis_generated_at,
+            )
+        total_elapsed = t_setup.elapsed_s + t_fetch_analyze.elapsed_s + t_save.elapsed_s
+        logger.info(
+            f"Analysis runner: completed {app_id} ({game_name}) — "
+            f"{processed} reviews, {len(aggregated_topics)} topics"
+        )
+        if slog:
+            log_structured(
+                "analysis_complete",
+                app_id=app_id,
+                game_name=game_name,
+                elapsed_s=round(total_elapsed, 3),
+                source="worker",
+                breakdown={
+                    "setup_s": t_setup.elapsed_s,
+                    "fetch_analyze_s": t_fetch_analyze.elapsed_s,
+                    "nlp_cumulative_s": round(nlp_cumulative_s, 3),
+                    "save_s": t_save.elapsed_s,
+                },
+                reviews_processed=processed,
+                topics_found=len(aggregated_topics),
+            )
+        return result.model_dump()
+    except Exception as e:
+        logger.error(f"Analysis runner error for {app_id} ({game_name}): {e}", exc_info=True)
+        if slog:
+            log_structured(
+                "analysis_error",
+                level=logging.ERROR,
+                app_id=app_id,
+                game_name=game_name,
+                source="worker",
+                error=str(e),
+            )
+        return None

backend/app/services/analysis_utils.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Shared analysis helpers used by both live and worker paths."""
+import time
+from datetime import datetime, timezone
+from typing import Any
+from app.core.config import settings
+from app.models.schemas import (
+    PredictionType,
+    SentimentType,
+    TopicSentiment,
+    UserCountPrediction,
+)
+def calculate_prediction(topics: list[TopicSentiment]) -> UserCountPrediction:
+    """Compute the player-count trend prediction from aggregated topics."""
+    topic_map = {t.topic: t for t in topics}
+    retention = topic_map.get("Retention")
+    if retention and retention.mention_count > 5:
+        if retention.score > settings.prediction_retention_threshold_pos:
+            return UserCountPrediction(
+                trend=PredictionType.INCREASING,
+                confidence=min(0.95, 0.5 + (retention.mention_count / 100)),
+                reasoning="PREDICTION_REASONING_RETENTION_HIGH",
+            )
+        if retention.score < settings.prediction_retention_threshold_neg:
+            return UserCountPrediction(
+                trend=PredictionType.DECREASING,
+                confidence=min(0.95, 0.5 + (retention.mention_count / 100)),
+                reasoning="PREDICTION_REASONING_RETENTION_LOW",
+            )
+    bugs = topic_map.get("Bugs")
+    performance = topic_map.get("Performance")
+    tech_score = 0.0
+    tech_count = 0
+    if bugs:
+        tech_score += bugs.score
+        tech_count += 1
+    if performance:
+        tech_score += performance.score
+        tech_count += 1
+    if tech_count > 0 and (tech_score / tech_count) < -0.3:
+        return UserCountPrediction(
+            trend=PredictionType.DECREASING,
+            confidence=0.75,
+            reasoning="PREDICTION_REASONING_TECH_ISSUES",
+        )
+    gameplay = topic_map.get("Gameplay")
+    fun = topic_map.get("Fun")
+    gameplay_score = 0.0
+    gameplay_count = 0
+    if gameplay:
+        gameplay_score += gameplay.score
+        gameplay_count += 1
+    if fun:
+        gameplay_score += fun.score
+        gameplay_count += 1
+    if gameplay_count > 0:
+        average_gameplay = gameplay_score / gameplay_count
+        if average_gameplay > 0.4:
+            return UserCountPrediction(
+                trend=PredictionType.INCREASING,
+                confidence=0.8,
+                reasoning="PREDICTION_REASONING_GAMEPLAY_HIGH",
+            )
+        if average_gameplay < -0.2:
+            return UserCountPrediction(
+                trend=PredictionType.DECREASING,
+                confidence=0.6,
+                reasoning="PREDICTION_REASONING_GAMEPLAY_LOW",
+            )
+    return UserCountPrediction(
+        trend=PredictionType.STABLE,
+        confidence=0.5,
+        reasoning="PREDICTION_REASONING_STABLE",
+    )
+def aggregate_topics(
+    existing: list[TopicSentiment],
+    new_batch: list[TopicSentiment],
+) -> list[TopicSentiment]:
+    """Merge topic aggregates using weighted mention counts."""
+    topic_data: dict[str, dict[str, Any]] = {}
+    def better_example(
+        current: tuple[str, float] | None,
+        new: tuple[str, float] | None,
+    ) -> tuple[str, float] | None:
+        if new is None:
+            return current
+        if current is None:
+            return new
+        return new if abs(new[1]) > abs(current[1]) else current
+    for topic in existing:
+        if topic.topic not in topic_data:
+            topic_data[topic.topic] = {"scores": [], "count": 0, "example": None}
+        topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count)
+        topic_data[topic.topic]["count"] += topic.mention_count
+        new_example = (
+            (topic.example, topic.example_score)
+            if topic.example and topic.example_score is not None
+            else None
+        )
+        topic_data[topic.topic]["example"] = better_example(
+            topic_data[topic.topic]["example"],
+            new_example,
+        )
+    for topic in new_batch:
+        if topic.topic not in topic_data:
+            topic_data[topic.topic] = {"scores": [], "count": 0, "example": None}
+        topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count)
+        topic_data[topic.topic]["count"] += topic.mention_count
+        new_example = (
+            (topic.example, topic.example_score)
+            if topic.example and topic.example_score is not None
+            else None
+        )
+        topic_data[topic.topic]["example"] = better_example(
+            topic_data[topic.topic]["example"],
+            new_example,
+        )
+    results: list[TopicSentiment] = []
+    for topic_name, data in topic_data.items():
+        count = data["count"]
+        if count == 0:
+            continue
+        average_score = sum(data["scores"]) / count
+        normalized_score = max(-1.0, min(1.0, average_score))
+        if normalized_score > settings.sentiment_positive_threshold:
+            sentiment = SentimentType.POSITIVE
+        elif normalized_score < settings.sentiment_negative_threshold:
+            sentiment = SentimentType.NEGATIVE
+        else:
+            sentiment = SentimentType.NEUTRAL
+        best_example = None
+        example_score = None
+        example_data = data["example"]
+        if example_data:
+            example_text, candidate_score = example_data
+            if sentiment == SentimentType.NEUTRAL or (
+                sentiment == SentimentType.POSITIVE and candidate_score > 0
+            ) or (
+                sentiment == SentimentType.NEGATIVE and candidate_score < 0
+            ):
+                best_example = example_text
+                example_score = candidate_score
+        results.append(
+            TopicSentiment(
+                topic=topic_name,
+                sentiment=sentiment,
+                score=round(normalized_score, 3),
+                mention_count=count,
+                example=best_example,
+                example_score=example_score,
+            )
+        )
+    results.sort(key=lambda item: item.mention_count, reverse=True)
+    return results
+def scale_topics(topics: list[TopicSentiment], factor: float) -> list[TopicSentiment]:
+    """Scale mention counts for the approximate recent sliding window."""
+    return [
+        topic.model_copy(update={"mention_count": max(1, int(topic.mention_count * factor))})
+        for topic in topics
+    ]
+def filter_topics_by_min_mentions(
+    topics: list[TopicSentiment],
+    min_mentions: int | None = None,
+) -> list[TopicSentiment]:
+    """Filter topics below the minimum mention threshold.
+    Preserves existing sort order. Only filters — does not modify score or sentiment.
+    Applied at the final aggregate level, never at the per-review level.
+    """
+    threshold = min_mentions if min_mentions is not None else settings.topic_min_mentions
+    return [t for t in topics if t.mention_count >= threshold]
+def compute_preferred_context(patch_timestamp: int | None) -> str:
+    """Choose the default user-facing context tab.
+    Returns 'current_patch' only when a recent major patch exists; otherwise
+    returns 'general' so the UI defaults to the full-picture view.
+    """
+    if patch_timestamp is None:
+        return "general"
+    patch_age_days = (time.time() - patch_timestamp) / 86400
+    if patch_age_days > settings.patch_context_max_age_days:
+        return "general"
+    return "current_patch"
+_LEGACY_FIELD_MAP = {
+    "topics": "general_topics",
+    "historical_topics": "general_topics",
+    "post_update_topics": "current_patch_topics",
+    "post_update_reviews_count": "current_patch_reviews_count",
+    "post_update_highlights": "current_patch_highlights",
+    "previous_update_topics": "last_patch_topics",
+    "previous_update_reviews_count": "last_patch_reviews_count",
+    "last_update_timestamp": "current_patch_timestamp",
+}
+def normalize_legacy_results(results: dict[str, Any]) -> dict[str, Any]:
+    """Map legacy persisted result fields to the current schema."""
+    normalized: dict[str, Any] = {}
+    for key, value in results.items():
+        new_key = _LEGACY_FIELD_MAP.get(key, key)
+        if key == "is_incremental":
+            continue
+        if new_key not in normalized:
+            normalized[new_key] = value
+    return normalized
+def serialize_datetime(value: Any) -> str | Any:
+    """Serialize datetimes in SSE payloads and persisted compatibility helpers."""
+    if isinstance(value, datetime):
+        return value.isoformat()
+    return value
+def coerce_utc_datetime(value: Any) -> datetime | None:
+    """Coerce persisted datetime values into timezone-aware UTC datetimes."""
+    if isinstance(value, datetime):
+        return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
+    if isinstance(value, str):
+        parsed = datetime.fromisoformat(value)
+        return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
+    return None
+def datetime_from_timestamp(timestamp: int | None) -> datetime | None:
+    """Convert a unix timestamp into UTC datetime."""
+    if timestamp is None:
+        return None
+    return datetime.fromtimestamp(timestamp, tz=timezone.utc)

backend/app/services/game_sync_service.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""
+Game Sync Service — fetches game data from SteamSpy and upserts to MongoDB.
+Replaces the manual scripts/fetch_games_to_mongodb.py with an automated,
+rate-limited sync that runs as part of the Worker cycle.
+"""
+import asyncio
+import logging
+from datetime import datetime, timezone
+from typing import Any
+import httpx
+from app.core.config import settings
+from app.db.mongodb import mongodb
+logger = logging.getLogger(__name__)
+STEAMSPY_API_URL = "https://steamspy.com/api.php"
+STEAM_STORE_API_URL = "https://store.steampowered.com/api"
+class GameSyncService:
+    """Syncs game data from SteamSpy into MongoDB."""
+    def __init__(self, client: httpx.AsyncClient | None = None) -> None:
+        self._client = client
+        self._owns_client = client is None
+    async def _get_client(self) -> httpx.AsyncClient:
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=30.0)
+        return self._client
+    async def close(self) -> None:
+        if self._owns_client and self._client is not None:
+            await self._client.aclose()
+            self._client = None
+    async def sync_all_games(self) -> tuple[int, int]:
+        """
+        Fetch all games from SteamSpy (paginated, up to 90 pages).
+        Returns:
+            (total_upserted, total_modified)
+        """
+        client = await self._get_client()
+        total_upserted = 0
+        total_modified = 0
+        now = datetime.now(timezone.utc)
+        for page in range(90):
+            try:
+                resp = await client.get(
+                    STEAMSPY_API_URL,
+                    params={"request": "all", "page": page},
+                )
+                resp.raise_for_status()
+                data = resp.json()
+                if not data:
+                    logger.info(f"SteamSpy page {page} empty — sync complete")
+                    break
+                games = self._parse_all_response(data, now)
+                if games:
+                    upserted, modified = await mongodb.upsert_games_batch(games)
+                    total_upserted += upserted
+                    total_modified += modified
+                logger.info(
+                    f"SteamSpy page {page}: {len(games)} games "
+                    f"(upserted={total_upserted}, modified={total_modified})"
+                )
+            except httpx.HTTPStatusError as e:
+                logger.error(f"SteamSpy HTTP error on page {page}: {e}")
+                break
+            except httpx.RequestError as e:
+                logger.error(f"SteamSpy request error on page {page}: {e}")
+                break
+            # Rate limit: SteamSpy allows ~1 request per minute
+            if page < 89:
+                await asyncio.sleep(settings.game_sync_steamspy_delay)
+        logger.info(
+            f"Game sync complete: upserted={total_upserted}, modified={total_modified}"
+        )
+        return (total_upserted, total_modified)
+    async def sync_top_game_details(self, limit: int | None = None) -> int:
+        """
+        Enrich top N games with detailed info (tags, genre, ccu) from SteamSpy.
+        Returns:
+            Number of games enriched.
+        """
+        limit = limit or settings.game_sync_top_n_details
+        client = await self._get_client()
+        top_games = await mongodb.get_top_games_by_reviews(limit)
+        enriched = 0
+        for game in top_games:
+            appid = game.get("appid", "")
+            if not appid:
+                continue
+            try:
+                resp = await client.get(
+                    STEAMSPY_API_URL,
+                    params={"request": "appdetails", "appid": appid},
+                )
+                resp.raise_for_status()
+                detail = resp.json()
+                update = self._parse_detail_response(detail)
+                if update:
+                    await mongodb.upsert_game({"appid": appid, "name": game["name"], **update})
+                    enriched += 1
+            except httpx.HTTPStatusError as e:
+                logger.warning(f"SteamSpy detail error for {appid}: {e}")
+            except httpx.RequestError as e:
+                logger.warning(f"SteamSpy detail request error for {appid}: {e}")
+            await asyncio.sleep(settings.game_sync_details_delay)
+        logger.info(f"Enriched {enriched}/{len(top_games)} games with details")
+        return enriched
+    async def enrich_cn_names(self, limit: int | None = None) -> int:
+        """
+        Enrich games with Chinese names from Steam Store API.
+        Returns:
+            Number of games processed.
+        """
+        limit = limit or settings.game_sync_cn_enrichment_limit
+        client = await self._get_client()
+        games_to_check = await mongodb.get_games_without_cn_name(limit)
+        processed = 0
+        for game in games_to_check:
+            appid = game.get("appid")
+            name_en = game.get("name")
+            if not appid:
+                continue
+            try:
+                app_data = await self._fetch_store_app_data(client, appid)
+                if app_data and app_data.get("success"):
+                    info = app_data.get("data", {})
+                    name_cn = info.get("name")
+                    # If names are different, we found a translation
+                    if name_cn and name_cn != name_en:
+                        await mongodb.mark_cn_name_checked(appid, name_cn)
+                    else:
+                        await mongodb.mark_cn_name_checked(appid)
+                else:
+                    # Not found or error in API - still mark as checked
+                    await mongodb.mark_cn_name_checked(appid)
+                processed += 1
+            except httpx.HTTPError as e:
+                logger.warning(f"Error fetching CN name for {appid}: {e}")
+                # Don't mark as checked on network error, try again next cycle
+            # Respect rate limits
+            await asyncio.sleep(settings.game_sync_cn_enrichment_delay)
+        logger.info(f"Enriched CN names for {processed}/{len(games_to_check)} games")
+        return processed
+    async def enrich_app_types(self, limit: int | None = None) -> int:
+        """
+        Enrich app_type/parent_appid using Steam Store appdetails.
+        Returns:
+            Number of games processed.
+        """
+        limit = limit or settings.game_sync_app_type_enrichment_limit
+        client = await self._get_client()
+        games_to_check = await mongodb.get_games_missing_app_type(limit)
+        processed = 0
+        for game in games_to_check:
+            appid = game.get("appid")
+            if not appid:
+                continue
+            try:
+                app_data = await self._fetch_store_app_data(client, appid)
+                info = app_data.get("data", {}) if app_data and app_data.get("success") else {}
+                parsed = self._parse_store_type_response(info)
+                await mongodb.mark_app_type_checked(
+                    appid,
+                    app_type=parsed["app_type"],
+                    parent_appid=parsed["parent_appid"],
+                )
+                processed += 1
+            except httpx.HTTPError as e:
+                logger.warning(f"Error fetching app type for {appid}: {e}")
+            await asyncio.sleep(settings.game_sync_app_type_enrichment_delay)
+        logger.info(f"Enriched app types for {processed}/{len(games_to_check)} games")
+        return processed
+    @staticmethod
+    def _parse_all_response(
+        data: dict[str, Any], synced_at: datetime
+    ) -> list[dict[str, Any]]:
+        """Parse SteamSpy 'all' response into list of game dicts."""
+        games: list[dict[str, Any]] = []
+        for appid_str, info in data.items():
+            name = info.get("name", "")
+            if not name:
+                continue
+            games.append({
+                "appid": str(appid_str),
+                "name": name,
+                "developer": info.get("developer", ""),
+                "publisher": info.get("publisher", ""),
+                "positive": info.get("positive", 0),
+                "negative": info.get("negative", 0),
+                "synced_at": synced_at,
+            })
+        return games
+    @staticmethod
+    def _parse_detail_response(detail: dict[str, Any]) -> dict[str, Any]:
+        """Parse SteamSpy 'appdetails' response into enrichment fields."""
+        update: dict[str, Any] = {}
+        tags = detail.get("tags")
+        if isinstance(tags, dict) and tags:
+            # Sort by vote count descending, keep top 20 tag names
+            sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]
+            update["tags"] = [tag_name for tag_name, _ in sorted_tags]
+        genre = detail.get("genre")
+        if genre:
+            update["genre"] = genre
+        ccu = detail.get("ccu")
+        if ccu is not None:
+            update["ccu"] = ccu
+        return update
+    @staticmethod
+    def _parse_store_type_response(info: dict[str, Any]) -> dict[str, Any]:
+        app_type = info.get("type") or "unknown"
+        fullgame = info.get("fullgame")
+        parent_appid = None
+        if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None:
+            parent_appid = str(fullgame["appid"])
+        return {
+            "app_type": str(app_type),
+            "parent_appid": parent_appid,
+        }
+    @staticmethod
+    async def _fetch_store_app_data(
+        client: httpx.AsyncClient, appid: str
+    ) -> dict[str, Any] | None:
+        """Fetch one appdetails payload from Steam Store."""
+        resp = await client.get(
+            f"{STEAM_STORE_API_URL}/appdetails",
+            params={
+                "appids": appid,
+                "l": "schinese",
+                "cc": "CN",
+            },
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        return data.get(str(appid))

backend/app/services/highlights_service.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+Serwis ekstrakcji Community Highlights z recenzji.
+Uzywa n-gramow (2-5 tokenow) + TF-IDF do identyfikacji najczesciej uzywanych fraz.
+"""
+import math
+from collections import Counter, defaultdict
+from typing import Any
+import jieba
+from app.core.config import settings
+from app.core.stopwords_zh import is_stopword
+class HighlightsCollector:
+    """
+    Stateful collector — akumuluje dane przez caly cykl analizy w sposob przyrostowy,
+    aby oszczedzac pamiec RAM. Oblicza highlights raz na koncu.
+    """
+    def __init__(self) -> None:
+        self._topic_ngrams: dict[str, Counter] = defaultdict(Counter)
+        self._category_ngrams: dict[str, Counter] = defaultdict(Counter)
+        self._global_counts: Counter = Counter()
+        self._ngram_doc_freq: Counter = Counter()
+        self._ngram_sentiment_sum: dict[str, float] = defaultdict(float)
+        self._ngram_sentiment_count: Counter = Counter()
+        self._review_count = 0
+        self._current_review_seen_ngrams: set[str] = set()
+    def start_review(self) -> None:
+        """Sygnalizuje poczatek nowej recenzji (do obliczania Document Frequency)."""
+        self._review_count += 1
+        self._current_review_seen_ngrams = set()
+    def add_sentence(
+        self,
+        review_idx: int, # Zachowane dla kompatybilnosci, uzywaj start_review() do separacji
+        sentence: str,
+        topics: list[str],
+        sentiment_score: float,
+        categories: list[str] | None = None,
+    ) -> None:
+        """Wywolywane per zdanie podczas analyze_batch()."""
+        # Prosta detekcja ASCII dla angielskich fraz (unikniecie blednego ciecia przez jieba)
+        is_ascii = all(ord(c) < 128 for c in sentence)
+        if is_ascii:
+            words = [w for w in sentence.split() if not is_stopword(w) and len(w.strip()) > 0]
+        else:
+            words = [w for w in jieba.lcut(sentence) if not is_stopword(w) and len(w.strip()) > 0]
+        if len(words) < 2:
+            return
+        for n in range(settings.highlights_ngram_min, settings.highlights_ngram_max + 1):
+            for i in range(len(words) - n + 1):
+                ngram = " ".join(words[i : i + n])
+                # 1. Globalne liczniki
+                self._global_counts[ngram] += 1
+                self._ngram_sentiment_sum[ngram] += sentiment_score
+                self._ngram_sentiment_count[ngram] += 1
+                # 2. Przyrostowe Document Frequency (raz per recenzja)
+                if ngram not in self._current_review_seen_ngrams:
+                    self._ngram_doc_freq[ngram] += 1
+                    self._current_review_seen_ngrams.add(ngram)
+                # 3. Liczniki tematyczne i kategoryczne
+                for topic in topics:
+                    self._topic_ngrams[topic][ngram] += 1
+                if categories:
+                    for category in categories:
+                        self._category_ngrams[category][ngram] += 1
+        if self._review_count % 500 == 0:
+            self._prune_singletons()
+    def _prune_singletons(self) -> None:
+        """Glebokie czyszczenie n-gramow z count=1 (oszczednosc pamieci)."""
+        singletons = [k for k, v in self._global_counts.items() if v <= 1]
+        for k in singletons:
+            del self._global_counts[k]
+            if k in self._ngram_sentiment_sum:
+                del self._ngram_sentiment_sum[k]
+            del self._ngram_sentiment_count[k]
+            del self._ngram_doc_freq[k]
+            # Czyszczenie w tematach
+            for topic in self._topic_ngrams:
+                if k in self._topic_ngrams[topic]:
+                    del self._topic_ngrams[topic][k]
+            # Czyszczenie w kategoriach
+            for cat in self._category_ngrams:
+                if k in self._category_ngrams[cat]:
+                    del self._category_ngrams[cat][k]
+    def compute_highlights(self) -> dict[str, Any]:
+        """
+        Oblicza highlights po zakonczeniu analizy.
+        """
+        if self._review_count == 0:
+            return {
+                "general": [],
+                "recent": [],
+                "current_patch": [],
+                "topics": {}
+            }
+        results: dict[str, Any] = {
+            "general": self._compute_tfidf_highlights(
+                self._global_counts,
+                top_n=settings.highlights_top_n_general,
+            ),
+            "recent": self._compute_tfidf_highlights(
+                self._category_ngrams.get("recent", Counter()),
+                top_n=settings.highlights_top_n_general,
+            ),
+            "current_patch": self._compute_tfidf_highlights(
+                self._category_ngrams.get("current_patch", Counter()),
+                top_n=settings.highlights_top_n_general,
+            ),
+            "topics": {}
+        }
+        for topic, counter in self._topic_ngrams.items():
+            h = self._compute_tfidf_highlights(
+                counter,
+                top_n=settings.highlights_top_n_per_topic,
+            )
+            if h:
+                results["topics"][topic] = h
+        return results
+    def _compute_tfidf_highlights(self, counter: Counter, top_n: int) -> list[dict]:
+        """TF-IDF scoring + filtering + dedup."""
+        candidates = []
+        n = self._review_count
+        total_count = sum(counter.values()) if counter.values() else 1
+        for ngram, count in counter.items():
+            df = self._ngram_doc_freq.get(ngram, 0)
+            if df < settings.highlights_min_mentions:
+                continue
+            if df / n > settings.highlights_max_doc_freq_ratio:
+                continue
+            idf = math.log(n / df) if df > 0 else 0
+            tf = count / total_count
+            tfidf = tf * idf
+            rank_score = count * tfidf
+            # Oblicz sredni sentyment z sumy i liczby
+            s_sum = self._ngram_sentiment_sum.get(ngram, 0.0)
+            s_count = self._ngram_sentiment_count.get(ngram, 0)
+            avg_score = s_sum / s_count if s_count > 0 else 0.0
+            candidates.append({
+                "phrase": ngram,
+                "mention_count": df,
+                "score": round(avg_score, 3),
+                "sentiment": (
+                    "positive" if avg_score > settings.sentiment_positive_threshold
+                    else "negative" if avg_score < settings.sentiment_negative_threshold
+                    else "neutral"
+                ),
+                "ngram_size": len(ngram.split()),
+                "_rank": rank_score,
+            })
+        candidates.sort(key=lambda x: x["_rank"], reverse=True)
+        # Substring absorption
+        absorbed: set[int] = set()
+        for i, c in enumerate(candidates):
+            if i in absorbed:
+                continue
+            for j in range(i + 1, len(candidates)):
+                if j in absorbed:
+                    continue
+                if candidates[j]["phrase"] in c["phrase"]:
+                    parent_has_neg = any(neg in c["phrase"] for neg in ["不", "没", "无"])
+                    child_has_neg = any(neg in candidates[j]["phrase"] for neg in ["不", "没", "无"])
+                    if parent_has_neg == child_has_neg:
+                        absorbed.add(j)
+        results = [c for i, c in enumerate(candidates) if i not in absorbed]
+        # Re-sort by mention_count descending for display order.
+        # TF-IDF sort above selected the top candidates; this ensures the final
+        # list the UI receives is ordered from most-mentioned to least-mentioned,
+        # with score and phrase as stable tie-breakers.
+        results.sort(key=lambda x: (-x["mention_count"], -x["score"], x["phrase"]))
+        for r in results[:top_n]:
+            r.pop("_rank", None)
+        return results[:top_n]

backend/app/services/nlp_service.py ADDED Viewed

	@@ -0,0 +1,524 @@

+"""
+Serwis NLP do analizy sentymentu i modelowania tematów.
+Architektura: Local Inference (CPU).
+Wykorzystuje model Transformer (DistilBERT) uruchamiany bezpośrednio w aplikacji,
+co eliminuje opóźnienia sieciowe i zapewnia deterministyczny czas wykonania.
+Optymalizacje:
+1. Pre-kompilacja wzorców Regex (O(1) matching).
+2. Wykonywanie inferencji w Executorze (nie blokuje Event Loop).
+3. Batching zapytań do modelu (wykorzystanie instrukcji wektorowych CPU).
+"""
+from __future__ import annotations
+import asyncio
+import logging
+import re
+from collections import OrderedDict, defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING
+from pathlib import Path
+import jieba
+from transformers import AutoTokenizer, pipeline
+from optimum.onnxruntime import ORTModelForSequenceClassification
+from zhconv import convert
+from app.core.config import settings
+from app.core.keywords import EXCLUSIONS, TOPIC_KEYWORDS
+from app.models.schemas import SentimentType, TopicSentiment
+if TYPE_CHECKING:
+    from app.services.highlights_service import HighlightsCollector
+logger = logging.getLogger(__name__)
+CARD_LAG_PREFIXES = frozenset({"不", "很", "好", "太", "真", "挺", "老", "总"})
+CARD_STANDALONE_PREVIOUS_TOKENS = frozenset({"有点", "一直", "偶尔"})
+# Zakresy Unicode dla Emoji i symboli graficznych
+# UWAGA: Poprzedni pattern "\U000024C2-\U0001F251" był zbyt szeroki i usuwał chińskie znaki!
+# Teraz używamy precyzyjnych zakresów tylko dla emoji.
+EMOJI_PATTERN = re.compile(
+    "["
+    "\U0001F600-\U0001F64F"  # Emoticons
+    "\U0001F300-\U0001F5FF"  # Misc Symbols and Pictographs
+    "\U0001F680-\U0001F6FF"  # Transport and Map
+    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
+    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
+    "\U0001FA00-\U0001FA6F"  # Chess Symbols
+    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
+    "\U00002702-\U000027B0"  # Dingbats
+    "\U0000FE00-\U0000FE0F"  # Variation Selectors
+    "]+",
+    flags=re.UNICODE,
+)
+# Inteligentny podział na zdania (wspiera angielski i chiński)
+# Chiński: 。！？；
+# Angielski: .!?
+# Interpunkcja do usunięcia przy deduplikacji (EN + ZH)
+DEDUP_PUNCTUATION = re.compile(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~。！？，、；：""''【】（）《》～…·]')
+SENTENCE_SPLIT_PATTERN = re.compile(r"""
+    (?<=[.!?。！？；])\s*     # Koniec zdania (EN + ZH punctuation)
+    |                       # LUB
+    (?<=[a-z]),\s+          # Przecinek po literze + spacja...
+    (?=but\b|however\b|although\b|though\b) # ...przed spójnikiem przeciwstawnym (EN)
+    |
+    \s+(?=but\b|however\b|although\b|though\b) # Spójnik bez przecinka (EN)
+    |
+    (?<=。|！|？|；)         # Po chińskiej interpunkcji (bez spacji)
+    |
+    (?=但是|然而|虽然|不过|可是) # Przed chińskim spójnikiem przeciwstawnym
+""", re.VERBOSE | re.IGNORECASE)
+class NLPService:
+    """
+    Serwis NLP realizujący analizę hybrydową:
+    1. Słowa kluczowe (Regex) -> Wykrywanie tematów.
+    2. DistilBERT (Local Model) -> Analiza sentymentu.
+    """
+    def __init__(self) -> None:
+        """
+        Inicjalizuje pipeline ML oraz kompiluje wzorce tekstowe.
+        Model ładowany jest raz przy starcie aplikacji (Singleton pattern).
+        """
+        logger.info("Inicjalizacja serwisu NLP (ONNX Optimized)...")
+        # 0. Jieba user dict — terminy gamingowe
+        userdict_path = Path(__file__).parent.parent / "core" / "jieba_userdict.txt"
+        if userdict_path.exists():
+            jieba.load_userdict(str(userdict_path))
+            logger.info(f"Załadowano jieba user dict: {userdict_path}")
+        # 1. Kompilacja Regexów
+        # Łączymy słowa kluczowe w jeden efektywny "automat" (Regex).
+        # UWAGA: \b nie działa z chińskimi znakami, więc używamy różnych wzorców
+        # dla słów ASCII (z \b) i chińskich (bez \b).
+        self.topic_patterns = {}
+        self.single_char_topic_keywords = {}
+        self.exclusion_patterns = {}
+        for topic, keyword_groups in TOPIC_KEYWORDS.items():
+            ascii_keywords: list[str] = []
+            chinese_keywords: list[str] = []
+            chinese_single_char_keywords: list[str] = []
+            for group_name, group in keyword_groups.items():
+                for keyword in group:
+                    if keyword.isascii():
+                        ascii_keywords.append(keyword)
+                    elif group_name == "single_char" and len(keyword) == 1:
+                        chinese_single_char_keywords.append(keyword)
+                    else:
+                        chinese_keywords.append(keyword)
+            self.single_char_topic_keywords[topic] = chinese_single_char_keywords
+            patterns = []
+            if ascii_keywords:
+                # Use word boundaries for ASCII keywords
+                sorted_ascii = sorted(ascii_keywords, key=len, reverse=True)
+                patterns.append(r'\b(' + '|'.join(re.escape(k) for k in sorted_ascii) + r')\b')
+            if chinese_keywords:
+                # No word boundaries for Chinese (they don't have spaces),
+                # but prefer longer keywords so compounds win over partial overlaps.
+                sorted_chinese = sorted(chinese_keywords, key=len, reverse=True)
+                patterns.append('(' + '|'.join(re.escape(k) for k in sorted_chinese) + ')')
+            if patterns:
+                combined_pattern = '|'.join(patterns)
+                self.topic_patterns[topic] = re.compile(combined_pattern, re.IGNORECASE)
+        for keyword, exclusions in EXCLUSIONS.items():
+            if exclusions:
+                pattern_str = '|'.join(re.escape(e) for e in exclusions)
+                self.exclusion_patterns[keyword] = re.compile(pattern_str, re.IGNORECASE)
+        # 2. Ładowanie modelu ONNX
+        logger.info(f"Ładowanie modelu ONNX {settings.hf_sentiment_model}...")
+        try:
+            from onnxruntime import GraphOptimizationLevel, SessionOptions
+            # OPTYMALIZACJA DLA HF SPACES (Shared CPU)
+            # Na darmowym tierze mamy 2 vCPU. Ograniczenie wątków zapobiega
+            # "context switching" i walce o zasoby.
+            session_options = SessionOptions()
+            session_options.intra_op_num_threads = settings.nlp_onnx_intra_threads
+            session_options.inter_op_num_threads = settings.nlp_onnx_inter_threads
+            session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+            # Load pre-built quantized INT8 ONNX model (no PyTorch needed at runtime)
+            quantized_path = Path(__file__).resolve().parent.parent.parent / "models" / "quantized"
+            model_file = quantized_path / "model_quantized.onnx"
+            if not model_file.exists():
+                raise FileNotFoundError(
+                    f"Quantized ONNX model not found at {model_file}. "
+                    "Run 'python scripts/quantize_model.py' to generate it."
+                )
+            logger.info(f"Loading quantized INT8 model from {quantized_path}")
+            model = ORTModelForSequenceClassification.from_pretrained(
+                str(quantized_path),
+                file_name="model_quantized.onnx",
+                session_options=session_options,
+            )
+            tokenizer = AutoTokenizer.from_pretrained(str(quantized_path))
+            self.classifier = pipeline(
+                "sentiment-analysis",
+                model=model,
+                tokenizer=tokenizer,
+                device="cpu",
+            )
+            logger.info("Model NLP ONNX ready: INT8 quantized, graph_optimization=ALL")
+        except Exception as e:
+            # Deliberate broad catch — model loading can fail with OSError, RuntimeError,
+            # ONNX errors, HF Hub errors, etc. Always fatal, always re-raised.
+            logger.error(f"Krytyczny błąd ładowania modelu ONNX: {e}")
+            raise
+        # Pula wątków, żeby ciężkie obliczenia AI nie blokowały serwera (Event Loop)
+        self.executor = ThreadPoolExecutor(max_workers=1)
+        # Cache sentymentu: normalized_text -> (label_str, score)
+        self._sentiment_cache: OrderedDict[str, tuple[str, float]] = OrderedDict()
+        self._cache_maxsize = settings.dedup_cache_maxsize
+    def clean_text(self, text: str) -> str:
+        """Usuwa szum (emoji, nadmiarowe spacje) i normalizuje tekst."""
+        text = EMOJI_PATTERN.sub("", text)
+        text = text.lower()
+        text = re.sub(r"\s+", " ", text).strip()
+        max_len = settings.text_max_length
+        return text[:max_len] if len(text) > max_len else text
+    def _normalize_for_dedup(self, text: str) -> str:
+        """Normalizuje zdanie do klucza deduplikacji (zachowuje kolejność słów)."""
+        text = DEDUP_PUNCTUATION.sub("", text).lower()
+        text = re.sub(r"\s+", " ", text).strip()
+        return convert(text, 'zh-cn')
+    def _split_into_sentences(self, text: str) -> list[str]:
+        """Rozbija recenzję na logiczne jednostki (zdania/klauzule)."""
+        parts = SENTENCE_SPLIT_PATTERN.split(text)
+        return [p.strip() for p in parts if p and p.strip()]
+    def _has_negation(self, text: str, position: int) -> bool:
+        """
+        Wykrywa negację przed słowem kluczowym (w zasięgu zdefiniowanym w configu).
+        Przydatne przy precyzyjniejszej analizie aspektowej w języku chińskim.
+        """
+        window = settings.nlp_negation_window
+        left_context = text[max(0, position-window):position]
+        return any(neg in left_context for neg in ["不", "没", "别", "无"])
+    @staticmethod
+    def _is_valid_single_char_token(keyword: str, token: str, previous_token: str | None) -> bool:
+        """Waliduje pojedynczy chiński keyword w kontekście całego tokenu."""
+        if keyword != "卡":
+            return True
+        if token == keyword:
+            return previous_token is None or previous_token in CARD_STANDALONE_PREVIOUS_TOKENS
+        return token.endswith(keyword) and token[:-1] in CARD_LAG_PREFIXES
+    def _find_single_char_keyword_match(self, sentence: str, keywords: list[str]) -> tuple[int, str] | None:
+        """Zwraca pierwszy poprawny match dla chińskiego single-char keywordu."""
+        if not keywords:
+            return None
+        keyword_set = set(keywords)
+        tokenized_sentence = list(jieba.tokenize(sentence))
+        for index, (token, start, _) in enumerate(tokenized_sentence):
+            previous_token = tokenized_sentence[index - 1][0] if index > 0 else None
+            for offset, char in enumerate(token):
+                if char not in keyword_set:
+                    continue
+                if self._is_valid_single_char_token(char, token, previous_token):
+                    return start + offset, char
+        return None
+    def _detect_topics_regex(self, sentence: str) -> dict[str, bool]:
+        """
+        Szybkie wykrywanie tematów przy użyciu prekompilowanych regexów.
+        Złożoność: O(N) względem długości zdania, niezależnie od liczby słów kluczowych.
+        """
+        detected = {}
+        # Konwersja TYMCZASOWA na uproszczony chiński dla potrzeb matchowania.
+        # Dzięki temu zachowujemy oryginalny tekst (tradycyjny/uproszczony) w bazie,
+        # ale słownik keywords.py może pozostać w zh-cn.
+        sentence_simp = convert(sentence, 'zh-cn')
+        for topic in TOPIC_KEYWORDS:
+            regex_match = None
+            if topic in self.topic_patterns:
+                regex_match = self.topic_patterns[topic].search(sentence_simp)
+            single_char_match = self._find_single_char_keyword_match(
+                sentence_simp,
+                self.single_char_topic_keywords.get(topic, []),
+            )
+            matched_word: str | None = None
+            match_start: int | None = None
+            if regex_match and single_char_match:
+                if single_char_match[0] < regex_match.start():
+                    match_start, matched_word = single_char_match
+                else:
+                    match_start = regex_match.start()
+                    matched_word = regex_match.group(0).lower()
+            elif regex_match:
+                match_start = regex_match.start()
+                matched_word = regex_match.group(0).lower()
+            elif single_char_match:
+                match_start, matched_word = single_char_match
+            if matched_word is not None and match_start is not None:
+                is_excluded = False
+                if matched_word in self.exclusion_patterns:
+                    if self.exclusion_patterns[matched_word].search(sentence_simp):
+                        is_excluded = True
+                if not is_excluded:
+                    negated = self._has_negation(sentence_simp, match_start)
+                    detected[topic] = negated
+        return detected
+    def _run_inference(self, texts: list[str]) -> list[dict]:
+        """Wrapper dla pipeline'u Hugging Face uruchamiany w wątku."""
+        # batch_size=16 optymalizuje operacje macierzowe na CPU (AVX)
+        # truncation=True, max_length=512 zapobiega przekroczeniu limitu pozycji ONNX
+        # (max_position_embeddings=512); pipeline uwzględnia tokeny specjalne automatycznie
+        return self.classifier(texts, batch_size=16, truncation=True, max_length=512)
+    @staticmethod
+    def _map_label(label_str: str, score: float) -> tuple[SentimentType, float]:
+        """Mapuje surowy label modelu na (SentimentType, score)."""
+        label_lower = label_str.lower()
+        if 'positive' in label_lower or 'label_1' in label_lower:
+            return (SentimentType.POSITIVE, score)
+        elif 'negative' in label_lower or 'label_0' in label_lower:
+            return (SentimentType.NEGATIVE, -score)
+        return (SentimentType.NEUTRAL, 0.0)
+    def _cache_put(self, key: str, value: tuple[str, float]) -> None:
+        """Dodaje wynik do cache LRU, usuwa najstarsze jeśli przekroczono limit."""
+        self._sentiment_cache[key] = value
+        self._sentiment_cache.move_to_end(key)
+        while len(self._sentiment_cache) > self._cache_maxsize:
+            self._sentiment_cache.popitem(last=False)
+    async def analyze_sentiment_batch(
+        self, texts: list[str]
+    ) -> list[tuple[SentimentType, float]]:
+        """
+        Asynchroniczny interfejs do analizy sentymentu.
+        Offloaduje obliczenia do osobnego wątku, nie blokując API.
+        Wykorzystuje cache LRU do pomijania powtórzonych zdań.
+        """
+        cleaned_texts = [self.clean_text(t) for t in texts]
+        norm_keys = [self._normalize_for_dedup(t) for t in cleaned_texts]
+        # Rozdziel na cache hits i misses
+        final_sentiments: list[tuple[SentimentType, float]] = [(SentimentType.NEUTRAL, 0.0)] * len(texts)
+        miss_indices: list[int] = []  # indeksy w cleaned_texts, które trzeba wysłać do modelu
+        miss_texts: list[str] = []
+        for i, (cleaned, key) in enumerate(zip(cleaned_texts, norm_keys)):
+            if not cleaned:
+                continue
+            cached = self._sentiment_cache.get(key)
+            if cached is not None:
+                self._sentiment_cache.move_to_end(key)
+                final_sentiments[i] = self._map_label(cached[0], cached[1])
+            else:
+                miss_indices.append(i)
+                miss_texts.append(cleaned)
+        cache_hits = len(texts) - len(miss_texts)
+        logger.debug(f"Cache: {cache_hits} hits, {len(miss_texts)} misses (cache size: {len(self._sentiment_cache)})")
+        if not miss_texts:
+            return final_sentiments
+        # Uruchomienie modelu TYLKO na cache-misses
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(self.executor, self._run_inference, miss_texts)
+        for j, res in enumerate(results):
+            original_idx = miss_indices[j]
+            label_str = res['label']
+            score = res['score']
+            # Zapisz surowy wynik w cache
+            self._cache_put(norm_keys[original_idx], (label_str, score))
+            final_sentiments[original_idx] = self._map_label(label_str, score)
+        return final_sentiments
+    async def analyze_batch(
+        self,
+        reviews: list[str],
+        highlights_collector: HighlightsCollector | None = None,
+        categories: list[str] | None = None,
+    ) -> tuple[list[TopicSentiment], int]:
+        """
+        Główna metoda przetwarzania partii recenzji.
+        Łączy segmentację, wykrywanie tematów i analizę sentymentu.
+        """
+        if not reviews:
+            return [], 0
+        # Krok 1: Pre-processing i identyfikacja zdań do analizy
+        sentiment_tasks = []
+        skipped_sentences = 0
+        for review_idx, review in enumerate(reviews):
+            if highlights_collector:
+                highlights_collector.start_review()
+            cleaned = self.clean_text(review)
+            if not cleaned or len(cleaned) < 5:
+                continue
+            sentences = self._split_into_sentences(cleaned)
+            for sentence in sentences:
+                topics_map = self._detect_topics_regex(sentence)
+                if topics_map:
+                    for topic, is_negated in topics_map.items():
+                        sentiment_tasks.append((review_idx, topic, sentence, is_negated))
+                else:
+                    skipped_sentences += 1
+        if not sentiment_tasks:
+            return [], skipped_sentences
+        # Krok 2: Deduplikacja + Analiza sentymentu
+        all_sentences = [task[2] for task in sentiment_tasks]
+        # Deduplikacja: normalizuj -> znajdź unikalne -> inference tylko na unikatach
+        norm_keys = [self._normalize_for_dedup(s) for s in all_sentences]
+        unique_map: dict[str, int] = {}  # normalized_key -> index in unique_texts
+        unique_texts: list[str] = []
+        for i, key in enumerate(norm_keys):
+            if key not in unique_map:
+                unique_map[key] = len(unique_texts)
+                unique_texts.append(all_sentences[i])
+        dedup_total = len(all_sentences)
+        dedup_unique = len(unique_texts)
+        dedup_pct = round((1 - dedup_unique / dedup_total) * 100) if dedup_total else 0
+        logger.debug(f"Dedup: {dedup_total} -> {dedup_unique} sentences ({dedup_pct}% reduced)")
+        unique_results = await self.analyze_sentiment_batch(unique_texts)
+        # Mapowanie wyników z unikalnych z powrotem na wszystkie zdania
+        sentiment_results = [unique_results[unique_map[key]] for key in norm_keys]
+        # Krok 3: Agregacja wyników
+        # review_id -> topic -> list of scores
+        review_topic_scores: dict[int, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
+        # topic -> (sentence, score) - online selection najlepszego przykładu
+        topic_best_example: dict[str, tuple[str, float]] = {}
+        for i, (review_idx, topic, sentence, is_negated) in enumerate(sentiment_tasks):
+            _, score = sentiment_results[i]
+            # KULOODPORNY PIPELINE: Jeśli wykryto negację (np. "nie lubię gameplayu"),
+            # a model mimo to zwrócił dodatni sentyment, korygujemy go.
+            if is_negated and score > 0:
+                score = -score
+            review_topic_scores[review_idx][topic].append(score)
+            if highlights_collector:
+                highlights_collector.add_sentence(
+                    review_idx=review_idx,
+                    sentence=sentence,
+                    topics=[topic],
+                    sentiment_score=score,
+                    categories=categories,
+                )
+            # Online selection - aktualizuj jeśli lepszy kandydat (wyższy |score|)
+            if len(sentence) > 20:
+                current = topic_best_example.get(topic)
+                if current is None or abs(score) > abs(current[1]):
+                    topic_best_example[topic] = (sentence, score)
+        # Agregacja globalna: Średnia per recenzja -> Suma globalna
+        global_topic_stats: dict[str, dict[str, float]] = defaultdict(lambda: {"sum_score": 0.0, "count": 0.0})
+        for review_idx, topics_data in review_topic_scores.items():
+            for topic, scores in topics_data.items():
+                avg_review_score = sum(scores) / len(scores)
+                global_topic_stats[topic]["sum_score"] += avg_review_score
+                global_topic_stats[topic]["count"] += 1.0
+        # Krok 4: Formatowanie końcowe
+        final_results: list[TopicSentiment] = []
+        for topic_name, stats in global_topic_stats.items():
+            count = int(stats["count"])
+            if count == 0:
+                continue
+            avg_global_score = stats["sum_score"] / stats["count"]
+            normalized_score = max(-1.0, min(1.0, avg_global_score))
+            if normalized_score > settings.sentiment_positive_threshold:
+                sentiment = SentimentType.POSITIVE
+            elif normalized_score < settings.sentiment_negative_threshold:
+                sentiment = SentimentType.NEGATIVE
+            else:
+                sentiment = SentimentType.NEUTRAL
+            # Pobierz najlepszy przykład i zwaliduj zgodność kierunku
+            best_example = None
+            example_score = None
+            candidate = topic_best_example.get(topic_name)
+            if candidate:
+                ex_sentence, ex_score = candidate
+                # Walidacja: przykład musi być zgodny z kierunkiem sentymentu
+                if sentiment == SentimentType.NEUTRAL or \
+                   (sentiment == SentimentType.POSITIVE and ex_score > 0) or \
+                   (sentiment == SentimentType.NEGATIVE and ex_score < 0):
+                    best_example = ex_sentence
+                    example_score = ex_score
+            final_results.append(
+                TopicSentiment(
+                    topic=topic_name,
+                    sentiment=sentiment,
+                    score=round(normalized_score, 3),
+                    mention_count=count,
+                    example=best_example,
+                    example_score=example_score,
+                )
+            )
+        final_results.sort(key=lambda x: x.mention_count, reverse=True)
+        return final_results, skipped_sentences
+_nlp_service: "NLPService | None" = None
+def get_nlp_service() -> "NLPService":
+    global _nlp_service
+    if _nlp_service is None:
+        _nlp_service = NLPService()
+    return _nlp_service

backend/app/services/precache_service.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Pre-cache Service — schedules and executes background analyses for top games.
+Creates refresh schedules with checkpoints (e.g. 6h, 12h, 24h after update)
+and processes due analyses each cycle, prioritized by game popularity.
+"""
+import asyncio
+import logging
+from datetime import datetime, timedelta, timezone
+from typing import Any
+from app.core.config import settings
+from app.db.mongodb import mongodb
+from app.services.analysis_runner import run_full_analysis, run_incremental_analysis
+from app.services.nlp_service import NLPService
+from app.services.steam_service import SteamService
+logger = logging.getLogger(__name__)
+class PreCacheService:
+    """Manages refresh schedules and triggers pre-cache analyses."""
+    def __init__(
+        self, steam_svc: SteamService, nlp_svc: NLPService
+    ) -> None:
+        self._steam_svc = steam_svc
+        self._nlp_svc = nlp_svc
+    def create_schedule(
+        self, app_id: str, game_name: str, update_at: datetime, *, is_release: bool = False
+    ) -> dict[str, Any]:
+        """Build a schedule document with checkpoints from config."""
+        checkpoints = []
+        for offset_hours in settings.precache_checkpoints_list:
+            checkpoints.append({
+                "offset_hours": offset_hours,
+                "due_at": update_at + timedelta(hours=offset_hours),
+                "completed": False,
+            })
+        return {
+            "app_id": str(app_id),
+            "game_name": game_name,
+            "update_at": update_at,
+            "checkpoints": checkpoints,
+            "is_release": is_release,
+            "status": "active",
+            "created_at": datetime.now(timezone.utc),
+        }
+    def create_bootstrap_schedule(
+        self, app_id: str, game_name: str
+    ) -> dict[str, Any]:
+        """Release schedule for a newly prioritized game, starting at 6h."""
+        now = datetime.now(timezone.utc)
+        return self.create_schedule(app_id, game_name, now, is_release=True)
+    async def create_schedules_for_updates(
+        self, updated_games: list[dict[str, Any]]
+    ) -> int:
+        """Bulk-create schedules for games that received updates."""
+        active_schedules = await mongodb.get_active_schedules()
+        active_by_app_id = {s["app_id"]: s for s in active_schedules}
+        created = 0
+        for game in updated_games:
+            app_id = str(game.get("appid", ""))
+            name = game.get("name", "")
+            update_at = game.get("update_at", datetime.now(timezone.utc))
+            existing = active_by_app_id.get(app_id)
+            if existing:
+                existing_update_at = existing.get("update_at")
+                if existing_update_at and update_at <= existing_update_at:
+                    continue  # Same or older patch — don't reset checkpoints
+            schedule = self.create_schedule(app_id, name, update_at)
+            await mongodb.upsert_refresh_schedule(schedule)
+            created += 1
+        logger.info(f"Created {created} refresh schedules for updated games")
+        return created
+    async def bootstrap_missing_analyses(
+        self, top_games: list[dict[str, Any]]
+    ) -> int:
+        """For top games with no cached analysis, create release schedules."""
+        # Pre-fetch active schedule app_ids for O(1) lookup
+        active_schedules = await mongodb.get_active_schedules()
+        scheduled_app_ids = {s["app_id"] for s in active_schedules}
+        created = 0
+        for game in top_games:
+            app_id = str(game.get("appid", ""))
+            if not app_id or app_id in scheduled_app_ids:
+                continue
+            # Check if analysis already cached
+            cached = await mongodb.get_cached_analysis(app_id)
+            if cached is not None:
+                continue
+            schedule = self.create_bootstrap_schedule(app_id, game.get("name", ""))
+            await mongodb.upsert_refresh_schedule(schedule)
+            scheduled_app_ids.add(app_id)
+            created += 1
+        logger.info(f"Bootstrap: created {created} release schedules")
+        return created
+    async def process_due_analyses(self) -> int:
+        """
+        Main processing loop: find due checkpoints, prioritize, execute.
+        Returns:
+            Number of analyses executed.
+        """
+        now = datetime.now(timezone.utc)
+        schedules = await mongodb.get_active_schedules()
+        max_per_cycle = settings.precache_max_analyses_per_cycle
+        delay = settings.precache_batch_delay_seconds
+        # Find one due checkpoint per game
+        due_items: list[dict[str, Any]] = []
+        for schedule in schedules:
+            for cp in schedule.get("checkpoints", []):
+                if cp.get("completed"):
+                    continue
+                if cp["due_at"] <= now:
+                    due_items.append({
+                        "app_id": schedule["app_id"],
+                        "game_name": schedule.get("game_name", ""),
+                        "offset_hours": cp["offset_hours"],
+                        "due_at": cp["due_at"],
+                        "positive": schedule.get("positive", 0),
+                        "negative": schedule.get("negative", 0),
+                    })
+                    break  # Only first due checkpoint per game
+        if not due_items:
+            logger.info("Pre-cache: no due analyses")
+            return 0
+        # Sort by popularity DESC, then due_at ASC
+        due_items.sort(
+            key=lambda x: (-(x.get("positive", 0) + x.get("negative", 0)), x["due_at"])
+        )
+        # Execute up to max_per_cycle
+        executed = 0
+        for item in due_items[:max_per_cycle]:
+            app_id = item["app_id"]
+            game_name = item["game_name"]
+            offset_hours = item["offset_hours"]
+            logger.info(f"Pre-cache: analyzing {app_id} ({game_name}) — checkpoint {offset_hours}h")
+            existing = await mongodb.get_analysis(app_id)
+            if existing and existing.get("results"):
+                result = await run_incremental_analysis(
+                    app_id, game_name, self._steam_svc, self._nlp_svc
+                )
+            else:
+                result = await run_full_analysis(
+                    app_id, game_name, self._steam_svc, self._nlp_svc
+                )
+            if result is not None:
+                executed += 1
+            # Mark checkpoint completed regardless of success
+            await mongodb.mark_checkpoint_completed(app_id, offset_hours)
+            # Check if all checkpoints done → complete schedule
+            await self._check_schedule_completion(app_id)
+            if executed < max_per_cycle and item != due_items[-1]:
+                await asyncio.sleep(delay)
+        logger.info(f"Pre-cache: executed {executed}/{len(due_items)} due analyses")
+        return executed
+    @staticmethod
+    async def _check_schedule_completion(app_id: str) -> None:
+        """If all checkpoints completed, mark schedule as completed."""
+        schedules = await mongodb.get_active_schedules()
+        for schedule in schedules:
+            if schedule["app_id"] != str(app_id):
+                continue
+            all_done = all(
+                cp.get("completed", False)
+                for cp in schedule.get("checkpoints", [])
+            )
+            if all_done:
+                await mongodb.complete_schedule(app_id)
+                logger.info(f"Schedule completed for {app_id}")
+            break

backend/app/services/priority_refresh_service.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""
+Priority Refresh Service — maintains canonical priority game state in MongoDB.
+Priority sources:
+  - top500: top 500 games by review count (local DB)
+  - top_sellers / new_releases / specials: Steam store featured categories
+Priority state fields on games documents:
+  is_priority              bool
+  priority_sources         list[str]
+  priority_grace_until     datetime | None
+  priority_last_confirmed_at  datetime | None
+"""
+import asyncio
+import logging
+from datetime import datetime, timedelta, timezone
+from typing import Any
+import httpx
+from app.core.config import settings
+from app.db.mongodb import mongodb
+logger = logging.getLogger(__name__)
+class PriorityRefreshService:
+    """Refreshes priority flags on the games collection each worker cycle."""
+    def __init__(self, client: httpx.AsyncClient | None = None) -> None:
+        self._client = client
+        self._owns_client = client is None
+    async def _get_client(self) -> httpx.AsyncClient:
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=15.0)
+        return self._client
+    async def close(self) -> None:
+        if self._owns_client and self._client is not None:
+            await self._client.aclose()
+            self._client = None
+    async def refresh_priorities(self) -> dict[str, Any]:
+        """
+        Recompute is_priority for all games and write changes to MongoDB.
+        Returns a summary dict with counts.
+        """
+        now = datetime.now(timezone.utc)
+        grace_deadline = now + timedelta(days=settings.steam_priority_grace_days)
+        # 1. Build active sources map
+        top500_ids: set[str] = {
+            g["appid"]
+            for g in await mongodb.get_top_games_by_reviews(500)
+            if g.get("app_type") != "dlc"
+        }
+        category_ids: dict[str, set[str]] = await self._fetch_store_categories()
+        active_sources: dict[str, list[str]] = {}
+        for appid in top500_ids:
+            active_sources.setdefault(appid, []).append("top500")
+        for cat_name, ids in category_ids.items():
+            for appid in ids:
+                active_sources.setdefault(appid, []).append(cat_name)
+        # 1b. Bootstrap category games that are missing from the local DB.
+        # top500 appids are safe — they come from existing DB records.
+        # Category appids may reference games not yet in our DB.
+        all_category_appids: set[str] = set()
+        for ids in category_ids.values():
+            all_category_appids.update(ids)
+        bootstrap_summary: dict[str, Any] = {}
+        if all_category_appids:
+            _, bootstrap_summary = await self._bootstrap_missing_games(all_category_appids)
+            # After bootstrap, remove from active_sources any category appid that
+            # still has no DB record (failed bootstrap / delisted / per-cycle limit).
+            # This prevents bulk_update_priority_fields from silently no-oping.
+            existing_in_db = await mongodb.get_existing_appids(all_category_appids)
+            for appid in all_category_appids - existing_in_db:
+                active_sources.pop(appid, None)
+        # 2. Load current priority state (only games that already have is_priority field)
+        existing_priority_docs: list[dict[str, Any]] = []
+        if mongodb.db is not None:
+            try:
+                collection = mongodb.db[mongodb.COLLECTION_GAMES]
+                cursor = collection.find(
+                    {"is_priority": {"$exists": True}},
+                    {
+                        "_id": 0,
+                        "appid": 1,
+                        "app_type": 1,
+                        "is_priority": 1,
+                        "priority_grace_until": 1,
+                        "priority_sources": 1,
+                    },
+                )
+                existing_priority_docs = await cursor.to_list(length=10000)
+            except Exception as e:
+                logger.warning(f"Failed to load existing priority docs: {e}")
+        existing_by_appid: dict[str, dict] = {
+            str(d["appid"]): d for d in existing_priority_docs
+        }
+        # 2b. DLC inherits effective priority from its parent game.
+        if settings.dlc_worker_analysis_enabled:
+            priority_parent_ids: set[str] = set(active_sources.keys())
+            for appid, doc in existing_by_appid.items():
+                if doc.get("app_type") == "dlc":
+                    continue
+                if not doc.get("is_priority") or appid in active_sources:
+                    continue
+                grace_until = doc.get("priority_grace_until")
+                if grace_until is None or grace_until >= now:
+                    priority_parent_ids.add(appid)
+            for parent_appid in priority_parent_ids:
+                dlcs = await mongodb.get_dlcs_by_parent_appid(parent_appid)
+                for dlc in dlcs:
+                    dlc_appid = str(dlc.get("appid", ""))
+                    if dlc_appid:
+                        active_sources[dlc_appid] = ["parent_priority"]
+        # 2c. When DLC worker analysis is disabled, remove any DLC that entered
+        # active_sources via other paths (e.g. Steam store categories).
+        if not settings.dlc_worker_analysis_enabled:
+            dlc_appids_to_remove = {
+                appid
+                for appid in active_sources
+                if existing_by_appid.get(appid, {}).get("app_type") == "dlc"
+            }
+            for appid in dlc_appids_to_remove:
+                del active_sources[appid]
+        # 3. Compute updates
+        updates: list[tuple[str, dict]] = []
+        became_priority = 0
+        entered_grace = 0
+        expired_grace = 0
+        reactivated = 0
+        removed_parent_priority = 0
+        # Active games — either new or confirming existing priority
+        for appid, sources in active_sources.items():
+            existing = existing_by_appid.get(appid)
+            fields: dict[str, Any] = {
+                "is_priority": True,
+                "priority_sources": sources,
+                "priority_grace_until": None,
+                "priority_last_confirmed_at": now,
+            }
+            if existing is None or not existing.get("is_priority"):
+                became_priority += 1
+            elif existing.get("priority_grace_until") is not None:
+                reactivated += 1
+            updates.append((appid, fields))
+        # Games that were priority but are no longer in any active source
+        for appid, doc in existing_by_appid.items():
+            if appid in active_sources:
+                continue  # already handled above
+            if not doc.get("is_priority"):
+                continue  # already marked non-priority, skip
+            if "parent_priority" in (doc.get("priority_sources") or []):
+                updates.append((appid, {
+                    "is_priority": False,
+                    "priority_sources": [],
+                    "priority_grace_until": None,
+                }))
+                removed_parent_priority += 1
+                continue
+            grace_until = doc.get("priority_grace_until")
+            if grace_until is None:
+                # Just left all sources — start grace period
+                updates.append((appid, {
+                    "priority_grace_until": grace_deadline,
+                    "priority_sources": [],
+                }))
+                entered_grace += 1
+            elif grace_until < now:
+                # Grace expired — remove priority
+                updates.append((appid, {
+                    "is_priority": False,
+                    "priority_sources": [],
+                    "priority_grace_until": None,
+                }))
+                expired_grace += 1
+            # else: still in grace and not expired — no update needed
+        modified = await mongodb.bulk_update_priority_fields(updates)
+        result = {
+            "total_active": len(active_sources),
+            "top500_count": len(top500_ids),
+            "category_counts": {k: len(v) for k, v in category_ids.items()},
+            "bootstrap": bootstrap_summary,
+            "became_priority": became_priority,
+            "reactivated": reactivated,
+            "entered_grace": entered_grace,
+            "expired_grace": expired_grace,
+            "removed_parent_priority": removed_parent_priority,
+            "db_modified": modified,
+        }
+        logger.info(f"Priority refresh complete: {result}")
+        return result
+    @staticmethod
+    def _parse_app_type(data: dict[str, Any]) -> dict[str, Any]:
+        """Parse app_type and parent_appid from an appdetails data block."""
+        app_type = data.get("type") or "unknown"
+        fullgame = data.get("fullgame")
+        parent_appid = None
+        if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None:
+            parent_appid = str(fullgame["appid"])
+        return {"app_type": str(app_type), "parent_appid": parent_appid}
+    async def _fetch_app_details_bilingual(self, appid: str) -> dict[str, Any] | None:
+        """
+        Fetch appdetails for a single game in both english and schinese.
+        Returns a minimal game dict (name, name_cn, app_type, parent_appid,
+        header_image, cn_name_checked) or None on failure / not found.
+        """
+        client = await self._get_client()
+        store_url = "https://store.steampowered.com/api/appdetails"
+        async def _fetch_one(lang: str) -> dict[str, Any]:
+            try:
+                resp = await client.get(
+                    store_url,
+                    params={"appids": appid, "l": lang, "cc": settings.steam_region},
+                )
+                if resp.status_code != 200:
+                    return {}
+                entry = resp.json().get(str(appid))
+                if entry and entry.get("success"):
+                    return entry.get("data") or {}
+                return {}
+            except Exception as e:
+                logger.warning(f"appdetails error for {appid} (lang={lang}): {e}")
+                return {}
+        data_en, data_cn = await asyncio.gather(
+            _fetch_one("english"),
+            _fetch_one("schinese"),
+        )
+        if not data_en and not data_cn:
+            logger.warning(f"No appdetails for {appid} — skipping bootstrap")
+            return None
+        name_en = data_en.get("name") or data_cn.get("name")
+        if not name_en:
+            logger.warning(f"No name in appdetails for {appid} — skipping bootstrap")
+            return None
+        name_cn = data_cn.get("name")
+        base = data_en or data_cn
+        type_info = self._parse_app_type(base)
+        return {
+            "appid": appid,
+            "name": name_en,
+            "name_cn": name_cn if name_cn and name_cn != name_en else None,
+            "cn_name_checked": True,
+            "app_type": type_info["app_type"],
+            "parent_appid": type_info["parent_appid"],
+            "header_image": base.get("header_image"),
+        }
+    async def _bootstrap_missing_games(
+        self,
+        category_appids: set[str],
+    ) -> tuple[set[str], dict[str, Any]]:
+        """
+        Fetch Steam Store data and upsert games missing from the local DB.
+        Returns:
+            (bootstrapped_appids, summary_dict)
+            bootstrapped_appids: set of appids that were newly upserted
+        """
+        existing = await mongodb.get_existing_appids(category_appids)
+        missing = category_appids - existing
+        if not missing:
+            return set(), {"bootstrapped": 0, "failed": 0, "skipped_existing": len(existing)}
+        limit = settings.steam_bootstrap_max_per_cycle
+        appids_to_fetch = list(missing)[:limit]
+        bootstrapped: set[str] = set()
+        failed = 0
+        for i, appid in enumerate(appids_to_fetch):
+            game_data = await self._fetch_app_details_bilingual(appid)
+            if game_data is None:
+                failed += 1
+            else:
+                await mongodb.upsert_game(game_data)
+                bootstrapped.add(appid)
+            if i < len(appids_to_fetch) - 1:
+                await asyncio.sleep(settings.steam_bootstrap_delay)
+        summary = {
+            "bootstrapped": len(bootstrapped),
+            "failed": failed,
+            "skipped_existing": len(existing),
+            "missing_over_limit": max(0, len(missing) - limit),
+        }
+        if bootstrapped or failed:
+            logger.info(f"Bootstrap missing games: {summary}")
+        return bootstrapped, summary
+    async def _fetch_region_categories(self, region: str) -> dict[str, set[str]]:
+        """
+        Fetch featured categories for a single Steam region (cc=region).
+        Returns dict mapping category name -> set of appid strings.
+        On any failure, returns {} so the caller can continue with other regions.
+        """
+        try:
+            client = await self._get_client()
+            resp = await client.get(
+                settings.steam_priority_categories_url,
+                params={"cc": region, "l": "schinese"},
+            )
+            if resp.status_code != 200:
+                logger.warning(
+                    f"Steam featuredcategories [{region}] returned {resp.status_code} — skipping region"
+                )
+                return {}
+            data = resp.json()
+        except Exception as e:
+            logger.warning(
+                f"Failed to fetch Steam store categories [{region}]: {e} — skipping region"
+            )
+            return {}
+        result: dict[str, set[str]] = {}
+        for cat_name in settings.steam_priority_categories_list:
+            cat_data = data.get(cat_name)
+            if not cat_data:
+                continue
+            items = cat_data.get("items", [])
+            appids: set[str] = {
+                str(item["id"])
+                for item in items
+                if item.get("type") == 0 and item.get("id") is not None
+            }
+            result[cat_name] = appids
+        return result
+    async def _fetch_store_categories(self) -> dict[str, set[str]]:
+        """
+        Fetch game appids from Steam store featured categories across all configured regions.
+        Iterates over steam_priority_regions_list (default: CN, US) and merges results.
+        If one region fails, the other is still used. If all fail, returns {} (fallback
+        to top-500 only).
+        Returns dict mapping category name -> set of appid strings.
+        """
+        regions = settings.steam_priority_regions_list
+        if not regions:
+            logger.warning(
+                "steam_priority_regions is empty — skipping store categories fetch (top500 only)"
+            )
+            return {}
+        merged: dict[str, set[str]] = {}
+        for region in regions:
+            region_data = await self._fetch_region_categories(region)
+            for cat_name, appids in region_data.items():
+                merged.setdefault(cat_name, set()).update(appids)
+        return merged

backend/app/services/steam_errors.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""
+Custom exceptions for Steam API errors.
+Separate module to avoid circular imports between mongodb.py and steam_service.py.
+"""
+class SteamAPIError(Exception):
+    """Raised when Steam API returns a non-retryable error (404, 403, other 4xx)."""
+    def __init__(self, status_code: int, app_id: str, message: str = "") -> None:
+        self.status_code = status_code
+        self.app_id = app_id
+        self.message = message or f"Steam API error {status_code} for app {app_id}"
+        super().__init__(self.message)
+class SteamRateLimitError(SteamAPIError):
+    """Raised when Steam API returns 429 after all retries are exhausted."""
+    def __init__(self, app_id: str) -> None:
+        super().__init__(status_code=429, app_id=app_id, message=f"Steam API rate limited for app {app_id}")

backend/app/services/steam_service.py ADDED Viewed

	@@ -0,0 +1,499 @@

+"""
+Serwis do komunikacji ze Steam API.
+Odpowiada za pobieranie informacji o grach oraz recenzji.
+Wykorzystuje publiczne API Steam (nie wymaga klucza API).
+Implementuje statystyczne próbkowanie recenzji (stratified sampling).
+Retry z exponential backoff dla 429/5xx/timeout.
+"""
+import asyncio
+import logging
+from dataclasses import dataclass
+from typing import Any, AsyncGenerator
+import httpx
+from app.core.config import settings
+from app.core.sampling import SamplePlan, create_sample_plan
+from app.db.mongodb import mongodb
+from app.models.schemas import GameInfo, ReviewBatch, ReviewItem
+from app.services.steam_errors import SteamAPIError, SteamRateLimitError
+logger = logging.getLogger(__name__)
+# Status codes that should be retried
+_RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
+@dataclass
+class ReviewStats:
+    """Statystyki recenzji gry."""
+    total: int
+    positive: int
+    negative: int
+class SteamService:
+    """
+    Serwis do pobierania danych ze Steam API.
+    """
+    STORE_API_URL = "https://store.steampowered.com/api"
+    REVIEW_API_URL = "https://store.steampowered.com/appreviews"
+    SEARCH_API_URL = "https://store.steampowered.com/api/storesearch"
+    def __init__(self, timeout: float = 30.0) -> None:
+        self.timeout = timeout
+        self.client = httpx.AsyncClient(timeout=self.timeout)
+    async def close(self) -> None:
+        """Close the shared HTTP client."""
+        await self.client.aclose()
+    async def _request_with_retry(
+        self,
+        client: httpx.AsyncClient,
+        url: str,
+        params: dict[str, Any],
+        context: str = "",
+    ) -> httpx.Response:
+        """
+        Wykonuje request z retry i exponential backoff.
+        """
+        max_attempts = settings.steam_retry_max_attempts
+        base_delay = settings.steam_retry_base_delay
+        max_delay = settings.steam_retry_max_delay
+        last_exception: Exception | None = None
+        for attempt in range(max_attempts):
+            try:
+                response = await client.get(url, params=params)
+                status = response.status_code
+                if status == 200:
+                    return response
+                # Non-retryable client errors
+                if status == 404:
+                    raise SteamAPIError(404, context, f"Not found: {url}")
+                if status == 403:
+                    raise SteamAPIError(403, context, f"Forbidden: {url}")
+                if 400 <= status < 500 and status not in _RETRYABLE_STATUS_CODES:
+                    raise SteamAPIError(status, context, f"Client error {status}: {url}")
+                # Retryable errors (429, 5xx)
+                if attempt < max_attempts - 1:
+                    delay = min(base_delay * (2 ** attempt), max_delay)
+                    # Respect Retry-After header for 429
+                    if status == 429:
+                        retry_after = response.headers.get("Retry-After")
+                        if retry_after:
+                            try:
+                                delay = min(float(retry_after), max_delay)
+                            except ValueError:
+                                pass
+                    logger.warning(
+                        f"Steam API {status} for {context}, "
+                        f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s"
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    # Exhausted retries
+                    if status == 429:
+                        raise SteamRateLimitError(context)
+                    raise SteamAPIError(status, context, f"Server error {status} after {max_attempts} attempts: {url}")
+            except (httpx.TimeoutException, httpx.ConnectError) as e:
+                last_exception = e
+                if attempt < max_attempts - 1:
+                    delay = min(base_delay * (2 ** attempt), max_delay)
+                    logger.warning(
+                        f"Steam API {type(e).__name__} for {context}, "
+                        f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s"
+                    )
+                    await asyncio.sleep(delay)
+                else:
+                    raise SteamAPIError(
+                        0, context,
+                        f"Connection failed after {max_attempts} attempts: {e}"
+                    ) from e
+        # Should not reach here, but just in case
+        raise SteamAPIError(0, context, "Unexpected retry exhaustion") from last_exception
+    async def search_game(self, query: str) -> GameInfo | None:
+        """Wyszukuje grę po nazwie używając publicznego API wyszukiwarki Steam."""
+        client = self.client
+        params = {
+            "term": query,
+            "l": settings.steam_review_language,
+            "cc": settings.steam_region,
+        }
+        try:
+            response = await self._request_with_retry(
+                client, self.SEARCH_API_URL, params, context=f"search:{query}"
+            )
+            data = response.json()
+        except (SteamAPIError, SteamRateLimitError) as e:
+            logger.error(f"Błąd wyszukiwania gry '{query}': {e}")
+            return None
+        items = data.get("items", [])
+        if not items:
+            logger.warning(f"Nie znaleziono gry: {query}")
+            return None
+        first_result = items[0]
+        app_id = str(first_result.get("id"))
+        game_info = await self.get_game_info(app_id)
+        if game_info:
+            await mongodb.upsert_game({
+                "appid": game_info.app_id,
+                "name": game_info.name,
+                "name_cn": game_info.name_cn,
+                "cn_name_checked": True,
+                "header_image": game_info.header_image,
+                "total_reviews": game_info.total_reviews
+            })
+        return game_info
+    async def get_game_info(self, app_id: str) -> GameInfo | None:
+        """Pobiera szczegółowe metadane gry (obrazek, nazwę) z appdetails."""
+        cached_error = await mongodb.get_steam_error(app_id)
+        if cached_error:
+            logger.info(
+                f"Skipping Steam API for app {app_id} — "
+                f"cached error {cached_error.get('status_code')}"
+            )
+            return None
+        client = self.client
+        details_url = f"{self.STORE_API_URL}/appdetails"
+        async def fetch_localized(lang: str):
+            try:
+                params = {"appids": app_id, "l": lang, "cc": settings.steam_region}
+                resp = await self._request_with_retry(
+                    client, details_url, params, context=app_id
+                )
+                return resp.json().get(app_id, {})
+            except SteamAPIError as e:
+                if e.status_code == 404:
+                    await mongodb.cache_steam_error(
+                        app_id, 404, settings.steam_error_cache_ttl_404
+                    )
+                return {}
+        data_zh, data_en = await asyncio.gather(
+            fetch_localized("schinese"),
+            fetch_localized("english")
+        )
+        if not data_en.get("success") and not data_zh.get("success"):
+            logger.warning(f"Nie znaleziono szczegółów gry: {app_id}")
+            return None
+        base_data = data_en.get("data") or data_zh.get("data")
+        name_en = data_en.get("data", {}).get("name") or base_data.get("name")
+        name_zh = data_zh.get("data", {}).get("name")
+        stats = await self.get_review_stats(app_id)
+        return GameInfo(
+            app_id=app_id,
+            name=name_en,
+            name_cn=name_zh if name_zh != name_en else None,
+            header_image=base_data.get("header_image"),
+            total_reviews=stats.total,
+        )
+    async def get_review_stats(self, app_id: str) -> ReviewStats:
+        """Pobiera sumaryczne statystyki recenzji potrzebne do planowania próbki."""
+        cached_error = await mongodb.get_steam_error(app_id)
+        if cached_error:
+            logger.info(
+                f"Skipping review stats for app {app_id} — "
+                f"cached error {cached_error.get('status_code')}"
+            )
+            return ReviewStats(total=0, positive=0, negative=0)
+        client = self.client
+        url = f"{self.REVIEW_API_URL}/{app_id}"
+        params = {
+            "json": "1",
+            "filter": "all",
+            "num_per_page": "0",
+        }
+        try:
+            response = await self._request_with_retry(
+                client, url, params, context=app_id
+            )
+            data = response.json()
+            summary = data.get("query_summary", {})
+            return ReviewStats(
+                total=summary.get("total_reviews", 0),
+                positive=summary.get("total_positive", 0),
+                negative=summary.get("total_negative", 0),
+            )
+        except SteamAPIError as e:
+            if e.status_code in (404, 429):
+                ttl = (
+                    settings.steam_error_cache_ttl_429
+                    if e.status_code == 429
+                    else settings.steam_error_cache_ttl_404
+                )
+                await mongodb.cache_steam_error(app_id, e.status_code, ttl)
+            logger.error(f"Błąd pobierania statystyk recenzji: {e}")
+            return ReviewStats(total=0, positive=0, negative=0)
+    async def _fetch_reviews_batch(
+        self,
+        client: httpx.AsyncClient,
+        app_id: str,
+        review_type: str,
+        filter_type: str,
+        num_per_page: int,
+        cursor: str | None,
+    ) -> tuple[list[str], list[ReviewItem], str | None]:
+        """Pobiera pojedynczą paczkę recenzji (do 100 sztuk)."""
+        url = f"{self.REVIEW_API_URL}/{app_id}"
+        params: dict[str, Any] = {
+            "json": "1",
+            "filter": filter_type,
+            "review_type": review_type,
+            "language": settings.steam_review_language,
+            "num_per_page": str(num_per_page),
+            "cursor": cursor or "*",
+            "purchase_type": "all",
+        }
+        try:
+            response = await self._request_with_retry(
+                client, url, params, context=app_id
+            )
+            data = response.json()
+        except SteamRateLimitError:
+            await mongodb.cache_steam_error(
+                app_id, 429, settings.steam_error_cache_ttl_429
+            )
+            logger.error(f"Rate limited fetching reviews for {app_id}")
+            return [], [], None
+        except SteamAPIError as e:
+            logger.error(f"Błąd pobierania recenzji: {e}")
+            return [], [], None
+        if not data.get("success"):
+            return [], [], None
+        reviews_data = data.get("reviews", [])
+        review_texts: list[str] = []
+        review_items: list[ReviewItem] = []
+        for review in reviews_data:
+            text = review.get("review")
+            if not text:
+                continue
+            review_texts.append(text)
+            review_items.append(ReviewItem(
+                text=text,
+                recommendation_id=str(review.get("recommendationid", "")),
+                timestamp_created=review.get("timestamp_created", 0),
+            ))
+        new_cursor = data.get("cursor")
+        return review_texts, review_items, new_cursor
+    async def fetch_reviews_stratified(
+        self,
+        app_id: str,
+        sample_plan: SamplePlan,
+    ) -> AsyncGenerator[ReviewBatch, None]:
+        """
+        Główna logika pobierania danych. Działa w dwóch fazach.
+        """
+        batch_size = settings.review_batch_size
+        all_reviews: set[str] = set()
+        seen_cursors: set[str] = set()
+        client = self.client
+        # --- FAZA 1: TOP HELPFUL ---
+        cursor: str | None = "*"
+        fetched = 0
+        while fetched < sample_plan.top_helpful:
+            to_fetch = min(batch_size, sample_plan.top_helpful - fetched)
+            reviews, review_items, cursor = await self._fetch_reviews_batch(
+                client, app_id, "all", "all", to_fetch, cursor
+            )
+            if not reviews:
+                break
+            if cursor and cursor in seen_cursors:
+                logger.warning(f"Repeated cursor {cursor} for {app_id} (top_helpful). Shortfall: {sample_plan.top_helpful - fetched}")
+                break
+            if cursor:
+                seen_cursors.add(cursor)
+            all_reviews.update(reviews)
+            fetched += len(reviews)
+            yield ReviewBatch(reviews=reviews, review_items=review_items, cursor=cursor)
+            if not cursor or cursor == "*":
+                break
+        # --- FAZA 2a: RECENT POSITIVE ---
+        positive_target = sample_plan.positive_count
+        if positive_target > 0:
+            cursor = "*"
+            fetched = 0
+            seen_cursors_pos: set[str] = set()
+            while fetched < positive_target:
+                to_fetch = min(batch_size, positive_target - fetched)
+                # Jeśli mamy dużo duplikatów, prosimy o więcej niż pozostało do targetu (ale max batch_size)
+                if fetched > 0:
+                    to_fetch = batch_size
+                reviews, review_items, cursor = await self._fetch_reviews_batch(
+                    client, app_id, "positive", "recent", to_fetch, cursor or "*"
+                )
+                if not reviews:
+                    break
+                if cursor and cursor in seen_cursors_pos:
+                    logger.warning(f"Repeated cursor {cursor} for {app_id} (positive). Shortfall: {positive_target - fetched}")
+                    break
+                if cursor:
+                    seen_cursors_pos.add(cursor)
+                new_reviews = [r for r in reviews if r not in all_reviews]
+                new_texts_set = set(new_reviews)
+                new_items = [ri for ri in review_items if ri.text in new_texts_set]
+                all_reviews.update(new_reviews)
+                fetched += len(new_reviews)
+                if new_reviews:
+                    yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor)
+                if not cursor or cursor == "*":
+                    break
+        # --- FAZA 2b: RECENT NEGATIVE ---
+        negative_target = sample_plan.negative_count
+        if negative_target > 0:
+            cursor = "*"
+            fetched = 0
+            seen_cursors_neg: set[str] = set()
+            while fetched < negative_target:
+                to_fetch = min(batch_size, negative_target - fetched)
+                if fetched > 0:
+                    to_fetch = batch_size
+                reviews, review_items, cursor = await self._fetch_reviews_batch(
+                    client, app_id, "negative", "recent", to_fetch, cursor or "*"
+                )
+                if not reviews:
+                    break
+                if cursor and cursor in seen_cursors_neg:
+                    logger.warning(f"Repeated cursor {cursor} for {app_id} (negative). Shortfall: {negative_target - fetched}")
+                    break
+                if cursor:
+                    seen_cursors_neg.add(cursor)
+                new_reviews = [r for r in reviews if r not in all_reviews]
+                new_texts_set = set(new_reviews)
+                new_items = [ri for ri in review_items if ri.text in new_texts_set]
+                all_reviews.update(new_reviews)
+                fetched += len(new_reviews)
+                if new_reviews:
+                    yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor)
+                if not cursor or cursor == "*":
+                    break
+        logger.info(f"Pobrano łącznie {len(all_reviews)} unikalnych recenzji")
+    async def fetch_recent_reviews(
+        self,
+        app_id: str,
+        exclude_ids: set[str] | None = None,
+    ) -> list[ReviewItem]:
+        """
+        Fetch recent reviews for incremental analysis.
+        """
+        is_new_game = not exclude_ids
+        exclude_ids = exclude_ids or set()
+        batch_size = settings.review_batch_size
+        # Incremental Fetch limit for new games
+        if is_new_game:
+            stats = await self.get_review_stats(app_id)
+            max_total = min(stats.total, settings.recent_sample_limit, 500)
+        else:
+            max_total = settings.recent_sample_limit
+        client = self.client
+        cursor: str | None = "*"
+        seen_cursors: set[str] = set()
+        new_items: list[ReviewItem] = []
+        while len(new_items) < max_total:
+            to_fetch = min(batch_size, max_total - len(new_items))
+            _, review_items, cursor = await self._fetch_reviews_batch(
+                client, app_id, "all", "recent", to_fetch, cursor
+            )
+            if not review_items:
+                break
+            if cursor and cursor in seen_cursors:
+                logger.warning(f"Repeated cursor {cursor} for {app_id} (recent). Shortfall: {max_total - len(new_items)}")
+                break
+            if cursor:
+                seen_cursors.add(cursor)
+            # Filter out already-known reviews
+            batch_new = [ri for ri in review_items if ri.recommendation_id not in exclude_ids]
+            # Early exit: if >80% of batch is known, we've passed the boundary
+            known_ratio = 1 - (len(batch_new) / len(review_items)) if review_items else 0
+            new_items.extend(batch_new)
+            if not is_new_game and known_ratio > 0.8:
+                logger.info(
+                    f"Early exit for {app_id}: {known_ratio:.0%} of batch already known"
+                )
+                break
+            if not cursor or cursor == "*":
+                break
+        logger.info(f"Incremental fetch for {app_id}: {len(new_items)} new reviews")
+        return new_items[:max_total]
+    async def fetch_reviews(
+        self,
+        app_id: str,
+        batch_size: int | None = None,
+        max_reviews: int | None = None,
+    ) -> AsyncGenerator[ReviewBatch, None]:
+        """Wrapper dla zachowania kompatybilności."""
+        stats = await self.get_review_stats(app_id)
+        if stats.total == 0:
+            return
+        sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
+        async for batch in self.fetch_reviews_stratified(app_id, sample_plan):
+            yield batch
+# Globalna instancja serwisu (Singleton)
+steam_service = SteamService()

backend/app/services/update_detection_service.py ADDED Viewed

	@@ -0,0 +1,453 @@

+"""
+Update Detection Service — checks Steam News API for game updates.
+Compares the latest news/patch date with the stored `last_game_update_at`
+to detect games that have been recently updated.
+"""
+import logging
+import re
+from datetime import datetime, timezone
+from typing import Any, NamedTuple, cast
+import httpx
+from app.core.config import settings
+from app.db.mongodb import mongodb
+logger = logging.getLogger(__name__)
+STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/"
+# Matches two-segment versions: 1.2, v2.0, 0.6, 123.4
+# Excludes three-segment (0.6.1) via negative lookahead, 4-digit years via \d{1,3},
+# and sub-segments of longer versions (e.g. "6.1" within "0.6.1") via lookbehind.
+VERSION_RE = re.compile(r'(?<!\d\.)\bv?\d{1,3}\.\d+\b(?!\.\d)')
+# Phase 1 regex constants
+RELEASE_PHRASE_RE = re.compile(
+    r'\b(out now|is out|is live|now live|now available|full release|'
+    r'leaving early access|out of early access)\b',
+    re.IGNORECASE
+)
+CONTENT_UPDATE_RE = re.compile(
+    r'\b(major update|content update|big update|biggest update)\b',
+    re.IGNORECASE
+)
+ACTION_WORD_RE = re.compile(
+    r'\b(update|patch|release|available|launch|live|out)\b',
+    re.IGNORECASE
+)
+HOTFIX_RE = re.compile(r'\b(hotfix|hot.?fix)\b', re.IGNORECASE)
+BRANCH_RE = re.compile(
+    r'\b(experimental branch|experimental.{0,10}patch|experimental.{0,10}build|'
+    r'public.?test|pts build|beta branch|'
+    r'on experimental|for experimental)\b',
+    re.IGNORECASE
+)
+MAJOR_RELEASE_RE = re.compile(
+    r'\b(out now|is out|is live|now live|now available|full release|'
+    r'leaving early access|out of early access)\b',
+    re.IGNORECASE
+)
+MAJOR_CONTENT_RE = re.compile(
+    r'\b(major update|content update|big update|biggest update)\b',
+    re.IGNORECASE
+)
+ONE_ZERO_RE = re.compile(r'\b1\.0\b(?!\.\d)')
+# Phase 2 regex constants
+EVENT_FESTIVAL_RE = re.compile(
+    r'\b(festival|anniversary\s+event|community\s+event|'
+    r'in-game\s+event|roadmap|preview)\b',
+    re.IGNORECASE
+)
+UPDATE_OR_PATCH_RE = re.compile(r'\b(update|patch)\b', re.IGNORECASE)
+NAMED_VERSION_RE = re.compile(r'\bV\d+\b')  # case-sensitive: uppercase V only
+UPDATE_WORD_RE = re.compile(r'\bupdate\b', re.IGNORECASE)
+PATCH_WORD_RE = re.compile(r'\bpatch\b', re.IGNORECASE)
+MAINT_LANGUAGE_RE = re.compile(
+    r'\b(fix(?:es|ed)?|bug\s*fix|improv(?:es?|ed|ements?)|stability|performance|tweak)\b',
+    re.IGNORECASE
+)
+_NEWS_MAX_PAGES = 5  # Max pages in incremental mode (5 * 5 = 25 items)
+class NewsCheckResult(NamedTuple):
+    latest_update_date: datetime | None  # date of most recent update-related item
+    is_major: bool                        # whether any item qualifies as major
+    major_date: datetime | None           # date of most recent major item; None if not major
+    newest_seen_gid: str | None = None    # GID of newest news item (for cursor persistence)
+    newest_seen_at: datetime | None = None  # timestamp of newest news item
+class UpdateDetectionService:
+    """Detects game updates via Steam News API."""
+    def __init__(self, client: httpx.AsyncClient | None = None) -> None:
+        self._client = client
+        self._owns_client = client is None
+    async def _get_client(self) -> httpx.AsyncClient:
+        if self._client is None:
+            self._client = httpx.AsyncClient(timeout=15.0)
+        return self._client
+    async def close(self) -> None:
+        if self._owns_client and self._client is not None:
+            await self._client.aclose()
+            self._client = None
+    @staticmethod
+    def _is_update_related(item: dict) -> bool:
+        """Return True if news item is update-related.
+        Conditions (any one is sufficient):
+        A: 'patchnotes' in tags
+        B: feedlabel == 'Product Update'
+        C: title matches release-style phrases
+        D: title matches large content update phrases
+        E: title has a version number AND an action word
+        """
+        tags = item.get("tags")
+        if isinstance(tags, list):
+            is_patch = "patchnotes" in tags
+        else:
+            is_patch = "patchnotes" in (tags or "")
+        feedlabel = item.get("feedlabel") or ""
+        if is_patch or feedlabel == "Product Update":
+            return True
+        # Conditions C/D/E: title-based signals — restricted to developer feed only.
+        # Third-party news sites (GamingOnLinux etc.) can write about updates using
+        # the same language, so we only trust these signals from the developer's own feed.
+        if item.get("feedname") != "steam_community_announcements":
+            return False
+        title = item.get("title", "")
+        if RELEASE_PHRASE_RE.search(title):
+            return True
+        if CONTENT_UPDATE_RE.search(title):
+            return True
+        if VERSION_RE.search(title) and ACTION_WORD_RE.search(title):
+            return True
+        # F: named version (V70) + "update" in title (developer feed only)
+        if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title):
+            return True
+        return False
+    @staticmethod
+    def _is_major_update(item: dict) -> bool:
+        """Return True if the news item represents a major update.
+        Negative signals (blockers) are checked first:
+        - hotfix keyword → not major
+        - experimental branch / public test branch → not major
+        Positive signals (any one is sufficient):
+        - version number in title (VERSION_RE)
+        - release language (MAJOR_RELEASE_RE)
+        - standalone '1.0' (ONE_ZERO_RE)
+        - large content phrases (MAJOR_CONTENT_RE)
+        """
+        title = item.get("title", "")
+        if HOTFIX_RE.search(title):
+            return False
+        if BRANCH_RE.search(title):
+            return False
+        if EVENT_FESTIVAL_RE.search(title) and not UPDATE_OR_PATCH_RE.search(title):
+            return False
+        if PATCH_WORD_RE.search(title) and MAINT_LANGUAGE_RE.search(title):
+            return False
+        if VERSION_RE.search(title):
+            return True
+        if MAJOR_RELEASE_RE.search(title):
+            return True
+        if ONE_ZERO_RE.search(title):
+            return True
+        if MAJOR_CONTENT_RE.search(title):
+            return True
+        if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title):
+            return True
+        return False
+    @staticmethod
+    def _collect_update_candidates(
+        news_items: list[dict],
+    ) -> tuple[datetime | None, datetime | None]:
+        """Scan all items, return (latest_update_date, major_date).
+        latest_update_date: max date of all update-related items (or None)
+        major_date: max date of major items (or None if no major found)
+        """
+        latest_update_ts: int | None = None
+        major_ts: int | None = None
+        for item in news_items:
+            if not UpdateDetectionService._is_update_related(item):
+                continue
+            ts = item.get("date") or 0
+            if not ts:
+                continue
+            if latest_update_ts is None or ts > latest_update_ts:
+                latest_update_ts = ts
+            if UpdateDetectionService._is_major_update(item):
+                if major_ts is None or ts > major_ts:
+                    major_ts = ts
+        latest_update_date = (
+            datetime.fromtimestamp(latest_update_ts, tz=timezone.utc)
+            if latest_update_ts is not None
+            else None
+        )
+        major_date = (
+            datetime.fromtimestamp(major_ts, tz=timezone.utc)
+            if major_ts is not None
+            else None
+        )
+        return latest_update_date, major_date
+    @staticmethod
+    async def _fetch_news_page(
+        client: httpx.AsyncClient,
+        app_id: str,
+        count: int,
+        enddate: int | None = None,
+    ) -> list[dict]:
+        """Fetch a single page of news items from Steam API.
+        Returns [] on HTTP error or request failure.
+        """
+        params: dict[str, Any] = {
+            "appid": app_id,
+            "count": count,
+            "maxlength": 0,
+        }
+        if enddate is not None:
+            params["enddate"] = enddate
+        try:
+            resp = await client.get(STEAM_NEWS_API_URL, params=params)
+            if resp.status_code != 200:
+                return []
+            data = resp.json()
+            return data.get("appnews", {}).get("newsitems", [])
+        except (httpx.RequestError, ValueError, KeyError) as e:
+            logger.debug(f"News page fetch failed for {app_id}: {e}")
+            return []
+    @staticmethod
+    def _scan_batch_with_stopping(
+        items: list[dict],
+        last_seen_gid: str | None,
+        last_seen_at_ts: int | None,
+        refresh_cutoff_ts: int | None,
+    ) -> tuple[list[dict], bool]:
+        """Scan items (newest→oldest), collecting until a stop condition is met.
+        Stop conditions (item is NOT included):
+        - gid matches last_seen_gid
+        - item date <= last_seen_at_ts
+        - item date < refresh_cutoff_ts
+        Returns (accepted_items, hit_stop).
+        """
+        accepted: list[dict] = []
+        for item in items:
+            gid = str(item.get("gid", ""))
+            ts = item.get("date") or 0
+            if last_seen_gid and gid and gid == last_seen_gid:
+                return accepted, True
+            if last_seen_at_ts is not None and ts and ts <= last_seen_at_ts:
+                return accepted, True
+            if refresh_cutoff_ts is not None and ts and ts < refresh_cutoff_ts:
+                return accepted, True
+            accepted.append(item)
+        return accepted, False
+    async def _get_latest_news_date(
+        self,
+        app_id: str,
+        last_seen_gid: str | None = None,
+        last_seen_at: datetime | None = None,
+    ) -> NewsCheckResult:
+        """Fetch and scan Steam news for update candidates.
+        In initial mode (no cursor): fetches count=20, single page.
+        In incremental mode (cursor present): fetches count=5 with pagination,
+        stopping at the known cursor or the refresh window boundary.
+        """
+        client = await self._get_client()
+        is_incremental = last_seen_gid is not None or last_seen_at is not None
+        count = settings.news_incremental_count if is_incremental else settings.news_initial_count
+        # Compute stop thresholds for incremental mode
+        last_seen_at_ts: int | None = None
+        refresh_cutoff_ts: int | None = None
+        if is_incremental:
+            last_seen_at_ts = int(last_seen_at.timestamp()) if last_seen_at else None
+            now_ts = int(datetime.now(timezone.utc).timestamp())
+            cutoff_ts = now_ts - (settings.news_refresh_window_hours * 3600)
+            # If cursor is older than the refresh window (worker was down),
+            # disable the time cutoff and scan to the cursor instead.
+            # _NEWS_MAX_PAGES protects against unbounded pagination.
+            if last_seen_at_ts is not None and last_seen_at_ts < cutoff_ts:
+                refresh_cutoff_ts = None
+            else:
+                refresh_cutoff_ts = cutoff_ts
+        all_accepted: list[dict] = []
+        newest_gid: str | None = None
+        newest_ts: int = 0
+        scan_complete = False
+        pages_fetched = 0
+        enddate: int | None = None
+        while True:
+            items = await self._fetch_news_page(client, app_id, count, enddate)
+            if not items:
+                if pages_fetched == 0:
+                    # First page empty (no news or HTTP error) — newest_gid stays None
+                    pass
+                # Pagination page empty → incomplete scan → don't update cursor
+                break
+            pages_fetched += 1
+            # Track newest item (from first page only)
+            if newest_gid is None:
+                for item in items:
+                    gid = str(item.get("gid", ""))
+                    ts = item.get("date") or 0
+                    if gid and ts:
+                        newest_gid = gid
+                        newest_ts = ts
+                        break
+            if is_incremental:
+                accepted, hit_stop = self._scan_batch_with_stopping(
+                    items, last_seen_gid, last_seen_at_ts, refresh_cutoff_ts
+                )
+                all_accepted.extend(accepted)
+                if hit_stop:
+                    scan_complete = True
+                    break
+                if len(items) < count:
+                    scan_complete = True  # API has no more items
+                    break
+                if pages_fetched >= _NEWS_MAX_PAGES:
+                    scan_complete = True  # page limit reached
+                    break
+                oldest_ts = items[-1].get("date") or 0
+                if not oldest_ts:
+                    break  # can't paginate → incomplete scan
+                enddate = oldest_ts - 1
+            else:
+                # Initial mode: single fetch, always clean
+                all_accepted.extend(items)
+                scan_complete = True
+                break
+        latest_update_date, major_date = self._collect_update_candidates(all_accepted)
+        cursor_gid: str | None = None
+        cursor_at: datetime | None = None
+        if scan_complete and newest_gid:
+            cursor_gid = newest_gid
+            cursor_at = datetime.fromtimestamp(newest_ts, tz=timezone.utc)
+        if latest_update_date is None:
+            return NewsCheckResult(
+                None, False, None,
+                newest_seen_gid=cursor_gid,
+                newest_seen_at=cursor_at,
+            )
+        return NewsCheckResult(
+            latest_update_date=latest_update_date,
+            is_major=major_date is not None,
+            major_date=major_date,
+            newest_seen_gid=cursor_gid,
+            newest_seen_at=cursor_at,
+        )
+    async def check_for_updates(
+        self, games: list[dict[str, Any]]
+    ) -> list[dict[str, Any]]:
+        """
+        Check Steam News API for each game. Return games with confirmed major updates.
+        Non-major patchnotes update last_game_update_at but do not trigger a schedule.
+        """
+        updated_games: list[dict[str, Any]] = []
+        dlcs_by_parent: dict[str, list[dict[str, Any]]] = {}
+        for game in games:
+            if game.get("app_type") == "dlc" and game.get("parent_appid"):
+                dlcs_by_parent.setdefault(str(game["parent_appid"]), []).append(game)
+        for game in games:
+            app_id = str(game.get("appid", ""))
+            if not app_id:
+                continue
+            if game.get("app_type") == "dlc":
+                continue
+            last_known = game.get("last_game_update_at")
+            # Normalize last_known to datetime if it's a timestamp
+            if last_known is not None and not isinstance(last_known, datetime):
+                try:
+                    last_known = datetime.fromtimestamp(float(last_known), tz=timezone.utc)
+                except (ValueError, TypeError):
+                    last_known = None
+            result = await self._get_latest_news_date(
+                app_id,
+                last_seen_gid=game.get("last_seen_news_gid"),
+                last_seen_at=game.get("last_seen_news_at"),
+            )
+            # Persist cursor before any early-continue — even if no updates found
+            if result.newest_seen_gid:
+                await mongodb.update_news_cursor(
+                    app_id, result.newest_seen_gid, cast(datetime, result.newest_seen_at)
+                )
+            if result.latest_update_date is None:
+                continue
+            if last_known is None or result.latest_update_date > last_known:
+                await mongodb.update_game_update_date(app_id, result.latest_update_date)
+            if result.is_major:
+                current_patch_at = game.get("current_patch_at")
+                patch_date = cast(datetime, result.major_date)  # always not None when is_major=True
+                if current_patch_at is None or patch_date > current_patch_at:
+                    await mongodb.update_game_patch_date(app_id, patch_date)
+                    updated_games.append({**game, "update_at": patch_date})
+                    for dlc in dlcs_by_parent.get(app_id, []):
+                        dlc_appid = str(dlc.get("appid", ""))
+                        if not dlc_appid:
+                            continue
+                        await mongodb.update_game_patch_date(dlc_appid, patch_date)
+                        updated_games.append({**dlc, "update_at": patch_date})
+        logger.info(
+            f"Update detection: {len(updated_games)}/{len(games)} games have new updates"
+        )
+        return updated_games

backend/pytest.ini ADDED Viewed

	@@ -0,0 +1,6 @@

+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_functions = test_*
+asyncio_mode = auto
+addopts = -v --tb=short

backend/requirements.txt ADDED Viewed

	@@ -0,0 +1,42 @@

+# Web Framework
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+sse-starlette==1.8.2
+# Database
+motor==3.3.2
+pymongo==4.6.1
+# Data Validation
+pydantic==2.5.3
+pydantic-settings==2.1.0
+# HTTP Client
+httpx==0.26.0
+# AI/ML - Local Inference (ONNX Runtime only, no PyTorch needed at runtime)
+numpy<2.0.0
+transformers==4.37.2
+optimum[onnxruntime]==1.16.2
+huggingface-hub==0.20.3
+# Rate Limiting
+slowapi==0.1.9
+# Utilities
+python-dotenv==1.0.0
+jieba==0.42.1
+# Keyword Expansion (FastText)
+gensim==4.3.3
+# Code Quality
+ruff==0.1.14
+mypy==1.8.0
+# Testing
+pytest==7.4.4
+pytest-asyncio==0.23.3
+pytest-cov==4.1.0
+anyio==4.12.1
+zhconv==1.4.3

backend/scripts/smoke_news_cursor.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+Smoke Test: Incremental Steam News Cursor Flow
+Validates that UpdateDetectionService correctly uses cursor-based incremental
+news fetching against the real Steam API.
+Test game: Factorio (427520) — stable, always has news, uses patchnotes tags.
+Usage:
+    cd /mnt/d/sentiment_summarizer/backend
+    ../venv/bin/python scripts/smoke_news_cursor.py
+"""
+import asyncio
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+import httpx
+# Ensure backend/app is importable
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from app.services.update_detection_service import UpdateDetectionService  # noqa: E402
+TEST_APP_ID = "427520"  # Factorio
+# ── helpers ──────────────────────────────────────────────────────────
+def _ts() -> str:
+    return datetime.now(timezone.utc).strftime("%H:%M:%S")
+def _print(status: str, msg: str) -> None:
+    tag = {
+        "OK": "\033[32mOK\033[0m",
+        "FAIL": "\033[31mFAIL\033[0m",
+        "SKIP": "\033[33mSKIP\033[0m",
+        "INFO": "\033[36mINFO\033[0m",
+    }
+    print(f"[{_ts()}] [{tag.get(status, status)}] {msg}")
+class RecordingTransport(httpx.AsyncBaseTransport):
+    """Forwards real HTTP requests but records URL + query params for inspection."""
+    def __init__(self) -> None:
+        self._inner = httpx.AsyncHTTPTransport()
+        self.recorded: list[dict] = []
+    async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
+        params = dict(request.url.params)
+        self.recorded.append({"url": str(request.url), "params": params})
+        return await self._inner.handle_async_request(request)
+    async def aclose(self) -> None:
+        await self._inner.aclose()
+# ── main ─────────────────────────────────────────────────────────────
+async def run_smoke_test() -> int:
+    print(f"\nSteam News Cursor Smoke Test — Factorio ({TEST_APP_ID})")
+    print("=" * 60)
+    failures = 0
+    # ── Check 1: initial scan returns cursor fields ───────────────────
+    _print("INFO", f"Check 1: initial scan for {TEST_APP_ID} (Factorio)")
+    result_initial = None
+    svc1 = UpdateDetectionService()
+    try:
+        result_initial = await svc1._get_latest_news_date(TEST_APP_ID)
+    finally:
+        await svc1.close()
+    if result_initial.newest_seen_gid is None:
+        _print("SKIP", "No news items returned — Steam API may be rate-limiting or unreachable; skipping all checks")
+        return 0
+    c1_ok = True
+    if not isinstance(result_initial.newest_seen_gid, str) or not result_initial.newest_seen_gid:
+        _print("FAIL", f"newest_seen_gid is empty/non-string: {result_initial.newest_seen_gid!r}")
+        c1_ok = False
+    now = datetime.now(timezone.utc)
+    if result_initial.newest_seen_at is None:
+        _print("FAIL", "newest_seen_at is None")
+        c1_ok = False
+    elif not (
+        datetime(2020, 1, 1, tzinfo=timezone.utc)
+        <= result_initial.newest_seen_at
+        <= datetime(now.year + 1, 1, 1, tzinfo=timezone.utc)
+    ):
+        _print("FAIL", f"newest_seen_at out of expected range: {result_initial.newest_seen_at!r}")
+        c1_ok = False
+    if c1_ok:
+        _print(
+            "OK",
+            f"cursor GID={result_initial.newest_seen_gid}, "
+            f"at={result_initial.newest_seen_at.isoformat()}",
+        )
+    else:
+        failures += 1
+    cursor_gid = result_initial.newest_seen_gid
+    cursor_at = result_initial.newest_seen_at
+    # ── Check 2: incremental scan uses count=5 ────────────────────────
+    _print("INFO", "Check 2: incremental scan uses count=5")
+    transport = RecordingTransport()
+    client = httpx.AsyncClient(transport=transport, timeout=15.0)
+    svc2 = UpdateDetectionService(client=client)
+    result_inc = None
+    try:
+        result_inc = await svc2._get_latest_news_date(
+            TEST_APP_ID, last_seen_gid=cursor_gid, last_seen_at=cursor_at
+        )
+    finally:
+        await client.aclose()
+    if not transport.recorded:
+        _print("SKIP", "No requests recorded — Steam API may be unreachable")
+    else:
+        c2_ok = True
+        for i, req in enumerate(transport.recorded):
+            count_val = req["params"].get("count")
+            enddate_val = req["params"].get("enddate", "n/a")
+            if str(count_val) != "5":
+                _print("FAIL", f"Request {i + 1}: count={count_val!r}, expected '5'")
+                c2_ok = False
+            else:
+                _print("INFO", f"  Request {i + 1}: count=5 ✓  enddate={enddate_val}")
+        if c2_ok:
+            _print("OK", f"All {len(transport.recorded)} request(s) used count=5")
+        else:
+            failures += 1
+    # ── Check 3: no items older than cursor boundary ──────────────────
+    _print("INFO", "Check 3: incremental result respects cursor boundary")
+    if result_inc is None:
+        _print("SKIP", "No incremental result available")
+    else:
+        c3_ok = True
+        if result_inc.latest_update_date is not None:
+            if result_inc.latest_update_date <= cursor_at:
+                _print(
+                    "FAIL",
+                    f"latest_update_date {result_inc.latest_update_date.isoformat()} "
+                    f"is not strictly newer than cursor {cursor_at.isoformat()}",
+                )
+                c3_ok = False
+            else:
+                _print(
+                    "INFO",
+                    f"  latest_update_date={result_inc.latest_update_date.isoformat()} "
+                    f"> cursor (new update found between scans)",
+                )
+        else:
+            _print("INFO", "  latest_update_date=None (no new updates since cursor) — expected")
+        if c3_ok:
+            _print("OK", "Cursor boundary respected")
+        else:
+            failures += 1
+    # ── Check 4: latest_update_date / major_date invariants ──────────
+    _print("INFO", "Check 4: structural invariants on initial scan result")
+    c4_ok = True
+    if result_initial.latest_update_date is None:
+        if result_initial.is_major or result_initial.major_date is not None:
+            _print(
+                "FAIL",
+                f"latest_update_date=None but is_major={result_initial.is_major}, "
+                f"major_date={result_initial.major_date!r}",
+            )
+            c4_ok = False
+    elif result_initial.is_major:
+        if result_initial.major_date is None:
+            _print("FAIL", "is_major=True but major_date is None")
+            c4_ok = False
+        elif result_initial.major_date > result_initial.latest_update_date:
+            _print(
+                "FAIL",
+                f"major_date {result_initial.major_date.isoformat()} "
+                f"> latest_update_date {result_initial.latest_update_date.isoformat()}",
+            )
+            c4_ok = False
+    else:
+        if result_initial.major_date is not None:
+            _print("FAIL", f"is_major=False but major_date={result_initial.major_date!r}")
+            c4_ok = False
+    if c4_ok:
+        _print(
+            "OK",
+            f"invariants hold: latest_update_date={result_initial.latest_update_date}, "
+            f"is_major={result_initial.is_major}, major_date={result_initial.major_date}",
+        )
+    else:
+        failures += 1
+    # ── Check 5: check_for_updates end-to-end, mocked DB ─────────────
+    _print("INFO", "Check 5: check_for_updates end-to-end (mocked DB)")
+    mock_mongodb = AsyncMock()
+    svc5 = UpdateDetectionService()
+    updated = None
+    try:
+        with patch("app.services.update_detection_service.mongodb", mock_mongodb):
+            updated = await svc5.check_for_updates(
+                [{"appid": TEST_APP_ID, "name": "Factorio"}]
+            )
+    finally:
+        await svc5.close()
+    c5_ok = True
+    if not isinstance(updated, list):
+        _print("FAIL", f"check_for_updates returned {type(updated).__name__}, expected list")
+        c5_ok = False
+    call_count = mock_mongodb.update_news_cursor.call_count
+    if call_count == 0:
+        # API may have failed between checks (swallowed internally by the service);
+        # treat as skip — not a hard failure per the plan.
+        _print("SKIP", "update_news_cursor not called — Steam API may have been unreachable for this call")
+    elif call_count > 1:
+        _print("FAIL", f"update_news_cursor called {call_count} times, expected 1")
+        c5_ok = False
+    else:
+        args = mock_mongodb.update_news_cursor.call_args[0]
+        if not (
+            isinstance(args[0], str)
+            and isinstance(args[1], str)
+            and isinstance(args[2], datetime)
+        ):
+            _print(
+                "FAIL",
+                f"update_news_cursor arg types wrong: "
+                f"{[type(a).__name__ for a in args]} — expected (str, str, datetime)",
+            )
+            c5_ok = False
+        else:
+            _print(
+                "OK",
+                f"check_for_updates returned list; "
+                f"update_news_cursor({args[0]!r}, {args[1]!r}, {args[2].isoformat()!r})",
+            )
+    if not c5_ok:
+        failures += 1
+    # ── Summary ───────────────────────────────────────────────────────
+    print("=" * 60)
+    if failures == 0:
+        _print("OK", "All checks passed")
+        return 0
+    else:
+        _print("FAIL", f"{failures} check(s) failed")
+        return 1
+if __name__ == "__main__":
+    sys.exit(asyncio.run(run_smoke_test()))

backend/scripts/smoke_test.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+Smoke Test — local verification of worker cycle and analysis pipeline.
+Usage:
+    cd backend
+    python scripts/smoke_test.py analyze <appid>   # run full analysis for a game
+    python scripts/smoke_test.py cycle              # mini worker cycle (1 game)
+"""
+import argparse
+import asyncio
+import logging
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+# Ensure backend/app is importable
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from app.core.config import settings  # noqa: E402
+from app.db.mongodb import mongodb  # noqa: E402
+from app.services.nlp_service import NLPService  # noqa: E402
+from app.services.steam_service import SteamService  # noqa: E402
+from app.services.update_detection_service import UpdateDetectionService  # noqa: E402
+from app.services.precache_service import PreCacheService  # noqa: E402
+from app.services.analysis_runner import run_full_analysis  # noqa: E402
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger("smoke_test")
+def _ts() -> str:
+    return datetime.now(timezone.utc).strftime("%H:%M:%S")
+def _print(status: str, msg: str) -> None:
+    tag = {"OK": "\033[32mOK\033[0m", "FAIL": "\033[31mFAIL\033[0m", "SKIP": "\033[33mSKIP\033[0m", "INFO": "\033[36mINFO\033[0m"}
+    print(f"[{_ts()}] [{tag.get(status, status)}] {msg}")
+# ── analyze subcommand ──────────────────────────────────────────────
+async def cmd_analyze(app_id: str) -> None:
+    _print("INFO", f"Starting analysis for app_id={app_id}")
+    _print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}")
+    await mongodb.connect()
+    steam_svc = SteamService()
+    nlp_svc = NLPService()
+    try:
+        t0 = time.monotonic()
+        result = await run_full_analysis(app_id, f"smoke-{app_id}", steam_svc, nlp_svc)
+        elapsed = time.monotonic() - t0
+        if result is None:
+            _print("FAIL", "run_full_analysis returned None")
+            return
+        game = result.get("game", {})
+        topics = result.get("topics", [])
+        analyzed = result.get("analyzed_reviews", 0)
+        highlights = result.get("general_highlights", [])
+        _print("OK", f"Analysis complete in {elapsed:.1f}s")
+        _print("OK", f"  Game: {game.get('name', '?')} (appid {game.get('app_id', '?')})")
+        _print("OK", f"  Reviews analyzed: {analyzed}")
+        _print("OK", f"  Topics found: {len(topics)}")
+        _print("OK", f"  General highlights: {len(highlights)}")
+        # Verify cache write
+        cached = await mongodb.get_cached_analysis(app_id)
+        if cached:
+            _print("OK", "  Cache write verified — document found in MongoDB")
+        else:
+            _print("FAIL", "  Cache write verification FAILED — no document in MongoDB")
+    finally:
+        await steam_svc.close()
+        await mongodb.disconnect()
+# ── cycle subcommand ─────────────────────────────────────────────────
+async def cmd_cycle() -> None:
+    _print("INFO", "Starting mini worker cycle")
+    _print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}")
+    await mongodb.connect()
+    steam_svc = SteamService()
+    nlp_svc = NLPService()
+    update_svc = UpdateDetectionService()
+    try:
+        # Step 1: Get top 1 game
+        _print("INFO", "Step 1: Fetching top game by reviews...")
+        top_games = await mongodb.get_top_games_by_reviews(1)
+        if not top_games:
+            _print("SKIP", "No games in DB — run game sync first or use 'analyze' subcommand")
+            return
+        game = top_games[0]
+        app_id = str(game.get("appid", ""))
+        name = game.get("name", "?")
+        _print("OK", f"  Top game: {name} (appid {app_id})")
+        # Step 2: Test datetime comparison (the bug this patch fixes)
+        _print("INFO", "Step 2: Testing synced_at datetime comparison...")
+        synced_at = game.get("synced_at")
+        if synced_at:
+            try:
+                delta = datetime.now(timezone.utc) - synced_at
+                hours = delta.total_seconds() / 3600
+                _print("OK", f"  synced_at delta: {hours:.1f}h (tz={synced_at.tzinfo})")
+            except TypeError as e:
+                _print("FAIL", f"  datetime subtraction failed: {e}")
+                return
+        else:
+            _print("SKIP", "  No synced_at field — game sync not run yet")
+        # Step 3: Update detection (1 game)
+        _print("INFO", "Step 3: Update detection...")
+        t0 = time.monotonic()
+        updated = await update_svc.check_for_updates([game])
+        elapsed = time.monotonic() - t0
+        _print("OK", f"  Updates detected: {len(updated)} in {elapsed:.1f}s")
+        # Step 4: Bootstrap missing analyses
+        _print("INFO", "Step 4: Bootstrap missing analyses...")
+        precache_svc = PreCacheService(steam_svc, nlp_svc)
+        bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games)
+        _print("OK", f"  Bootstrapped: {bootstrapped}")
+        # Step 5: Process due analyses (max 1)
+        _print("INFO", "Step 5: Processing due analyses (max 1)...")
+        orig = settings.precache_max_analyses_per_cycle
+        # Temporarily limit to 1
+        object.__setattr__(settings, "precache_max_analyses_per_cycle", 1)
+        try:
+            executed = await precache_svc.process_due_analyses()
+            _print("OK", f"  Executed: {executed}")
+        finally:
+            object.__setattr__(settings, "precache_max_analyses_per_cycle", orig)
+        _print("OK", "Mini cycle complete")
+    finally:
+        await update_svc.close()
+        await steam_svc.close()
+        await mongodb.disconnect()
+# ── main ─────────────────────────────────────────────────────────────
+def main() -> None:
+    parser = argparse.ArgumentParser(description="SentimentStream smoke test")
+    sub = parser.add_subparsers(dest="command")
+    p_analyze = sub.add_parser("analyze", help="Run full analysis for a game")
+    p_analyze.add_argument("appid", help="Steam app ID (e.g. 730)")
+    sub.add_parser("cycle", help="Run mini worker cycle (top 1 game)")
+    args = parser.parse_args()
+    if args.command == "analyze":
+        asyncio.run(cmd_analyze(args.appid))
+    elif args.command == "cycle":
+        asyncio.run(cmd_cycle())
+    else:
+        parser.print_help()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

backend/worker_main.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Worker Main App — lightweight FastAPI for background game sync and pre-cache.
+Endpoints:
+  GET  /health  — MongoDB ping, last cycle summary, cycle_running flag
+  POST /trigger — token-protected, starts a worker cycle as background task
+  GET  /logs    — token-protected, read structured log tail
+"""
+import asyncio
+import logging
+import os
+import uuid
+from contextlib import asynccontextmanager
+from datetime import datetime, timezone
+from typing import Any
+from fastapi import FastAPI, Query, Request
+from fastapi.responses import JSONResponse
+from app.core.config import settings
+from app.core.worker_logging import (
+    AsyncTimingContext,
+    WORKER_LOG_WHITELIST,
+    log_structured,
+    read_log_tail,
+    resolve_log_path,
+    set_cycle_id,
+    setup_app_logging,
+    setup_structured_logger,
+)
+from app.db.mongodb import mongodb
+from app.services.game_sync_service import GameSyncService
+from app.services.nlp_service import NLPService
+from app.services.precache_service import PreCacheService
+from app.services.steam_service import SteamService
+from app.services.priority_refresh_service import PriorityRefreshService
+from app.services.update_detection_service import UpdateDetectionService
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+# Cycle state
+_cycle_running = False
+_last_cycle_summary: dict[str, Any] = {}
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Connect MongoDB on startup, disconnect on shutdown."""
+    await mongodb.connect()
+    setup_structured_logger("worker")
+    setup_app_logging()
+    logger.info("Worker started — MongoDB connected, structured logging initialized")
+    yield
+    await mongodb.disconnect()
+    logger.info("Worker shutting down")
+app = FastAPI(title="SentimentStream Worker", lifespan=lifespan)
+@app.get("/health")
+async def health():
+    """Health check with cycle status."""
+    mongo_ok = False
+    try:
+        if mongodb.client:
+            await mongodb.client.admin.command("ping")
+            mongo_ok = True
+    except Exception:
+        pass
+    return {
+        "status": "ok" if mongo_ok else "degraded",
+        "mongodb": "connected" if mongo_ok else "disconnected",
+        "cycle_running": _cycle_running,
+        "last_cycle": _last_cycle_summary,
+    }
+def _check_bearer_token(request: Request) -> bool:
+    """Validate Bearer token from Authorization header."""
+    auth = request.headers.get("Authorization", "")
+    expected = settings.worker_trigger_token
+    return bool(expected and auth.startswith("Bearer ") and auth[7:] == expected)
+@app.post("/trigger")
+async def trigger(request: Request):
+    """Token-protected trigger to start a worker cycle."""
+    global _cycle_running
+    if not _check_bearer_token(request):
+        return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
+    if _cycle_running:
+        return JSONResponse(status_code=503, content={"detail": "Cycle already running"})
+    asyncio.create_task(_run_cycle())
+    return {"status": "started"}
+@app.get("/logs")
+async def get_logs(
+    request: Request,
+    lines: int = Query(default=100, ge=1, le=1000),
+    level: str | None = Query(default=None),
+    event: str | None = Query(default=None),
+    file: str = Query(default="worker"),
+):
+    """Token-protected endpoint to read structured log tail."""
+    if not _check_bearer_token(request):
+        return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
+    log_path = resolve_log_path(file, WORKER_LOG_WHITELIST)
+    if log_path is None:
+        return JSONResponse(
+            status_code=400,
+            content={"detail": f"Unknown log file: '{file}'. Valid: {list(WORKER_LOG_WHITELIST.keys())}"},
+        )
+    entries = read_log_tail(log_path, lines=lines, level=level, event=event)
+    return {"entries": entries, "count": len(entries)}
+async def _run_cycle() -> None:
+    """Execute a full worker cycle."""
+    global _cycle_running, _last_cycle_summary
+    _cycle_running = True
+    started = datetime.now(timezone.utc)
+    summary: dict[str, Any] = {"started_at": started.isoformat()}
+    cycle_id = uuid.uuid4().hex[:8]
+    set_cycle_id(cycle_id)
+    log_structured("cycle_start", cycle_id=cycle_id)
+    steam_svc = SteamService()
+    nlp_svc = NLPService()
+    game_sync_svc = GameSyncService()
+    priority_svc = PriorityRefreshService()
+    update_svc = UpdateDetectionService()
+    try:
+        # 1. Game sync (if enabled and not synced recently)
+        if settings.game_sync_enabled:
+            top_games = await mongodb.get_top_games_by_reviews(1)
+            last_synced = top_games[0].get("synced_at") if top_games else None
+            hours_since_sync = None
+            if last_synced:
+                delta = datetime.now(timezone.utc) - last_synced
+                hours_since_sync = delta.total_seconds() / 3600
+            if hours_since_sync is None or hours_since_sync > 20:
+                async with AsyncTimingContext() as t_sync:
+                    logger.info("Starting game sync...")
+                    upserted, modified = await game_sync_svc.sync_all_games()
+                    summary["game_sync"] = {"upserted": upserted, "modified": modified}
+                log_structured("game_sync", elapsed_s=t_sync.elapsed_s,
+                               detail=summary["game_sync"])
+                async with AsyncTimingContext() as t_details:
+                    enriched = await game_sync_svc.sync_top_game_details()
+                    summary["game_details"] = {"enriched": enriched}
+                log_structured("game_details", elapsed_s=t_details.elapsed_s,
+                               detail=summary["game_details"])
+            else:
+                summary["game_sync"] = "skipped (recent)"
+                log_structured("game_sync", detail="skipped (recent)")
+            # ALWAYS enrich CN names if sync is enabled, even if main sync skipped
+            async with AsyncTimingContext() as t_cn:
+                cn_processed = await game_sync_svc.enrich_cn_names()
+                summary["cn_enrichment"] = {"processed": cn_processed}
+            log_structured("cn_enrichment", elapsed_s=t_cn.elapsed_s,
+                           detail=summary["cn_enrichment"])
+            async with AsyncTimingContext() as t_app_types:
+                app_types_processed = await game_sync_svc.enrich_app_types()
+                summary["app_type_enrichment"] = {"processed": app_types_processed}
+            log_structured("app_type_enrichment", elapsed_s=t_app_types.elapsed_s,
+                           detail=summary["app_type_enrichment"])
+        # 1b. Priority refresh
+        async with AsyncTimingContext() as t_priority:
+            priority_result = await priority_svc.refresh_priorities()
+            summary["priority_refresh"] = priority_result
+        log_structured("priority_refresh", elapsed_s=t_priority.elapsed_s, detail=priority_result)
+        # 2. Update detection
+        async with AsyncTimingContext() as t_update:
+            top_games = await mongodb.get_priority_games_for_analysis()
+            updated_games = await update_svc.check_for_updates(top_games)
+            summary["updates_detected"] = len(updated_games)
+        log_structured("update_detection", elapsed_s=t_update.elapsed_s,
+                       detail={"updates_detected": len(updated_games)})
+        # 3. Create schedules for updated games
+        precache_svc = PreCacheService(steam_svc, nlp_svc)
+        async with AsyncTimingContext() as t_sched:
+            if updated_games:
+                await precache_svc.create_schedules_for_updates(updated_games)
+        log_structured("create_schedules", elapsed_s=t_sched.elapsed_s,
+                       detail={"updated_games": len(updated_games) if updated_games else 0})
+        # 4. Bootstrap missing analyses
+        async with AsyncTimingContext() as t_boot:
+            bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games)
+            summary["bootstrapped"] = bootstrapped
+        log_structured("bootstrap_missing", elapsed_s=t_boot.elapsed_s,
+                       detail={"bootstrapped": bootstrapped})
+        # 5. Process due analyses
+        if settings.precache_enabled:
+            async with AsyncTimingContext() as t_analyses:
+                executed = await precache_svc.process_due_analyses()
+                summary["analyses_executed"] = executed
+            log_structured("process_due_analyses", elapsed_s=t_analyses.elapsed_s,
+                           detail={"executed": executed})
+        else:
+            summary["precache"] = "disabled"
+    except Exception as e:
+        logger.error(f"Cycle error: {e}", exc_info=True)
+        summary["error"] = str(e)
+        log_structured("cycle_error", level=logging.ERROR, error=str(e))
+    finally:
+        await game_sync_svc.close()
+        await priority_svc.close()
+        await update_svc.close()
+        await steam_svc.close()
+        elapsed = (datetime.now(timezone.utc) - started).total_seconds()
+        summary["elapsed_seconds"] = round(elapsed, 1)
+        _last_cycle_summary = summary
+        _cycle_running = False
+        log_structured("cycle_end", elapsed_s=round(elapsed, 1),
+                       detail=summary)
+        set_cycle_id(None)
+        logger.info(f"Cycle complete in {elapsed:.1f}s: {summary}")

scripts/benchmark_major_update.py ADDED Viewed

	@@ -0,0 +1,848 @@

+#!/usr/bin/env python3
+"""
+Benchmark script for the major update detection heuristic.
+Evaluates UpdateDetectionService._is_update_related, _collect_update_candidates,
+and _is_major_update against a curated set of Steam games.
+Three modes:
+  --discover          Fetch news for all games (count=20 by default, matches
+                      production) and display all items with classification
+                      details. Use this to identify ground truth.
+  --evaluate          Item-level evaluation: for each ItemCase, find the item
+                      by gid and check if _is_update_related / _is_major_update
+                      match expectations.
+  --evaluate-service  Service-level evaluation: for each ServiceCase, run the
+                      full selection pipeline and compare the outcome.
+Both --evaluate and --evaluate-service run by default when no mode is specified.
+Examples:
+    python scripts/benchmark_major_update.py --discover
+    python scripts/benchmark_major_update.py --discover --count 50
+    python scripts/benchmark_major_update.py --evaluate
+    python scripts/benchmark_major_update.py --evaluate-service
+    python scripts/benchmark_major_update.py          # runs both evaluate modes
+"""
+from __future__ import annotations
+import argparse
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Literal
+import httpx
+# ── import project service ────────────────────────────────────────────────────
+sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
+from app.services.update_detection_service import UpdateDetectionService  # noqa: E402
+STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/"
+# ── benchmark games ───────────────────────────────────────────────────────────
+GAMES: list[tuple[str, str]] = [
+    ("Going Medieval", "1029780"),
+    ("Timberborn", "1062090"),
+    ("Hades II", "1145350"),
+    ("Against the Storm", "1336490"),
+    ("Valheim", "892970"),
+    ("Manor Lords", "1363080"),
+    ("Project Zomboid", "108600"),
+    ("Dwarf Fortress", "975370"),
+    ("Helldivers 2", "553850"),
+    ("Deep Rock Galactic", "548430"),
+    ("Lethal Company", "1966720"),
+    ("Factorio", "427520"),
+    ("Satisfactory", "526870"),
+]
+# ── ground truth structures ───────────────────────────────────────────────────
+@dataclass
+class ItemCase:
+    """Per-item ground truth: is this specific event major?"""
+    game_name: str
+    appid: str
+    gid: str
+    title: str          # for display
+    expected: Literal["major", "not_major", "ambiguous"]
+    reasoning: str
+@dataclass
+class ServiceCase:
+    """Per-game ground truth: what should the production code do?"""
+    game_name: str
+    appid: str
+    expected_major: bool | None   # True / False / None = ambiguous
+    reasoning: str
+# ── item-level ground truth ───────────────────────────────────────────────────
+# Populated from --discover run on 2026-03-19.
+ITEM_CASES: list[ItemCase] = [
+    # ── Going Medieval ────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Going Medieval",
+        appid="1029780",
+        gid="1826992588604105",
+        title="Going Medieval is out now in 1.0!",
+        expected="major",
+        reasoning=(
+            "1.0 full release out of Early Access — unambiguously major. "
+            "Phase 1: RELEASE_PHRASE_RE matches 'is out now' → update-related. "
+            "ONE_ZERO_RE matches '1.0' → major."
+        ),
+    ),
+    ItemCase(
+        game_name="Going Medieval",
+        appid="1029780",
+        gid="1827626365751261",
+        title="Experimental Branch Patch (1.0.48)",
+        expected="not_major",
+        reasoning=(
+            "Experimental branch incremental patch. Three-segment version (1.0.48) "
+            "excluded by VERSION_RE. BRANCH_RE blocks major classification."
+        ),
+    ),
+    ItemCase(
+        game_name="Going Medieval",
+        appid="1029780",
+        gid="1827626365750723",
+        title="Patch Notes (1.0.47)",
+        expected="not_major",
+        reasoning="Incremental stable patch, three-segment version. not_major is correct.",
+    ),
+    # ── Timberborn ────────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Timberborn",
+        appid="1062090",
+        gid="1826992588592887",
+        title="Timberborn 1.0 is live!",
+        expected="major",
+        reasoning=(
+            "1.0 full release out of Early Access — unambiguously major. "
+            "Phase 1: RELEASE_PHRASE_RE matches 'is live' → update-related. "
+            "ONE_ZERO_RE matches '1.0' → major."
+        ),
+    ),
+    ItemCase(
+        game_name="Timberborn",
+        appid="1062090",
+        gid="1826992588603124",
+        title="Patch notes 2026-03-17 (experimental)",
+        expected="not_major",
+        reasoning="Experimental branch date-based patch notes. No version number. not_major is correct.",
+    ),
+    # ── Hades II ──────────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Hades II",
+        appid="1145350",
+        gid="1816215235360707",
+        title="Hades II v1.0 Hotfix 3",
+        expected="not_major",
+        reasoning=(
+            "A bugfix hotfix on top of the v1.0 launch — not a content update. "
+            "Phase 1: HOTFIX_RE blocks major classification. Correct: not_major."
+        ),
+    ),
+    ItemCase(
+        game_name="Hades II",
+        appid="1145350",
+        gid="1811772772516846",
+        title="Hades II v1.0 Hotfix 2",
+        expected="not_major",
+        reasoning="Same pattern: HOTFIX_RE blocks 'v1.0 Hotfix N' from being classified as major.",
+    ),
+    ItemCase(
+        game_name="Hades II",
+        appid="1145350",
+        gid="1811772772248738",
+        title="Hades II v1.0 Is Now Available!",
+        expected="major",
+        reasoning=(
+            "v1.0 full launch — unambiguously major. "
+            "Phase 1: RELEASE_PHRASE_RE matches 'Is Now Available' → update-related. "
+            "No hotfix/branch blocker. VERSION_RE matches 'v1.0' → major."
+        ),
+    ),
+    # ── Against the Storm ─────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Against the Storm",
+        appid="1336490",
+        gid="1818752592135840",
+        title="Demo Update 1.9.6",
+        expected="not_major",
+        reasoning=(
+            "Demo game update, three-segment version 1.9.6. "
+            "Service correctly classifies as not_major."
+        ),
+    ),
+    ItemCase(
+        game_name="Against the Storm",
+        appid="1336490",
+        gid="1816849002010836",
+        title="Brineworks Update (1.9) available!",
+        expected="major",
+        reasoning=(
+            "Named major content update with version 1.9. "
+            "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' "
+            "→ update-related. VERSION_RE → major."
+        ),
+    ),
+    # ── Valheim ───────────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Valheim",
+        appid="892970",
+        gid="1825093633184197",
+        title="Patch 0.221.12",
+        expected="not_major",
+        reasoning="Three-segment maintenance patch. Correctly classified as not_major.",
+    ),
+    ItemCase(
+        game_name="Valheim",
+        appid="892970",
+        gid="1809869179994587",
+        title="Patch 0.221.4 (Public Test)",
+        expected="not_major",
+        reasoning="Public test branch three-segment patch. Correctly classified as not_major.",
+    ),
+    # ── Manor Lords ───────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Manor Lords",
+        appid="1363080",
+        gid="1827626365750540",
+        title="Major Update #6: Battlefield Changes, New Map, and Family Based Progression",
+        expected="major",
+        reasoning=(
+            "Developer-declared major content drop. "
+            "Phase 1: CONTENT_UPDATE_RE matches 'Major Update' → update-related and major."
+        ),
+    ),
+    ItemCase(
+        game_name="Manor Lords",
+        appid="1363080",
+        gid="1826992588603500",
+        title="New BETA version is available for testing (0.8.065)",
+        expected="not_major",
+        reasoning=(
+            "Beta/testing build announcement, not a production major update. "
+            "Current heuristic misses it entirely, which is acceptable for this benchmark case."
+        ),
+    ),
+    # ── Project Zomboid ───────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Project Zomboid",
+        appid="108600",
+        gid="1826992588590120",
+        title="42.15.2 UNSTABLE HOTFIX Released",
+        expected="not_major",
+        reasoning=(
+            "Unstable-branch hotfix. patchnotes tag makes it update-related, "
+            "but HOTFIX_RE correctly blocks major classification."
+        ),
+    ),
+    ItemCase(
+        game_name="Project Zomboid",
+        appid="108600",
+        gid="1826362059930323",
+        title="Build 42.15.0 Unstable Released",
+        expected="not_major",
+        reasoning=(
+            "Unstable build release, not a production major update. "
+            "Current heuristic does not classify it as update-related because the three-segment "
+            "build number fails VERSION_RE."
+        ),
+    ),
+    # ── Dwarf Fortress ────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Dwarf Fortress",
+        appid="975370",
+        gid="1826362059918689",
+        title="Food fixes, AMA, community spotlight and more! Dwarf Fortress Patch 53.11",
+        expected="not_major",
+        reasoning=(
+            "Maintenance patch with Dwarf Fortress' two-segment numbering scheme. "
+            "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fixes' "
+            "→ maintenance blocker fires before VERSION_RE → not_major."
+        ),
+    ),
+    ItemCase(
+        game_name="Dwarf Fortress",
+        appid="975370",
+        gid="1821288646585998",
+        title="Aquatic portraits, Naked dwarf fix and more Dwarf Fortress Patch 53.10",
+        expected="not_major",
+        reasoning=(
+            "Another maintenance patch under the same numbering scheme. "
+            "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fix' "
+            "→ maintenance blocker fires → not_major."
+        ),
+    ),
+    # ── Helldivers 2 ──────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Helldivers 2",
+        appid="553850",
+        gid="1826992588603352",
+        title="Machinery of Oppression: 6.1.0",
+        expected="major",
+        reasoning=(
+            "Named content drop with new missions/enemies. This should count as a major update. "
+            "Useful to test whether named major drops with three-segment versions are still found."
+        ),
+    ),
+    ItemCase(
+        game_name="Helldivers 2",
+        appid="553850",
+        gid="1826992588603981",
+        title="Revealing our Machinery of Oppression Content Roadmap!",
+        expected="not_major",
+        reasoning=(
+            "Roadmap/announcement post, not the update itself. Should not be treated as major."
+        ),
+    ),
+    # ── Deep Rock Galactic ────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Deep Rock Galactic",
+        appid="548430",
+        gid="1825727806720055",
+        title="'Eight Years in Orbit' Anniversary Event is live now!",
+        expected="not_major",
+        reasoning=(
+            "Live event announcement, not a game patch. "
+            "Phase 2: EVENT_FESTIVAL_RE matches 'anniversary event'; no 'update'/'patch' in title "
+            "→ UPDATE_OR_PATCH_RE guard fails → event blocker fires → not_major."
+        ),
+    ),
+    ItemCase(
+        game_name="Deep Rock Galactic",
+        appid="548430",
+        gid="1824644522847377",
+        title="Lunar Festival 2026 is now live!",
+        expected="not_major",
+        reasoning=(
+            "Seasonal event announcement, not a major patch/update. "
+            "Phase 2: EVENT_FESTIVAL_RE matches 'festival'; no 'update'/'patch' → event blocker fires → not_major."
+        ),
+    ),
+    # ── Lethal Company ────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Lethal Company",
+        appid="1966720",
+        gid="1800991756395986",
+        title="V70 - The Incubating Update",
+        expected="major",
+        reasoning=(
+            "Named major content update. "
+            "Phase 2: NAMED_VERSION_RE matches 'V70'; UPDATE_WORD_RE matches 'Update' "
+            "→ condition F makes it update-related; named version positive signal → major."
+        ),
+    ),
+    ItemCase(
+        game_name="Lethal Company",
+        appid="1966720",
+        gid="1801617199407807",
+        title="V72 Bug fix patch",
+        expected="not_major",
+        reasoning=(
+            "Small bug-fix patch. patchnotes tag makes it update-related. "
+            "Phase 2: PATCH_WORD_RE matches 'patch'; MAINT_LANGUAGE_RE matches 'bug fix' "
+            "→ maintenance blocker fires → not_major."
+        ),
+    ),
+    # ── Factorio ──────────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Factorio",
+        appid="427520",
+        gid="1827626365752749",
+        title="Version 2.0.76 released as stable",
+        expected="not_major",
+        reasoning=(
+            "Stable maintenance patch under a three-segment versioning scheme. "
+            "Useful as a clean true negative."
+        ),
+    ),
+    # ── Satisfactory ──────────────────────────────────────────────────────────
+    ItemCase(
+        game_name="Satisfactory",
+        appid="526870",
+        gid="1826992588604352",
+        title="Update 1.2 is out now on Experimental!",
+        expected="not_major",
+        reasoning=(
+            "Experimental-branch release, not a production major update. "
+            "Phase 2: extended BRANCH_RE matches 'on Experimental' → branch blocker fires → not_major."
+        ),
+    ),
+    ItemCase(
+        game_name="Satisfactory",
+        appid="526870",
+        gid="1825093633185794",
+        title="Experimental Hotfix v1.1.3.1",
+        expected="not_major",
+        reasoning=(
+            "Experimental hotfix on a three-segment version. Correct behavior is not_major."
+        ),
+    ),
+]
+# ── service-level ground truth ────────────────────────────────────────────────
+# What SHOULD the production code do for this game given the current news window?
+# Populated from --discover run on 2026-03-19.
+# Phase 1 semantics: verdict based on is_major (major_date is not None), not on selected item title.
+SERVICE_CASES: list[ServiceCase] = [
+    ServiceCase(
+        game_name="Going Medieval",
+        appid="1029780",
+        expected_major=True,
+        reasoning=(
+            "Game released 1.0 on 2026-03-17. Phase 1: 'is out now in 1.0!' matches "
+            "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. "
+            "Expected: major_date is not None (TP)."
+        ),
+    ),
+    ServiceCase(
+        game_name="Timberborn",
+        appid="1062090",
+        expected_major=True,
+        reasoning=(
+            "Game reached 1.0 on 2026-03-12. Phase 1: '1.0 is live!' matches "
+            "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. "
+            "Expected: major_date is not None (TP)."
+        ),
+    ),
+    ServiceCase(
+        game_name="Hades II",
+        appid="1145350",
+        expected_major=True,
+        reasoning=(
+            "Game launched v1.0 on 2025-09-25. Phase 1: 'v1.0 Is Now Available!' matches "
+            "RELEASE_PHRASE_RE → update-related (developer feed). VERSION_RE matches 'v1.0' → major. "
+            "Subsequent hotfixes (v1.0 Hotfix 2, 3) are correctly blocked by HOTFIX_RE. "
+            "major_date = v1.0 launch date, latest_update_date = most recent hotfix date. "
+            "Expected: major_date is not None (TP)."
+        ),
+    ),
+    ServiceCase(
+        game_name="Against the Storm",
+        appid="1336490",
+        expected_major=True,
+        reasoning=(
+            "'Brineworks Update (1.9) available!' is a named major content update. "
+            "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' "
+            "→ update-related (developer feed). VERSION_RE → major. "
+            "Expected: major_date is not None (TP)."
+        ),
+    ),
+    ServiceCase(
+        game_name="Valheim",
+        appid="892970",
+        expected_major=False,
+        reasoning=(
+            "Top items are three-segment maintenance patches. "
+            "Correctly classified as not_major. TN."
+        ),
+    ),
+    ServiceCase(
+        game_name="Manor Lords",
+        appid="1363080",
+        expected_major=True,
+        reasoning=(
+            "Current window contains a clearly labeled 'Major Update #6' post. "
+            "Expected: major_date is not None."
+        ),
+    ),
+    ServiceCase(
+        game_name="Project Zomboid",
+        appid="108600",
+        expected_major=False,
+        reasoning=(
+            "Current window is dominated by unstable builds and hotfixes. "
+            "These should update activity, but should not count as major releases."
+        ),
+    ),
+    ServiceCase(
+        game_name="Dwarf Fortress",
+        appid="975370",
+        expected_major=False,
+        reasoning=(
+            "Current window contains only maintenance patches (53.11/53.10/53.09 plus hotfixes). "
+            "Phase 2: maintenance blocker (patch + fix language) correctly blocks all of them → no major_date."
+        ),
+    ),
+    ServiceCase(
+        game_name="Helldivers 2",
+        appid="553850",
+        expected_major=True,
+        reasoning=(
+            "Current window contains 'Machinery of Oppression: 6.1.0', a named content update. "
+            "Expected: major_date is not None."
+        ),
+    ),
+    ServiceCase(
+        game_name="Lethal Company",
+        appid="1966720",
+        expected_major=True,
+        reasoning=(
+            "Current window contains 'V70 - The Incubating Update', a named major content drop, "
+            "plus newer bug-fix patches. Phase 2: NAMED_VERSION_RE + UPDATE_WORD_RE detects V70 → major_date set."
+        ),
+    ),
+    ServiceCase(
+        game_name="Factorio",
+        appid="427520",
+        expected_major=False,
+        reasoning=(
+            "Current window contains only three-segment stable maintenance releases (2.0.x). "
+            "Expected: not_major."
+        ),
+    ),
+    ServiceCase(
+        game_name="Satisfactory",
+        appid="526870",
+        expected_major=False,
+        reasoning=(
+            "Current window contains an experimental 1.2 rollout and experimental hotfixes. "
+            "Phase 2: extended BRANCH_RE ('on Experimental') blocks the 1.2 rollout → no major_date."
+        ),
+    ),
+]
+# ── helpers ───────────────────────────────────────────────────────────────────
+def _fmt_ts(ts: int | None) -> str:
+    if not ts:
+        return "—"
+    try:
+        return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
+    except (OSError, ValueError):
+        return "—"
+def _fmt_dt(dt: datetime | None) -> str:
+    if dt is None:
+        return "—"
+    return dt.strftime("%Y-%m-%d")
+def _trunc(s: str, n: int) -> str:
+    return (s[:n] + "…") if len(s) > n else s
+def _fetch_news(client: httpx.Client, appid: str, count: int) -> list[dict]:
+    try:
+        resp = client.get(
+            STEAM_NEWS_API_URL,
+            params={"appid": appid, "count": count, "maxlength": 0},
+        )
+        if resp.status_code != 200:
+            print(f"  [WARN] HTTP {resp.status_code} for appid {appid}", file=sys.stderr)
+            return []
+        data = resp.json()
+        return data.get("appnews", {}).get("newsitems", []) or []
+    except Exception as exc:
+        print(f"  [WARN] Request failed for appid {appid}: {exc}", file=sys.stderr)
+        return []
+# ── Mode 1: discover ──────────────────────────────────────────────────────────
+def run_discover(count: int) -> None:
+    if count != 20:
+        print(f"NOTE: count={count} — beyond production window (prod uses count=20)\n")
+    col_idx    = 4
+    col_gid    = 20
+    col_date   = 10
+    col_title  = 40
+    col_fl     = 16
+    col_tags   = 24
+    col_ur     = 9
+    col_maj    = 7
+    header = (
+        f"{'#':<{col_idx}} "
+        f"{'gid':<{col_gid}} "
+        f"{'date':<{col_date}} "
+        f"{'title':<{col_title}} "
+        f"{'feedlabel':<{col_fl}} "
+        f"{'tags':<{col_tags}} "
+        f"{'upd_rel?':<{col_ur}} "
+        f"{'major?':<{col_maj}}"
+    )
+    sep = "-" * len(header)
+    with httpx.Client(timeout=30.0) as client:
+        for game_name, appid in GAMES:
+            print(f"\n{'=' * len(header)}")
+            print(f"  {game_name}  (appid={appid})")
+            print(f"{'=' * len(header)}")
+            print(header)
+            print(sep)
+            items = _fetch_news(client, appid, count)
+            if not items:
+                print("  (no items returned)")
+                continue
+            for idx, item in enumerate(items, start=1):
+                gid       = str(item.get("gid") or "")[:col_gid]
+                date_str  = _fmt_ts(item.get("date"))
+                title     = _trunc(item.get("title", ""), col_title)
+                feedlabel = _trunc(item.get("feedlabel") or "", col_fl)
+                tags      = _trunc(str(item.get("tags") or ""), col_tags)
+                is_ur  = UpdateDetectionService._is_update_related(item)
+                is_maj = UpdateDetectionService._is_major_update(item)
+                ur_str  = "Yes" if is_ur  else "No"
+                maj_str = "Yes" if is_maj else "No"
+                print(
+                    f"{idx:<{col_idx}} "
+                    f"{gid:<{col_gid}} "
+                    f"{date_str:<{col_date}} "
+                    f"{title:<{col_title}} "
+                    f"{feedlabel:<{col_fl}} "
+                    f"{tags:<{col_tags}} "
+                    f"{ur_str:<{col_ur}} "
+                    f"{maj_str:<{col_maj}}"
+                )
+            latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items)
+            print(f"\n  >> latest_update_date: {_fmt_dt(latest_update_date)}  |  major_date: {_fmt_dt(major_date)}")
+            verdict = "MAJOR" if major_date is not None else "not_major"
+            print(f"  >> Service result: {verdict}")
+# ── Mode 2: evaluate (item-level) ─────────────────────────────────────────────
+def run_evaluate() -> None:
+    if not ITEM_CASES:
+        print("[evaluate] No item-level ground truth defined yet.")
+        print("  Run --discover first, then populate ITEM_CASES in this script.")
+        return
+    # Build lookup: appid → {gid → item}
+    gid_index: dict[str, dict[str, dict]] = {}
+    needed_appids = {case.appid for case in ITEM_CASES}
+    with httpx.Client(timeout=30.0) as client:
+        for appid in needed_appids:
+            items = _fetch_news(client, appid, count=20)
+            gid_index[appid] = {str(item.get("gid", "")): item for item in items}
+    tp = tn = fp = fn = amb = not_found = 0
+    rows: list[tuple] = []
+    for case in ITEM_CASES:
+        item = gid_index.get(case.appid, {}).get(case.gid)
+        if item is None:
+            not_found += 1
+            rows.append((case.game_name, case.title, "—", "—", "—", case.expected, "NOT FOUND"))
+            continue
+        is_ur  = UpdateDetectionService._is_update_related(item)
+        is_maj = UpdateDetectionService._is_major_update(item)
+        predicted = "major" if (is_ur and is_maj) else "not_major"
+        expected  = case.expected
+        if expected == "ambiguous":
+            verdict = "ambiguous"
+            amb += 1
+        elif predicted == expected:
+            verdict = "PASS"
+            if expected == "major":
+                tp += 1
+            else:
+                tn += 1
+        else:
+            if predicted == "major" and expected == "not_major":
+                verdict = "FAIL (FP)"
+                fp += 1
+            else:
+                verdict = "FAIL (FN)"
+                fn += 1
+        rows.append((
+            case.game_name,
+            _trunc(case.title, 30),
+            _fmt_ts(item.get("date")),
+            str(item.get("tags", ""))[:20],
+            item.get("feedlabel", "")[:16],
+            expected,
+            "Yes" if is_ur else "No",
+            "Yes" if is_maj else "No",
+            verdict,
+        ))
+    # Print report
+    print("\n" + "=" * 110)
+    print("REPORT A — Item-level classification")
+    print("=" * 110)
+    hdr = f"{'Game':<18} {'Title':<30} {'Date':<10} {'Tags':<20} {'FeedLabel':<16} {'Expected':<10} {'UpdRel?':<8} {'Major?':<7} Verdict"
+    print(hdr)
+    print("-" * 110)
+    for row in rows:
+        if len(row) == 7:
+            print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {'—':<20} {'—':<16} {row[5]:<10} {'—':<8} {'—':<7} {row[6]}")
+        else:
+            print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {row[3]:<20} {row[4]:<16} {row[5]:<10} {row[6]:<8} {row[7]:<7} {row[8]}")
+    total = tp + tn + fp + fn
+    print("\nSummary:")
+    print(f"  Total cases : {len(ITEM_CASES)}  |  not found: {not_found}  |  ambiguous: {amb}")
+    print(f"  TP={tp}  TN={tn}  FP={fp}  FN={fn}")
+    if total > 0:
+        prec   = tp / (tp + fp) if (tp + fp) else float("nan")
+        recall = tp / (tp + fn) if (tp + fn) else float("nan")
+        acc    = (tp + tn) / total
+        print(f"  Precision={prec:.2f}  Recall={recall:.2f}  Accuracy={acc:.2f}")
+    fps = [c for c in ITEM_CASES if "FAIL (FP)" in str(rows[ITEM_CASES.index(c)])]
+    fns = [c for c in ITEM_CASES if "FAIL (FN)" in str(rows[ITEM_CASES.index(c)])]
+    if fps:
+        print("\nFalse Positives:")
+        for c in fps:
+            print(f"  [{c.game_name}] {c.title!r} — {c.reasoning}")
+    if fns:
+        print("\nFalse Negatives:")
+        for c in fns:
+            print(f"  [{c.game_name}] {c.title!r} — {c.reasoning}")
+# ── Mode 3: evaluate-service (end-to-end) ─────────────────────────────────────
+def run_evaluate_service() -> None:
+    if not SERVICE_CASES:
+        print("[evaluate-service] No service-level ground truth defined yet.")
+        print("  Run --discover first, then populate SERVICE_CASES in this script.")
+        return
+    tp = tn = fp = fn = amb = 0
+    rows: list[tuple] = []
+    with httpx.Client(timeout=30.0) as client:
+        for case in SERVICE_CASES:
+            items = _fetch_news(client, case.appid, count=20)
+            latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items)
+            is_maj = major_date is not None
+            latest_str = _fmt_dt(latest_update_date)
+            major_str  = _fmt_dt(major_date)
+            maj_label  = "Yes" if is_maj else "No"
+            if case.expected_major is None:
+                verdict = "ambiguous"
+                amb += 1
+            elif is_maj == case.expected_major:
+                verdict = "PASS"
+                if case.expected_major:
+                    tp += 1
+                else:
+                    tn += 1
+            else:
+                if is_maj and not case.expected_major:
+                    verdict = "FAIL (FP)"
+                    fp += 1
+                else:
+                    verdict = "FAIL (FN)"
+                    fn += 1
+            rows.append((
+                case.game_name,
+                latest_str,
+                major_str,
+                maj_label,
+                "True" if case.expected_major else ("None" if case.expected_major is None else "False"),
+                verdict,
+            ))
+    print("\n" + "=" * 100)
+    print("REPORT B — Service-level (end-to-end)")
+    print("=" * 100)
+    hdr = f"{'Game':<18} {'LatestUpdate':<13} {'MajorDate':<11} {'Major?':<7} {'Expected':<9} Verdict"
+    print(hdr)
+    print("-" * 100)
+    for row in rows:
+        print(f"{row[0]:<18} {row[1]:<13} {row[2]:<11} {row[3]:<7} {row[4]:<9} {row[5]}")
+    total = tp + tn + fp + fn
+    print("\nSummary:")
+    print(f"  Total games : {len(SERVICE_CASES)}  |  ambiguous: {amb}")
+    print(f"  TP={tp}  TN={tn}  FP={fp}  FN={fn}")
+    if total > 0:
+        prec   = tp / (tp + fp) if (tp + fp) else float("nan")
+        recall = tp / (tp + fn) if (tp + fn) else float("nan")
+        acc    = (tp + tn) / total
+        print(f"  Precision={prec:.2f}  Recall={recall:.2f}  Accuracy={acc:.2f}")
+    for idx, case in enumerate(SERVICE_CASES):
+        verdict = rows[idx][5]
+        if verdict.startswith("FAIL"):
+            print(f"\n  [{case.game_name}] {verdict} — {case.reasoning}")
+# ── main ──────────────────────────────────────────────────────────────────────
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Benchmark the major update detection heuristic against real Steam games."
+    )
+    p.add_argument(
+        "--discover",
+        action="store_true",
+        help="Fetch news for all games and display per-item classification details.",
+    )
+    p.add_argument(
+        "--evaluate",
+        action="store_true",
+        help="Run item-level evaluation against ITEM_CASES ground truth.",
+    )
+    p.add_argument(
+        "--evaluate-service",
+        action="store_true",
+        dest="evaluate_service",
+        help="Run service-level end-to-end evaluation against SERVICE_CASES ground truth.",
+    )
+    p.add_argument(
+        "--count",
+        type=int,
+        default=20,
+        help="Number of news items to fetch (default: 20, matches production). "
+             "Values > 20 are beyond the production window.",
+    )
+    return p.parse_args()
+def main() -> int:
+    args = _parse_args()
+    discover  = args.discover
+    evaluate  = args.evaluate
+    eval_svc  = args.evaluate_service
+    # Default: run both evaluate modes when nothing is specified
+    if not discover and not evaluate and not eval_svc:
+        evaluate = True
+        eval_svc = True
+    if discover:
+        run_discover(count=args.count)
+    if evaluate:
+        run_evaluate()
+    if eval_svc:
+        run_evaluate_service()
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/check_db_stats.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import asyncio
+import os
+from motor.motor_asyncio import AsyncIOMotorClient
+from dotenv import load_dotenv
+# Załaduj .env z głównego katalogu lub katalogu backend
+load_dotenv(".env")
+load_dotenv("backend/.env")
+async def check_stats():
+    # Pobranie parametrów z .env
+    mongo_url = os.getenv("MONGODB_URL")
+    db_name = os.getenv("MONGODB_DB_NAME", "sentimentSummary")
+    if not mongo_url:
+        print("ERROR: MONGODB_URL not found in .env file!")
+        return
+    print(f"Connecting to MongoDB: {mongo_url.split('@')[-1]}...") # Pokazuje tylko hosta dla bezpieczeństwa
+    try:
+        client = AsyncIOMotorClient(mongo_url)
+        db = client[db_name]
+        collection = db["games"]
+        total = await collection.count_documents({})
+        with_cn = await collection.count_documents({
+            "name_cn": {"$exists": True, "$ne": None, "$nin": ["", "null", "None"]}
+        })
+        print("\n" + "="*30)
+        print(f"DATABASE STATS")
+        print("="*30)
+        print(f"Total games:      {total}")
+        print(f"With Chinese:     {with_cn}")
+        if total > 0:
+            percentage = (with_cn / total) * 100
+            print(f"Coverage:         {percentage:.2f}%")
+        print("="*30)
+        client.close()
+    except Exception as e:
+        print(f"ERROR: Could not connect or query DB: {e}")
+if __name__ == "__main__":
+    asyncio.run(check_stats())

scripts/expand_keywords/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Keyword expansion toolkit using FastText.
+This package provides tools to:
+1. Fetch reviews from Steam games
+2. Train FastText models on review corpus
+3. Expand existing keyword dictionary with semantically similar words
+"""

scripts/expand_keywords/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Allow running as: python -m scripts.expand_keywords"""
+from .main import main
+if __name__ == "__main__":
+    main()

scripts/expand_keywords/config.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+Configuration for keyword expansion: game list and settings.
+"""
+from pathlib import Path
+# Base directories
+BASE_DIR = Path(__file__).parent
+DATA_DIR = BASE_DIR / "data"
+REVIEWS_DIR = DATA_DIR / "reviews"
+MODELS_DIR = DATA_DIR / "models"
+OUTPUT_DIR = DATA_DIR / "output"
+# Ensure directories exist
+for dir_path in [REVIEWS_DIR, MODELS_DIR, OUTPUT_DIR]:
+    dir_path.mkdir(parents=True, exist_ok=True)
+# Game list: (app_id, name, genre)
+# Selected for variety across genres to get diverse vocabulary
+GAMES: list[tuple[str, str, str]] = [
+    # Action RPG
+    ("1245620", "Elden Ring", "action_rpg"),
+    ("374320", "Dark Souls III", "action_rpg"),
+    # CRPG
+    ("1086940", "Baldur's Gate 3", "crpg"),
+    ("435150", "Divinity: Original Sin 2", "crpg"),
+    ("1184370", "Pathfinder: Wrath of the Righteous", "crpg"),
+    # Open World RPG
+    ("292030", "The Witcher 3", "open_world_rpg"),
+    ("489830", "Skyrim Special Edition", "open_world_rpg"),
+    ("1091500", "Cyberpunk 2077", "open_world_rpg"),
+    # FPS
+    ("730", "Counter-Strike 2", "fps_competitive"),
+    ("782330", "DOOM Eternal", "fps_single"),
+    ("1237970", "Titanfall 2", "fps_single"),
+    # Survival
+    ("892970", "Valheim", "survival"),
+    ("252490", "Rust", "survival"),
+    ("264710", "Subnautica", "survival"),
+    ("242760", "The Forest", "survival"),
+    # Strategy
+    ("289070", "Civilization VI", "strategy"),
+    ("1142710", "Total War: Warhammer III", "strategy"),
+    ("1466860", "Age of Empires IV", "strategy"),
+    # Roguelike
+    ("1145360", "Hades", "roguelike"),
+    ("588650", "Dead Cells", "roguelike"),
+    ("646570", "Slay the Spire", "roguelike"),
+    # Metroidvania
+    ("367520", "Hollow Knight", "metroidvania"),
+    ("1057090", "Ori and the Will of the Wisps", "metroidvania"),
+    # Simulation
+    ("255710", "Cities: Skylines", "simulation"),
+    ("427520", "Factorio", "simulation"),
+    ("526870", "Satisfactory", "simulation"),
+    # Horror
+    ("1196590", "Resident Evil Village", "horror"),
+    ("739630", "Phasmophobia", "horror"),
+    ("381210", "Dead by Daylight", "horror"),
+    # Live Service
+    ("1085660", "Destiny 2", "live_service"),
+    ("230410", "Warframe", "live_service"),
+    ("238960", "Path of Exile", "live_service"),
+    # Racing
+    ("1551360", "Forza Horizon 5", "racing"),
+    # Story Driven
+    ("1174180", "Red Dead Redemption 2", "story_driven"),
+    # Casual
+    ("413150", "Stardew Valley", "casual"),
+    ("105600", "Terraria", "casual"),
+]
+# Fetching settings
+SETTINGS = {
+    # Review fetching
+    "reviews_per_game": 2700,       # ~80k total across ~30 games
+    "batch_size": 100,              # Steam API batch size
+    "sleep_between_batches": 1.5,   # Seconds between API calls
+    "sleep_between_games": 5.0,     # Longer pause between games
+    "min_review_length": 50,        # Filter short reviews (chars)
+    "max_retries": 3,               # Retry count on failure
+    "retry_base_delay": 10.0,       # Base delay for exponential backoff
+    # Preprocessing
+    "phrase_min_count": 10,         # Min occurrences for phrase detection
+    "phrase_threshold": 10.0,       # Phrase detection threshold
+    # FastText training
+    "fasttext_vector_size": 150,
+    "fasttext_window": 5,
+    "fasttext_min_count": 5,
+    "fasttext_epochs": 10,
+    "fasttext_workers": 4,
+    # Expansion
+    "similarity_threshold": 0.55,
+    "max_suggestions_per_seed": 20,
+    "min_frequency": 10,            # Min word frequency in corpus
+    "auto_approve_threshold": 0.70, # Score threshold for auto-approval
+}
+# Steam API endpoint
+STEAM_REVIEWS_API = "https://store.steampowered.com/appreviews/{app_id}"
+# Steam language setting for reviews
+STEAM_REVIEW_LANGUAGE = "schinese"  # schinese, english, tchinese, etc.

scripts/expand_keywords/expander.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Keyword dictionary expansion with exclusive category assignment.
+Key principle: Each word can only belong to ONE category.
+This prevents cross-contamination where a word like "unplayable"
+might be counted in both Bugs and Performance categories.
+Algorithm:
+1. For each category: find candidate words similar to seed keywords
+2. Collect ALL candidates in a global pool
+3. Assign each word to the category with highest score
+4. Filter by similarity threshold and frequency
+"""
+import json
+import logging
+import math
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from gensim.models import FastText
+from .config import OUTPUT_DIR, SETTINGS
+logger = logging.getLogger(__name__)
+@dataclass
+class Candidate:
+    """A candidate word for dictionary expansion."""
+    word: str
+    similarity: float
+    frequency: int
+    source_seeds: list[str] = field(default_factory=list)
+    @property
+    def score(self) -> float:
+        """
+        Combined score from similarity and frequency.
+        Formula: 0.7 * similarity + 0.3 * normalized_log_frequency
+        Frequency factor normalized to ~0-1 range.
+        """
+        freq_factor = math.log10(max(self.frequency, 1) + 1) / 5
+        return self.similarity * 0.7 + freq_factor * 0.3
+    def to_dict(self) -> dict:
+        return {
+            "word": self.word.replace("_", " "),
+            "similarity": round(self.similarity, 3),
+            "frequency": self.frequency,
+            "score": round(self.score, 3),
+            "source_seeds": self.source_seeds,
+        }
+class KeywordExpander:
+    """
+    Expands keyword dictionary using trained FastText model.
+    Uses exclusive category assignment to prevent words
+    appearing in multiple categories.
+    """
+    def __init__(
+        self,
+        model: FastText,
+        existing_keywords: dict[str, list[str]],
+        word_frequencies: dict[str, int],
+        similarity_threshold: float | None = None,
+        max_suggestions_per_seed: int | None = None,
+        min_frequency: int | None = None,
+    ):
+        """
+        Initialize expander.
+        Args:
+            model: Trained FastText model
+            existing_keywords: Current TOPIC_KEYWORDS dictionary
+            word_frequencies: Word frequency counts from corpus
+            similarity_threshold: Minimum similarity for candidates
+            max_suggestions_per_seed: Max similar words per seed
+            min_frequency: Minimum corpus frequency
+        """
+        self.model = model
+        self.existing = existing_keywords
+        self.word_freq = word_frequencies
+        self.similarity_threshold = similarity_threshold or SETTINGS["similarity_threshold"]
+        self.max_suggestions = max_suggestions_per_seed or SETTINGS["max_suggestions_per_seed"]
+        self.min_frequency = min_frequency or SETTINGS["min_frequency"]
+        # Build set of all existing words (normalized)
+        self.existing_words: set[str] = set()
+        for words in existing_keywords.values():
+            for w in words:
+                self.existing_words.add(w.lower().replace(" ", "_"))
+        logger.info(f"Expander initialized with {len(self.existing_words)} existing keywords")
+    def _find_candidates_for_category(
+        self,
+        category: str,
+        seeds: list[str],
+    ) -> dict[str, Candidate]:
+        """
+        Find candidate words for a single category.
+        Returns dict[word -> Candidate] with best similarity per word.
+        """
+        candidates: dict[str, Candidate] = {}
+        for seed in seeds:
+            # Normalize seed (e.g., "frame rate" -> "frame_rate")
+            seed_normalized = seed.lower().replace(" ", "_")
+            # Skip if seed not in vocabulary
+            if seed_normalized not in self.model.wv:
+                continue
+            # Get similar words
+            try:
+                similar = self.model.wv.most_similar(
+                    seed_normalized,
+                    topn=self.max_suggestions,
+                )
+            except KeyError:
+                continue
+            for word, similarity in similar:
+                # Skip existing words
+                if word in self.existing_words:
+                    continue
+                # Skip below threshold
+                if similarity < self.similarity_threshold:
+                    continue
+                # Check frequency
+                freq = self.word_freq.get(word, 0)
+                if freq < self.min_frequency:
+                    continue
+                # Update or add candidate
+                if word in candidates:
+                    # Keep higher similarity
+                    if similarity > candidates[word].similarity:
+                        candidates[word].similarity = similarity
+                    candidates[word].source_seeds.append(seed)
+                else:
+                    candidates[word] = Candidate(
+                        word=word,
+                        similarity=similarity,
+                        frequency=freq,
+                        source_seeds=[seed],
+                    )
+        return candidates
+    def expand_all_exclusive(self) -> dict[str, list[Candidate]]:
+        """
+        Expand all categories with exclusive assignment.
+        Each word is assigned only to the category where it has
+        the highest score.
+        Returns:
+            Dict mapping category -> list of Candidates (sorted by score)
+        """
+        logger.info("Starting exclusive expansion...")
+        # Step 1: Collect candidates from all categories
+        # Format: word -> [(category, Candidate), ...]
+        all_candidates: dict[str, list[tuple[str, Candidate]]] = defaultdict(list)
+        for category, seeds in self.existing.items():
+            category_candidates = self._find_candidates_for_category(category, seeds)
+            for word, candidate in category_candidates.items():
+                all_candidates[word].append((category, candidate))
+            logger.info(f"[{category}] Found {len(category_candidates)} raw candidates")
+        # Step 2: Assign each word to category with highest score
+        final_assignments: dict[str, list[Candidate]] = defaultdict(list)
+        for word, category_candidates in all_candidates.items():
+            # Find category with highest score
+            best_category, best_candidate = max(
+                category_candidates,
+                key=lambda x: x[1].score,
+            )
+            final_assignments[best_category].append(best_candidate)
+        # Step 3: Sort candidates in each category by score
+        for category in final_assignments:
+            final_assignments[category].sort(key=lambda c: c.score, reverse=True)
+        # Log results
+        total = sum(len(cands) for cands in final_assignments.values())
+        logger.info(f"Exclusive assignment complete: {total} total candidates")
+        for category, cands in sorted(final_assignments.items()):
+            logger.info(f"  {category}: {len(cands)} candidates")
+        return dict(final_assignments)
+    def export_candidates(
+        self,
+        path: Path | str | None = None,
+        include_threshold_in_name: bool = False,
+    ) -> Path:
+        """
+        Export candidates to JSON for manual review.
+        Args:
+            path: Output path (default: output/candidates.json)
+            include_threshold_in_name: Add threshold to filename for comparison
+        Returns:
+            Path to exported file
+        """
+        if path:
+            path = Path(path)
+        elif include_threshold_in_name:
+            path = OUTPUT_DIR / f"candidates_t{self.similarity_threshold:.2f}.json"
+        else:
+            path = OUTPUT_DIR / "candidates.json"
+        results = self.expand_all_exclusive()
+        export_data = {
+            "metadata": {
+                "generated_at": datetime.now().isoformat(),
+                "similarity_threshold": self.similarity_threshold,
+                "min_frequency": self.min_frequency,
+                "total_candidates": sum(len(c) for c in results.values()),
+            },
+            "categories": {},
+        }
+        for category, candidates in sorted(results.items()):
+            export_data["categories"][category] = [c.to_dict() for c in candidates]
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(export_data, f, indent=2, ensure_ascii=False)
+        logger.info(f"Exported candidates to {path}")
+        return path
+    def generate_keywords_py(
+        self,
+        output_path: Path | str | None = None,
+        auto_approve_threshold: float | None = None,
+    ) -> Path:
+        """
+        Generate new keywords.py with expanded dictionary.
+        Words with score >= auto_approve_threshold are added directly.
+        Words below threshold are added as comments for manual review.
+        Args:
+            output_path: Output path (default: output/keywords_expanded.py)
+            auto_approve_threshold: Score threshold for auto-approval
+        Returns:
+            Path to generated file
+        """
+        output_path = Path(output_path) if output_path else OUTPUT_DIR / "keywords_expanded.py"
+        auto_approve = auto_approve_threshold or SETTINGS["auto_approve_threshold"]
+        results = self.expand_all_exclusive()
+        lines = [
+            '"""',
+            "Expanded keyword dictionary for game review topic detection.",
+            f"Generated: {datetime.now().isoformat()}",
+            f"Auto-approve threshold: {auto_approve}",
+            '"""',
+            "",
+            "TOPIC_KEYWORDS = {",
+        ]
+        for category, seeds in self.existing.items():
+            lines.append(f'    "{category}": [')
+            # Existing keywords
+            lines.append("        # Existing")
+            for seed in seeds:
+                lines.append(f'        "{seed}",')
+            # New candidates
+            candidates = results.get(category, [])
+            if candidates:
+                # Auto-approved
+                auto_approved = [c for c in candidates if c.score >= auto_approve]
+                if auto_approved:
+                    lines.append(f"        # NEW (auto-approved, score >= {auto_approve})")
+                    for c in auto_approved:
+                        word_display = c.word.replace("_", " ")
+                        lines.append(f'        "{word_display}",  # score={c.score:.2f}')
+                # Candidates requiring review
+                review_needed = [c for c in candidates if c.score < auto_approve]
+                if review_needed:
+                    lines.append(f"        # CANDIDATES (score < {auto_approve}, require review)")
+                    for c in review_needed:
+                        word_display = c.word.replace("_", " ")
+                        lines.append(f'        # "{word_display}",  # score={c.score:.2f}')
+            lines.append("    ],")
+            lines.append("")
+        lines.append("}")
+        lines.append("")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+        logger.info(f"Generated keywords file at {output_path}")
+        return output_path
+    def get_expansion_stats(self) -> dict:
+        """Get statistics about the expansion."""
+        results = self.expand_all_exclusive()
+        auto_threshold = SETTINGS["auto_approve_threshold"]
+        stats = {
+            "total_candidates": 0,
+            "auto_approved": 0,
+            "needs_review": 0,
+            "by_category": {},
+        }
+        for category, candidates in results.items():
+            auto = sum(1 for c in candidates if c.score >= auto_threshold)
+            review = len(candidates) - auto
+            stats["by_category"][category] = {
+                "total": len(candidates),
+                "auto_approved": auto,
+                "needs_review": review,
+            }
+            stats["total_candidates"] += len(candidates)
+            stats["auto_approved"] += auto
+            stats["needs_review"] += review
+        return stats

scripts/expand_keywords/fetcher.py ADDED Viewed

	@@ -0,0 +1,355 @@

+"""
+Review fetcher with rate limiting and progress tracking.
+Downloads reviews from Steam API with:
+- Cursor-based pagination
+- Sleep between requests to respect rate limits
+- Progress persistence (JSONL per game + progress.json)
+- Resume capability
+"""
+import asyncio
+import json
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+import httpx
+from .config import GAMES, REVIEWS_DIR, SETTINGS, STEAM_REVIEW_LANGUAGE, STEAM_REVIEWS_API
+logger = logging.getLogger(__name__)
+@dataclass
+class FetchProgress:
+    """Progress tracking for a single game."""
+    app_id: str
+    name: str
+    target: int
+    fetched: int = 0
+    cursor: str = "*"
+    completed: bool = False
+    last_updated: str = ""
+    def to_dict(self) -> dict:
+        return {
+            "app_id": self.app_id,
+            "name": self.name,
+            "target": self.target,
+            "fetched": self.fetched,
+            "cursor": self.cursor,
+            "completed": self.completed,
+            "last_updated": self.last_updated,
+        }
+    @classmethod
+    def from_dict(cls, data: dict) -> "FetchProgress":
+        return cls(
+            app_id=data["app_id"],
+            name=data["name"],
+            target=data["target"],
+            fetched=data.get("fetched", 0),
+            cursor=data.get("cursor", "*"),
+            completed=data.get("completed", False),
+            last_updated=data.get("last_updated", ""),
+        )
+@dataclass
+class ReviewFetcher:
+    """
+    Fetches reviews from Steam with rate limiting.
+    Features:
+    - Async HTTP client with timeout
+    - Exponential backoff on rate limiting
+    - Progress persistence (resume capability)
+    - JSONL output per game
+    """
+    timeout: float = 30.0
+    progress_file: Path = field(default_factory=lambda: REVIEWS_DIR / "progress.json")
+    def __post_init__(self):
+        self._progress: dict[str, FetchProgress] = {}
+        self._load_progress()
+    def _load_progress(self) -> None:
+        """Load progress from file if exists."""
+        if self.progress_file.exists():
+            try:
+                with open(self.progress_file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                    for app_id, progress_data in data.items():
+                        self._progress[app_id] = FetchProgress.from_dict(progress_data)
+                logger.info(f"Loaded progress for {len(self._progress)} games")
+            except (json.JSONDecodeError, KeyError) as e:
+                logger.warning(f"Failed to load progress: {e}")
+                self._progress = {}
+    def _save_progress(self) -> None:
+        """Save progress to file."""
+        data = {app_id: prog.to_dict() for app_id, prog in self._progress.items()}
+        with open(self.progress_file, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+    def get_progress(self) -> dict[str, dict]:
+        """Get current progress for all games."""
+        return {app_id: prog.to_dict() for app_id, prog in self._progress.items()}
+    def _get_reviews_file(self, app_id: str) -> Path:
+        """Get path to reviews JSONL file for a game."""
+        return REVIEWS_DIR / f"{app_id}.jsonl"
+    def _append_reviews(self, app_id: str, reviews: list[str]) -> None:
+        """Append reviews to JSONL file."""
+        reviews_file = self._get_reviews_file(app_id)
+        with open(reviews_file, "a", encoding="utf-8") as f:
+            for review in reviews:
+                f.write(json.dumps({"text": review}, ensure_ascii=False) + "\n")
+    def load_reviews(self, app_id: str) -> list[str]:
+        """Load reviews from JSONL file."""
+        reviews_file = self._get_reviews_file(app_id)
+        if not reviews_file.exists():
+            return []
+        reviews = []
+        with open(reviews_file, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    data = json.loads(line.strip())
+                    reviews.append(data["text"])
+                except (json.JSONDecodeError, KeyError):
+                    continue
+        return reviews
+    def load_all_reviews(self) -> list[str]:
+        """Load all reviews from all downloaded games."""
+        all_reviews = []
+        for app_id, _, _ in GAMES:
+            reviews = self.load_reviews(app_id)
+            all_reviews.extend(reviews)
+        logger.info(f"Loaded {len(all_reviews)} total reviews")
+        return all_reviews
+    async def _fetch_batch(
+        self,
+        client: httpx.AsyncClient,
+        app_id: str,
+        cursor: str,
+        batch_size: int,
+    ) -> tuple[list[str], str | None]:
+        """Fetch a single batch of reviews."""
+        url = STEAM_REVIEWS_API.format(app_id=app_id)
+        params: dict[str, Any] = {
+            "json": "1",
+            "filter": "recent",  # "recent" has more reviews available than "all"
+            "review_type": "all",
+            "language": STEAM_REVIEW_LANGUAGE,
+            "num_per_page": str(batch_size),
+            "cursor": cursor,
+            "purchase_type": "all",
+        }
+        try:
+            response = await client.get(url, params=params)
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error fetching reviews for {app_id}: {e}")
+            return [], None
+        if not data.get("success"):
+            logger.warning(f"API returned success=false for {app_id}")
+            return [], None
+        reviews_data = data.get("reviews", [])
+        min_length = SETTINGS["min_review_length"]
+        reviews = [
+            review.get("review", "").strip()
+            for review in reviews_data
+            if review.get("review") and len(review.get("review", "").strip()) >= min_length
+        ]
+        new_cursor = data.get("cursor")
+        return reviews, new_cursor
+    async def _fetch_with_backoff(
+        self,
+        client: httpx.AsyncClient,
+        app_id: str,
+        cursor: str,
+        batch_size: int,
+    ) -> tuple[list[str], str | None]:
+        """Fetch with exponential backoff on failure."""
+        max_retries = SETTINGS["max_retries"]
+        base_delay = SETTINGS["retry_base_delay"]
+        for attempt in range(max_retries):
+            reviews, new_cursor = await self._fetch_batch(client, app_id, cursor, batch_size)
+            if reviews or new_cursor is None:
+                return reviews, new_cursor
+            # Empty reviews with cursor - might be rate limited
+            delay = base_delay * (2 ** attempt)
+            logger.warning(f"Empty response, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
+            await asyncio.sleep(delay)
+        return [], None
+    async def fetch_game_reviews(
+        self,
+        app_id: str,
+        name: str,
+        target: int,
+        resume: bool = True,
+    ) -> int:
+        """
+        Fetch reviews for a single game.
+        Returns number of reviews fetched.
+        """
+        # Check if already completed
+        if resume and app_id in self._progress:
+            progress = self._progress[app_id]
+            if progress.completed:
+                logger.info(f"[{name}] Already completed ({progress.fetched} reviews)")
+                return progress.fetched
+            cursor = progress.cursor
+            fetched = progress.fetched
+        else:
+            # Start fresh - clear existing file
+            reviews_file = self._get_reviews_file(app_id)
+            if reviews_file.exists():
+                reviews_file.unlink()
+            cursor = "*"
+            fetched = 0
+        # Initialize progress
+        self._progress[app_id] = FetchProgress(
+            app_id=app_id,
+            name=name,
+            target=target,
+            fetched=fetched,
+            cursor=cursor,
+        )
+        batch_size = SETTINGS["batch_size"]
+        sleep_between = SETTINGS["sleep_between_batches"]
+        seen_cursors: set[str] = set()
+        logger.info(f"[{name}] Starting fetch: target={target}, already={fetched}")
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            while fetched < target:
+                reviews, new_cursor = await self._fetch_with_backoff(
+                    client, app_id, cursor, batch_size
+                )
+                if not reviews:
+                    logger.warning(f"[{name}] No more reviews available")
+                    break
+                if new_cursor and new_cursor in seen_cursors:
+                    logger.warning(f"[{name}] Cursor loop detected")
+                    break
+                if new_cursor:
+                    seen_cursors.add(new_cursor)
+                # Save reviews
+                self._append_reviews(app_id, reviews)
+                fetched += len(reviews)
+                # Update progress
+                self._progress[app_id].fetched = fetched
+                self._progress[app_id].cursor = new_cursor or cursor
+                self._progress[app_id].last_updated = datetime.now().isoformat()
+                self._save_progress()
+                logger.info(f"[{name}] Fetched {fetched}/{target} reviews")
+                if not new_cursor or new_cursor == "*":
+                    break
+                cursor = new_cursor
+                await asyncio.sleep(sleep_between)
+        # Mark as completed
+        self._progress[app_id].completed = True
+        self._progress[app_id].last_updated = datetime.now().isoformat()
+        self._save_progress()
+        logger.info(f"[{name}] Completed with {fetched} reviews")
+        return fetched
+    async def fetch_all(
+        self,
+        resume: bool = True,
+        limit_games: int | None = None,
+    ) -> dict[str, int]:
+        """
+        Fetch reviews for all configured games.
+        Args:
+            resume: Continue from previous progress
+            limit_games: Limit number of games (for testing)
+        Returns:
+            Dict mapping app_id to number of reviews fetched
+        """
+        results: dict[str, int] = {}
+        sleep_between_games = SETTINGS["sleep_between_games"]
+        reviews_per_game = SETTINGS["reviews_per_game"]
+        games = GAMES[:limit_games] if limit_games else GAMES
+        for i, (app_id, name, genre) in enumerate(games):
+            logger.info(f"Processing game {i + 1}/{len(games)}: {name} ({genre})")
+            count = await self.fetch_game_reviews(
+                app_id=app_id,
+                name=name,
+                target=reviews_per_game,
+                resume=resume,
+            )
+            results[app_id] = count
+            # Sleep between games (except for last one)
+            if i < len(games) - 1:
+                logger.info(f"Sleeping {sleep_between_games}s before next game...")
+                await asyncio.sleep(sleep_between_games)
+        total = sum(results.values())
+        logger.info(f"Total reviews fetched: {total}")
+        return results
+    def get_stats(self) -> dict:
+        """Get statistics about fetched reviews."""
+        stats = {
+            "games_total": len(GAMES),
+            "games_completed": 0,
+            "games_in_progress": 0,
+            "reviews_total": 0,
+            "reviews_per_game": {},
+        }
+        for app_id, name, _ in GAMES:
+            reviews_file = self._get_reviews_file(app_id)
+            if reviews_file.exists():
+                count = sum(1 for _ in open(reviews_file, "r", encoding="utf-8"))
+                stats["reviews_per_game"][name] = count
+                stats["reviews_total"] += count
+                if app_id in self._progress and self._progress[app_id].completed:
+                    stats["games_completed"] += 1
+                else:
+                    stats["games_in_progress"] += 1
+        return stats

scripts/expand_keywords/keywords_base.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+Słowa kluczowe do wykrywania tematów w recenzjach gier.
+Używane w podejściu hybrydowym (Keywords + ML Sentiment).
+Kategorie zostały dobrane na podstawie najczęstszych tematów
+poruszanych w recenzjach gier na platformie Steam.
+"""
+TOPIC_KEYWORDS = {
+    # =========================================================================
+    # CORE GAMEPLAY
+    # =========================================================================
+    "Gameplay": [
+        # Podstawowe
+        "gameplay", "mechanics", "game mechanics", "core gameplay", "game loop",
+        "combat", "combat system", "fighting", "battle", "battles",
+        # Progresja
+        "progression", "leveling", "level up", "experience", "xp", "grind", "grinding",
+        "skill tree", "talent tree", "unlock", "unlocks", "unlockables",
+        # Misje i aktywności
+        "quests", "quest", "missions", "mission", "objectives", "side quests",
+        "main quest", "fetch quests", "puzzles", "puzzle", "exploration",
+        # Design
+        "game design", "level design", "map design", "pacing",
+        "balancing", "balanced", "unbalanced", "overpowered", "underpowered", "meta",
+        # Wrogowie
+        "enemies", "enemy", "bosses", "boss fights", "boss battle", "mobs",
+        # Ruch i umiejętności
+        "movement", "traversal", "parkour", "skills", "abilities", "powers",
+        "spells", "weapons", "weapon variety", "builds", "build variety",
+    ],
+    "Fun": [
+        # Pozytywne
+        "fun", "enjoyable", "entertaining", "addictive", "addicting", "engaging",
+        "exciting", "thrilling", "satisfying", "rewarding", "immersive",
+        "masterpiece", "gem", "hidden gem", "must play", "must buy",
+        # Negatywne
+        "boring", "tedious", "repetitive", "monotonous", "dull", "bland",
+        "frustrating", "annoying", "unfun", "not fun", "waste of time",
+        "disappointing", "letdown", "overhyped", "overrated", "underrated",
+    ],
+    "Difficulty": [
+        # Poziomy trudności
+        "difficulty", "easy", "normal", "hard", "very hard", "nightmare",
+        "easy mode", "hard mode", "difficulty settings", "difficulty options",
+        # Opisy trudności
+        "challenging", "too easy", "too hard", "too difficult", "punishing",
+        "forgiving", "casual", "hardcore", "souls-like", "soulslike",
+        "dark souls", "die a lot", "dying", "deaths", "unfair", "cheap deaths",
+        # Krzywa trudności
+        "learning curve", "steep learning curve", "skill ceiling", "skill floor",
+        "newcomer friendly", "beginner friendly", "accessible",
+    ],
+    # =========================================================================
+    # TECHNICAL
+    # =========================================================================
+    "Performance": [
+        # Wydajność
+        "performance", "optimize", "optimized", "optimization", "well optimized",
+        "poorly optimized", "unoptimized", "runs well", "runs smooth", "runs poorly",
+        # FPS
+        "fps", "framerate", "frame rate", "frames per second", "60fps", "30fps",
+        "fps drops", "frame drops", "drops", "dips", "stuttering", "stutter",
+        "hitching", "micro stutter",
+        # Zasoby
+        "cpu", "gpu", "ram", "vram", "memory", "memory leak", "memory usage",
+        # Ładowanie
+        "loading", "loading times", "load times", "loading screens", "long loading",
+        # Stabilność
+        "smooth", "stable", "unstable", "lag", "lagging", "input lag",
+    ],
+    "Bugs": [
+        # Ogólne
+        "bugs", "bug", "buggy", "glitch", "glitches", "glitchy",
+        "broken", "issues", "problems", "jank", "janky",
+        # Crashe
+        "crash", "crashes", "crashing", "crashed", "freeze", "freezing", "frozen",
+        "ctd", "crash to desktop", "black screen", "stuck",
+        # Konkretne bugi
+        "save bug", "save corruption", "corrupted save", "lost progress",
+        "clipping", "falling through", "invisible", "t-pose",
+        "softlock", "soft lock", "softlocked", "game breaking",
+        # Stan gry
+        "unplayable", "unfinished", "early access", "beta", "alpha",
+        "needs polish", "polished", "rough edges",
+    ],
+    # =========================================================================
+    # AUDIO-VISUAL
+    # =========================================================================
+    "Graphics": [
+        # Ogólne
+        "graphics", "visuals", "visual", "graphic", "graphically",
+        "looks", "look", "looking", "looks good", "looks bad", "looks great",
+        # Styl
+        "art style", "art direction", "artstyle", "aesthetic", "stylized",
+        "realistic", "photorealistic", "cartoony", "anime", "pixel art", "retro",
+        # Techniczne
+        "textures", "texture", "models", "model", "animations", "animation",
+        "lighting", "lights", "shadows", "shadow", "reflections", "ray tracing",
+        "rendering", "shaders", "particle effects", "particles",
+        # Rozdzielczość
+        "resolution", "4k", "1440p", "1080p", "720p", "upscaling", "dlss", "fsr",
+        # Środowisko
+        "environments", "environment", "scenery", "landscapes", "world design",
+        "level of detail", "lod", "draw distance", "pop in", "pop-in",
+        # Oceny
+        "beautiful", "gorgeous", "stunning", "breathtaking", "pretty",
+        "ugly", "dated", "outdated", "aged", "old looking",
+    ],
+    "Sound": [
+        # Muzyka
+        "music", "soundtrack", "ost", "score", "composer", "tracks",
+        "ambient", "ambient music", "battle music", "menu music",
+        # Głos
+        "voice", "voice acting", "voice actors", "voice over", "vo",
+        "voice lines", "dialogue", "dubbed", "dubbing", "lip sync",
+        # Efekty dźwiękowe
+        "sound", "sounds", "audio", "sfx", "sound effects", "sound design",
+        "footsteps", "gunshots", "explosions",
+        # Jakość
+        "atmosphere", "atmospheric", "immersive audio", "spatial audio",
+        "surround", "audio quality", "sound quality",
+        # Problemy
+        "audio bug", "audio glitch", "no sound", "sound cutting", "loud", "quiet",
+    ],
+    # =========================================================================
+    # CONTENT & VALUE
+    # =========================================================================
+    "Content": [
+        # Długość
+        "hours", "hour", "length", "long", "short", "playtime", "play time",
+        "how long", "game length", "campaign length",
+        # Ilość contentu
+        "content", "lots of content", "lack of content", "thin", "meaty",
+        "activities", "things to do", "side content", "endgame", "end game",
+        "post game", "new game plus", "ng+",
+        # Replayability
+        "replay", "replay value", "replayability", "replayable",
+        "multiple endings", "different endings", "choices matter",
+        "multiple playthroughs", "completionist", "100%", "100 percent",
+    ],
+    "Monetization": [
+        # Cena (ex-Price)
+        "price", "pricing", "cost", "costs", "priced",
+        "expensive", "overpriced", "cheap", "affordable",
+        "value", "worth", "worth it", "not worth", "bang for buck",
+        "value for money", "money well spent",
+        "sale", "discount", "on sale", "full price", "wait for sale",
+        "refund", "refunded", "steam sale",
+        "aaa price", "indie price", "budget", "premium",
+        "free to play", "f2p", "free",
+        # MTX (ex-Microtransactions)
+        "microtransactions", "microtransaction", "mtx", "monetization",
+        "in app purchases", "iap", "real money", "cash shop", "item shop",
+        "pay to win", "p2w", "pay2win", "paywall", "pay wall",
+        "pay to progress", "paying", "whale", "whales",
+        "loot box", "loot boxes", "lootbox", "gacha", "gambling",
+        "rng", "random", "chance",
+        "battle pass", "season pass", "battlepass", "seasons",
+        "premium currency", "gems", "coins", "points",
+        "cosmetics", "cosmetic", "skins", "skin", "outfits",
+        "dlc", "expansion", "expansions", "dlcs",
+        "cash grab", "money grab", "greedy", "predatory", "scam",
+    ],
+    # =========================================================================
+    # MULTIPLAYER & COMMUNITY
+    # =========================================================================
+    "Multiplayer": [
+        # Tryby
+        "multiplayer", "multi-player", "online", "offline",
+        "co-op", "coop", "co op", "cooperative",
+        "pvp", "pve", "pvpve", "versus",
+        "singleplayer", "single player", "solo", "solo play",
+        # Matchmaking
+        "matchmaking", "queue", "queue times", "waiting",
+        "servers", "server", "dedicated servers", "p2p", "peer to peer",
+        "ping", "latency", "connection", "disconnects", "desync",
+        # Gracze
+        "players", "teammates", "team", "squad", "party",
+        "randoms", "random teammates", "lobbies", "lobby",
+        # Problemy
+        "cheaters", "cheater", "hackers", "hacker", "hacking", "cheating",
+        "aimbots", "wallhacks", "anticheat", "anti cheat",
+        "toxic", "toxicity", "griefing", "griefers",
+    ],
+    "Community": [
+        # Społeczność
+        "community", "playerbase", "player base", "players",
+        "active", "dead game", "dead", "alive", "population",
+        # Modding
+        "mods", "mod", "modding", "mod support", "workshop",
+        "steam workshop", "nexus", "modders", "modded",
+        "custom content", "user generated",
+        # Deweloperzy (interakcja)
+        "devs", "developers", "dev team", "community manager",
+        "communication", "transparent", "listening",
+        # Społeczność graczy
+        "helpful", "friendly", "toxic community", "welcoming",
+        "guides", "wiki", "tutorials", "newbie friendly",
+    ],
+    # =========================================================================
+    # CONTROLS & UI
+    # =========================================================================
+    "Controls": [
+        # Sterowanie
+        "controls", "control", "controlling", "control scheme",
+        "keybinds", "keybind", "key bindings", "rebind", "remapping",
+        # Urządzenia
+        "keyboard", "mouse", "kb+m", "kbm",
+        "controller", "gamepad", "joystick", "controller support",
+        "xbox controller", "ps controller", "dualsense",
+        # Responsywność
+        "responsive", "unresponsive", "clunky", "sluggish", "tight",
+        "smooth controls", "floaty", "heavy", "weighty",
+        # Celowanie
+        "aiming", "aim", "aim assist", "auto aim",
+        "camera", "camera controls", "camera angle",
+    ],
+    "UI": [
+        # Interface
+        "ui", "user interface", "interface", "hud",
+        "menu", "menus", "main menu", "pause menu",
+        "ux", "user experience",
+        # Design UI
+        "clean ui", "cluttered", "minimalist", "intuitive",
+        "confusing", "overwhelming", "readable", "readable text",
+        # Elementy
+        "minimap", "map", "inventory", "crafting menu",
+        "skill menu", "quest log", "journal",
+        # Problemy
+        "font size", "text size", "too small", "can't read",
+        "navigation", "navigating",
+    ],
+    # =========================================================================
+    # STORY & NARRATIVE
+    # =========================================================================
+    "Story": [
+        # Narracja
+        "story", "storyline", "plot", "narrative", "storytelling",
+        "writing", "written", "well written", "poorly written",
+        # Elementy fabularne
+        "characters", "character", "protagonist", "main character",
+        "villain", "antagonist", "npcs", "npc", "companions",
+        "dialogue", "dialogues", "conversations", "choices",
+        # Świat
+        "lore", "world building", "worldbuilding", "universe",
+        "setting", "backstory", "history",
+        # Emocje
+        "emotional", "emotions", "feels", "touching", "heartwarming",
+        "dark", "mature", "gritty", "lighthearted",
+        # Zakończenie
+        "ending", "endings", "conclusion", "finale",
+        "twist", "plot twist", "predictable", "unpredictable",
+        # Cutscenki
+        "cutscenes", "cutscene", "cinematics", "cinematic",
+        "script", "scripted", "linear", "open ended",
+    ],
+    # =========================================================================
+    # DEVELOPER SUPPORT
+    # =========================================================================
+    "Support": [
+        # Aktualizacje
+        "updates", "update", "patch", "patches", "patched",
+        "hotfix", "hotfixes", "bug fixes", "fixed",
+        # Stan rozwoju
+        "abandoned", "dead", "no updates", "still updating",
+        "active development", "roadmap", "planned",
+        "early access", "full release", "1.0", "launch",
+        # Deweloperzy
+        "developer", "developers", "dev", "devs", "studio",
+        "indie dev", "indie developer", "aaa developer",
+        # Wsparcie
+        "support", "customer support", "response", "feedback",
+        "listening to feedback", "ignoring", "communication",
+        # Porty
+        "port", "ported", "console port", "pc port", "lazy port",
+    ],
+    # =========================================================================
+    # PREDICTION & INTENT (NEW!)
+    # =========================================================================
+    "Retention": [
+        # Pozytywne (High Retention)
+        "addictive", "addicted", "can't stop playing", "hooked", "drug",
+        "thousands of hours", "hundreds of hours", "worth it", "worth every penny",
+        "buy it", "must buy", "highly recommend", "masterpiece", "goty",
+        "game of the year", "10/10", "best game", "favorite game",
+        # Negatywne (Churn)
+        "refund", "refunded", "refunding", "uninstalled", "uninstall", "delete",
+        "waste of money", "waste of time", "don't buy", "do not buy",
+        "regret", "regretting", "boring", "bored", "sleep", "sleepy",
+        "wait for sale", "not worth it", "cash grab", "scam",
+    ],
+}
+# =============================================================================
+# WYKLUCZENIA (Context-aware filtering)
+# =============================================================================
+# Słowa wykluczające - jeśli występują w pobliżu słowa kluczowego,
+# ignorujemy to słowo kluczowe w danym kontekście.
+# Format: "słowo_kluczowe": ["słowo_obok", "inne_słowo"]
+EXCLUSIONS = {
+    # "fps" jako gatunek (FPS shooter) vs wydajność (60 fps)
+    "fps": ["genre", "shooter", "first person", "fps game", "fps genre"],
+    # "free" jako darmowy vs "free to play" model biznesowy
+    "free": ["drm free", "bug free", "free roam", "free world"],
+    # "controls" jako sterowanie vs "kontroluje" w narracji
+    "control": ["mind control", "control the world", "control freak"],
+}

scripts/expand_keywords/main.py ADDED Viewed

	@@ -0,0 +1,447 @@

+"""
+CLI for keyword expansion toolkit.
+Usage:
+    # Fetch reviews from Steam (can be resumed)
+    python -m scripts.expand_keywords fetch --resume
+    # Train FastText model
+    python -m scripts.expand_keywords train
+    # Expand dictionary and export candidates
+    python -m scripts.expand_keywords expand --threshold 0.55
+    # Generate new keywords.py
+    python -m scripts.expand_keywords generate --auto-approve 0.7
+    # Run all steps
+    python -m scripts.expand_keywords run --resume
+    # Show statistics
+    python -m scripts.expand_keywords stats
+"""
+import argparse
+import asyncio
+import logging
+import sys
+from pathlib import Path
+# Add project root to path for imports
+PROJECT_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS
+from scripts.expand_keywords.expander import KeywordExpander
+from scripts.expand_keywords.fetcher import ReviewFetcher
+from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords
+from scripts.expand_keywords.trainer import FastTextTrainer
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+def load_existing_keywords() -> dict[str, list[str]]:
+    """Load existing TOPIC_KEYWORDS from keywords.py."""
+    keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py"
+    if not keywords_path.exists():
+        raise FileNotFoundError(f"Keywords file not found: {keywords_path}")
+    # Execute keywords.py to get TOPIC_KEYWORDS
+    namespace: dict = {}
+    exec(keywords_path.read_text(encoding="utf-8"), namespace)
+    keywords = namespace.get("TOPIC_KEYWORDS")
+    if not keywords:
+        raise ValueError("TOPIC_KEYWORDS not found in keywords.py")
+    return keywords
+async def cmd_fetch(args: argparse.Namespace) -> None:
+    """Fetch reviews from Steam."""
+    logger.info("Starting review fetch...")
+    fetcher = ReviewFetcher()
+    # Show current progress
+    stats = fetcher.get_stats()
+    logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
+    await fetcher.fetch_all(
+        resume=args.resume,
+        limit_games=args.limit,
+    )
+    # Show final stats
+    stats = fetcher.get_stats()
+    logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
+def cmd_train(args: argparse.Namespace) -> None:
+    """Train FastText model."""
+    logger.info("Starting model training...")
+    # Load existing keywords for frozen n-grams
+    keywords = load_existing_keywords()
+    existing_ngrams = extract_ngrams_from_keywords(keywords)
+    logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary")
+    # Load reviews
+    fetcher = ReviewFetcher()
+    reviews = fetcher.load_all_reviews()
+    if not reviews:
+        logger.error("No reviews found. Run 'fetch' first.")
+        return
+    logger.info(f"Loaded {len(reviews)} reviews")
+    # Preprocess
+    preprocessor = Preprocessor(existing_ngrams=existing_ngrams)
+    sentences = preprocessor.preprocess_corpus(reviews)
+    preprocessor.save()
+    # Train
+    trainer = FastTextTrainer()
+    trainer.train(sentences)
+    trainer.save()
+    logger.info("Training complete!")
+def cmd_expand(args: argparse.Namespace) -> None:
+    """Expand dictionary and export candidates."""
+    logger.info("Starting dictionary expansion...")
+    # Load components
+    keywords = load_existing_keywords()
+    preprocessor = Preprocessor()
+    try:
+        preprocessor.load()
+    except FileNotFoundError:
+        logger.error("Preprocessor not found. Run 'train' first.")
+        return
+    trainer = FastTextTrainer()
+    try:
+        model = trainer.load()
+    except FileNotFoundError:
+        logger.error("Model not found. Run 'train' first.")
+        return
+    # Expand
+    expander = KeywordExpander(
+        model=model,
+        existing_keywords=keywords,
+        word_frequencies=preprocessor.get_word_frequencies(),
+        similarity_threshold=args.threshold,
+    )
+    # Export candidates (with threshold in filename if requested)
+    expander.export_candidates(include_threshold_in_name=args.compare)
+    # Show stats
+    stats = expander.get_expansion_stats()
+    logger.info(f"Expansion complete: {stats['total_candidates']} candidates")
+    logger.info(f"  Auto-approved: {stats['auto_approved']}")
+    logger.info(f"  Needs review: {stats['needs_review']}")
+def cmd_compare(args: argparse.Namespace) -> None:
+    """Compare multiple thresholds."""
+    logger.info("Comparing thresholds...")
+    # Load components
+    keywords = load_existing_keywords()
+    preprocessor = Preprocessor()
+    try:
+        preprocessor.load()
+    except FileNotFoundError:
+        logger.error("Preprocessor not found. Run 'train' first.")
+        return
+    trainer = FastTextTrainer()
+    try:
+        model = trainer.load()
+    except FileNotFoundError:
+        logger.error("Model not found. Run 'train' first.")
+        return
+    thresholds = args.thresholds
+    results = []
+    for threshold in thresholds:
+        expander = KeywordExpander(
+            model=model,
+            existing_keywords=keywords,
+            word_frequencies=preprocessor.get_word_frequencies(),
+            similarity_threshold=threshold,
+        )
+        # Export with threshold in name
+        expander.export_candidates(include_threshold_in_name=True)
+        stats = expander.get_expansion_stats()
+        results.append((threshold, stats))
+    # Print comparison table
+    print("\n" + "=" * 60)
+    print("THRESHOLD COMPARISON")
+    print("=" * 60)
+    print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}")
+    print("-" * 60)
+    for threshold, stats in results:
+        print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}")
+    print("-" * 60)
+    print(f"\nOutput files saved to: {OUTPUT_DIR}/")
+    print("Compare candidates_t*.json to see differences.")
+def cmd_generate(args: argparse.Namespace) -> None:
+    """Generate new keywords.py."""
+    logger.info("Generating expanded keywords.py...")
+    # Load components
+    keywords = load_existing_keywords()
+    preprocessor = Preprocessor()
+    try:
+        preprocessor.load()
+    except FileNotFoundError:
+        logger.error("Preprocessor not found. Run 'train' first.")
+        return
+    trainer = FastTextTrainer()
+    try:
+        model = trainer.load()
+    except FileNotFoundError:
+        logger.error("Model not found. Run 'train' first.")
+        return
+    # Generate
+    expander = KeywordExpander(
+        model=model,
+        existing_keywords=keywords,
+        word_frequencies=preprocessor.get_word_frequencies(),
+    )
+    output_path = expander.generate_keywords_py(
+        auto_approve_threshold=args.auto_approve,
+    )
+    logger.info(f"Generated: {output_path}")
+async def cmd_run(args: argparse.Namespace) -> None:
+    """Run all steps: fetch, train, expand, generate."""
+    logger.info("Running complete pipeline...")
+    # Step 1: Fetch
+    await cmd_fetch(args)
+    # Step 2: Train
+    cmd_train(args)
+    # Step 3: Expand
+    cmd_expand(args)
+    # Step 4: Generate
+    cmd_generate(args)
+    logger.info("Pipeline complete!")
+def cmd_stats(args: argparse.Namespace) -> None:
+    """Show statistics."""
+    # Fetcher stats
+    fetcher = ReviewFetcher()
+    fetch_stats = fetcher.get_stats()
+    print("\n=== Fetch Statistics ===")
+    print(f"Games configured: {fetch_stats['games_total']}")
+    print(f"Games completed: {fetch_stats['games_completed']}")
+    print(f"Games in progress: {fetch_stats['games_in_progress']}")
+    print(f"Total reviews: {fetch_stats['reviews_total']}")
+    if fetch_stats["reviews_per_game"]:
+        print("\nReviews per game:")
+        for name, count in sorted(fetch_stats["reviews_per_game"].items()):
+            print(f"  {name}: {count}")
+    # Model stats
+    model_path = MODELS_DIR / "fasttext.model"
+    if model_path.exists():
+        print("\n=== Model Statistics ===")
+        trainer = FastTextTrainer()
+        model = trainer.load()
+        print(f"Vocabulary size: {len(model.wv)}")
+    # Expansion stats (if available)
+    candidates_path = OUTPUT_DIR / "candidates.json"
+    if candidates_path.exists():
+        import json
+        with open(candidates_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        print("\n=== Expansion Statistics ===")
+        print(f"Total candidates: {data['metadata']['total_candidates']}")
+        for cat, cands in data["categories"].items():
+            print(f"  {cat}: {len(cands)}")
+def cmd_similar(args: argparse.Namespace) -> None:
+    """Find similar words for testing."""
+    trainer = FastTextTrainer()
+    try:
+        model = trainer.load()
+    except FileNotFoundError:
+        logger.error("Model not found. Run 'train' first.")
+        return
+    word = args.word
+    topn = args.topn
+    similar = trainer.get_similar(word, topn=topn)
+    if similar:
+        print(f"\nWords similar to '{word}':")
+        for w, sim in similar:
+            print(f"  {w}: {sim:.3f}")
+    else:
+        print(f"Word '{word}' not found in vocabulary")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Keyword expansion toolkit using FastText",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # fetch command
+    fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam")
+    fetch_parser.add_argument(
+        "--resume", "-r",
+        action="store_true",
+        help="Resume from previous progress",
+    )
+    fetch_parser.add_argument(
+        "--limit", "-l",
+        type=int,
+        default=None,
+        help="Limit number of games (for testing)",
+    )
+    # train command
+    train_parser = subparsers.add_parser("train", help="Train FastText model")
+    # expand command
+    expand_parser = subparsers.add_parser("expand", help="Expand dictionary")
+    expand_parser.add_argument(
+        "--threshold", "-t",
+        type=float,
+        default=SETTINGS["similarity_threshold"],
+        help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
+    )
+    expand_parser.add_argument(
+        "--compare", "-c",
+        action="store_true",
+        help="Include threshold in output filename (for comparison)",
+    )
+    # compare command
+    compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds")
+    compare_parser.add_argument(
+        "--thresholds", "-t",
+        type=float,
+        nargs="+",
+        default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70],
+        help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)",
+    )
+    # generate command
+    generate_parser = subparsers.add_parser("generate", help="Generate keywords.py")
+    generate_parser.add_argument(
+        "--auto-approve", "-a",
+        type=float,
+        default=SETTINGS["auto_approve_threshold"],
+        help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
+    )
+    # run command (all steps)
+    run_parser = subparsers.add_parser("run", help="Run all steps")
+    run_parser.add_argument(
+        "--resume", "-r",
+        action="store_true",
+        help="Resume fetch from previous progress",
+    )
+    run_parser.add_argument(
+        "--limit", "-l",
+        type=int,
+        default=None,
+        help="Limit number of games (for testing)",
+    )
+    run_parser.add_argument(
+        "--threshold", "-t",
+        type=float,
+        default=SETTINGS["similarity_threshold"],
+        help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
+    )
+    run_parser.add_argument(
+        "--auto-approve", "-a",
+        type=float,
+        default=SETTINGS["auto_approve_threshold"],
+        help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
+    )
+    # stats command
+    stats_parser = subparsers.add_parser("stats", help="Show statistics")
+    # similar command (for testing)
+    similar_parser = subparsers.add_parser("similar", help="Find similar words")
+    similar_parser.add_argument("word", help="Word to find similar words for")
+    similar_parser.add_argument(
+        "--topn", "-n",
+        type=int,
+        default=20,
+        help="Number of results (default: 20)",
+    )
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+        return
+    # Execute command
+    if args.command == "fetch":
+        asyncio.run(cmd_fetch(args))
+    elif args.command == "train":
+        cmd_train(args)
+    elif args.command == "expand":
+        cmd_expand(args)
+    elif args.command == "compare":
+        cmd_compare(args)
+    elif args.command == "generate":
+        cmd_generate(args)
+    elif args.command == "run":
+        asyncio.run(cmd_run(args))
+    elif args.command == "stats":
+        cmd_stats(args)
+    elif args.command == "similar":
+        cmd_similar(args)
+if __name__ == "__main__":
+    main()

scripts/expand_keywords/preprocessor.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Text preprocessing with n-gram detection using gensim.Phrases.
+Pipeline:
+1. Tokenization (jieba for Chinese, regex for English/mixed)
+2. Build Phrases models (bigrams, trigrams)
+3. Apply frozen n-grams from existing dictionary
+4. Apply detected phrases
+This ensures that multi-word concepts like "帧率" or "加载画面"
+are treated as single tokens during FastText training.
+For Chinese text:
+- Uses jieba for word segmentation (Chinese has no spaces)
+- Keeps English words intact (common in gaming reviews: fps, bug, dlc)
+- Removes punctuation but preserves Chinese characters
+"""
+import logging
+import pickle
+import re
+from collections import Counter
+from pathlib import Path
+import jieba
+from gensim.models import Phrases
+from gensim.models.phrases import Phraser
+from .config import MODELS_DIR, SETTINGS
+logger = logging.getLogger(__name__)
+class Preprocessor:
+    """
+    Text preprocessor with n-gram detection.
+    Uses gensim Phrases for automatic phrase detection plus
+    frozen n-grams from the existing keyword dictionary.
+    """
+    def __init__(self, existing_ngrams: list[str] | None = None):
+        """
+        Initialize preprocessor.
+        Args:
+            existing_ngrams: Multi-word phrases from existing keywords.py
+                            (e.g., "frame rate", "loading screen")
+        """
+        self.frozen_ngrams: set[tuple[str, ...]] = set()
+        if existing_ngrams:
+            self.frozen_ngrams = self._normalize_ngrams(existing_ngrams)
+            logger.info(f"Loaded {len(self.frozen_ngrams)} frozen n-grams")
+        self.bigram_model: Phraser | None = None
+        self.trigram_model: Phraser | None = None
+        self.word_frequencies: Counter = Counter()
+    def _normalize_ngrams(self, ngrams: list[str]) -> set[tuple[str, ...]]:
+        """Convert n-grams to lowercase tuple format for fast lookup."""
+        result = set()
+        for ng in ngrams:
+            if " " in ng:
+                tokens = tuple(ng.lower().split())
+                result.add(tokens)
+        return result
+    def tokenize(self, text: str) -> list[str]:
+        """
+        Tokenization for Chinese/mixed text using jieba.
+        - Uses jieba for Chinese word segmentation
+        - Keeps English words intact (common in gaming: fps, bug, dlc)
+        - Removes punctuation (both Chinese and English)
+        - Lowercases English text
+        """
+        # Remove URLs
+        text = re.sub(r'https?://\S+', ' ', text)
+        # Remove punctuation (Chinese and English) but keep Chinese chars and alphanumeric
+        # Chinese punctuation: 。！？，、；：""''（）【】《》
+        text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbfa-zA-Z0-9\s]', ' ', text)
+        # Lowercase English text
+        text = text.lower()
+        # Use jieba to segment Chinese text
+        # jieba handles mixed Chinese/English text well
+        tokens = list(jieba.cut(text))
+        # Filter: remove empty strings and single spaces
+        tokens = [t.strip() for t in tokens if t.strip()]
+        return tokens
+    def build_phrase_models(
+        self,
+        corpus: list[list[str]],
+        min_count: int | None = None,
+        threshold: float | None = None,
+    ) -> None:
+        """
+        Build Phrases models for automatic n-gram detection.
+        Args:
+            corpus: List of tokenized documents
+            min_count: Minimum phrase occurrences (default from settings)
+            threshold: Scoring threshold (higher = fewer phrases)
+        """
+        min_count = min_count or SETTINGS["phrase_min_count"]
+        threshold = threshold or SETTINGS["phrase_threshold"]
+        logger.info(f"Building phrase models (min_count={min_count}, threshold={threshold})")
+        # Build bigram model: "frame rate" -> "frame_rate"
+        bigram_phrases = Phrases(
+            corpus,
+            min_count=min_count,
+            threshold=threshold,
+            delimiter="_",
+        )
+        self.bigram_model = Phraser(bigram_phrases)
+        # Apply bigramy to create input for trigram detection
+        bigram_corpus = [self.bigram_model[doc] for doc in corpus]
+        # Build trigram model: "dark_souls like" -> "dark_souls_like"
+        trigram_phrases = Phrases(
+            bigram_corpus,
+            min_count=min_count,
+            threshold=threshold,
+            delimiter="_",
+        )
+        self.trigram_model = Phraser(trigram_phrases)
+        # Log detected phrases
+        bigram_count = len(bigram_phrases.export_phrases())
+        trigram_count = len(trigram_phrases.export_phrases())
+        logger.info(f"Detected {bigram_count} bigrams, {trigram_count} trigrams")
+    def _apply_frozen_ngrams(self, tokens: list[str]) -> list[str]:
+        """
+        Apply frozen n-grams from existing dictionary.
+        These are always joined, even if not detected by Phrases.
+        """
+        result = []
+        i = 0
+        while i < len(tokens):
+            matched = False
+            # Try trigrams first (longer matches preferred)
+            if i + 2 < len(tokens):
+                trigram = (tokens[i], tokens[i + 1], tokens[i + 2])
+                if trigram in self.frozen_ngrams:
+                    result.append("_".join(trigram))
+                    i += 3
+                    matched = True
+            # Try bigrams
+            if not matched and i + 1 < len(tokens):
+                bigram = (tokens[i], tokens[i + 1])
+                if bigram in self.frozen_ngrams:
+                    result.append("_".join(bigram))
+                    i += 2
+                    matched = True
+            if not matched:
+                result.append(tokens[i])
+                i += 1
+        return result
+    def apply_phrases(self, tokens: list[str]) -> list[str]:
+        """
+        Apply phrase models and frozen n-grams to tokens.
+        Order:
+        1. Frozen n-grams (from existing dictionary)
+        2. Automatic Phrases (bigrams then trigrams)
+        """
+        # Apply frozen n-grams first
+        tokens = self._apply_frozen_ngrams(tokens)
+        # Apply automatic phrase models
+        if self.bigram_model:
+            tokens = list(self.bigram_model[tokens])
+        if self.trigram_model:
+            tokens = list(self.trigram_model[tokens])
+        return tokens
+    def preprocess_corpus(
+        self,
+        reviews: list[str],
+        build_phrases: bool = True,
+    ) -> list[list[str]]:
+        """
+        Full preprocessing pipeline.
+        Args:
+            reviews: Raw review texts
+            build_phrases: Whether to build phrase models (skip if loading)
+        Returns:
+            List of tokenized documents with phrases applied
+        """
+        logger.info(f"Preprocessing {len(reviews)} reviews...")
+        # Step 1: Tokenize all reviews
+        tokenized = [self.tokenize(review) for review in reviews]
+        logger.info("Tokenization complete")
+        # Step 2: Build phrase models
+        if build_phrases:
+            self.build_phrase_models(tokenized)
+        # Step 3: Apply phrases and count frequencies
+        processed = []
+        for tokens in tokenized:
+            phrased = self.apply_phrases(tokens)
+            processed.append(phrased)
+            self.word_frequencies.update(phrased)
+        logger.info(f"Vocabulary size: {len(self.word_frequencies)}")
+        return processed
+    def get_word_frequencies(self) -> dict[str, int]:
+        """Get word frequency dictionary."""
+        return dict(self.word_frequencies)
+    def save(self, path: Path | None = None) -> None:
+        """Save preprocessor state (phrase models, frequencies)."""
+        path = path or MODELS_DIR / "preprocessor.pkl"
+        data = {
+            "frozen_ngrams": self.frozen_ngrams,
+            "bigram_model": self.bigram_model,
+            "trigram_model": self.trigram_model,
+            "word_frequencies": self.word_frequencies,
+        }
+        with open(path, "wb") as f:
+            pickle.dump(data, f)
+        logger.info(f"Saved preprocessor to {path}")
+    def load(self, path: Path | None = None) -> None:
+        """Load preprocessor state."""
+        path = path or MODELS_DIR / "preprocessor.pkl"
+        if not path.exists():
+            raise FileNotFoundError(f"Preprocessor not found at {path}")
+        with open(path, "rb") as f:
+            data = pickle.load(f)
+        self.frozen_ngrams = data["frozen_ngrams"]
+        self.bigram_model = data["bigram_model"]
+        self.trigram_model = data["trigram_model"]
+        self.word_frequencies = data["word_frequencies"]
+        logger.info(f"Loaded preprocessor from {path}")
+def extract_ngrams_from_keywords(keywords: dict[str, list[str]]) -> list[str]:
+    """
+    Extract multi-word phrases from keywords dictionary.
+    Args:
+        keywords: TOPIC_KEYWORDS dictionary from keywords.py
+    Returns:
+        List of multi-word phrases (e.g., ["frame rate", "loading screen"])
+    """
+    ngrams = []
+    for category_words in keywords.values():
+        for word in category_words:
+            if " " in word:
+                ngrams.append(word)
+    return ngrams

scripts/expand_keywords/trainer.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+FastText model training.
+FastText is preferred over Word2Vec because:
+- Better handling of typos and misspellings (common in reviews)
+- Can generate vectors for out-of-vocabulary words
+- Uses character n-grams internally
+"""
+import logging
+from pathlib import Path
+from gensim.models import FastText
+from .config import MODELS_DIR, SETTINGS
+logger = logging.getLogger(__name__)
+class FastTextTrainer:
+    """
+    Trains FastText word embeddings on review corpus.
+    """
+    def __init__(
+        self,
+        vector_size: int | None = None,
+        window: int | None = None,
+        min_count: int | None = None,
+        epochs: int | None = None,
+        workers: int | None = None,
+    ):
+        """
+        Initialize trainer with hyperparameters.
+        Args:
+            vector_size: Dimensionality of word vectors
+            window: Context window size
+            min_count: Minimum word frequency
+            epochs: Number of training iterations
+            workers: Number of worker threads
+        """
+        self.vector_size = vector_size or SETTINGS["fasttext_vector_size"]
+        self.window = window or SETTINGS["fasttext_window"]
+        self.min_count = min_count or SETTINGS["fasttext_min_count"]
+        self.epochs = epochs or SETTINGS["fasttext_epochs"]
+        self.workers = workers or SETTINGS["fasttext_workers"]
+        self.model: FastText | None = None
+    def train(self, sentences: list[list[str]]) -> FastText:
+        """
+        Train FastText model on tokenized sentences.
+        Args:
+            sentences: List of tokenized documents (output from preprocessor)
+        Returns:
+            Trained FastText model
+        """
+        logger.info(
+            f"Training FastText model: "
+            f"vector_size={self.vector_size}, window={self.window}, "
+            f"min_count={self.min_count}, epochs={self.epochs}"
+        )
+        logger.info(f"Training on {len(sentences)} documents")
+        self.model = FastText(
+            sentences=sentences,
+            vector_size=self.vector_size,
+            window=self.window,
+            min_count=self.min_count,
+            epochs=self.epochs,
+            workers=self.workers,
+            sg=1,  # Skip-gram (better for semantic similarity)
+            min_n=3,  # Minimum character n-gram length
+            max_n=6,  # Maximum character n-gram length
+        )
+        vocab_size = len(self.model.wv)
+        logger.info(f"Training complete. Vocabulary size: {vocab_size}")
+        return self.model
+    def save(self, path: Path | str | None = None) -> Path:
+        """
+        Save trained model.
+        Args:
+            path: Save path (default: models/fasttext.model)
+        Returns:
+            Path where model was saved
+        """
+        if self.model is None:
+            raise ValueError("No model to save. Train first.")
+        path = Path(path) if path else MODELS_DIR / "fasttext.model"
+        self.model.save(str(path))
+        logger.info(f"Saved model to {path}")
+        return path
+    def load(self, path: Path | str | None = None) -> FastText:
+        """
+        Load model from file.
+        Args:
+            path: Model path (default: models/fasttext.model)
+        Returns:
+            Loaded FastText model
+        """
+        path = Path(path) if path else MODELS_DIR / "fasttext.model"
+        if not path.exists():
+            raise FileNotFoundError(f"Model not found at {path}")
+        self.model = FastText.load(str(path))
+        vocab_size = len(self.model.wv)
+        logger.info(f"Loaded model from {path}. Vocabulary size: {vocab_size}")
+        return self.model
+    def get_similar(
+        self,
+        word: str,
+        topn: int = 10,
+    ) -> list[tuple[str, float]]:
+        """
+        Get most similar words to a given word.
+        Args:
+            word: Query word
+            topn: Number of results
+        Returns:
+            List of (word, similarity) tuples
+        """
+        if self.model is None:
+            raise ValueError("No model loaded. Train or load first.")
+        # Normalize word (space to underscore for phrases)
+        word_normalized = word.lower().replace(" ", "_")
+        try:
+            return self.model.wv.most_similar(word_normalized, topn=topn)
+        except KeyError:
+            logger.warning(f"Word '{word}' not in vocabulary")
+            return []
+    def get_similarity(self, word1: str, word2: str) -> float:
+        """
+        Get similarity between two words.
+        Args:
+            word1: First word
+            word2: Second word
+        Returns:
+            Cosine similarity (-1 to 1)
+        """
+        if self.model is None:
+            raise ValueError("No model loaded. Train or load first.")
+        w1 = word1.lower().replace(" ", "_")
+        w2 = word2.lower().replace(" ", "_")
+        try:
+            return float(self.model.wv.similarity(w1, w2))
+        except KeyError as e:
+            logger.warning(f"Word not in vocabulary: {e}")
+            return 0.0
+    def word_in_vocab(self, word: str) -> bool:
+        """Check if word is in vocabulary."""
+        if self.model is None:
+            return False
+        word_normalized = word.lower().replace(" ", "_")
+        return word_normalized in self.model.wv
+    def get_vocab_words(self) -> list[str]:
+        """Get all words in vocabulary."""
+        if self.model is None:
+            return []
+        return list(self.model.wv.key_to_index.keys())