diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..c93ffabefa7e640255c92ad905ca7d66ac6b0a5f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,23 @@ +# Ignore everything by default +* + +# Allow only what is needed for Docker build +!backend/ +!frontend/ +!scripts/ +!Dockerfile +!README.md +!requirements.txt +!.gitignore + +# Exclude unnecessary subfolders +backend/tests/ +backend/__pycache__/ +backend/.pytest_cache/ +frontend/node_modules/ +frontend/dist/ + +# Exclude specific files +*.pdf +.env +*.log diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..ff1c4d297abb3237b50669cb9f2e96fbf7d9d426 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +# ------------------------------------------------------------------------------ +# Stage 1: Quantize NLP model (torch needed ONLY here for PyTorch -> ONNX export) +# ------------------------------------------------------------------------------ +FROM python:3.11-slim AS model-quantizer + +WORKDIR /app + +RUN pip install --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cpu \ + "torch==2.2.0" \ + "optimum[onnxruntime]==1.16.2" \ + "transformers==4.37.2" \ + "huggingface-hub==0.20.3" \ + "numpy<2.0.0" + +COPY scripts/quantize_model.py scripts/quantize_model.py +RUN python3 scripts/quantize_model.py + + +# ------------------------------------------------------------------------------ +# Stage 2: Runtime (Python FastAPI Worker — no torch, no frontend) +# ------------------------------------------------------------------------------ +FROM python:3.11-slim + +WORKDIR /app + +# Create non-root user for security +RUN useradd -m -u 1000 user +USER user +ENV HOME=/home/user \ + PATH=/home/user/.local/bin:$PATH + +# Install Python dependencies (no torch — ~700MB RAM saved) +COPY --chown=user:user backend/requirements.txt backend/requirements.txt +RUN pip install --no-cache-dir --upgrade -r backend/requirements.txt + +# Copy Backend code +COPY --chown=user:user backend backend + +# Copy pre-quantized ONNX model from Stage 1 +COPY --chown=user:user --from=model-quantizer /app/backend/models/quantized backend/models/quantized + +WORKDIR /app/backend + +EXPOSE 7860 + +CMD ["uvicorn", "worker_main:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1f35c239582a7050388752b2b0ea5ea221a06254 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +--- +title: SentimentStream Worker +emoji: ⚙️ +colorFrom: gray +colorTo: blue +sdk: docker +app_port: 7860 +pinned: false +license: agpl-3.0 +--- + +# SentimentStream Worker + +Background worker for SentimentStream. Syncs games from SteamSpy, detects updates via Steam News API, and pre-caches sentiment analyses. diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..65cad5644bfce99abbf278195e21b8beb3a3899c --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,38 @@ +# MongoDB +MONGODB_URL=mongodb://admin:password@localhost:27017 +MONGODB_DB_NAME=sentimentSummary + +# App Settings +DEBUG=true +CORS_ORIGINS=http://localhost:5173,http://localhost:3000 + +# Cache Settings +CACHE_TTL_HOURS=24 + +# Steam API Settings +REVIEW_BATCH_SIZE=500 +STEAM_REVIEW_LANGUAGE=schinese +STEAM_REGION=CN + +# Steam API Retry +STEAM_RETRY_MAX_ATTEMPTS=3 +STEAM_RETRY_BASE_DELAY=1.0 +STEAM_RETRY_MAX_DELAY=10.0 + +# Sampling Settings - Statistical sampling parameters +SAMPLE_TOP_HELPFUL=50 +SAMPLE_CONFIDENCE_LEVEL=0.95 +SAMPLE_MARGIN_OF_ERROR=0.01 +SAMPLE_MAX_REVIEWS=3000 + +# NLP Settings - Hugging Face Models +HF_SENTIMENT_MODEL=uer/roberta-base-finetuned-jd-binary-chinese + +# NLP Settings - Analysis Parameters +TEXT_MAX_LENGTH=512 +SENTIMENT_POSITIVE_THRESHOLD=0.1 +SENTIMENT_NEGATIVE_THRESHOLD=-0.1 +TOPIC_MIN_MENTIONS=5 + +# Deduplication Cache +DEDUP_CACHE_MAXSIZE=10000 diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..18197ce9172870b7e25fcf24b72d3407fa9025c0 --- /dev/null +++ b/backend/app/__init__.py @@ -0,0 +1,8 @@ +""" +SentimentStream Backend Application. + +Narzędzie do analizy sentymentu i modelowania tematów +w recenzjach gier Steam w czasie rzeczywistym. +""" + +__version__ = "0.1.0" diff --git a/backend/app/core/__init__.py b/backend/app/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3f7b751521f29b7faacaf0743f4a8e42d9761a0 --- /dev/null +++ b/backend/app/core/__init__.py @@ -0,0 +1 @@ +"""Moduł konfiguracji aplikacji.""" diff --git a/backend/app/core/config.py b/backend/app/core/config.py new file mode 100644 index 0000000000000000000000000000000000000000..bee3a80bc8fd7eb76c912e49fc22e3c299446cc5 --- /dev/null +++ b/backend/app/core/config.py @@ -0,0 +1,174 @@ +""" +Konfiguracja aplikacji. + +Wykorzystuje Pydantic Settings do zarządzania zmiennymi środowiskowymi. +""" + +from functools import lru_cache + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """ + Ustawienia aplikacji ładowane ze zmiennych środowiskowych. + """ + + model_config = SettingsConfigDict( + env_file=(".env", "backend/.env"), + env_file_encoding="utf-8", + case_sensitive=False, + extra="ignore" + ) + + # MongoDB + mongodb_url: str = "" + mongodb_db_name: str = "sentimentSummary" + + # App Mode + app_mode: str = "full" # "full" = monolith, "api" = API-only (no frontend) + + # App Settings + debug: bool = False + cors_origins: str = "http://localhost:5173,http://localhost:3000" + + # Cache Settings + cache_ttl_hours: int = 24 + cache_ttl_short_hours: int = 12 # frequently updated games + cache_ttl_long_hours: int = 168 # stable games (7 days) + cache_ttl_worker_managed_hours: int = 1440 # 60 days to preserve stale fallback results + cache_ttl_on_demand_hours: int = 1440 # 60 days to preserve stale fallback results + + # Incremental Analysis + incremental_enabled: bool = True + incremental_max_stored_ids: int = 5000 + incremental_max_gap_days: int = 90 # fall back to full analysis after this many days without reviews + recent_sample_limit: int = 1000 + niche_cache_max_age_days: int = 60 + analysis_freshness_max_age_days: int = 60 + patch_context_max_age_days: int = 90 + dlc_min_reviews_for_analysis: int = 50 + dlc_visible_in_search: bool = False # Temporary policy: hide DLC from autocomplete/suggestions + dlc_worker_analysis_enabled: bool = False # Temporary policy: exclude DLC from worker-managed analysis + + # Steam API Settings + review_batch_size: int = 100 + steam_review_language: str = "schinese" # Review fetch scope; product analyzes Simplified Chinese Steam reviews. + steam_region: str = "CN" # CN, US, etc. + + # Steam API Retry + steam_retry_max_attempts: int = 3 + steam_retry_base_delay: float = 1.0 # doubles each retry + steam_retry_max_delay: float = 10.0 # cap + + # Steam API Error Cache TTL (seconds) + steam_error_cache_ttl_404: int = 3600 # 1h + steam_error_cache_ttl_429: int = 300 # 5min + + # Sampling Settings - Statistical sampling parameters + sample_top_helpful: int = 50 + sample_confidence_level: float = 0.95 + sample_margin_of_error: float = 0.02 + sample_max_reviews: int = 3000 + sample_minority_min: int = 100 + + # NLP Settings - Analysis Parameters + text_max_length: int = 512 + sentiment_positive_threshold: float = 0.1 + sentiment_negative_threshold: float = -0.1 + topic_min_mentions: int = 5 + + # NLP Settings - Deduplication Cache + dedup_cache_maxsize: int = 10000 + + # NLP Settings - Performance & Logic + nlp_onnx_intra_threads: int = 2 + nlp_onnx_inter_threads: int = 2 + nlp_negation_window: int = 3 + + # Prediction Settings + prediction_retention_threshold_pos: float = 0.2 + prediction_retention_threshold_neg: float = -0.2 + + # Community Highlights + highlights_ngram_min: int = 2 + highlights_ngram_max: int = 5 + highlights_min_mentions: int = 3 + highlights_max_doc_freq_ratio: float = 0.4 + highlights_top_n_general: int = 15 + highlights_top_n_per_topic: int = 5 + + # Worker — Pre-cache + worker_trigger_token: str = "" + precache_enabled: bool = False + precache_top_n_games: int = 500 + precache_batch_delay_seconds: int = 10 + precache_checkpoints_hours: str = "6,12,24,72,168,336" + precache_max_analyses_per_cycle: int = 50 + + # Worker — Priority Games + steam_priority_categories: str = "top_sellers,new_releases,specials" + steam_priority_regions: str = "CN,US" + steam_priority_grace_days: int = 3 + steam_priority_categories_url: str = "https://store.steampowered.com/api/featuredcategories" + steam_bootstrap_max_per_cycle: int = 20 + steam_bootstrap_delay: float = 1.5 + + # Worker — News Scan + news_refresh_window_hours: int = 6 + news_initial_count: int = 20 + news_incremental_count: int = 5 + + # Worker — Game Sync + game_sync_enabled: bool = False + game_sync_steamspy_delay: float = 61.0 + game_sync_details_delay: float = 1.1 + game_sync_top_n_details: int = 500 + game_sync_cn_enrichment_delay: float = 1.5 + game_sync_cn_enrichment_limit: int = 200 + game_sync_app_type_enrichment_delay: float = 1.5 + game_sync_app_type_enrichment_limit: int = 200 + + # Logging (both Live API and Worker) + worker_log_dir: str = "/data/worker_logs" + worker_log_fallback_dir: str = "/tmp/worker_logs" + worker_log_max_bytes: int = 5_000_000 # 5 MB per file + worker_log_backup_count: int = 3 # 3 rotated files = 20 MB max + nlp_verbose_logging: bool = False # re-enable NLP debug logs to stdout + nlp_debug_log_max_bytes: int = 2_000_000 # 2 MB per file + errors_log_max_bytes: int = 2_000_000 # 2 MB per file + + # Rate Limiting + rate_limit_analyze: str = "10/minute" + rate_limit_default: str = "30/minute" + + # NLP Settings - Hugging Face Models + # Using specialized Chinese model (RoBERTa-JD) - 90% accuracy on product reviews + hf_sentiment_model: str = "uer/roberta-base-finetuned-jd-binary-chinese" + + @property + def cors_origins_list(self) -> list[str]: + """Zwraca listę dozwolonych originów CORS.""" + return [origin.strip() for origin in self.cors_origins.split(",")] + + @property + def precache_checkpoints_list(self) -> list[int]: + """Parse checkpoint hours from comma-separated string.""" + return sorted(int(h.strip()) for h in self.precache_checkpoints_hours.split(",")) + + @property + def steam_priority_categories_list(self) -> list[str]: + return [c.strip() for c in self.steam_priority_categories.split(",") if c.strip()] + + @property + def steam_priority_regions_list(self) -> list[str]: + return [r.strip() for r in self.steam_priority_regions.split(",") if r.strip()] + + +@lru_cache +def get_settings() -> Settings: + """Zwraca singleton instancji Settings.""" + return Settings() + + +settings = get_settings() diff --git a/backend/app/core/freshness.py b/backend/app/core/freshness.py new file mode 100644 index 0000000000000000000000000000000000000000..6be09b21b3e09d56f1ed2e1374b570bb5fbcf108 --- /dev/null +++ b/backend/app/core/freshness.py @@ -0,0 +1,71 @@ +""" +Product-level analysis freshness rules. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from enum import Enum +from typing import Any, cast + +from app.core.config import settings + + +class FreshnessStatus(str, Enum): + """Product freshness state for an existing analysis.""" + + FRESH = "fresh" + STALE_BY_AGE = "stale_by_age" + STALE_BY_PATCH = "stale_by_patch" + + +def _as_utc_datetime(value: Any) -> datetime | None: + if value is None: + return None + if isinstance(value, datetime): + return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc) + if isinstance(value, str): + parsed = datetime.fromisoformat(value) + return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc) + return None + + +def get_analysis_reference_at(document: dict[str, Any]) -> datetime | None: + """Return the best available execution timestamp for freshness checks.""" + raw = document.get("results") + results: dict[str, Any] = cast(dict[str, Any], raw) if isinstance(raw, dict) else {} + return ( + _as_utc_datetime(results.get("analysis_date")) + or _as_utc_datetime(document.get("analyzed_at")) + or _as_utc_datetime(document.get("cached_at")) + ) + + +def evaluate_freshness( + document: dict[str, Any], + current_patch_at: datetime | None, +) -> FreshnessStatus: + """ + Evaluate analysis freshness using product rules: + patch recency first, then max age. + """ + analysis_at = get_analysis_reference_at(document) + if analysis_at is None: + return FreshnessStatus.STALE_BY_AGE + + if current_patch_at is not None and analysis_at < current_patch_at: + return FreshnessStatus.STALE_BY_PATCH + + age_days = (datetime.now(timezone.utc) - analysis_at).days + if age_days > settings.analysis_freshness_max_age_days: + return FreshnessStatus.STALE_BY_AGE + + return FreshnessStatus.FRESH + + +def get_staleness_reason(status: FreshnessStatus) -> str | None: + if status == FreshnessStatus.STALE_BY_AGE: + return "STALE_REASON_AGE" + if status == FreshnessStatus.STALE_BY_PATCH: + return "STALE_REASON_PATCH" + return None diff --git a/backend/app/core/jieba_userdict.txt b/backend/app/core/jieba_userdict.txt new file mode 100644 index 0000000000000000000000000000000000000000..073b6963f2f2b0001992e3d42d4012a2c2fc2719 --- /dev/null +++ b/backend/app/core/jieba_userdict.txt @@ -0,0 +1,14 @@ +boss战 5 n +开放世界 5 n +大逃杀 5 n +战斗通行证 5 n +皮肤系统 5 n +氪金 10 v +开箱 5 v +人机对战 5 n +帧数不稳 5 n +内存泄漏 5 n +手感好 5 a +手感差 5 a +上手简单 5 a +劝退新手 5 v diff --git a/backend/app/core/keywords.py b/backend/app/core/keywords.py new file mode 100644 index 0000000000000000000000000000000000000000..f403d06d0abd5cbceae4585b3e650d0a83b4bdf0 --- /dev/null +++ b/backend/app/core/keywords.py @@ -0,0 +1,273 @@ +""" +Chinese keywords for game review topic detection. +Used in hybrid approach (Keywords + ML Sentiment). + +Categories based on common topics in Steam game reviews. +Seed keywords will be expanded using the expand_keywords pipeline. + +Structure: topic -> {single_char, compound, phrase} +- single_char: standalone Chinese characters (1 char, prone to false positives) +- compound: multi-char Chinese words or short English words +- phrase: multi-word phrases (EN or ZH) +""" + +TOPIC_KEYWORDS: dict[str, dict[str, list[str]]] = { + # ========================================================================= + # CORE GAMEPLAY - 核心玩法 + # ========================================================================= + "Gameplay": { + "single_char": ["刷", "肝"], + "compound": [ + "玩法", "游戏性", "机制", "战斗", "任务", "关卡", + "探索", "技能", "装备", "gameplay", + ], + "phrase": ["战斗系统"], + }, + + "Fun": { + "single_char": ["爽", "烂"], + "compound": [ + # Positive + "好玩", "有趣", "上瘾", "神作", "佳作", "精品", + "沉浸", "过瘾", "带感", "回血", "爽游", + "解压", "杀时间", + # Negative + "无聊", "枯燥", "乏味", "垃圾", "辣鸡", "粪作", + "失望", "无趣", + ], + "phrase": [ + "电子伟哥", "治好了", "精神时光屋", "时光屋", + "电子阳痿", "电子ed", + ], + }, + + "Difficulty": { + "single_char": [], + "compound": [ + "难度", "简单", "困难", "硬核", + "劝退", "手残", "新手", "上手", + "souls", "魂类", + ], + "phrase": ["太难", "太简单"], + }, + + # ========================================================================= + # TECHNICAL - 技术 + # ========================================================================= + "Performance": { + "single_char": ["卡"], + "compound": [ + "优化", "卡顿", "帧率", "帧数", "流畅", "掉帧", + "丝滑", "显卡", "显存", "延迟", + "fps", "cpu", "gpu", + ], + "phrase": [ + "稳60", "锁60", "解锁帧率", "吃配置", "带不动", + "PPT效果", "幻灯片", "帧生成", "输入延迟", "帧数不稳", + ], + }, + + "Bugs": { + "single_char": [], + "compound": [ + "闪退", "崩溃", "卡死", "报错", "存档", + "黑屏", "进不去", "打不开", "未响应", "无响应", + "弹窗", "坏档", "掉线", + "bug", "bugs", + ], + "phrase": [ + "存档损坏", "无法保存", "卡加载", + "加载失败", "连不上", + ], + }, + + # ========================================================================= + # AUDIO-VISUAL - 视听 + # ========================================================================= + "Graphics": { + "single_char": [], + "compound": [ + "画面", "画质", "特效", "建模", "贴图", + "美术", "风格", "场景", "光影", + "4k", "hdr", + ], + "phrase": [], + }, + + "Sound": { + "single_char": [], + "compound": [ + "音乐", "音效", "配音", "配乐", "声音", + "原声", + "bgm", "ost", + ], + "phrase": ["中文配音"], + }, + + # ========================================================================= + # CONTENT & VALUE - 内容与价值 + # ========================================================================= + "Content": { + "single_char": [], + "compound": [ + "内容", "时长", "流程", "耐玩", "通关", + "主线", "支线", "收集", "小时", "体量", + "注水", "重复", "换皮", "多周目", + "dlc", + ], + "phrase": [ + "素材复用", "拖时长", "强行延长", "通关后", + ], + }, + + "Monetization": { + "single_char": [], + "compound": [ + # ex-Price + "价格", "定价", "值得", "不值", "贵", "便宜", + "打折", "史低", "入手", "白嫖", "性价比", + # ex-Microtransactions + "氪金", "内购", "充值", "抽卡", "648", + "课金", "首充", "月卡", "战令", "季票", + "开箱", "箱子", "钥匙", "保底", "抽奖", + "p2w", + ], + "phrase": [ + "通行证", "pay to win", + ], + }, + + # ========================================================================= + # MULTIPLAYER & COMMUNITY - 多人与社区 + # ========================================================================= + "Multiplayer": { + "single_char": [], + "compound": [ + "联机", "多人", "匹配", "服务器", "延迟", + "掉线", "开黑", "组队", "单机", "野排", "车队", + "单排", "组排", "路人", "挂机", + "pvp", "pve", "coop", + ], + "phrase": [ + "坑比", "猪队友", "送人头", + ], + }, + + "Community": { + "single_char": [], + "compound": [ + "社区", "玩家", "汉化", + "官方", "民间", + "mod", "mods", + ], + "phrase": ["创意工坊"], + }, + + # ========================================================================= + # CONTROLS & UI - 操控与界面 + # ========================================================================= + "Controls": { + "single_char": [], + "compound": [ + "操作", "手感", "手柄", "键鼠", "键盘", + "摇杆", "触发", "键位", "改键", + "死区", "陀螺仪", "扳机", "震动", + ], + "phrase": [ + "自定义键位", "辅助瞄准", "触觉反馈", "自适应扳机", + ], + }, + + "UI": { + "single_char": [], + "compound": [ + "界面", "菜单", "字幕", "字体", + "中文", "汉化", + "ui", "hud", + ], + "phrase": [], + }, + + # ========================================================================= + # STORY & NARRATIVE - 剧情 + # ========================================================================= + "Story": { + "single_char": [], + "compound": [ + "剧情", "故事", "人物", "角色", "结局", + "剧本", "叙事", "世界观", "背景", "喂屎", + "烂尾", "降智", "工具人", "脸谱化", + "剧情杀", "都合主义", + "npc", + ], + "phrase": ["逻辑硬伤"], + }, + + # ========================================================================= + # DEVELOPER SUPPORT - 开发支持 + # ========================================================================= + "Support": { + "single_char": [], + "compound": [ + "更新", "修复", "维护", "开发商", "官方", + "补丁", "版本", + ], + "phrase": [], + }, + + "Localization": { + "single_char": [], + "compound": [ + "本地化", "汉化", "翻译", "机翻", "缺字", "乱码", + "繁体", "简体", + ], + "phrase": [ + "语言支持", "中文支持", "无中文", "不支援中文", + "文本质量", "字幕翻译", "界面翻译", + ], + }, + + # ========================================================================= + # REFINEMENT - 打磨 + # ========================================================================= + "Polish": { + "single_char": [], + "compound": [ + "打磨", "精致", "粗糙", "用心", "敷衍", "细节", + "诚意", "偷懒", "不用心", "精良", "精美", + ], + "phrase": ["粗制滥造"], + }, + + # ========================================================================= + # RETENTION - 留存 + # ========================================================================= + "Retention": { + "single_char": [], + "compound": [ + # Positive (High Retention) + "推荐", "安利", "入正", "入坑", "必玩", + "神作", "年度", "满分", + # Negative (Churn) + "退款", "卸载", "弃坑", "劝退", "不推荐", + "避雷", "踩雷", "退坑", + "回坑", "出坑", "已弃", + ], + "phrase": [ + "坚持玩", "每天玩", "停不下来", "刷了", + "已退", "退款了", + ], + }, +} + +# ============================================================================= +# EXCLUSIONS (Context-aware filtering) +# ============================================================================= +# Words to exclude when they appear in certain contexts. +# Format: "keyword": ["context_word1", "context_word2"] + +EXCLUSIONS = { + # "fps" as genre (FPS shooter) vs performance (60 fps) + "fps": ["射击", "枪战", "第一人称"], + # Empty for now - will be expanded based on false positives +} diff --git a/backend/app/core/rate_limit.py b/backend/app/core/rate_limit.py new file mode 100644 index 0000000000000000000000000000000000000000..f75ffc78b61c5dbed37e07e11d9c33b2b061931c --- /dev/null +++ b/backend/app/core/rate_limit.py @@ -0,0 +1,6 @@ +"""Shared rate limiter instance for the application.""" + +from slowapi import Limiter +from slowapi.util import get_remote_address + +limiter = Limiter(key_func=get_remote_address) diff --git a/backend/app/core/sampling.py b/backend/app/core/sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..b7af7851b7b4732c44e2441ba8019e1ab9ffdf7e --- /dev/null +++ b/backend/app/core/sampling.py @@ -0,0 +1,135 @@ +""" +Moduł do obliczania statystycznej wielkości próbki. + +Implementuje wzory statystyczne dla próbkowania populacji. +""" + +import math +from dataclasses import dataclass + +from app.core.config import settings + + +# Wartości Z dla poziomów ufności +Z_SCORES = { + 0.90: 1.645, + 0.95: 1.96, + 0.99: 2.576, +} + + +@dataclass +class SamplePlan: + """ + Plan próbkowania dla gry. + + Attributes: + top_helpful: Liczba najprzydatniejszych recenzji. + statistical_sample: Wielkość próbki statystycznej. + positive_count: Ile pobrać pozytywnych (stratified). + negative_count: Ile pobrać negatywnych (stratified). + total: Łączna liczba recenzji do pobrania. + """ + + top_helpful: int + statistical_sample: int + positive_count: int + negative_count: int + total: int + + +def calculate_sample_size( + population: int, + confidence_level: float | None = None, + margin_of_error: float | None = None, +) -> int: + """ + Oblicza minimalną wielkość próbki dla danej populacji. + Wykorzystuje wzór Cochrana z korektą dla populacji skończonej. + """ + if confidence_level is None: + confidence_level = settings.sample_confidence_level + if margin_of_error is None: + margin_of_error = settings.sample_margin_of_error + + # 1. Pobieramy Z-score (np. 1.96 dla 95% ufności). + # Mówi on, jak bardzo wynik może odbiegać od średniej w jednostkach odchylenia standardowego. + z = Z_SCORES.get(confidence_level, 1.96) + + # 2. Zakładamy p=0.5 (maksymalna zmienność). + # To daje nam najbezpieczniejszą (największą) wielkość próbki. + p = 0.5 + + # 3. Wzór Cochrana dla nieskończonej populacji: + # n0 = (Z^2 * p * (1-p)) / e^2 + # Wyjaśnienie: Z kwadrat razy zmienność, podzielone przez kwadrat błędu. + n_0 = (z ** 2 * p * (1 - p)) / (margin_of_error ** 2) + + # 4. Korekta dla populacji skończonej (Steam ma policzalną liczbę recenzji): + # n = n0 / (1 + (n0 - 1) / N) + # Wyjaśnienie: Zmniejszamy próbkę, bo wiemy dokładnie, ile osób (recenzji) jest w "całym świecie" tej gry. + n = n_0 / (1 + (n_0 - 1) / population) + + # Zaokrąglamy w górę do pełnej recenzji + return math.ceil(n) + + +def create_sample_plan( + total_reviews: int, + positive_reviews: int, + negative_reviews: int, +) -> SamplePlan: + """ + Tworzy plan próbkowania, łącząc dwa podejścia. + """ + top_helpful = settings.sample_top_helpful + max_reviews = settings.sample_max_reviews + + # Obliczamy, ile recenzji musimy pobrać, żeby wynik był wiarygodny + statistical_sample = calculate_sample_size(total_reviews) + + # Pilnujemy, żeby nie przekroczyć ustawionego limitu (np. 3000) + statistical_sample = min(statistical_sample, max_reviews - top_helpful) + + # Obliczamy jaki procent stanowią pozytywy i negatywy w całości + if total_reviews > 0: + pos_ratio = positive_reviews / total_reviews + neg_ratio = negative_reviews / total_reviews + else: + pos_ratio = 0.5 + neg_ratio = 0.5 + + # Rozdzielamy naszą próbkę proporcjonalnie do tych wyników (Stratified Sampling) + pos_target = math.ceil(statistical_sample * pos_ratio) + neg_target = math.ceil(statistical_sample * neg_ratio) + + # Minority protection: boost the smaller group to minority_min if possible + minority_min = settings.sample_minority_min + + if pos_target < minority_min and positive_reviews > pos_target: + pos_target = min(minority_min, positive_reviews) + + if neg_target < minority_min and negative_reviews > neg_target: + neg_target = min(minority_min, negative_reviews) + + # Final adjustment to stay within statistical_sample limit + if pos_target + neg_target > statistical_sample: + if pos_target > neg_target: + pos_target = max(pos_target - (pos_target + neg_target - statistical_sample), minority_min) + else: + neg_target = max(neg_target - (pos_target + neg_target - statistical_sample), minority_min) + + # Final cap by actual availability + positive_count = min(pos_target, positive_reviews) + negative_count = min(neg_target, negative_reviews) + + # Sumujemy wszystko (Top Helpful + Próbka Statystyczna) + total = top_helpful + positive_count + negative_count + + return SamplePlan( + top_helpful=top_helpful, + statistical_sample=statistical_sample, + positive_count=positive_count, + negative_count=negative_count, + total=total, + ) diff --git a/backend/app/core/stopwords_zh.py b/backend/app/core/stopwords_zh.py new file mode 100644 index 0000000000000000000000000000000000000000..e7b685cbc302f3e6e86d130e5f47010911bc675f --- /dev/null +++ b/backend/app/core/stopwords_zh.py @@ -0,0 +1,39 @@ +""" +Chinskie stop words dla NLP pipeline. +Uzywane przez Community Highlights (n-gram extraction) i potencjalnie inne moduly. +""" + +# Jednooznakowe tokeny do odfiltrowania (WYJATKI ponizej) +SINGLE_CHAR_EXCEPTIONS = {"卡", "肝", "爽", "氪", "菜", "毒"} + +# Stop words — czeste slowa bez wartosci informacyjnej +STOPWORDS_ZH = { + # Zaimki + "我", "你", "他", "她", "它", "我们", "你们", "他们", + # Czastki i spojniki + "的", "了", "是", "在", "不", "有", "和", "就", + "都", "也", "很", "要", "会", "可以", "这", "那", + "还", "没", "着", "被", "把", "让", "给", "从", + "到", "对", "但", "而", "或", "与", + # Czastki modalne + "吗", "呢", "啊", "吧", "呀", "嘛", "哦", "哈", + # Przysliwki + "比较", "非常", "真的", "确实", "其实", "可能", + "已经", "一直", "马上", "刚刚", + # Czasowniki ogolne + "觉得", "感觉", "知道", "看到", "说", + # Liczebniki i okreslniki + "一个", "一些", "这个", "那个", "什么", "怎么", + "多少", "几个", + # Filler w recenzjach gier + "这游戏", "这个游戏", "游戏", "玩家", +} + + +def is_stopword(token: str) -> bool: + """Sprawdza czy token jest stop wordem lub jednooznakowym tokenem bez wartosci.""" + if token in STOPWORDS_ZH: + return True + if len(token) == 1 and token not in SINGLE_CHAR_EXCEPTIONS: + return True + return False diff --git a/backend/app/core/ttl_tiers.py b/backend/app/core/ttl_tiers.py new file mode 100644 index 0000000000000000000000000000000000000000..f842329556be722680cf3146bfb5de4d2ae246db --- /dev/null +++ b/backend/app/core/ttl_tiers.py @@ -0,0 +1,19 @@ +""" +Tiered TTL configuration for game cache expiry. + +Popular games (worker-managed top N) get longer cache, +niche games (on-demand) get shorter cache. +""" + +from app.core.config import settings + + +async def get_ttl_hours(app_id: str) -> int: + """Return TTL in hours based on whether the game is a priority game.""" + from app.db.mongodb import mongodb + + priority_ids = await mongodb.get_priority_game_ids_for_analysis() + + if app_id in priority_ids: + return settings.cache_ttl_worker_managed_hours # 1440h (60d) + return settings.cache_ttl_on_demand_hours # 1440h (60d) diff --git a/backend/app/core/worker_logging.py b/backend/app/core/worker_logging.py new file mode 100644 index 0000000000000000000000000000000000000000..1b6be9341c3dacc145f6565eb4cd8f6bef0d0675 --- /dev/null +++ b/backend/app/core/worker_logging.py @@ -0,0 +1,316 @@ +""" +Structured logging infrastructure for Worker and Live API. + +Provides JSON-line file logging with rotation, timing context managers, +and module-level accessors for use across the codebase. +""" + +import json +import logging +import logging.handlers +import os +import time +from typing import Any + +from app.core.config import settings + +# Module-level state +_structured_logger: logging.Logger | None = None +_cycle_id: str | None = None +_app_logging_initialized: bool = False + +# Per-process log file whitelists (key → filename) +LIVE_LOG_WHITELIST: dict[str, str] = { + "live": "live.jsonl", + "errors": "errors.log", + "nlp_debug": "nlp_debug.log", +} +WORKER_LOG_WHITELIST: dict[str, str] = { + "worker": "worker.jsonl", + "errors": "errors.log", + "nlp_debug": "nlp_debug.log", +} + + +class DebugOnlyFilter(logging.Filter): + """Pass only DEBUG-level records (blocks INFO and above).""" + + def filter(self, record: logging.LogRecord) -> bool: + return record.levelno == logging.DEBUG + + +def _get_writable_log_dir() -> str: + """Return the first writable log directory (primary or fallback).""" + log_dir = settings.worker_log_dir + try: + os.makedirs(log_dir, exist_ok=True) + test_path = os.path.join(log_dir, ".write_test") + with open(test_path, "w") as f: + f.write("ok") + os.remove(test_path) + except (OSError, PermissionError): + log_dir = settings.worker_log_fallback_dir + os.makedirs(log_dir, exist_ok=True) + return log_dir + + +class JsonLineFormatter(logging.Formatter): + """Formats log records as single-line JSON (JSONL).""" + + def format(self, record: logging.LogRecord) -> str: + entry: dict[str, Any] = { + "ts": self.formatTime(record, self.datefmt), + "level": record.levelname, + "event": getattr(record, "event", record.getMessage()), + } + + # Optional structured fields + for key in ("detail", "elapsed_s", "breakdown", "app_id", + "game_name", "source", "reviews_processed", + "topics_found", "analysis_type", "cycle_id", "error"): + val = getattr(record, key, None) + if val is not None: + entry[key] = val + + # Include cycle_id from module state if not on record + if "cycle_id" not in entry or entry["cycle_id"] is None: + cid = get_cycle_id() + if cid: + entry["cycle_id"] = cid + + # Remove None values + entry = {k: v for k, v in entry.items() if v is not None} + + return json.dumps(entry, default=str, ensure_ascii=False) + + +def setup_structured_logger(name: str) -> logging.Logger: + """ + Create a rotating JSON-line file logger. + + Tries settings.worker_log_dir first, falls back to + settings.worker_log_fallback_dir if the primary is not writable. + + Args: + name: Logger name and file prefix (e.g. "worker" or "live"). + + Returns: + Configured logger instance. + """ + logger = logging.getLogger(f"structured.{name}") + logger.setLevel(logging.INFO) + + # Don't add duplicate handlers on re-init + if logger.handlers: + return logger + + log_dir = _get_writable_log_dir() + log_path = os.path.join(log_dir, f"{name}.jsonl") + handler = logging.handlers.RotatingFileHandler( + log_path, + maxBytes=settings.worker_log_max_bytes, + backupCount=settings.worker_log_backup_count, + encoding="utf-8", + ) + handler.setFormatter(JsonLineFormatter()) + logger.addHandler(handler) + + # Also store as module-level default + set_structured_logger(logger) + + return logger + + +class TimingContext: + """Sync context manager that measures wall-clock time via time.monotonic().""" + + def __init__(self) -> None: + self.elapsed_s: float = 0.0 + self._start: float = 0.0 + + def __enter__(self) -> "TimingContext": + self._start = time.monotonic() + return self + + def __exit__(self, *exc: Any) -> None: + self.elapsed_s = round(time.monotonic() - self._start, 3) + + +class AsyncTimingContext: + """Async context manager that measures wall-clock time via time.monotonic().""" + + def __init__(self) -> None: + self.elapsed_s: float = 0.0 + self._start: float = 0.0 + + async def __aenter__(self) -> "AsyncTimingContext": + self._start = time.monotonic() + return self + + async def __aexit__(self, *exc: Any) -> None: + self.elapsed_s = round(time.monotonic() - self._start, 3) + + +def read_log_tail( + path: str, + lines: int = 100, + level: str | None = None, + event: str | None = None, +) -> list[dict[str, Any]]: + """ + Read last N JSON lines from a log file, with optional filtering. + + Args: + path: Path to .jsonl log file. + lines: Max number of lines to return. + level: Filter by log level (e.g. "ERROR"). + event: Filter by event name substring. + + Returns: + List of parsed JSON dicts, newest last. + """ + if not os.path.exists(path): + return [] + + # Read all lines, take last N (simple approach for small-ish files) + with open(path, "r", encoding="utf-8") as f: + all_lines = f.readlines() + + # Parse from the end, collect up to `lines` matching entries + results: list[dict[str, Any]] = [] + for raw in reversed(all_lines): + raw = raw.strip() + if not raw: + continue + try: + entry = json.loads(raw) + except json.JSONDecodeError: + continue + + if level and entry.get("level") != level: + continue + if event and event not in entry.get("event", ""): + continue + + results.append(entry) + if len(results) >= lines: + break + + results.reverse() # Restore chronological order + return results + + +def resolve_log_path(file_key: str, whitelist: dict[str, str]) -> str | None: + """ + Resolve a whitelisted log file key to its absolute path. + + Returns the expected path if the key is in the whitelist, None otherwise. + The file may not exist yet (read_log_tail handles that gracefully). + + Args: + file_key: Logical name for the log file (e.g. "live", "errors"). + whitelist: Mapping of allowed keys to filenames for this process. + + Returns: + Absolute path to the log file, or None if key is not whitelisted. + """ + filename = whitelist.get(file_key) + if not filename: + return None + + primary = os.path.join(settings.worker_log_dir, filename) + if os.path.isdir(settings.worker_log_dir): + return primary + + return os.path.join(settings.worker_log_fallback_dir, filename) + + +def setup_app_logging() -> None: + """ + Set up application-wide file logging handlers. Idempotent. + + Creates: + - errors.log: WARNING+ from all loggers (attached to root logger) + - nlp_debug.log: DEBUG-only NLP trace from app.services.nlp_service + + Call once during app lifespan startup, after setup_structured_logger(). + """ + global _app_logging_initialized + if _app_logging_initialized: + return + _app_logging_initialized = True + + log_dir = _get_writable_log_dir() + + # 1. errors.log — WARNING+ from root (catches all loggers via propagation) + errors_handler = logging.handlers.RotatingFileHandler( + os.path.join(log_dir, "errors.log"), + maxBytes=settings.errors_log_max_bytes, + backupCount=settings.worker_log_backup_count, + encoding="utf-8", + ) + errors_handler.setLevel(logging.WARNING) + errors_handler.setFormatter(JsonLineFormatter()) + logging.getLogger().addHandler(errors_handler) + + # 2. nlp_debug.log — DEBUG-only NLP trace (Dedup/Cache messages) + nlp_handler = logging.handlers.RotatingFileHandler( + os.path.join(log_dir, "nlp_debug.log"), + maxBytes=settings.nlp_debug_log_max_bytes, + backupCount=settings.worker_log_backup_count, + encoding="utf-8", + ) + nlp_handler.setLevel(logging.DEBUG) + nlp_handler.addFilter(DebugOnlyFilter()) + nlp_handler.setFormatter(JsonLineFormatter()) + + nlp_logger = logging.getLogger("app.services.nlp_service") + nlp_logger.setLevel(logging.DEBUG) + nlp_logger.addHandler(nlp_handler) + + # 3. Optional: re-enable NLP debug to stdout + if settings.nlp_verbose_logging: + verbose_handler = logging.StreamHandler() + verbose_handler.setLevel(logging.DEBUG) + verbose_handler.setFormatter(logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + )) + nlp_logger.addHandler(verbose_handler) + + +def get_structured_logger() -> logging.Logger | None: + """Get the module-level structured logger (if initialized).""" + return _structured_logger + + +def set_structured_logger(logger: logging.Logger) -> None: + """Set the module-level structured logger.""" + global _structured_logger + _structured_logger = logger + + +def get_cycle_id() -> str | None: + """Get the current worker cycle ID.""" + return _cycle_id + + +def set_cycle_id(cycle_id: str | None) -> None: + """Set the current worker cycle ID.""" + global _cycle_id + _cycle_id = cycle_id + + +def log_structured( + event: str, + level: int = logging.INFO, + **kwargs: Any, +) -> None: + """ + Emit a structured log entry via the module-level logger. + + No-op if no structured logger has been initialized (e.g. in tests). + """ + slog = get_structured_logger() + if not slog: + return + slog.log(level, event, extra={"event": event, **kwargs}) diff --git a/backend/app/db/__init__.py b/backend/app/db/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..425aa05c30941846187557a23a398dae37763aca --- /dev/null +++ b/backend/app/db/__init__.py @@ -0,0 +1 @@ +"""Moduł bazy danych.""" diff --git a/backend/app/db/mongodb.py b/backend/app/db/mongodb.py new file mode 100644 index 0000000000000000000000000000000000000000..621cad9949c12c343174f24479d21a667694c43a --- /dev/null +++ b/backend/app/db/mongodb.py @@ -0,0 +1,1152 @@ +""" +Moduł połączenia z bazą danych MongoDB. + +Wykorzystuje Motor (async driver) do asynchronicznej komunikacji z MongoDB. +Implementuje cache wyników analizy z TTL 24h. +""" + +import asyncio +import logging +import re +from datetime import datetime, timedelta, timezone +from typing import Any + +from bson.codec_options import CodecOptions +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase +from pymongo import ASCENDING, DESCENDING, UpdateOne +from pymongo.errors import ( + BulkWriteError, + ConnectionFailure, + OperationFailure, + PyMongoError, +) + +from app.core.config import settings + +logger = logging.getLogger(__name__) + + +class MongoDB: + """ + Klasa zarządzająca połączeniem z MongoDB. + + Implementuje wzorzec Singleton poprzez globalną instancję. + Obsługuje cache wyników analizy z automatyczną walidacją TTL. + Przechowuje listę gier Steam do autouzupełniania. + + Attributes: + client: Klient MongoDB (Motor). + db: Referencja do bazy danych. + """ + + COLLECTION_ANALYSES = "analyses" + COLLECTION_GAMES = "games" + COLLECTION_STEAM_ERRORS = "steam_errors" + COLLECTION_REFRESH_SCHEDULES = "refresh_schedules" + + def __init__(self) -> None: + """Inicjalizuje instancję bez aktywnego połączenia.""" + self.client: AsyncIOMotorClient | None = None # type: ignore + self.db: AsyncIOMotorDatabase | None = None # type: ignore + + async def connect(self, max_retries: int = 3) -> None: + """ + Nawiązuje połączenie z MongoDB z exponential backoff. + + Tworzy indeksy dla optymalnej wydajności zapytań. + + Args: + max_retries: Maksymalna liczba prób połączenia. + + Raises: + ConnectionError: Gdy nie można połączyć się z bazą po wszystkich próbach. + """ + for attempt in range(1, max_retries + 1): + try: + self.client = AsyncIOMotorClient(settings.mongodb_url, tz_aware=True) + codec_options: CodecOptions = CodecOptions(tz_aware=True) + self.db = self.client.get_database( + settings.mongodb_db_name, codec_options=codec_options + ) + + # Weryfikacja połączenia + await self.client.admin.command("ping") + logger.info(f"Połączono z MongoDB: {settings.mongodb_db_name}") + + # Utwórz indeksy + await self._create_indexes() + return + + except (ConnectionFailure, PyMongoError) as e: + if attempt < max_retries: + delay = 2 ** (attempt - 1) # 1s, 2s, 4s + logger.warning( + f"MongoDB connection attempt {attempt}/{max_retries} failed: {e}. " + f"Retrying in {delay}s..." + ) + await asyncio.sleep(delay) + else: + logger.error(f"MongoDB connection failed after {max_retries} attempts: {e}") + raise ConnectionError( + f"Nie można połączyć się z MongoDB po {max_retries} próbach: {e}" + ) + + async def _create_indexes(self) -> None: + """Tworzy indeksy dla kolekcji.""" + if self.db is None: + return + + # Indeksy dla analiz + analyses = self.db[self.COLLECTION_ANALYSES] + await analyses.create_index("game_id", unique=True) + + # Migrate from old global TTL index (cached_at) to per-document TTL (expires_at) + try: + existing_indexes = await analyses.index_information() + for idx_name, idx_info in existing_indexes.items(): + if idx_info.get("expireAfterSeconds") is not None and "cached_at" in str(idx_info.get("key")): + await analyses.drop_index(idx_name) + logger.info(f"Dropped old TTL index: {idx_name}") + break + except OperationFailure: + pass # Old index may not exist + + await analyses.create_index("expires_at", expireAfterSeconds=0) + + # Indeksy dla listy gier + games = self.db[self.COLLECTION_GAMES] + await games.create_index("appid", unique=True) + # Indeks dla wyszukiwania regex (case-insensitive) + await games.create_index("name_lower") + await games.create_index("name_cn") + # Rzadki indeks dla flagi sprawdzenia (oszczędność miejsca, szybkość zapytania) + await games.create_index("cn_name_checked", sparse=True) + await games.create_index("parent_appid", sparse=True) + + # Compound index for sorting games by review count (worker game sync) + await games.create_index( + [("positive", DESCENDING), ("negative", DESCENDING)], + sparse=True, + ) + await games.create_index( + [ + ("name_lower", ASCENDING), + ("app_type", ASCENDING), + ("positive", DESCENDING), + ("negative", DESCENDING), + ] + ) + await games.create_index("is_priority", sparse=True) + + # Indeksy dla cache błędów Steam API + steam_errors = self.db[self.COLLECTION_STEAM_ERRORS] + await steam_errors.create_index("app_id", unique=True) + await steam_errors.create_index("expires_at", expireAfterSeconds=0) + + # Indexes for refresh schedules (worker pre-cache) + schedules = self.db[self.COLLECTION_REFRESH_SCHEDULES] + await schedules.create_index("app_id", unique=True) + await schedules.create_index("status") + + logger.debug("Utworzono indeksy MongoDB") + + async def disconnect(self) -> None: + """Zamyka połączenie z MongoDB.""" + if self.client: + self.client.close() + logger.info("Rozłączono z MongoDB") + + def _is_document_expired(self, document: dict[str, Any]) -> bool: + """Check if a cache document is expired using expires_at or cached_at fallback. + + With tz_aware=True on the Motor client, all datetimes from MongoDB are + already timezone-aware, so no manual .replace(tzinfo=...) is needed. + """ + now = datetime.now(timezone.utc) + + # New-format: per-document expires_at + expires_at = document.get("expires_at") + if expires_at: + if isinstance(expires_at, str): + expires_at = datetime.fromisoformat(expires_at) + return now >= expires_at + + # Old-format fallback: cached_at + default TTL + cached_at = document.get("cached_at") + if cached_at: + if isinstance(cached_at, str): + cached_at = datetime.fromisoformat(cached_at) + ttl_hours = document.get("ttl_hours", settings.cache_ttl_hours) + return now - cached_at > timedelta(hours=ttl_hours) + + return True # No timestamp info = treat as expired + + async def get_cached_analysis_full(self, game_id: str) -> dict[str, Any] | None: + """ + Returns full cache document (with review IDs, TTL info) or None if expired/missing. + """ + if self.db is None: + return None + + collection = self.db[self.COLLECTION_ANALYSES] + + try: + document = await collection.find_one({"game_id": game_id}) + if not document: + return None + + if self._is_document_expired(document): + logger.info(f"Cache expired for game {game_id}") + return None + + document.pop("_id", None) + return document + + except PyMongoError as e: + logger.error(f"Error reading cache: {e}") + return None + + async def get_stale_analysis(self, game_id: str) -> dict[str, Any] | None: + """ + Returns cache document even if expired. Used by incremental path + to retrieve old review IDs. Returns None only if no document exists. + """ + return await self.get_analysis(game_id) + + async def get_analysis(self, game_id: str) -> dict[str, Any] | None: + """ + Returns an analysis document regardless of TTL. + + Product freshness is evaluated outside MongoDB, so this method is the + canonical read path for "show stale result + refresh" behavior. + """ + if self.db is None: + return None + + collection = self.db[self.COLLECTION_ANALYSES] + + try: + document = await collection.find_one({"game_id": game_id}) + if not document: + return None + + document.pop("_id", None) + return document + + except PyMongoError as e: + logger.error(f"Error reading stale cache: {e}") + return None + + async def get_cached_analysis(self, game_id: str) -> dict[str, Any] | None: + """ + Returns cached analysis results or None if expired/missing. + Backward-compatible wrapper around get_cached_analysis_full. + """ + doc = await self.get_cached_analysis_full(game_id) + if doc is None: + return None + results = doc.get("results") + if isinstance(results, dict) and results.get("cached_at") is None and doc.get("cached_at") is not None: + results = {**results, "cached_at": doc["cached_at"]} + return results + + async def save_analysis( + self, + game_id: str, + results: dict[str, Any], + analyzed_review_ids: list[str] | None = None, + latest_review_timestamp: int = 0, + ttl_hours: int | None = None, + analyzed_at: datetime | None = None, + ) -> None: + """ + Saves analysis results to cache with per-document TTL. + Purges review IDs to keep only the most recent ones (space efficiency). + """ + if self.db is None: + logger.warning("Brak połączenia z MongoDB - nie zapisano cache") + return + + collection = self.db[self.COLLECTION_ANALYSES] + + effective_ttl = ttl_hours or settings.cache_ttl_hours + now = datetime.now(timezone.utc) + analysis_date = analyzed_at + if analysis_date is None: + raw_value = results.get("analysis_date") or results.get("cached_at") + if isinstance(raw_value, str): + analysis_date = datetime.fromisoformat(raw_value) + elif isinstance(raw_value, datetime): + analysis_date = raw_value + if analysis_date is None: + analysis_date = now + + if results.get("analysis_date") is None: + results = {**results, "analysis_date": analysis_date} + + # Purge old IDs — keep only the most recent N + if analyzed_review_ids: + analyzed_review_ids = analyzed_review_ids[-settings.incremental_max_stored_ids:] + + document: dict[str, Any] = { + "game_id": game_id, + "results": results, + "analyzed_review_ids": analyzed_review_ids or [], + "latest_review_timestamp": latest_review_timestamp, + "cached_at": now, + "analyzed_at": analysis_date, + "ttl_hours": effective_ttl, + "expires_at": now + timedelta(hours=effective_ttl), + } + + try: + await collection.update_one( + {"game_id": game_id}, + {"$set": document}, + upsert=True, + ) + logger.info(f"Saved cache for game {game_id} (TTL: {effective_ttl}h)") + + except PyMongoError as e: + logger.error(f"Error saving cache: {e}") + + async def delete_cached_analysis(self, game_id: str) -> bool: + """ + Usuwa cache dla danej gry. + + Args: + game_id: Identyfikator gry Steam. + + Returns: + True jeśli usunięto, False w przeciwnym razie. + """ + if self.db is None: + return False + + collection = self.db[self.COLLECTION_ANALYSES] + + try: + result = await collection.delete_one({"game_id": game_id}) + return result.deleted_count > 0 + except PyMongoError as e: + logger.error(f"Błąd usuwania cache: {e}") + return False + + # ========== Steam API Error Cache ========== + + async def get_steam_error(self, app_id: str) -> dict[str, Any] | None: + """ + Sprawdza czy app_id ma cached error. + + Returns: + Dict z polami app_id, status_code, expires_at lub None. + """ + if self.db is None: + return None + + collection = self.db[self.COLLECTION_STEAM_ERRORS] + + try: + document = await collection.find_one({"app_id": app_id}) + if not document: + return None + + document.pop("_id", None) + return document + + except PyMongoError as e: + logger.error(f"Błąd odczytu steam error cache: {e}") + return None + + async def cache_steam_error( + self, app_id: str, status_code: int, ttl_seconds: int + ) -> None: + """ + Cachuje błąd Steam API z automatycznym TTL. + + MongoDB TTL index automatycznie usunie dokument po expires_at. + """ + if self.db is None: + return + + collection = self.db[self.COLLECTION_STEAM_ERRORS] + + document = { + "app_id": app_id, + "status_code": status_code, + "cached_at": datetime.now(timezone.utc), + "expires_at": datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds), + } + + try: + await collection.update_one( + {"app_id": app_id}, + {"$set": document}, + upsert=True, + ) + logger.info( + f"Cached Steam error {status_code} for app {app_id} (TTL: {ttl_seconds}s)" + ) + except PyMongoError as e: + logger.error(f"Błąd zapisu steam error cache: {e}") + + # ========== Metody dla listy gier (autouzupełnianie) ========== + + async def get_games_count(self) -> int: + """Zwraca liczbę gier w bazie.""" + if self.db is None: + return 0 + + collection = self.db[self.COLLECTION_GAMES] + return await collection.count_documents({}) + + async def save_games_batch(self, games: list[dict[str, str]]) -> int: + """ + Zapisuje partię gier do bazy (bulk insert). + + Args: + games: Lista słowników z kluczami 'appid', 'name', opcjonalnie 'developer', 'publisher'. + + Returns: + Liczba zapisanych gier. + """ + if self.db is None or not games: + return 0 + + collection = self.db[self.COLLECTION_GAMES] + + # Dodaj pole name_lower dla wyszukiwania case-insensitive + documents = [] + for game in games: + if not game.get("name"): + continue + + doc = { + "appid": game["appid"], + "name": game["name"], + "name_lower": game["name"].lower(), + } + + # Dodaj opcjonalne pola + if game.get("developer"): + doc["developer"] = game["developer"] + if game.get("publisher"): + doc["publisher"] = game["publisher"] + + documents.append(doc) + + try: + # Użyj ordered=False żeby kontynuować mimo duplikatów + result = await collection.insert_many(documents, ordered=False) + return len(result.inserted_ids) + except BulkWriteError as e: + # Duplicates are expected with ordered=False — count successful inserts + inserted = e.details.get("nInserted", 0) + logger.debug(f"Pominięto duplikaty podczas zapisu gier ({inserted} inserted)") + return inserted + except PyMongoError as e: + logger.error(f"Błąd zapisu gier: {e}") + return 0 + + async def clear_games(self) -> None: + """Usuwa wszystkie gry z bazy.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + await collection.delete_many({}) + logger.info("Usunięto wszystkie gry z bazy") + + async def upsert_game(self, game_data: dict[str, Any]) -> None: + """ + Dodaje lub aktualizuje pojedynczą grę w bazie danych. + Używane głównie przez mechanizm Fallback Search. + """ + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + appid = str(game_data["appid"]) + + # Przygotuj dokument + update_doc = { + "appid": appid, + "name": game_data["name"], + "name_lower": game_data["name"].lower(), + } + + if game_data.get("name_cn"): + update_doc["name_cn"] = game_data["name_cn"] + update_doc["cn_name_checked"] = True + elif game_data.get("cn_name_checked"): + update_doc["cn_name_checked"] = True + + if game_data.get("header_image") is not None: + update_doc["header_image"] = game_data["header_image"] + if game_data.get("total_reviews") is not None: + update_doc["total_reviews"] = game_data["total_reviews"] + + # Worker-supplied fields + for field in ( + "positive", "negative", "tags", "genre", "ccu", + "last_game_update_at", "synced_at", "developer", "publisher", + "app_type", "parent_appid", "dlc_checked_at", + ): + if game_data.get(field) is not None: + update_doc[field] = game_data[field] + + try: + await collection.update_one( + {"appid": appid}, + {"$set": update_doc}, + upsert=True + ) + logger.debug(f"Zsynchronizowano grę {appid} w MongoDB") + except PyMongoError as e: + logger.error(f"Błąd upsert gry {appid}: {e}") + + async def search_games(self, query: str, limit: int = 10) -> list[dict[str, Any]]: + """ + Wyszukuje gry po nazwie (EN lub CN). + + Używa wyszukiwania case-insensitive z prefiksem. + + Args: + query: Tekst do wyszukania. + limit: Maksymalna liczba wyników. + + Returns: + Lista gier pasujących do zapytania (appid, name, name_cn, developer, publisher). + """ + normalized_query = query.strip() + if self.db is None or not normalized_query or len(normalized_query) < 2: + return [] + + collection = self.db[self.COLLECTION_GAMES] + + try: + query_lower = normalized_query.lower() + name_pattern = re.escape(query_lower) + name_prefix_pattern = f"^{name_pattern}" + name_exact_pattern = f"^{name_pattern}$" + cn_pattern = re.escape(normalized_query) + cn_prefix_pattern = f"^{cn_pattern}" + cn_exact_pattern = f"^{cn_pattern}$" + + match_filter: dict[str, Any] = { + "$or": [ + {"name_lower": {"$regex": name_pattern}}, + {"name_cn": {"$regex": cn_pattern, "$options": "i"}}, + ] + } + if not settings.dlc_visible_in_search: + match_filter["app_type"] = {"$ne": "dlc"} + + pipeline = [ + {"$match": match_filter}, + { + "$addFields": { + "match_rank": { + "$switch": { + "branches": [ + { + "case": { + "$or": [ + { + "$regexMatch": { + "input": {"$ifNull": ["$name_lower", ""]}, + "regex": name_exact_pattern, + } + }, + { + "$regexMatch": { + "input": {"$ifNull": ["$name_cn", ""]}, + "regex": cn_exact_pattern, + "options": "i", + } + }, + ] + }, + "then": 0, + }, + { + "case": { + "$or": [ + { + "$regexMatch": { + "input": {"$ifNull": ["$name_lower", ""]}, + "regex": name_prefix_pattern, + } + }, + { + "$regexMatch": { + "input": {"$ifNull": ["$name_cn", ""]}, + "regex": cn_prefix_pattern, + "options": "i", + } + }, + ] + }, + "then": 1, + }, + ], + "default": 2, + } + }, + "type_rank": { + "$switch": { + "branches": [ + { + "case": { + "$in": [ + {"$ifNull": ["$app_type", "unknown"]}, + ["game", "unknown"], + ] + }, + "then": 0, + }, + {"case": {"$eq": ["$app_type", "dlc"]}, "then": 1}, + {"case": {"$eq": ["$app_type", "demo"]}, "then": 2}, + ], + "default": 1, + } + }, + "review_count": { + "$add": [ + {"$ifNull": ["$positive", 0]}, + {"$ifNull": ["$negative", 0]}, + ] + }, + } + }, + { + "$sort": { + "match_rank": 1, + "type_rank": 1, + "review_count": -1, + "name": 1, + } + }, + {"$limit": limit}, + { + "$project": { + "_id": 0, + "appid": 1, + "name": 1, + "name_cn": 1, + "developer": 1, + "publisher": 1, + "app_type": 1, + "parent_appid": 1, + } + }, + ] + + cursor = collection.aggregate(pipeline) + results = await cursor.to_list(length=limit) + return results + + except PyMongoError as e: + logger.error(f"Błąd wyszukiwania gier: {e}") + return [] + + + async def get_game_update_date(self, app_id: str) -> datetime | None: + """Get the last game update timestamp for a game.""" + if self.db is None: + return None + + collection = self.db[self.COLLECTION_GAMES] + try: + doc = await collection.find_one( + {"appid": str(app_id)}, + {"_id": 0, "last_game_update_at": 1}, + ) + if doc and doc.get("last_game_update_at"): + val = doc["last_game_update_at"] + if isinstance(val, datetime): + return val + return None + return None + except PyMongoError as e: + logger.error(f"Error getting game update date for {app_id}: {e}") + return None + + async def get_games_without_cn_name(self, limit: int = 200) -> list[dict[str, Any]]: + """ + Pobiera gry, które nie mają jeszcze nazwy chińskiej i nie były sprawdzane. + Sortuje po liczbie pozytywnych recenzji (jeśli dostępne, dla priorytetyzacji). + """ + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_GAMES] + try: + pipeline = [ + {"$match": { + "name_cn": {"$exists": False}, + "cn_name_checked": {"$ne": True}, # Pomiń już sprawdzone + }}, + # Sortowanie po positive (DESC), ale gry bez tego pola trafią na koniec (sparse index handling) + {"$sort": {"positive": -1}}, + {"$limit": limit}, + {"$project": {"_id": 0, "appid": 1, "name": 1}}, + ] + cursor = collection.aggregate(pipeline) + return await cursor.to_list(length=limit) + except PyMongoError as e: + logger.error(f"Error getting games without CN name: {e}") + return [] + + async def mark_cn_name_checked(self, app_id: str, name_cn: str | None = None) -> None: + """ + Oznacza grę jako sprawdzoną pod kątem chińskiej nazwy. + Opcjonalnie zapisuje znalezioną nazwę. + """ + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + update_doc: dict[str, Any] = {"cn_name_checked": True} + if name_cn: + update_doc["name_cn"] = name_cn + + try: + await collection.update_one( + {"appid": str(app_id)}, + {"$set": update_doc} + ) + except PyMongoError as e: + logger.error(f"Error marking CN name checked for {app_id}: {e}") + + async def get_games_missing_app_type(self, limit: int = 200) -> list[dict[str, Any]]: + """ + Return high-signal games that still need Steam Store type enrichment. + + We prioritize already-priority games first, then any app with enough reviews + to qualify a DLC for worker-managed analysis. + """ + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_GAMES] + try: + pipeline = [ + { + "$addFields": { + "total_reviews_sum": { + "$add": [ + {"$ifNull": ["$positive", 0]}, + {"$ifNull": ["$negative", 0]}, + ] + } + } + }, + { + "$match": { + "dlc_checked_at": {"$exists": False}, + "$or": [ + {"is_priority": True}, + { + "total_reviews_sum": { + "$gte": settings.dlc_min_reviews_for_analysis + } + }, + ], + } + }, + {"$sort": {"is_priority": -1, "total_reviews_sum": -1}}, + {"$limit": limit}, + {"$project": {"_id": 0, "appid": 1, "name": 1}}, + ] + cursor = collection.aggregate(pipeline) + return await cursor.to_list(length=limit) + except PyMongoError as e: + logger.error(f"Error getting games missing app type: {e}") + return [] + + async def mark_app_type_checked( + self, + app_id: str, + *, + app_type: str, + parent_appid: str | None = None, + ) -> None: + """Persist Steam Store app type metadata.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + update_doc: dict[str, Any] = { + "app_type": app_type, + "parent_appid": str(parent_appid) if parent_appid else None, + "dlc_checked_at": datetime.now(timezone.utc), + } + + try: + await collection.update_one( + {"appid": str(app_id)}, + {"$set": update_doc}, + ) + except PyMongoError as e: + logger.error(f"Error marking app type checked for {app_id}: {e}") + + # ========== Worker Methods ========== + + async def upsert_games_batch(self, games: list[dict[str, Any]]) -> tuple[int, int]: + """ + Bulk upsert games via UpdateOne operations. + + Returns: + (upserted_count, modified_count) + """ + if self.db is None or not games: + return (0, 0) + + collection = self.db[self.COLLECTION_GAMES] + operations = [] + + for game in games: + appid = str(game.get("appid", "")) + name = game.get("name", "") + if not appid or not name: + continue + + update_doc: dict[str, Any] = { + "appid": appid, + "name": name, + "name_lower": name.lower(), + } + for field in ( + "developer", "publisher", "positive", "negative", + "tags", "genre", "ccu", "synced_at", + "app_type", "parent_appid", "dlc_checked_at", + ): + if game.get(field) is not None: + update_doc[field] = game[field] + + operations.append( + UpdateOne({"appid": appid}, {"$set": update_doc}, upsert=True) + ) + + if not operations: + return (0, 0) + + try: + result = await collection.bulk_write(operations, ordered=False) + return (result.upserted_count, result.modified_count) + except BulkWriteError as e: + details = e.details or {} + return (details.get("nUpserted", 0), details.get("nModified", 0)) + except PyMongoError as e: + logger.error(f"Error in upsert_games_batch: {e}") + return (0, 0) + + async def get_top_games_by_reviews(self, limit: int = 500) -> list[dict[str, Any]]: + """Top N games sorted by total review count (positive + negative) DESC.""" + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_GAMES] + try: + pipeline = [ + {"$match": {"positive": {"$exists": True}, "negative": {"$exists": True}}}, + {"$addFields": {"total_reviews_sum": {"$add": ["$positive", "$negative"]}}}, + {"$sort": {"total_reviews_sum": -1}}, + {"$limit": limit}, + {"$project": {"_id": 0}}, + ] + cursor = collection.aggregate(pipeline) + return await cursor.to_list(length=limit) + except PyMongoError as e: + logger.error(f"Error getting top games: {e}") + return [] + + async def update_game_update_date(self, app_id: str, update_at: datetime) -> None: + """Store the latest game update timestamp.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + try: + await collection.update_one( + {"appid": str(app_id)}, + {"$set": {"last_game_update_at": update_at}}, + ) + except PyMongoError as e: + logger.error(f"Error updating game update date for {app_id}: {e}") + + async def update_game_patch_date(self, app_id: str, patch_date: datetime) -> None: + """Store the latest confirmed major-update timestamp.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + try: + await collection.update_one( + {"appid": str(app_id)}, + {"$set": {"current_patch_at": patch_date}}, + ) + except PyMongoError as e: + logger.error(f"Error updating game patch date for {app_id}: {e}") + + async def update_news_cursor(self, app_id: str, gid: str, date: datetime) -> None: + """Store the latest seen news GID and its date as an incremental scan cursor.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_GAMES] + try: + await collection.update_one( + {"appid": str(app_id)}, + {"$set": {"last_seen_news_gid": gid, "last_seen_news_at": date}}, + ) + except PyMongoError as e: + logger.error(f"Error updating news cursor for {app_id}: {e}") + + async def get_game_patch_date(self, app_id: str) -> datetime | None: + """Get the latest confirmed major-update timestamp for a game.""" + if self.db is None: + return None + + collection = self.db[self.COLLECTION_GAMES] + try: + doc = await collection.find_one( + {"appid": str(app_id)}, + {"_id": 0, "current_patch_at": 1}, + ) + if doc and doc.get("current_patch_at"): + val = doc["current_patch_at"] + if isinstance(val, datetime): + return val + return None + return None + except PyMongoError as e: + logger.error(f"Error getting game patch date for {app_id}: {e}") + return None + + async def upsert_refresh_schedule(self, schedule: dict[str, Any]) -> None: + """Create or replace a refresh schedule document.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_REFRESH_SCHEDULES] + try: + await collection.update_one( + {"app_id": schedule["app_id"]}, + {"$set": schedule}, + upsert=True, + ) + except PyMongoError as e: + logger.error(f"Error upserting refresh schedule for {schedule.get('app_id')}: {e}") + + async def get_active_schedules(self) -> list[dict[str, Any]]: + """All schedules with status: 'active'.""" + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_REFRESH_SCHEDULES] + try: + cursor = collection.find({"status": "active"}, {"_id": 0}) + return await cursor.to_list(length=10000) + except PyMongoError as e: + logger.error(f"Error getting active schedules: {e}") + return [] + + async def has_due_refresh_schedule(self, app_id: str) -> bool: + """True when an active schedule has at least one due, incomplete checkpoint.""" + if self.db is None: + return False + + collection = self.db[self.COLLECTION_REFRESH_SCHEDULES] + now = datetime.now(timezone.utc) + try: + document = await collection.find_one( + { + "app_id": str(app_id), + "status": "active", + "checkpoints": { + "$elemMatch": { + "completed": False, + "due_at": {"$lte": now}, + } + }, + }, + {"_id": 0, "app_id": 1}, + ) + return document is not None + except PyMongoError as e: + logger.error(f"Error checking due refresh schedule for {app_id}: {e}") + return False + + async def mark_checkpoint_completed(self, app_id: str, offset_hours: int) -> None: + """Mark a specific checkpoint as completed using positional $ update.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_REFRESH_SCHEDULES] + try: + await collection.update_one( + {"app_id": str(app_id), "checkpoints.offset_hours": offset_hours}, + {"$set": {"checkpoints.$.completed": True}}, + ) + except PyMongoError as e: + logger.error(f"Error marking checkpoint for {app_id}/{offset_hours}h: {e}") + + async def complete_schedule(self, app_id: str) -> None: + """Set schedule status to 'completed'.""" + if self.db is None: + return + + collection = self.db[self.COLLECTION_REFRESH_SCHEDULES] + try: + await collection.update_one( + {"app_id": str(app_id)}, + {"$set": {"status": "completed"}}, + ) + except PyMongoError as e: + logger.error(f"Error completing schedule for {app_id}: {e}") + + # ========== Priority Games Methods ========== + + async def get_priority_games(self) -> list[dict[str, Any]]: + """All games with is_priority == True, all fields except _id.""" + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_GAMES] + try: + cursor = collection.find({"is_priority": True}, {"_id": 0}) + return await cursor.to_list(length=10000) + except PyMongoError as e: + logger.error(f"Error getting priority games: {e}") + return [] + + async def get_priority_games_for_analysis(self) -> list[dict[str, Any]]: + """ + Priority games eligible for worker-managed analysis. + + DLC stays linked to the priority universe via is_priority, but low-review DLC + falls back to on-demand mode instead of occupying worker capacity. + """ + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_GAMES] + if settings.dlc_worker_analysis_enabled: + query: dict[str, Any] = { + "is_priority": True, + "$or": [ + {"app_type": {"$ne": "dlc"}}, + { + "$expr": { + "$gte": [ + { + "$add": [ + {"$ifNull": ["$positive", 0]}, + {"$ifNull": ["$negative", 0]}, + ] + }, + settings.dlc_min_reviews_for_analysis, + ] + } + }, + ], + } + else: + query = { + "is_priority": True, + "app_type": {"$ne": "dlc"}, + } + + try: + cursor = collection.find(query, {"_id": 0}) + return await cursor.to_list(length=10000) + except PyMongoError as e: + logger.error(f"Error getting priority games for analysis: {e}") + return [] + + async def get_priority_game_ids(self) -> set[str]: + """Lightweight set of appids for is_priority == True games.""" + if self.db is None: + return set() + + collection = self.db[self.COLLECTION_GAMES] + try: + cursor = collection.find({"is_priority": True}, {"_id": 0, "appid": 1}) + docs = await cursor.to_list(length=10000) + return {str(d["appid"]) for d in docs if d.get("appid")} + except PyMongoError as e: + logger.error(f"Error getting priority game ids: {e}") + return set() + + async def get_priority_game_ids_for_analysis(self) -> set[str]: + """App IDs that should behave as worker-managed in runtime decisions.""" + docs = await self.get_priority_games_for_analysis() + return {str(d["appid"]) for d in docs if d.get("appid")} + + async def get_dlcs_by_parent_appid(self, parent_appid: str) -> list[dict[str, Any]]: + """Return DLC documents linked to a given base game.""" + if self.db is None: + return [] + + collection = self.db[self.COLLECTION_GAMES] + try: + cursor = collection.find( + {"app_type": "dlc", "parent_appid": str(parent_appid)}, + {"_id": 0}, + ) + return await cursor.to_list(length=1000) + except PyMongoError as e: + logger.error(f"Error getting DLCs for parent {parent_appid}: {e}") + return [] + + async def get_existing_appids(self, appids: set[str]) -> set[str]: + """Return the subset of the given appids that have a document in games.""" + if self.db is None or not appids: + return set() + + collection = self.db[self.COLLECTION_GAMES] + try: + cursor = collection.find( + {"appid": {"$in": list(appids)}}, + {"_id": 0, "appid": 1}, + ) + docs = await cursor.to_list(length=len(appids) + 1) + return {str(d["appid"]) for d in docs if d.get("appid")} + except PyMongoError as e: + logger.error(f"Error in get_existing_appids: {e}") + return set() + + async def bulk_update_priority_fields(self, updates: list[tuple[str, dict]]) -> int: + """ + Batch UpdateOne operations for priority fields. + + Args: + updates: List of (appid, fields_dict) tuples. + + Returns: + modified_count + """ + if self.db is None or not updates: + return 0 + + collection = self.db[self.COLLECTION_GAMES] + operations = [ + UpdateOne({"appid": appid}, {"$set": fields}) + for appid, fields in updates + ] + + try: + result = await collection.bulk_write(operations, ordered=False) + return result.modified_count + except BulkWriteError as e: + details = e.details or {} + return details.get("nModified", 0) + except PyMongoError as e: + logger.error(f"Error in bulk_update_priority_fields: {e}") + return 0 + + +# Globalna instancja (Singleton) +mongodb = MongoDB() diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000000000000000000000000000000000000..4981123306abe9c6769333cf474a004ad615f697 --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,159 @@ +import logging +import os +from contextlib import asynccontextmanager +from typing import AsyncGenerator + +from fastapi import FastAPI, Query, Request, Response +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from fastapi.responses import FileResponse, JSONResponse +from slowapi import _rate_limit_exceeded_handler +from slowapi.errors import RateLimitExceeded +from starlette.middleware.base import BaseHTTPMiddleware + +from app.core.config import settings +from app.core.rate_limit import limiter +from app.core.worker_logging import ( + LIVE_LOG_WHITELIST, + read_log_tail, + resolve_log_path, + setup_app_logging, + setup_structured_logger, +) +from app.db.mongodb import mongodb +from app.routers import analyze, games +from app.services.nlp_service import get_nlp_service +from app.services.steam_service import steam_service + + +# Konfiguracja logowania +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) + + +class SecurityHeadersMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + response: Response = await call_next(request) + response.headers["X-Content-Type-Options"] = "nosniff" + response.headers["X-Frame-Options"] = "SAMEORIGIN" + response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin" + return response + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: + """ + Zarządza cyklem życia aplikacji. + + Nawiązuje połączenie z MongoDB przy starcie + i zamyka je przy wyłączeniu. + """ + if not settings.mongodb_url: + raise RuntimeError( + "MONGODB_URL is not set. Please configure it in .env or environment variables." + ) + await mongodb.connect() + setup_structured_logger("live") + setup_app_logging() + yield + await steam_service.close() + await mongodb.disconnect() + + +app = FastAPI( + title="SentimentStream API", + description="API do analizy sentymentu recenzji gier Steam w czasie rzeczywistym", + version="1.0.0", + lifespan=lifespan, +) + +# Rate limiter +app.state.limiter = limiter +app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) # type: ignore[arg-type] + +# Konfiguracja CORS +app.add_middleware( + CORSMiddleware, + allow_origins=settings.cors_origins_list, + allow_credentials=True, + allow_methods=["GET", "POST", "OPTIONS"], + allow_headers=["Content-Type", "Accept"], +) + +# Security headers +app.add_middleware(SecurityHeadersMiddleware) + +# Rejestracja routerów +app.include_router(analyze.router, prefix="/api", tags=["analyze"]) +app.include_router(games.router, prefix="/api", tags=["games"]) + + +@app.get("/api/logs") +async def get_logs( + request: Request, + lines: int = Query(default=100, ge=1, le=1000), + level: str | None = Query(default=None), + event: str | None = Query(default=None), + file: str = Query(default="live"), +): + """Token-protected endpoint to read structured log tail.""" + auth = request.headers.get("Authorization", "") + expected = settings.worker_trigger_token + if expected: + if not auth.startswith("Bearer ") or auth[7:] != expected: + return JSONResponse(status_code=401, content={"detail": "Unauthorized"}) + + log_path = resolve_log_path(file, LIVE_LOG_WHITELIST) + if log_path is None: + return JSONResponse( + status_code=400, + content={"detail": f"Unknown log file: '{file}'. Valid: {list(LIVE_LOG_WHITELIST.keys())}"}, + ) + + entries = read_log_tail(log_path, lines=lines, level=level, event=event) + return {"entries": entries, "count": len(entries)} + + +@app.get("/health") +async def health_check() -> dict: + """Endpoint sprawdzający stan aplikacji z rzeczywistą weryfikacją zależności.""" + mongo_ok = False + if mongodb.client is not None: + try: + await mongodb.client.admin.command("ping") + mongo_ok = True + except Exception: + pass + + nlp_svc = get_nlp_service() + model_ok = hasattr(nlp_svc, "classifier") and nlp_svc.classifier is not None + + overall = "healthy" if (mongo_ok and model_ok) else "degraded" + return { + "status": overall, + "mongodb": "connected" if mongo_ok else "disconnected", + "model": "loaded" if model_ok else "not_loaded", + } + + +# Obsługa plików statycznych (Frontend) - tylko jeśli istnieją (np. w Dockerze) +# Ścieżka w kontenerze Docker będzie: /app/frontend/dist +# Lokalnie zazwyczaj nie istnieje (bo używamy vite dev server), więc pomijamy +static_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "frontend", "dist") + +if settings.app_mode != "api" and os.path.exists(static_dir): + app.mount("/assets", StaticFiles(directory=os.path.join(static_dir, "assets")), name="assets") + + # Catch-all dla SPA (React Router) + @app.get("/{full_path:path}") + async def serve_spa(full_path: str): + if full_path.startswith("api"): + return {"error": "API route not found"} + + file_path = os.path.join(static_dir, full_path) + if os.path.exists(file_path) and os.path.isfile(file_path): + return FileResponse(file_path) + + return FileResponse(os.path.join(static_dir, "index.html")) diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..66c18974975a728ee650f4c8d77557353ff1e027 --- /dev/null +++ b/backend/app/models/__init__.py @@ -0,0 +1,19 @@ +"""Modele danych Pydantic.""" + +from app.models.schemas import ( + AnalysisProgress, + AnalysisResult, + GameInfo, + ReviewBatch, + SentimentType, + TopicSentiment, +) + +__all__ = [ + "AnalysisProgress", + "AnalysisResult", + "GameInfo", + "ReviewBatch", + "SentimentType", + "TopicSentiment", +] diff --git a/backend/app/models/schemas.py b/backend/app/models/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..e81ddd2e3e4e98b514a8f4d32b7f5517d4c704cd --- /dev/null +++ b/backend/app/models/schemas.py @@ -0,0 +1,210 @@ +""" +Modele danych Pydantic. + +Definiuje struktury danych używane w API oraz do walidacji. +""" + +from datetime import datetime +from enum import Enum + +from pydantic import BaseModel, Field + + +class SentimentType(str, Enum): + """Typ sentymentu dla tematu.""" + + POSITIVE = "positive" + NEGATIVE = "negative" + NEUTRAL = "neutral" + + +class PredictionType(str, Enum): + """Typ przewidywanego trendu liczby graczy.""" + + INCREASING = "increasing" + DECREASING = "decreasing" + STABLE = "stable" + UNCERTAIN = "uncertain" + + +class UserCountPrediction(BaseModel): + """ + Przewidywanie trendu liczby graczy. + + Attributes: + trend: Przewidywany kierunek (wzrost/spadek). + confidence: Pewność predykcji (0.0 - 1.0). + reasoning: Krótkie uzasadnienie. + """ + + trend: PredictionType + confidence: float + reasoning: str + + +class GameInfo(BaseModel): + """ + Informacje o grze ze Steam. + + Attributes: + app_id: Unikalny identyfikator gry na Steam. + name: Nazwa gry. + name_cn: Chińska nazwa gry (jeśli dostępna). + header_image: URL obrazka nagłówkowego. + total_reviews: Całkowita liczba recenzji. + target_count: Docelowa liczba recenzji do analizy (sample size). + """ + + app_id: str + name: str + name_cn: str | None = None + header_image: str | None = None + total_reviews: int = 0 + target_count: int | None = None + last_game_update_at: int | None = None + + +class TopicSentiment(BaseModel): + """ + Sentyment dla pojedynczego tematu. + + Attributes: + topic: Nazwa tematu (np. "Grafika", "Gameplay"). + sentiment: Typ sentymentu. + score: Wynik sentymentu (-1.0 do 1.0). + mention_count: Liczba wzmianek o temacie. + example: Przykładowe zdanie z recenzji. + """ + + topic: str + sentiment: SentimentType + score: float = Field(ge=-1.0, le=1.0) + mention_count: int = 0 + example: str | None = None + example_score: float | None = None # score przykładu do porównań przy agregacji + + +class Highlight(BaseModel): + """Czesto powtarzana fraza z recenzji.""" + + phrase: str + mention_count: int + sentiment: SentimentType + score: float + ngram_size: int + + +class TopicHighlights(BaseModel): + """Highlights dla konkretnego tematu.""" + + topic: str + highlights: list[Highlight] + + +class AnalysisProgress(BaseModel): + """ + Postęp analizy (wysyłany przez SSE). + + Attributes: + processed: Liczba przetworzonych recenzji. + total: Całkowita liczba recenzji do przetworzenia. + current_topics: Aktualne wyniki tematów. + skipped_count: Liczba zdań pominiętych (brak słów kluczowych). + """ + + processed: int + total: int + current_topics: list[TopicSentiment] = [] + skipped_count: int = 0 + + +class AnalysisResult(BaseModel): + """ + Końcowy wynik analizy. + + Attributes: + game: Informacje o grze. + general_topics: Lista tematów z sentymentem (pełny agregat). + prediction: Przewidywanie trendu liczby graczy. + analyzed_reviews: Liczba przeanalizowanych recenzji. + skipped_count: Łączna liczba pominiętych zdań. + cached_at: Data zapisania w cache. + """ + + game: GameInfo + general_topics: list[TopicSentiment] + prediction: UserCountPrediction | None = None + analyzed_reviews: int + skipped_count: int = 0 + general_highlights: list[Highlight] = [] + recent_highlights: list[Highlight] | None = None + current_patch_highlights: list[Highlight] | None = None + topic_highlights: list[TopicHighlights] = [] + cached_at: datetime | None = None + recent_topics: list[TopicSentiment] | None = None + recent_reviews_count: int = 0 + current_patch_topics: list[TopicSentiment] | None = None + current_patch_reviews_count: int = 0 + last_patch_topics: list[TopicSentiment] | None = None + last_patch_reviews_count: int = 0 + current_patch_timestamp: int | None = None + analysis_date: datetime | None = None + current_patch_date: datetime | None = None + preferred_context: str | None = None + freshness_status: str | None = None + staleness_reason: str | None = None + is_refreshing: bool = False + + +class ReviewItem(BaseModel): + """Single review with metadata for incremental tracking.""" + + text: str + recommendation_id: str + timestamp_created: int + + +class ReviewBatch(BaseModel): + """ + Partia recenzji do przetworzenia. + + Attributes: + reviews: Lista tekstów recenzji. + review_items: Recenzje z metadanymi (do incremental analysis). + cursor: Kursor do paginacji Steam API. + """ + + reviews: list[str] + review_items: list[ReviewItem] = [] + cursor: str | None = None + + +class SSEEvent(BaseModel): + """ + Wydarzenie Server-Sent Events. + + Attributes: + event: Typ wydarzenia (progress/complete/error). + data: Dane wydarzenia. + """ + + event: str + data: AnalysisProgress | AnalysisResult | dict + + +class CachedAnalysis(BaseModel): + """ + Dokument cache w MongoDB. + + Przechowuje wyniki analizy z timestampem dla walidacji TTL. + + Attributes: + game_id: Identyfikator gry Steam (klucz cache). + results: Wyniki analizy sentymentu. + cached_at: Data i czas zapisania do cache. + """ + + game_id: str + results: AnalysisResult + cached_at: datetime + analyzed_at: datetime | None = None diff --git a/backend/app/routers/__init__.py b/backend/app/routers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f7331260aae7f51abaa806826f35dd4ac33b7c --- /dev/null +++ b/backend/app/routers/__init__.py @@ -0,0 +1,5 @@ +"""Routery API.""" + +from app.routers import analyze + +__all__ = ["analyze"] diff --git a/backend/app/routers/analyze.py b/backend/app/routers/analyze.py new file mode 100644 index 0000000000000000000000000000000000000000..7abb65160549e5695f648ad963daebbda5c5cbd1 --- /dev/null +++ b/backend/app/routers/analyze.py @@ -0,0 +1,597 @@ +""" +Router API do analizy sentymentu. + +Zawiera endpoint do streamowania wyników analizy przez SSE. +""" + +import asyncio +import contextlib +import json +import logging +import time +from datetime import datetime, timezone +from typing import Any, AsyncGenerator + +from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request +from sse_starlette.sse import EventSourceResponse + +from app.core.config import settings +from app.core.freshness import ( + FreshnessStatus, + evaluate_freshness, + get_staleness_reason, +) +from app.core.sampling import SamplePlan, create_sample_plan +from app.core.ttl_tiers import get_ttl_hours +from app.core.worker_logging import get_structured_logger, log_structured +from app.db.mongodb import mongodb +from app.core.rate_limit import limiter +from app.models.schemas import ( + AnalysisProgress, + AnalysisResult, + GameInfo, + Highlight, + TopicHighlights, + TopicSentiment, +) +from app.services.analysis_utils import ( + aggregate_topics, + calculate_prediction, + coerce_utc_datetime, + compute_preferred_context, + datetime_from_timestamp, + filter_topics_by_min_mentions, + normalize_legacy_results, + serialize_datetime, +) +from app.services.highlights_service import HighlightsCollector +from app.services.analysis_runner import iter_incremental_analysis_events +from app.services.nlp_service import NLPService +from app.services.nlp_service import get_nlp_service as _get_nlp_service_instance +from app.services.steam_errors import SteamAPIError, SteamRateLimitError +from app.services.steam_service import SteamService, steam_service + +logger = logging.getLogger(__name__) + +router = APIRouter() + +# Background refresh concurrency control +_refreshing_app_ids: set[str] = set() +_refresh_semaphore = asyncio.Semaphore(3) # max 3 concurrent background refreshes + +# Funkcje pomocnicze dla Dependency Injection +def get_nlp_service() -> NLPService: + return _get_nlp_service_instance() + +def get_steam_service() -> SteamService: + return steam_service + + +@router.get("/health") +async def health_check(): + """ + Endpoint do sprawdzania stanu aplikacji (Health Check). + """ + return { + "status": "ok", + "services": { + "mongodb": "connected", + "nlp": "ready", + "steam_api": "reachable" + } + } + + + +def _build_analysis_payload( + document: dict[str, Any], + freshness_status: FreshnessStatus, + *, + current_patch_at: datetime | None = None, + is_refreshing: bool = False, +) -> dict[str, Any]: + results = normalize_legacy_results(document.get("results", {})) + payload = dict(results) + analysis_date = ( + coerce_utc_datetime(payload.get("analysis_date")) + or coerce_utc_datetime(document.get("analyzed_at")) + or coerce_utc_datetime(payload.get("cached_at")) + or coerce_utc_datetime(document.get("cached_at")) + ) + if current_patch_at is not None: + current_patch_date: datetime | None = current_patch_at + else: + # No confirmed major update in DB — nullify current_patch fields so + # legacy cached values don't appear as a valid Current Patch tab. + current_patch_date = None + payload["current_patch_topics"] = None + payload["current_patch_reviews_count"] = 0 + payload["current_patch_highlights"] = None + payload["current_patch_timestamp"] = None + + if payload.get("cached_at") is None and document.get("cached_at") is not None: + payload["cached_at"] = serialize_datetime(document["cached_at"]) + elif payload.get("cached_at") is not None: + payload["cached_at"] = serialize_datetime(payload["cached_at"]) + + payload["analysis_date"] = serialize_datetime(analysis_date) + payload["current_patch_date"] = serialize_datetime(current_patch_date) + payload["freshness_status"] = freshness_status.value + payload["staleness_reason"] = get_staleness_reason(freshness_status) + payload["is_refreshing"] = is_refreshing + # Always recompute preferred_context from the current patch date so cached + # documents with a stale stored value get the correct tab on read. + patch_ts_for_context = int(current_patch_date.timestamp()) if current_patch_date else None + payload["preferred_context"] = compute_preferred_context(patch_ts_for_context) + return payload + + +async def _full_analysis( + game: GameInfo, + sample_plan: SamplePlan, + steam_svc: SteamService, + nlp_svc: NLPService, + patch_timestamp: int | None = None, + stale_doc: dict[str, Any] | None = None, +) -> AsyncGenerator[dict, None]: + """Full analysis path — Producer-Consumer queue pattern.""" + total_target = sample_plan.total + ttl_hours = await get_ttl_hours(game.app_id) + nlp_cumulative_s: float = 0.0 + + # Producer-Consumer queue (max 5 batches in flight) + queue: asyncio.Queue = asyncio.Queue(maxsize=5) + + async def fetch_worker(): + try: + async for batch in steam_svc.fetch_reviews_stratified(game.app_id, sample_plan): + await queue.put(batch) + except Exception as e: + # Relay all exceptions to consumer via queue — they'll be re-raised + # and caught by the SSE generator's specific exception handlers. + await queue.put(e) + finally: + await queue.put(None) + + fetch_task = asyncio.create_task(fetch_worker()) + + processed = 0 + total_skipped = 0 + aggregated_topics: list[TopicSentiment] = [] + recent_processed = 0 + recent_limit = settings.recent_sample_limit + all_review_ids: list[str] = [] + latest_timestamp = 0 + highlights_collector = HighlightsCollector() + current_patch_topics: list[TopicSentiment] = [] + current_patch_count = 0 + review_topic_results: list[tuple[int, list[TopicSentiment]]] = [] + + try: + while True: + item = await queue.get() + + if item is None: + break + if isinstance(item, Exception): + raise item + + batch = item + if not batch.reviews: + continue + + # Collect review IDs for incremental cache + for ri in batch.review_items: + all_review_ids.append(ri.recommendation_id) + if ri.timestamp_created > latest_timestamp: + latest_timestamp = ri.timestamp_created + + # Split by patch timestamp when available and we have review_items + batch_skipped = 0 + if patch_timestamp and batch.review_items: + for ri, text in zip(batch.review_items, batch.reviews): + is_recent = recent_processed < recent_limit + cat = [] + if is_recent: + cat.append("recent") + + if ri.timestamp_created >= patch_timestamp: + cat.append("current_patch") + nlp_start = time.monotonic() + res, skipped = await nlp_svc.analyze_batch( + [text], highlights_collector=highlights_collector, categories=cat + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + if res: + aggregated_topics = aggregate_topics(aggregated_topics, res) + current_patch_topics = aggregate_topics(current_patch_topics, res) + review_topic_results.append((ri.timestamp_created, res)) + current_patch_count += 1 + else: + nlp_start = time.monotonic() + res, skipped = await nlp_svc.analyze_batch( + [text], highlights_collector=highlights_collector, categories=cat + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + if res: + aggregated_topics = aggregate_topics(aggregated_topics, res) + review_topic_results.append((ri.timestamp_created, res)) + + recent_processed += 1 + else: + for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews): + is_recent = recent_processed < recent_limit + cat = ["recent"] if is_recent else [] + + nlp_start = time.monotonic() + res, skipped = await nlp_svc.analyze_batch( + [text], highlights_collector=highlights_collector, categories=cat + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + ts = ri.timestamp_created if batch.review_items else 0 + if res: + aggregated_topics = aggregate_topics(aggregated_topics, res) + review_topic_results.append((ts, res)) + recent_processed += 1 + + total_skipped += batch_skipped + processed += len(batch.reviews) + + progress = AnalysisProgress( + processed=processed, + total=total_target, + current_topics=aggregated_topics, + skipped_count=total_skipped, + ) + yield {"event": "progress", "data": progress.model_dump_json()} + + await fetch_task + except BaseException: + fetch_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await fetch_task + raise + + # Build recent_topics from highest-timestamp reviews + review_topic_results.sort(key=lambda x: x[0], reverse=True) + recent_entries = review_topic_results[:recent_limit] + recent_topics: list[TopicSentiment] = [] + for _, topics_batch in recent_entries: + for ts in topics_batch: + recent_topics = aggregate_topics(recent_topics, [ts]) + recent_reviews_count = len(recent_entries) + + # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py). + aggregated_topics = filter_topics_by_min_mentions(aggregated_topics) + recent_topics = filter_topics_by_min_mentions(recent_topics) + current_patch_topics = filter_topics_by_min_mentions(current_patch_topics) + + prediction = calculate_prediction(aggregated_topics) + + highlights_data = highlights_collector.compute_highlights() + general_highlights = highlights_data["general"] + recent_highlights = highlights_data["recent"] + current_patch_highlights = highlights_data["current_patch"] + topic_highlights_dict = highlights_data["topics"] + + # Restrict topic highlights to topics that survived the min-mentions filter, + # so the topic_highlights set is always consistent with general_topics. + _surviving_topics = {t.topic for t in aggregated_topics} + topic_highlights_list = [ + TopicHighlights( + topic=topic, + highlights=[Highlight(**h) for h in highlights], + ) + for topic, highlights in topic_highlights_dict.items() + if topic in _surviving_topics + ] + + # Show recent tab if we have enough reviews to make the split meaningful + has_recent_split = processed > recent_limit + + has_current_patch = patch_timestamp is not None and current_patch_count > 0 + analysis_generated_at = datetime.now(timezone.utc) + current_patch_date = datetime_from_timestamp(patch_timestamp) + + # Archive last_patch_topics when this full analysis replaces a doc with a different patch. + last_patch_topics: list[TopicSentiment] | None = None + last_patch_reviews_count = 0 + if stale_doc: + old_r = normalize_legacy_results(stale_doc.get("results", {})) + old_patch_ts = old_r.get("current_patch_timestamp") + if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts: + raw_cp = old_r.get("current_patch_topics") + last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None + last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0) + else: + raw_lp = old_r.get("last_patch_topics") + last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None + last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0) + + result = AnalysisResult( + game=game, + general_topics=aggregated_topics, + recent_topics=recent_topics if has_recent_split else None, + recent_reviews_count=recent_reviews_count if has_recent_split else 0, + current_patch_topics=current_patch_topics if has_current_patch else None, + current_patch_reviews_count=current_patch_count if has_current_patch else 0, + last_patch_topics=last_patch_topics, + last_patch_reviews_count=last_patch_reviews_count, + current_patch_timestamp=patch_timestamp, + analysis_date=analysis_generated_at, + current_patch_date=current_patch_date, + prediction=prediction, + analyzed_reviews=processed, + skipped_count=total_skipped, + general_highlights=[Highlight(**h) for h in general_highlights], + recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None, + current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None, + topic_highlights=topic_highlights_list, + cached_at=analysis_generated_at, + preferred_context=compute_preferred_context(patch_timestamp), + freshness_status=FreshnessStatus.FRESH.value, + is_refreshing=False, + ) + await mongodb.save_analysis( + game.app_id, + result.model_dump(), + analyzed_review_ids=all_review_ids, + latest_review_timestamp=latest_timestamp, + ttl_hours=ttl_hours, + analyzed_at=analysis_generated_at, + ) + + # Log structured timing for full analysis + if get_structured_logger(): + log_structured( + "full_analysis_complete", + app_id=game.app_id, + game_name=game.name if hasattr(game, "name") else str(game.app_id), + source="live", + reviews_processed=processed, + topics_found=len(aggregated_topics), + detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)}, + ) + + yield {"event": "complete", "data": result.model_dump_json()} + + +async def _incremental_analysis( + game: GameInfo, + stale_doc: dict[str, Any], + steam_svc: SteamService, + nlp_svc: NLPService, + patch_timestamp: int | None = None, +) -> AsyncGenerator[dict, None]: + """Incremental analysis SSE wrapper over the shared service implementation.""" + async for event in iter_incremental_analysis_events( + game, + stale_doc, + steam_svc, + nlp_svc, + patch_timestamp=patch_timestamp, + source="live", + ): + yield event + + +async def _background_refresh( + game: GameInfo, + stale_doc: dict[str, Any], + steam_svc: SteamService, + nlp_svc: NLPService, + patch_ts: int | None, +) -> None: + """Fire-and-forget incremental analysis for stale niche caches.""" + async with _refresh_semaphore: + try: + async for _ in _incremental_analysis( + game, stale_doc, steam_svc, nlp_svc, patch_timestamp=patch_ts + ): + pass + logger.info(f"Background refresh completed for {game.app_id}") + except Exception as e: + logger.error(f"Background refresh failed for {game.app_id}: {e}") + finally: + _refreshing_app_ids.discard(game.app_id) + + +async def analysis_event_generator( + game_name: str, + steam_service: SteamService, + nlp_service: NLPService, + *, + appid: str | None = None, +) -> AsyncGenerator[dict, None]: + """ + Main SSE event generator. Decides between full and incremental analysis paths. + """ + t_start = time.monotonic() + analysis_type = "unknown" + app_id = "" + resolved_game_name = game_name + reviews_processed = 0 + + try: + # 1. Resolve game — use appid directly if provided, otherwise search by name + if appid: + game = await steam_service.get_game_info(appid) + else: + game = await steam_service.search_game(game_name) + if not game: + yield { + "event": "analysis_error", + "data": json.dumps({"message": "ERROR_GAME_NOT_FOUND"}), + } + return + + app_id = game.app_id + resolved_game_name = game.name if hasattr(game, "name") else game_name + + # 1b. Fetch game patch date for current_patch tab / freshness evaluation + patch_date = await mongodb.get_game_patch_date(game.app_id) + patch_ts = int(patch_date.timestamp()) if patch_date else None + if patch_ts: + game = game.model_copy(update={"last_game_update_at": patch_ts}) + + # 2. Load any existing analysis and evaluate product freshness. + analysis_doc = await mongodb.get_analysis(game.app_id) + priority_ids = await mongodb.get_priority_game_ids_for_analysis() + is_priority = game.app_id in priority_ids + is_niche = not is_priority + + if analysis_doc and analysis_doc.get("results"): + freshness_status = evaluate_freshness(analysis_doc, patch_date) + + if freshness_status == FreshnessStatus.FRESH: + analysis_type = "cached" + payload = _build_analysis_payload( + analysis_doc, + freshness_status, + current_patch_at=patch_date, + ) + yield {"event": "result", "data": json.dumps(payload)} + return + + analysis_type = "stale_result" + is_refreshing = ( + await mongodb.has_due_refresh_schedule(game.app_id) + if is_priority + else True + ) + stale_payload = _build_analysis_payload( + analysis_doc, + freshness_status, + current_patch_at=patch_date, + is_refreshing=is_refreshing, + ) + yield {"event": "result", "data": json.dumps(stale_payload)} + + if is_priority: + return + + try: + if settings.incremental_enabled and analysis_doc.get("analyzed_review_ids"): + refresh_generator = _incremental_analysis( + game, analysis_doc, steam_service, nlp_service, patch_timestamp=patch_ts + ) + else: + stats = await steam_service.get_review_stats(game.app_id) + sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative) + game = game.model_copy(update={"target_count": sample_plan.total}) + refresh_generator = _full_analysis( + game, + sample_plan, + steam_service, + nlp_service, + patch_timestamp=patch_ts, + stale_doc=analysis_doc, + ) + async for event in refresh_generator: + if event.get("event") == "complete": + try: + data = json.loads(event["data"]) + reviews_processed = data.get("analyzed_reviews", 0) + except (json.JSONDecodeError, KeyError): + pass + yield event + return + except Exception as e: + logger.error(f"Refresh failed for {game.app_id}: {e}") + return + + # 3. No cache at all — live analysis + + analysis_type = "full" + stats = await steam_service.get_review_stats(game.app_id) + sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative) + total_target = sample_plan.total + game = game.model_copy(update={"target_count": total_target}) + + yield {"event": "game_found", "data": game.model_dump_json()} + + if is_niche: + yield { + "event": "state", + "data": json.dumps({"type": "first_live_analysis"}), + } + + async for event in _full_analysis(game, sample_plan, steam_service, nlp_service, patch_timestamp=patch_ts): + if event.get("event") == "complete": + try: + data = json.loads(event["data"]) + reviews_processed = data.get("analyzed_reviews", 0) + except (json.JSONDecodeError, KeyError): + pass + yield event + + except SteamRateLimitError as e: + logger.warning(f"Steam rate limit: {e}") + yield { + "event": "analysis_error", + "data": json.dumps({"message": "ERROR_STEAM_RATE_LIMIT"}), + } + except SteamAPIError as e: + logger.error(f"Steam API error: {e}") + yield { + "event": "analysis_error", + "data": json.dumps({"message": "ERROR_STEAM_API"}), + } + except Exception as e: + # Safety net — SSE generator must always send an error event, never crash silently. + logger.error(f"Analysis error: {e}", exc_info=True) + yield { + "event": "analysis_error", + "data": json.dumps({"message": "ERROR_INTERNAL"}), + } + finally: + elapsed = round(time.monotonic() - t_start, 3) + if get_structured_logger(): + log_structured( + "live_analysis", + app_id=app_id, + game_name=resolved_game_name, + analysis_type=analysis_type, + elapsed_s=elapsed, + reviews_processed=reviews_processed, + source="live", + ) + + +@router.get("/analyze/{game_name}") +@limiter.limit(settings.rate_limit_analyze) +async def analyze_game( + request: Request, + game_name: str = Path(..., min_length=1, max_length=200), + appid: str | None = Query(None, min_length=1, max_length=20), + steam_service: SteamService = Depends(get_steam_service), + nlp_service: NLPService = Depends(get_nlp_service), +) -> EventSourceResponse: + """ + Endpoint do analizy sentymentu gry (SSE Stream). + """ + return EventSourceResponse( + analysis_event_generator(game_name, steam_service, nlp_service, appid=appid) + ) + + +@router.get("/game/{game_name}") +@limiter.limit(settings.rate_limit_default) +async def get_game_info( + request: Request, + game_name: str = Path(..., min_length=1, max_length=200), + steam_service: SteamService = Depends(get_steam_service), +) -> dict: + """ + Endpoint do pobierania informacji o grze. + """ + game = await steam_service.search_game(game_name) + if not game: + raise HTTPException( + status_code=404, detail="ERROR_GAME_NOT_FOUND" + ) + + return game.model_dump() diff --git a/backend/app/routers/games.py b/backend/app/routers/games.py new file mode 100644 index 0000000000000000000000000000000000000000..c5f9237a48ea0414b8585fdac6bd05d2d15a6cb6 --- /dev/null +++ b/backend/app/routers/games.py @@ -0,0 +1,68 @@ +""" +Router API do wyszukiwania gier. + +Zawiera endpoint do pobierania sugestii gier dla autouzupełniania. +""" + +from fastapi import APIRouter, Query, Request + +from app.core.config import settings +from app.db.mongodb import mongodb +from app.core.rate_limit import limiter + +router = APIRouter() + + +@router.get("/games/suggestions") +@limiter.limit(settings.rate_limit_default) +async def get_game_suggestions( + request: Request, + q: str = Query(..., min_length=2, max_length=100, description="Tekst do wyszukania"), + limit: int = Query(10, ge=1, le=20, description="Maksymalna liczba wyników"), +) -> list[dict[str, str]]: + """ + Endpoint do pobierania sugestii gier dla autouzupełniania. + + Wyszukuje gry po nazwie (case-insensitive). + Wymaga minimum 2 znaków. + + Args: + q: Tekst do wyszukania w nazwie gry. + limit: Maksymalna liczba wyników (1-20). + + Returns: + Lista gier pasujących do zapytania. + + Example: + ``` + GET /api/games/suggestions?q=cyber&limit=5 + + [ + {"appid": "1091500", "name": "Cyberpunk 2077"}, + {"appid": "12345", "name": "Cyber Shadow"}, + ... + ] + ``` + """ + games = await mongodb.search_games(q, limit) + return games + + +@router.get("/games/count") +@limiter.limit(settings.rate_limit_default) +async def get_games_count(request: Request) -> dict[str, int]: + """ + Endpoint do sprawdzenia liczby gier w bazie. + + Returns: + Liczba gier w bazie danych. + + Example: + ``` + GET /api/games/count + + {"count": 85432} + ``` + """ + count = await mongodb.get_games_count() + return {"count": count} diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e40979676dabf3fdf5969ff281ce7261a3f192e --- /dev/null +++ b/backend/app/services/__init__.py @@ -0,0 +1,6 @@ +"""Serwisy biznesowe aplikacji.""" + +from app.services.nlp_service import get_nlp_service +from app.services.steam_service import steam_service + +__all__ = ["get_nlp_service", "steam_service"] diff --git a/backend/app/services/analysis_runner.py b/backend/app/services/analysis_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..98e720d7ab35d7e9ba5103f6059d9bf9c94a32cc --- /dev/null +++ b/backend/app/services/analysis_runner.py @@ -0,0 +1,643 @@ +""" +Analysis Runner — programmatic (non-SSE) analysis for the Worker. + +Extracts the core full-analysis logic from analyze.py without SSE wrapping. +Used by the pre-cache service to run analyses in the background. +""" + +import asyncio +import contextlib +import json +import logging +import time +from datetime import datetime, timezone +from typing import Any, AsyncGenerator + +from app.core.config import settings +from app.core.freshness import FreshnessStatus +from app.core.sampling import create_sample_plan +from app.core.ttl_tiers import get_ttl_hours +from app.core.worker_logging import AsyncTimingContext, get_structured_logger, log_structured +from app.db.mongodb import mongodb +from app.models.schemas import ( + AnalysisProgress, + AnalysisResult, + GameInfo, + Highlight, + TopicHighlights, + TopicSentiment, +) +from app.services.highlights_service import HighlightsCollector +from app.services.analysis_utils import ( + aggregate_topics, + calculate_prediction, + compute_preferred_context, + datetime_from_timestamp, + filter_topics_by_min_mentions, + normalize_legacy_results, + scale_topics, + serialize_datetime, +) +from app.services.nlp_service import NLPService +from app.services.steam_service import SteamService + +logger = logging.getLogger(__name__) + + +async def iter_incremental_analysis_events( + game: GameInfo, + stale_doc: dict[str, Any], + steam_svc: SteamService, + nlp_svc: NLPService, + patch_timestamp: int | None = None, + *, + source: str = "live", +) -> AsyncGenerator[dict[str, str], None]: + """Yield incremental-analysis progress and final result events.""" + ttl_hours = await get_ttl_hours(game.app_id) + old_results = normalize_legacy_results(stale_doc.get("results", {})) + old_review_ids: list[str] = stale_doc.get("analyzed_review_ids", []) + old_review_ids_set = set(old_review_ids) + nlp_cumulative_s: float = 0.0 + + old_general = [TopicSentiment(**topic) for topic in old_results.get("general_topics", [])] + old_recent = ( + [TopicSentiment(**topic) for topic in old_results.get("recent_topics", [])] + if old_results.get("recent_topics") + else [] + ) + old_current_patch = ( + [TopicSentiment(**topic) for topic in old_results.get("current_patch_topics", [])] + if old_results.get("current_patch_topics") + else [] + ) + old_last_patch = ( + [TopicSentiment(**topic) for topic in old_results.get("last_patch_topics", [])] + if old_results.get("last_patch_topics") + else None + ) + old_last_patch_count = old_results.get("last_patch_reviews_count", 0) + old_patch_ts = old_results.get("current_patch_timestamp") + + new_items = await steam_svc.fetch_recent_reviews( + game.app_id, + exclude_ids=old_review_ids_set, + ) + + if not new_items: + refreshed_at = datetime.now(timezone.utc) + refreshed_results = { + **old_results, + "cached_at": refreshed_at, + "analysis_date": refreshed_at, + "current_patch_date": datetime_from_timestamp( + patch_timestamp if patch_timestamp is not None else old_results.get("current_patch_timestamp") + ), + "freshness_status": FreshnessStatus.FRESH.value, + "staleness_reason": None, + "is_refreshing": False, + } + await mongodb.save_analysis( + game.app_id, + refreshed_results, + analyzed_review_ids=old_review_ids, + latest_review_timestamp=stale_doc.get("latest_review_timestamp", 0), + ttl_hours=ttl_hours, + analyzed_at=refreshed_at, + ) + yield { + "event": "complete", + "data": json.dumps(refreshed_results, default=serialize_datetime), + } + return + + new_texts = [item.text for item in new_items] + new_review_ids = [item.recommendation_id for item in new_items] + latest_timestamp = max( + (item.timestamp_created for item in new_items), + default=stale_doc.get("latest_review_timestamp", 0), + ) + + batch_size = settings.review_batch_size + delta_topics: list[TopicSentiment] = [] + delta_current_patch_topics: list[TopicSentiment] = [] + delta_current_patch_count = 0 + highlights_collector = HighlightsCollector() + processed = 0 + total_skipped = 0 + + for i in range(0, len(new_texts), batch_size): + chunk_texts = new_texts[i:i + batch_size] + chunk_items = new_items[i:i + batch_size] + + batch_skipped = 0 + if patch_timestamp: + for review_item, text in zip(chunk_items, chunk_texts): + categories = ["recent"] + if review_item.timestamp_created >= patch_timestamp: + categories.append("current_patch") + + nlp_start = time.monotonic() + result_topics, skipped = await nlp_svc.analyze_batch( + [text], + highlights_collector=highlights_collector, + categories=categories, + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + if result_topics: + delta_topics = aggregate_topics(delta_topics, result_topics) + if review_item.timestamp_created >= patch_timestamp: + delta_current_patch_topics = aggregate_topics( + delta_current_patch_topics, + result_topics, + ) + delta_current_patch_count += 1 + total_skipped += batch_skipped + else: + nlp_start = time.monotonic() + batch_results, batch_skipped = await nlp_svc.analyze_batch( + chunk_texts, + highlights_collector=highlights_collector, + categories=["recent"], + ) + nlp_cumulative_s += time.monotonic() - nlp_start + if batch_results: + delta_topics = aggregate_topics(delta_topics, batch_results) + total_skipped += batch_skipped + + processed += len(chunk_texts) + + progress = AnalysisProgress( + processed=processed, + total=len(new_texts), + current_topics=delta_topics, + skipped_count=total_skipped, + ) + yield {"event": "progress", "data": progress.model_dump_json()} + + new_general = aggregate_topics(old_general, delta_topics) + + old_recent_count = old_results.get("recent_reviews_count", 0) + new_count = len(new_texts) + + if ( + old_recent_count + new_count > settings.recent_sample_limit + and old_recent + and old_recent_count > 0 + ): + overflow = old_recent_count + new_count - settings.recent_sample_limit + retain_ratio = max(0.2, 1.0 - overflow / old_recent_count) + scaled_old = scale_topics(old_recent, retain_ratio) + new_recent = aggregate_topics(scaled_old, delta_topics) + recent_count = int(old_recent_count * retain_ratio) + new_count + else: + new_recent = aggregate_topics(old_recent, delta_topics) if old_recent else delta_topics + recent_count = old_recent_count + new_count + + last_patch_topics = old_last_patch + last_patch_count = old_last_patch_count + + if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts: + last_patch_topics = old_current_patch if old_current_patch else None + last_patch_count = old_results.get("current_patch_reviews_count", 0) + old_current_patch = [] + + new_current_patch = ( + aggregate_topics(old_current_patch, delta_current_patch_topics) + if old_current_patch + else (delta_current_patch_topics if delta_current_patch_topics else []) + ) + base_current_patch_count = ( + 0 + if (patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts) + else old_results.get("current_patch_reviews_count", 0) + ) + new_current_patch_count = base_current_patch_count + delta_current_patch_count + has_current_patch = patch_timestamp is not None and ( + new_current_patch_count > 0 or bool(old_current_patch) + ) + + # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py). + new_general = filter_topics_by_min_mentions(new_general) + new_recent = filter_topics_by_min_mentions(new_recent) + new_current_patch = filter_topics_by_min_mentions(new_current_patch) + + prediction = calculate_prediction(new_general) + + highlights_data = highlights_collector.compute_highlights() + general_highlights = highlights_data["general"] + recent_highlights = highlights_data["recent"] + current_patch_highlights = highlights_data["current_patch"] + topic_highlights_dict = highlights_data["topics"] + + # Restrict topic highlights to topics that survived the min-mentions filter, + # so the topic_highlights set is always consistent with general_topics. + _surviving_topics = {t.topic for t in new_general} + topic_highlights_list = [ + TopicHighlights( + topic=topic, + highlights=[Highlight(**highlight) for highlight in highlights], + ) + for topic, highlights in topic_highlights_dict.items() + if topic in _surviving_topics + ] + + merged_review_ids = old_review_ids + new_review_ids + + analysis_generated_at = datetime.now(timezone.utc) + result = AnalysisResult( + game=game, + general_topics=new_general, + recent_topics=new_recent, + recent_reviews_count=recent_count, + current_patch_topics=new_current_patch if has_current_patch else None, + current_patch_reviews_count=new_current_patch_count if has_current_patch else 0, + last_patch_topics=last_patch_topics, + last_patch_reviews_count=last_patch_count, + current_patch_timestamp=patch_timestamp, + analysis_date=analysis_generated_at, + current_patch_date=datetime_from_timestamp(patch_timestamp), + prediction=prediction, + analyzed_reviews=old_results.get("analyzed_reviews", 0) + processed, + skipped_count=old_results.get("skipped_count", 0) + total_skipped, + general_highlights=[Highlight(**highlight) for highlight in general_highlights], + recent_highlights=[Highlight(**highlight) for highlight in recent_highlights] if recent_highlights else None, + current_patch_highlights=[Highlight(**highlight) for highlight in current_patch_highlights] if current_patch_highlights else None, + topic_highlights=topic_highlights_list, + cached_at=analysis_generated_at, + preferred_context=compute_preferred_context(patch_timestamp), + freshness_status=FreshnessStatus.FRESH.value, + is_refreshing=False, + ) + await mongodb.save_analysis( + game.app_id, + result.model_dump(), + analyzed_review_ids=merged_review_ids, + latest_review_timestamp=latest_timestamp, + ttl_hours=ttl_hours, + analyzed_at=analysis_generated_at, + ) + + if get_structured_logger(): + log_structured( + "incremental_analysis_complete", + app_id=game.app_id, + game_name=game.name if hasattr(game, "name") else str(game.app_id), + source=source, + reviews_processed=processed, + topics_found=len(new_general), + detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)}, + ) + + yield {"event": "complete", "data": result.model_dump_json()} + + +async def run_incremental_analysis( + app_id: str, + game_name: str, + steam_svc: SteamService, + nlp_svc: NLPService, +) -> dict[str, Any] | None: + """Run a non-SSE incremental analysis for worker jobs.""" + slog = get_structured_logger() + + try: + stale_doc = await mongodb.get_analysis(app_id) + if not stale_doc or not stale_doc.get("results") or not stale_doc.get("analyzed_review_ids"): + return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc) + + # Long gap guard: if the most recent review we have is too old, Steam's cursor-based + # API may not reliably surface all reviews since then. Fall back to full analysis. + latest_ts = stale_doc.get("latest_review_timestamp", 0) + if latest_ts > 0: + gap_days = (time.time() - latest_ts) / 86400 + if gap_days > settings.incremental_max_gap_days: + logger.info( + f"Incremental gap {gap_days:.0f}d > {settings.incremental_max_gap_days}d " + f"for {app_id} ({game_name}) — falling back to full analysis" + ) + return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc) + + game = await steam_svc.get_game_info(app_id) + if not game: + cached_game = stale_doc.get("results", {}).get("game") + if isinstance(cached_game, dict): + game = GameInfo(**cached_game) + else: + game = GameInfo(app_id=app_id, name=game_name) + + patch_date = await mongodb.get_game_patch_date(app_id) + patch_timestamp = int(patch_date.timestamp()) if patch_date else None + if patch_timestamp: + game = game.model_copy(update={"last_game_update_at": patch_timestamp}) + + final_payload: dict[str, Any] | None = None + async for event in iter_incremental_analysis_events( + game, + stale_doc, + steam_svc, + nlp_svc, + patch_timestamp=patch_timestamp, + source="worker", + ): + if event.get("event") == "complete": + final_payload = json.loads(event["data"]) + + return final_payload + except Exception as e: + logger.error(f"Incremental analysis runner error for {app_id} ({game_name}): {e}", exc_info=True) + if slog: + log_structured( + "analysis_error", + level=logging.ERROR, + app_id=app_id, + game_name=game_name, + source="worker", + error=str(e), + ) + return None + + +async def run_full_analysis( + app_id: str, + game_name: str, + steam_svc: SteamService, + nlp_svc: NLPService, + stale_doc: dict[str, Any] | None = None, +) -> dict[str, Any] | None: + """ + Run a full analysis for a game (no SSE, no streaming). + + Returns: + Analysis result dict, or None on error. + """ + slog = get_structured_logger() + + try: + # Phase 1: Setup — game info + review stats + sample plan + async with AsyncTimingContext() as t_setup: + # 1. Get game info + game = await steam_svc.get_game_info(app_id) + if not game: + logger.warning(f"Analysis runner: game info not found for {app_id}") + return None + + # 2. Get review stats + stats = await steam_svc.get_review_stats(app_id) + if stats.total == 0: + logger.warning(f"Analysis runner: no reviews for {app_id}") + return None + + # 3. Create sample plan + sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative) + ttl_hours = await get_ttl_hours(app_id) + + # 3b. Fetch game patch date for current_patch splitting + patch_date = await mongodb.get_game_patch_date(app_id) + patch_timestamp = int(patch_date.timestamp()) if patch_date else None + if patch_timestamp and isinstance(game, GameInfo): + game = game.model_copy(update={"last_game_update_at": patch_timestamp}) + + # Phase 2: Fetch + Analyze — producer-consumer loop + nlp_cumulative_s: float = 0.0 + + async with AsyncTimingContext() as t_fetch_analyze: + # 4. Producer-consumer fetch + analyze + queue: asyncio.Queue = asyncio.Queue(maxsize=5) + + async def fetch_worker(): + try: + async for batch in steam_svc.fetch_reviews_stratified(app_id, sample_plan): + await queue.put(batch) + except Exception as e: + await queue.put(e) + finally: + await queue.put(None) + + fetch_task = asyncio.create_task(fetch_worker()) + + processed = 0 + total_skipped = 0 + aggregated_topics: list[TopicSentiment] = [] + recent_processed = 0 + recent_limit = settings.recent_sample_limit + all_review_ids: list[str] = [] + latest_timestamp = 0 + highlights_collector = HighlightsCollector() + current_patch_topics: list[TopicSentiment] = [] + current_patch_count = 0 + review_topic_results: list[tuple[int, list[TopicSentiment]]] = [] + + try: + while True: + item = await queue.get() + + if item is None: + break + if isinstance(item, Exception): + raise item + + batch = item + if not batch.reviews: + continue + + for ri in batch.review_items: + all_review_ids.append(ri.recommendation_id) + if ri.timestamp_created > latest_timestamp: + latest_timestamp = ri.timestamp_created + + batch_skipped = 0 + if patch_timestamp and batch.review_items: + for ri, text in zip(batch.review_items, batch.reviews): + is_recent = recent_processed < recent_limit + cat = [] + if is_recent: + cat.append("recent") + + if ri.timestamp_created >= patch_timestamp: + cat.append("current_patch") + nlp_start = time.monotonic() + res, skipped = await nlp_svc.analyze_batch( + [text], highlights_collector=highlights_collector, categories=cat + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + if res: + aggregated_topics = aggregate_topics(aggregated_topics, res) + current_patch_topics = aggregate_topics(current_patch_topics, res) + review_topic_results.append((ri.timestamp_created, res)) + current_patch_count += 1 + else: + nlp_start = time.monotonic() + res, skipped = await nlp_svc.analyze_batch( + [text], highlights_collector=highlights_collector, categories=cat + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + if res: + aggregated_topics = aggregate_topics(aggregated_topics, res) + review_topic_results.append((ri.timestamp_created, res)) + recent_processed += 1 + else: + for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews): + is_recent = recent_processed < recent_limit + cat = ["recent"] if is_recent else [] + + nlp_start = time.monotonic() + res, skipped = await nlp_svc.analyze_batch( + [text], highlights_collector=highlights_collector, categories=cat + ) + nlp_cumulative_s += time.monotonic() - nlp_start + batch_skipped += skipped + ts = ri.timestamp_created if batch.review_items else 0 + if res: + aggregated_topics = aggregate_topics(aggregated_topics, res) + review_topic_results.append((ts, res)) + recent_processed += 1 + + total_skipped += batch_skipped + processed += len(batch.reviews) + + await fetch_task + except BaseException: + fetch_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await fetch_task + raise + + # Phase 3: Save — highlights + MongoDB save + async with AsyncTimingContext() as t_save: + # 5. Compute prediction + highlights + + # Build recent_topics from highest-timestamp reviews + review_topic_results.sort(key=lambda x: x[0], reverse=True) + recent_entries = review_topic_results[:recent_limit] + recent_topics: list[TopicSentiment] = [] + for _, topics_batch in recent_entries: + for ts in topics_batch: + recent_topics = aggregate_topics(recent_topics, [ts]) + recent_reviews_count = len(recent_entries) + + # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py). + aggregated_topics = filter_topics_by_min_mentions(aggregated_topics) + recent_topics = filter_topics_by_min_mentions(recent_topics) + current_patch_topics = filter_topics_by_min_mentions(current_patch_topics) + + prediction = calculate_prediction(aggregated_topics) + + highlights_data = highlights_collector.compute_highlights() + general_highlights = highlights_data["general"] + recent_highlights = highlights_data["recent"] + current_patch_highlights = highlights_data["current_patch"] + topic_highlights_dict = highlights_data["topics"] + + # Restrict topic highlights to topics that survived the min-mentions filter, + # so the topic_highlights set is always consistent with general_topics. + _surviving_topics = {t.topic for t in aggregated_topics} + topic_highlights_list = [ + TopicHighlights( + topic=topic, + highlights=[Highlight(**h) for h in highlights], + ) + for topic, highlights in topic_highlights_dict.items() + if topic in _surviving_topics + ] + + has_recent_split = processed > recent_limit + has_current_patch = patch_timestamp is not None and current_patch_count > 0 + analysis_generated_at = datetime.now(timezone.utc) + current_patch_date = ( + datetime.fromtimestamp(patch_timestamp, tz=timezone.utc) + if patch_timestamp is not None + else None + ) + + # Archive last_patch_topics when full analysis replaces a doc with a different patch. + last_patch_topics: list[TopicSentiment] | None = None + last_patch_reviews_count = 0 + if stale_doc: + old_r = normalize_legacy_results(stale_doc.get("results", {})) + old_patch_ts = old_r.get("current_patch_timestamp") + if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts: + raw_cp = old_r.get("current_patch_topics") + last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None + last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0) + else: + raw_lp = old_r.get("last_patch_topics") + last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None + last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0) + + result = AnalysisResult( + game=game, + general_topics=aggregated_topics, + recent_topics=recent_topics if has_recent_split else None, + recent_reviews_count=recent_reviews_count if has_recent_split else 0, + current_patch_topics=current_patch_topics if has_current_patch else None, + current_patch_reviews_count=current_patch_count if has_current_patch else 0, + last_patch_topics=last_patch_topics, + last_patch_reviews_count=last_patch_reviews_count, + current_patch_timestamp=patch_timestamp, + analysis_date=analysis_generated_at, + current_patch_date=current_patch_date, + prediction=prediction, + analyzed_reviews=processed, + skipped_count=total_skipped, + general_highlights=[Highlight(**h) for h in general_highlights], + recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None, + current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None, + topic_highlights=topic_highlights_list, + cached_at=analysis_generated_at, + preferred_context=compute_preferred_context(patch_timestamp), + freshness_status=FreshnessStatus.FRESH.value, + is_refreshing=False, + ) + + # 6. Save to cache + await mongodb.save_analysis( + game.app_id, + result.model_dump(), + analyzed_review_ids=all_review_ids, + latest_review_timestamp=latest_timestamp, + ttl_hours=ttl_hours, + analyzed_at=analysis_generated_at, + ) + + total_elapsed = t_setup.elapsed_s + t_fetch_analyze.elapsed_s + t_save.elapsed_s + + logger.info( + f"Analysis runner: completed {app_id} ({game_name}) — " + f"{processed} reviews, {len(aggregated_topics)} topics" + ) + + if slog: + log_structured( + "analysis_complete", + app_id=app_id, + game_name=game_name, + elapsed_s=round(total_elapsed, 3), + source="worker", + breakdown={ + "setup_s": t_setup.elapsed_s, + "fetch_analyze_s": t_fetch_analyze.elapsed_s, + "nlp_cumulative_s": round(nlp_cumulative_s, 3), + "save_s": t_save.elapsed_s, + }, + reviews_processed=processed, + topics_found=len(aggregated_topics), + ) + + return result.model_dump() + + except Exception as e: + logger.error(f"Analysis runner error for {app_id} ({game_name}): {e}", exc_info=True) + if slog: + log_structured( + "analysis_error", + level=logging.ERROR, + app_id=app_id, + game_name=game_name, + source="worker", + error=str(e), + ) + return None diff --git a/backend/app/services/analysis_utils.py b/backend/app/services/analysis_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f034d3f2aaab04856e136780347af672a7749bda --- /dev/null +++ b/backend/app/services/analysis_utils.py @@ -0,0 +1,259 @@ +"""Shared analysis helpers used by both live and worker paths.""" + +import time +from datetime import datetime, timezone +from typing import Any + +from app.core.config import settings +from app.models.schemas import ( + PredictionType, + SentimentType, + TopicSentiment, + UserCountPrediction, +) + + +def calculate_prediction(topics: list[TopicSentiment]) -> UserCountPrediction: + """Compute the player-count trend prediction from aggregated topics.""" + topic_map = {t.topic: t for t in topics} + + retention = topic_map.get("Retention") + if retention and retention.mention_count > 5: + if retention.score > settings.prediction_retention_threshold_pos: + return UserCountPrediction( + trend=PredictionType.INCREASING, + confidence=min(0.95, 0.5 + (retention.mention_count / 100)), + reasoning="PREDICTION_REASONING_RETENTION_HIGH", + ) + if retention.score < settings.prediction_retention_threshold_neg: + return UserCountPrediction( + trend=PredictionType.DECREASING, + confidence=min(0.95, 0.5 + (retention.mention_count / 100)), + reasoning="PREDICTION_REASONING_RETENTION_LOW", + ) + + bugs = topic_map.get("Bugs") + performance = topic_map.get("Performance") + tech_score = 0.0 + tech_count = 0 + + if bugs: + tech_score += bugs.score + tech_count += 1 + if performance: + tech_score += performance.score + tech_count += 1 + + if tech_count > 0 and (tech_score / tech_count) < -0.3: + return UserCountPrediction( + trend=PredictionType.DECREASING, + confidence=0.75, + reasoning="PREDICTION_REASONING_TECH_ISSUES", + ) + + gameplay = topic_map.get("Gameplay") + fun = topic_map.get("Fun") + gameplay_score = 0.0 + gameplay_count = 0 + + if gameplay: + gameplay_score += gameplay.score + gameplay_count += 1 + if fun: + gameplay_score += fun.score + gameplay_count += 1 + + if gameplay_count > 0: + average_gameplay = gameplay_score / gameplay_count + if average_gameplay > 0.4: + return UserCountPrediction( + trend=PredictionType.INCREASING, + confidence=0.8, + reasoning="PREDICTION_REASONING_GAMEPLAY_HIGH", + ) + if average_gameplay < -0.2: + return UserCountPrediction( + trend=PredictionType.DECREASING, + confidence=0.6, + reasoning="PREDICTION_REASONING_GAMEPLAY_LOW", + ) + + return UserCountPrediction( + trend=PredictionType.STABLE, + confidence=0.5, + reasoning="PREDICTION_REASONING_STABLE", + ) + + +def aggregate_topics( + existing: list[TopicSentiment], + new_batch: list[TopicSentiment], +) -> list[TopicSentiment]: + """Merge topic aggregates using weighted mention counts.""" + topic_data: dict[str, dict[str, Any]] = {} + + def better_example( + current: tuple[str, float] | None, + new: tuple[str, float] | None, + ) -> tuple[str, float] | None: + if new is None: + return current + if current is None: + return new + return new if abs(new[1]) > abs(current[1]) else current + + for topic in existing: + if topic.topic not in topic_data: + topic_data[topic.topic] = {"scores": [], "count": 0, "example": None} + topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count) + topic_data[topic.topic]["count"] += topic.mention_count + new_example = ( + (topic.example, topic.example_score) + if topic.example and topic.example_score is not None + else None + ) + topic_data[topic.topic]["example"] = better_example( + topic_data[topic.topic]["example"], + new_example, + ) + + for topic in new_batch: + if topic.topic not in topic_data: + topic_data[topic.topic] = {"scores": [], "count": 0, "example": None} + topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count) + topic_data[topic.topic]["count"] += topic.mention_count + new_example = ( + (topic.example, topic.example_score) + if topic.example and topic.example_score is not None + else None + ) + topic_data[topic.topic]["example"] = better_example( + topic_data[topic.topic]["example"], + new_example, + ) + + results: list[TopicSentiment] = [] + for topic_name, data in topic_data.items(): + count = data["count"] + if count == 0: + continue + + average_score = sum(data["scores"]) / count + normalized_score = max(-1.0, min(1.0, average_score)) + + if normalized_score > settings.sentiment_positive_threshold: + sentiment = SentimentType.POSITIVE + elif normalized_score < settings.sentiment_negative_threshold: + sentiment = SentimentType.NEGATIVE + else: + sentiment = SentimentType.NEUTRAL + + best_example = None + example_score = None + example_data = data["example"] + if example_data: + example_text, candidate_score = example_data + if sentiment == SentimentType.NEUTRAL or ( + sentiment == SentimentType.POSITIVE and candidate_score > 0 + ) or ( + sentiment == SentimentType.NEGATIVE and candidate_score < 0 + ): + best_example = example_text + example_score = candidate_score + + results.append( + TopicSentiment( + topic=topic_name, + sentiment=sentiment, + score=round(normalized_score, 3), + mention_count=count, + example=best_example, + example_score=example_score, + ) + ) + + results.sort(key=lambda item: item.mention_count, reverse=True) + return results + + +def scale_topics(topics: list[TopicSentiment], factor: float) -> list[TopicSentiment]: + """Scale mention counts for the approximate recent sliding window.""" + return [ + topic.model_copy(update={"mention_count": max(1, int(topic.mention_count * factor))}) + for topic in topics + ] + + +def filter_topics_by_min_mentions( + topics: list[TopicSentiment], + min_mentions: int | None = None, +) -> list[TopicSentiment]: + """Filter topics below the minimum mention threshold. + + Preserves existing sort order. Only filters — does not modify score or sentiment. + Applied at the final aggregate level, never at the per-review level. + """ + threshold = min_mentions if min_mentions is not None else settings.topic_min_mentions + return [t for t in topics if t.mention_count >= threshold] + + +def compute_preferred_context(patch_timestamp: int | None) -> str: + """Choose the default user-facing context tab. + + Returns 'current_patch' only when a recent major patch exists; otherwise + returns 'general' so the UI defaults to the full-picture view. + """ + if patch_timestamp is None: + return "general" + patch_age_days = (time.time() - patch_timestamp) / 86400 + if patch_age_days > settings.patch_context_max_age_days: + return "general" + return "current_patch" + + +_LEGACY_FIELD_MAP = { + "topics": "general_topics", + "historical_topics": "general_topics", + "post_update_topics": "current_patch_topics", + "post_update_reviews_count": "current_patch_reviews_count", + "post_update_highlights": "current_patch_highlights", + "previous_update_topics": "last_patch_topics", + "previous_update_reviews_count": "last_patch_reviews_count", + "last_update_timestamp": "current_patch_timestamp", +} + + +def normalize_legacy_results(results: dict[str, Any]) -> dict[str, Any]: + """Map legacy persisted result fields to the current schema.""" + normalized: dict[str, Any] = {} + for key, value in results.items(): + new_key = _LEGACY_FIELD_MAP.get(key, key) + if key == "is_incremental": + continue + if new_key not in normalized: + normalized[new_key] = value + return normalized + + +def serialize_datetime(value: Any) -> str | Any: + """Serialize datetimes in SSE payloads and persisted compatibility helpers.""" + if isinstance(value, datetime): + return value.isoformat() + return value + + +def coerce_utc_datetime(value: Any) -> datetime | None: + """Coerce persisted datetime values into timezone-aware UTC datetimes.""" + if isinstance(value, datetime): + return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc) + if isinstance(value, str): + parsed = datetime.fromisoformat(value) + return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc) + return None + + +def datetime_from_timestamp(timestamp: int | None) -> datetime | None: + """Convert a unix timestamp into UTC datetime.""" + if timestamp is None: + return None + return datetime.fromtimestamp(timestamp, tz=timezone.utc) diff --git a/backend/app/services/game_sync_service.py b/backend/app/services/game_sync_service.py new file mode 100644 index 0000000000000000000000000000000000000000..1f5cdddb550a459258ba9c589cfcfc35e705fb9b --- /dev/null +++ b/backend/app/services/game_sync_service.py @@ -0,0 +1,290 @@ +""" +Game Sync Service — fetches game data from SteamSpy and upserts to MongoDB. + +Replaces the manual scripts/fetch_games_to_mongodb.py with an automated, +rate-limited sync that runs as part of the Worker cycle. +""" + +import asyncio +import logging +from datetime import datetime, timezone +from typing import Any + +import httpx + +from app.core.config import settings +from app.db.mongodb import mongodb + +logger = logging.getLogger(__name__) + +STEAMSPY_API_URL = "https://steamspy.com/api.php" +STEAM_STORE_API_URL = "https://store.steampowered.com/api" + + +class GameSyncService: + """Syncs game data from SteamSpy into MongoDB.""" + + def __init__(self, client: httpx.AsyncClient | None = None) -> None: + self._client = client + self._owns_client = client is None + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient(timeout=30.0) + return self._client + + async def close(self) -> None: + if self._owns_client and self._client is not None: + await self._client.aclose() + self._client = None + + async def sync_all_games(self) -> tuple[int, int]: + """ + Fetch all games from SteamSpy (paginated, up to 90 pages). + + Returns: + (total_upserted, total_modified) + """ + client = await self._get_client() + total_upserted = 0 + total_modified = 0 + now = datetime.now(timezone.utc) + + for page in range(90): + try: + resp = await client.get( + STEAMSPY_API_URL, + params={"request": "all", "page": page}, + ) + resp.raise_for_status() + data = resp.json() + + if not data: + logger.info(f"SteamSpy page {page} empty — sync complete") + break + + games = self._parse_all_response(data, now) + if games: + upserted, modified = await mongodb.upsert_games_batch(games) + total_upserted += upserted + total_modified += modified + + logger.info( + f"SteamSpy page {page}: {len(games)} games " + f"(upserted={total_upserted}, modified={total_modified})" + ) + + except httpx.HTTPStatusError as e: + logger.error(f"SteamSpy HTTP error on page {page}: {e}") + break + except httpx.RequestError as e: + logger.error(f"SteamSpy request error on page {page}: {e}") + break + + # Rate limit: SteamSpy allows ~1 request per minute + if page < 89: + await asyncio.sleep(settings.game_sync_steamspy_delay) + + logger.info( + f"Game sync complete: upserted={total_upserted}, modified={total_modified}" + ) + return (total_upserted, total_modified) + + async def sync_top_game_details(self, limit: int | None = None) -> int: + """ + Enrich top N games with detailed info (tags, genre, ccu) from SteamSpy. + + Returns: + Number of games enriched. + """ + limit = limit or settings.game_sync_top_n_details + client = await self._get_client() + + top_games = await mongodb.get_top_games_by_reviews(limit) + enriched = 0 + + for game in top_games: + appid = game.get("appid", "") + if not appid: + continue + + try: + resp = await client.get( + STEAMSPY_API_URL, + params={"request": "appdetails", "appid": appid}, + ) + resp.raise_for_status() + detail = resp.json() + + update = self._parse_detail_response(detail) + if update: + await mongodb.upsert_game({"appid": appid, "name": game["name"], **update}) + enriched += 1 + + except httpx.HTTPStatusError as e: + logger.warning(f"SteamSpy detail error for {appid}: {e}") + except httpx.RequestError as e: + logger.warning(f"SteamSpy detail request error for {appid}: {e}") + + await asyncio.sleep(settings.game_sync_details_delay) + + logger.info(f"Enriched {enriched}/{len(top_games)} games with details") + return enriched + + async def enrich_cn_names(self, limit: int | None = None) -> int: + """ + Enrich games with Chinese names from Steam Store API. + + Returns: + Number of games processed. + """ + limit = limit or settings.game_sync_cn_enrichment_limit + client = await self._get_client() + + games_to_check = await mongodb.get_games_without_cn_name(limit) + processed = 0 + + for game in games_to_check: + appid = game.get("appid") + name_en = game.get("name") + if not appid: + continue + + try: + app_data = await self._fetch_store_app_data(client, appid) + if app_data and app_data.get("success"): + info = app_data.get("data", {}) + name_cn = info.get("name") + + # If names are different, we found a translation + if name_cn and name_cn != name_en: + await mongodb.mark_cn_name_checked(appid, name_cn) + else: + await mongodb.mark_cn_name_checked(appid) + else: + # Not found or error in API - still mark as checked + await mongodb.mark_cn_name_checked(appid) + + processed += 1 + + except httpx.HTTPError as e: + logger.warning(f"Error fetching CN name for {appid}: {e}") + # Don't mark as checked on network error, try again next cycle + + # Respect rate limits + await asyncio.sleep(settings.game_sync_cn_enrichment_delay) + + logger.info(f"Enriched CN names for {processed}/{len(games_to_check)} games") + return processed + + async def enrich_app_types(self, limit: int | None = None) -> int: + """ + Enrich app_type/parent_appid using Steam Store appdetails. + + Returns: + Number of games processed. + """ + limit = limit or settings.game_sync_app_type_enrichment_limit + client = await self._get_client() + + games_to_check = await mongodb.get_games_missing_app_type(limit) + processed = 0 + + for game in games_to_check: + appid = game.get("appid") + if not appid: + continue + + try: + app_data = await self._fetch_store_app_data(client, appid) + info = app_data.get("data", {}) if app_data and app_data.get("success") else {} + + parsed = self._parse_store_type_response(info) + await mongodb.mark_app_type_checked( + appid, + app_type=parsed["app_type"], + parent_appid=parsed["parent_appid"], + ) + processed += 1 + + except httpx.HTTPError as e: + logger.warning(f"Error fetching app type for {appid}: {e}") + + await asyncio.sleep(settings.game_sync_app_type_enrichment_delay) + + logger.info(f"Enriched app types for {processed}/{len(games_to_check)} games") + return processed + + @staticmethod + def _parse_all_response( + data: dict[str, Any], synced_at: datetime + ) -> list[dict[str, Any]]: + """Parse SteamSpy 'all' response into list of game dicts.""" + games: list[dict[str, Any]] = [] + for appid_str, info in data.items(): + name = info.get("name", "") + if not name: + continue + + games.append({ + "appid": str(appid_str), + "name": name, + "developer": info.get("developer", ""), + "publisher": info.get("publisher", ""), + "positive": info.get("positive", 0), + "negative": info.get("negative", 0), + "synced_at": synced_at, + }) + return games + + @staticmethod + def _parse_detail_response(detail: dict[str, Any]) -> dict[str, Any]: + """Parse SteamSpy 'appdetails' response into enrichment fields.""" + update: dict[str, Any] = {} + + tags = detail.get("tags") + if isinstance(tags, dict) and tags: + # Sort by vote count descending, keep top 20 tag names + sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20] + update["tags"] = [tag_name for tag_name, _ in sorted_tags] + + genre = detail.get("genre") + if genre: + update["genre"] = genre + + ccu = detail.get("ccu") + if ccu is not None: + update["ccu"] = ccu + + return update + + @staticmethod + def _parse_store_type_response(info: dict[str, Any]) -> dict[str, Any]: + app_type = info.get("type") or "unknown" + fullgame = info.get("fullgame") + + parent_appid = None + if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None: + parent_appid = str(fullgame["appid"]) + + return { + "app_type": str(app_type), + "parent_appid": parent_appid, + } + + @staticmethod + async def _fetch_store_app_data( + client: httpx.AsyncClient, appid: str + ) -> dict[str, Any] | None: + """Fetch one appdetails payload from Steam Store.""" + resp = await client.get( + f"{STEAM_STORE_API_URL}/appdetails", + params={ + "appids": appid, + "l": "schinese", + "cc": "CN", + }, + ) + resp.raise_for_status() + data = resp.json() + return data.get(str(appid)) diff --git a/backend/app/services/highlights_service.py b/backend/app/services/highlights_service.py new file mode 100644 index 0000000000000000000000000000000000000000..e42510683fafa3d9ac64932e9189a140db9e591d --- /dev/null +++ b/backend/app/services/highlights_service.py @@ -0,0 +1,202 @@ +""" +Serwis ekstrakcji Community Highlights z recenzji. +Uzywa n-gramow (2-5 tokenow) + TF-IDF do identyfikacji najczesciej uzywanych fraz. +""" + +import math +from collections import Counter, defaultdict +from typing import Any + +import jieba + +from app.core.config import settings +from app.core.stopwords_zh import is_stopword + + +class HighlightsCollector: + """ + Stateful collector — akumuluje dane przez caly cykl analizy w sposob przyrostowy, + aby oszczedzac pamiec RAM. Oblicza highlights raz na koncu. + """ + + def __init__(self) -> None: + self._topic_ngrams: dict[str, Counter] = defaultdict(Counter) + self._category_ngrams: dict[str, Counter] = defaultdict(Counter) + self._global_counts: Counter = Counter() + self._ngram_doc_freq: Counter = Counter() + self._ngram_sentiment_sum: dict[str, float] = defaultdict(float) + self._ngram_sentiment_count: Counter = Counter() + self._review_count = 0 + self._current_review_seen_ngrams: set[str] = set() + + def start_review(self) -> None: + """Sygnalizuje poczatek nowej recenzji (do obliczania Document Frequency).""" + self._review_count += 1 + self._current_review_seen_ngrams = set() + + def add_sentence( + self, + review_idx: int, # Zachowane dla kompatybilnosci, uzywaj start_review() do separacji + sentence: str, + topics: list[str], + sentiment_score: float, + categories: list[str] | None = None, + ) -> None: + """Wywolywane per zdanie podczas analyze_batch().""" + # Prosta detekcja ASCII dla angielskich fraz (unikniecie blednego ciecia przez jieba) + is_ascii = all(ord(c) < 128 for c in sentence) + if is_ascii: + words = [w for w in sentence.split() if not is_stopword(w) and len(w.strip()) > 0] + else: + words = [w for w in jieba.lcut(sentence) if not is_stopword(w) and len(w.strip()) > 0] + + if len(words) < 2: + return + + for n in range(settings.highlights_ngram_min, settings.highlights_ngram_max + 1): + for i in range(len(words) - n + 1): + ngram = " ".join(words[i : i + n]) + + # 1. Globalne liczniki + self._global_counts[ngram] += 1 + self._ngram_sentiment_sum[ngram] += sentiment_score + self._ngram_sentiment_count[ngram] += 1 + + # 2. Przyrostowe Document Frequency (raz per recenzja) + if ngram not in self._current_review_seen_ngrams: + self._ngram_doc_freq[ngram] += 1 + self._current_review_seen_ngrams.add(ngram) + + # 3. Liczniki tematyczne i kategoryczne + for topic in topics: + self._topic_ngrams[topic][ngram] += 1 + if categories: + for category in categories: + self._category_ngrams[category][ngram] += 1 + + if self._review_count % 500 == 0: + self._prune_singletons() + + def _prune_singletons(self) -> None: + """Glebokie czyszczenie n-gramow z count=1 (oszczednosc pamieci).""" + singletons = [k for k, v in self._global_counts.items() if v <= 1] + for k in singletons: + del self._global_counts[k] + if k in self._ngram_sentiment_sum: + del self._ngram_sentiment_sum[k] + del self._ngram_sentiment_count[k] + del self._ngram_doc_freq[k] + + # Czyszczenie w tematach + for topic in self._topic_ngrams: + if k in self._topic_ngrams[topic]: + del self._topic_ngrams[topic][k] + + # Czyszczenie w kategoriach + for cat in self._category_ngrams: + if k in self._category_ngrams[cat]: + del self._category_ngrams[cat][k] + + def compute_highlights(self) -> dict[str, Any]: + """ + Oblicza highlights po zakonczeniu analizy. + """ + if self._review_count == 0: + return { + "general": [], + "recent": [], + "current_patch": [], + "topics": {} + } + + results: dict[str, Any] = { + "general": self._compute_tfidf_highlights( + self._global_counts, + top_n=settings.highlights_top_n_general, + ), + "recent": self._compute_tfidf_highlights( + self._category_ngrams.get("recent", Counter()), + top_n=settings.highlights_top_n_general, + ), + "current_patch": self._compute_tfidf_highlights( + self._category_ngrams.get("current_patch", Counter()), + top_n=settings.highlights_top_n_general, + ), + "topics": {} + } + + for topic, counter in self._topic_ngrams.items(): + h = self._compute_tfidf_highlights( + counter, + top_n=settings.highlights_top_n_per_topic, + ) + if h: + results["topics"][topic] = h + + return results + + def _compute_tfidf_highlights(self, counter: Counter, top_n: int) -> list[dict]: + """TF-IDF scoring + filtering + dedup.""" + candidates = [] + n = self._review_count + total_count = sum(counter.values()) if counter.values() else 1 + + for ngram, count in counter.items(): + df = self._ngram_doc_freq.get(ngram, 0) + + if df < settings.highlights_min_mentions: + continue + if df / n > settings.highlights_max_doc_freq_ratio: + continue + + idf = math.log(n / df) if df > 0 else 0 + tf = count / total_count + tfidf = tf * idf + rank_score = count * tfidf + + # Oblicz sredni sentyment z sumy i liczby + s_sum = self._ngram_sentiment_sum.get(ngram, 0.0) + s_count = self._ngram_sentiment_count.get(ngram, 0) + avg_score = s_sum / s_count if s_count > 0 else 0.0 + + candidates.append({ + "phrase": ngram, + "mention_count": df, + "score": round(avg_score, 3), + "sentiment": ( + "positive" if avg_score > settings.sentiment_positive_threshold + else "negative" if avg_score < settings.sentiment_negative_threshold + else "neutral" + ), + "ngram_size": len(ngram.split()), + "_rank": rank_score, + }) + + candidates.sort(key=lambda x: x["_rank"], reverse=True) + + # Substring absorption + absorbed: set[int] = set() + for i, c in enumerate(candidates): + if i in absorbed: + continue + for j in range(i + 1, len(candidates)): + if j in absorbed: + continue + if candidates[j]["phrase"] in c["phrase"]: + parent_has_neg = any(neg in c["phrase"] for neg in ["不", "没", "无"]) + child_has_neg = any(neg in candidates[j]["phrase"] for neg in ["不", "没", "无"]) + if parent_has_neg == child_has_neg: + absorbed.add(j) + + results = [c for i, c in enumerate(candidates) if i not in absorbed] + + # Re-sort by mention_count descending for display order. + # TF-IDF sort above selected the top candidates; this ensures the final + # list the UI receives is ordered from most-mentioned to least-mentioned, + # with score and phrase as stable tie-breakers. + results.sort(key=lambda x: (-x["mention_count"], -x["score"], x["phrase"])) + + for r in results[:top_n]: + r.pop("_rank", None) + + return results[:top_n] diff --git a/backend/app/services/nlp_service.py b/backend/app/services/nlp_service.py new file mode 100644 index 0000000000000000000000000000000000000000..a642a3a86ff80fd3a28c953ac528e035281df09c --- /dev/null +++ b/backend/app/services/nlp_service.py @@ -0,0 +1,524 @@ +""" +Serwis NLP do analizy sentymentu i modelowania tematów. + +Architektura: Local Inference (CPU). +Wykorzystuje model Transformer (DistilBERT) uruchamiany bezpośrednio w aplikacji, +co eliminuje opóźnienia sieciowe i zapewnia deterministyczny czas wykonania. + +Optymalizacje: +1. Pre-kompilacja wzorców Regex (O(1) matching). +2. Wykonywanie inferencji w Executorze (nie blokuje Event Loop). +3. Batching zapytań do modelu (wykorzystanie instrukcji wektorowych CPU). +""" + +from __future__ import annotations + +import asyncio +import logging +import re +from collections import OrderedDict, defaultdict +from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING + +from pathlib import Path + +import jieba +from transformers import AutoTokenizer, pipeline +from optimum.onnxruntime import ORTModelForSequenceClassification +from zhconv import convert + +from app.core.config import settings +from app.core.keywords import EXCLUSIONS, TOPIC_KEYWORDS +from app.models.schemas import SentimentType, TopicSentiment + +if TYPE_CHECKING: + from app.services.highlights_service import HighlightsCollector + +logger = logging.getLogger(__name__) + +CARD_LAG_PREFIXES = frozenset({"不", "很", "好", "太", "真", "挺", "老", "总"}) +CARD_STANDALONE_PREVIOUS_TOKENS = frozenset({"有点", "一直", "偶尔"}) + +# Zakresy Unicode dla Emoji i symboli graficznych +# UWAGA: Poprzedni pattern "\U000024C2-\U0001F251" był zbyt szeroki i usuwał chińskie znaki! +# Teraz używamy precyzyjnych zakresów tylko dla emoji. +EMOJI_PATTERN = re.compile( + "[" + "\U0001F600-\U0001F64F" # Emoticons + "\U0001F300-\U0001F5FF" # Misc Symbols and Pictographs + "\U0001F680-\U0001F6FF" # Transport and Map + "\U0001F1E0-\U0001F1FF" # Flags (iOS) + "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs + "\U0001FA00-\U0001FA6F" # Chess Symbols + "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A + "\U00002702-\U000027B0" # Dingbats + "\U0000FE00-\U0000FE0F" # Variation Selectors + "]+", + flags=re.UNICODE, +) + +# Inteligentny podział na zdania (wspiera angielski i chiński) +# Chiński: 。!?; +# Angielski: .!? +# Interpunkcja do usunięcia przy deduplikacji (EN + ZH) +DEDUP_PUNCTUATION = re.compile(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~。!?,、;:""''【】()《》~…·]') + +SENTENCE_SPLIT_PATTERN = re.compile(r""" + (?<=[.!?。!?;])\s* # Koniec zdania (EN + ZH punctuation) + | # LUB + (?<=[a-z]),\s+ # Przecinek po literze + spacja... + (?=but\b|however\b|although\b|though\b) # ...przed spójnikiem przeciwstawnym (EN) + | + \s+(?=but\b|however\b|although\b|though\b) # Spójnik bez przecinka (EN) + | + (?<=。|!|?|;) # Po chińskiej interpunkcji (bez spacji) + | + (?=但是|然而|虽然|不过|可是) # Przed chińskim spójnikiem przeciwstawnym +""", re.VERBOSE | re.IGNORECASE) + + +class NLPService: + """ + Serwis NLP realizujący analizę hybrydową: + 1. Słowa kluczowe (Regex) -> Wykrywanie tematów. + 2. DistilBERT (Local Model) -> Analiza sentymentu. + """ + + def __init__(self) -> None: + """ + Inicjalizuje pipeline ML oraz kompiluje wzorce tekstowe. + Model ładowany jest raz przy starcie aplikacji (Singleton pattern). + """ + logger.info("Inicjalizacja serwisu NLP (ONNX Optimized)...") + + # 0. Jieba user dict — terminy gamingowe + userdict_path = Path(__file__).parent.parent / "core" / "jieba_userdict.txt" + if userdict_path.exists(): + jieba.load_userdict(str(userdict_path)) + logger.info(f"Załadowano jieba user dict: {userdict_path}") + + # 1. Kompilacja Regexów + # Łączymy słowa kluczowe w jeden efektywny "automat" (Regex). + # UWAGA: \b nie działa z chińskimi znakami, więc używamy różnych wzorców + # dla słów ASCII (z \b) i chińskich (bez \b). + self.topic_patterns = {} + self.single_char_topic_keywords = {} + self.exclusion_patterns = {} + + for topic, keyword_groups in TOPIC_KEYWORDS.items(): + ascii_keywords: list[str] = [] + chinese_keywords: list[str] = [] + chinese_single_char_keywords: list[str] = [] + + for group_name, group in keyword_groups.items(): + for keyword in group: + if keyword.isascii(): + ascii_keywords.append(keyword) + elif group_name == "single_char" and len(keyword) == 1: + chinese_single_char_keywords.append(keyword) + else: + chinese_keywords.append(keyword) + + self.single_char_topic_keywords[topic] = chinese_single_char_keywords + + patterns = [] + if ascii_keywords: + # Use word boundaries for ASCII keywords + sorted_ascii = sorted(ascii_keywords, key=len, reverse=True) + patterns.append(r'\b(' + '|'.join(re.escape(k) for k in sorted_ascii) + r')\b') + if chinese_keywords: + # No word boundaries for Chinese (they don't have spaces), + # but prefer longer keywords so compounds win over partial overlaps. + sorted_chinese = sorted(chinese_keywords, key=len, reverse=True) + patterns.append('(' + '|'.join(re.escape(k) for k in sorted_chinese) + ')') + + if patterns: + combined_pattern = '|'.join(patterns) + self.topic_patterns[topic] = re.compile(combined_pattern, re.IGNORECASE) + + for keyword, exclusions in EXCLUSIONS.items(): + if exclusions: + pattern_str = '|'.join(re.escape(e) for e in exclusions) + self.exclusion_patterns[keyword] = re.compile(pattern_str, re.IGNORECASE) + + # 2. Ładowanie modelu ONNX + logger.info(f"Ładowanie modelu ONNX {settings.hf_sentiment_model}...") + try: + from onnxruntime import GraphOptimizationLevel, SessionOptions + + # OPTYMALIZACJA DLA HF SPACES (Shared CPU) + # Na darmowym tierze mamy 2 vCPU. Ograniczenie wątków zapobiega + # "context switching" i walce o zasoby. + session_options = SessionOptions() + session_options.intra_op_num_threads = settings.nlp_onnx_intra_threads + session_options.inter_op_num_threads = settings.nlp_onnx_inter_threads + session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL + + # Load pre-built quantized INT8 ONNX model (no PyTorch needed at runtime) + quantized_path = Path(__file__).resolve().parent.parent.parent / "models" / "quantized" + model_file = quantized_path / "model_quantized.onnx" + if not model_file.exists(): + raise FileNotFoundError( + f"Quantized ONNX model not found at {model_file}. " + "Run 'python scripts/quantize_model.py' to generate it." + ) + + logger.info(f"Loading quantized INT8 model from {quantized_path}") + model = ORTModelForSequenceClassification.from_pretrained( + str(quantized_path), + file_name="model_quantized.onnx", + session_options=session_options, + ) + tokenizer = AutoTokenizer.from_pretrained(str(quantized_path)) + + self.classifier = pipeline( + "sentiment-analysis", + model=model, + tokenizer=tokenizer, + device="cpu", + ) + + logger.info("Model NLP ONNX ready: INT8 quantized, graph_optimization=ALL") + except Exception as e: + # Deliberate broad catch — model loading can fail with OSError, RuntimeError, + # ONNX errors, HF Hub errors, etc. Always fatal, always re-raised. + logger.error(f"Krytyczny błąd ładowania modelu ONNX: {e}") + raise + + # Pula wątków, żeby ciężkie obliczenia AI nie blokowały serwera (Event Loop) + self.executor = ThreadPoolExecutor(max_workers=1) + + # Cache sentymentu: normalized_text -> (label_str, score) + self._sentiment_cache: OrderedDict[str, tuple[str, float]] = OrderedDict() + self._cache_maxsize = settings.dedup_cache_maxsize + + def clean_text(self, text: str) -> str: + """Usuwa szum (emoji, nadmiarowe spacje) i normalizuje tekst.""" + text = EMOJI_PATTERN.sub("", text) + text = text.lower() + text = re.sub(r"\s+", " ", text).strip() + max_len = settings.text_max_length + return text[:max_len] if len(text) > max_len else text + + def _normalize_for_dedup(self, text: str) -> str: + """Normalizuje zdanie do klucza deduplikacji (zachowuje kolejność słów).""" + text = DEDUP_PUNCTUATION.sub("", text).lower() + text = re.sub(r"\s+", " ", text).strip() + return convert(text, 'zh-cn') + + def _split_into_sentences(self, text: str) -> list[str]: + """Rozbija recenzję na logiczne jednostki (zdania/klauzule).""" + parts = SENTENCE_SPLIT_PATTERN.split(text) + return [p.strip() for p in parts if p and p.strip()] + + def _has_negation(self, text: str, position: int) -> bool: + """ + Wykrywa negację przed słowem kluczowym (w zasięgu zdefiniowanym w configu). + Przydatne przy precyzyjniejszej analizie aspektowej w języku chińskim. + """ + window = settings.nlp_negation_window + left_context = text[max(0, position-window):position] + return any(neg in left_context for neg in ["不", "没", "别", "无"]) + + @staticmethod + def _is_valid_single_char_token(keyword: str, token: str, previous_token: str | None) -> bool: + """Waliduje pojedynczy chiński keyword w kontekście całego tokenu.""" + if keyword != "卡": + return True + if token == keyword: + return previous_token is None or previous_token in CARD_STANDALONE_PREVIOUS_TOKENS + return token.endswith(keyword) and token[:-1] in CARD_LAG_PREFIXES + + def _find_single_char_keyword_match(self, sentence: str, keywords: list[str]) -> tuple[int, str] | None: + """Zwraca pierwszy poprawny match dla chińskiego single-char keywordu.""" + if not keywords: + return None + + keyword_set = set(keywords) + tokenized_sentence = list(jieba.tokenize(sentence)) + for index, (token, start, _) in enumerate(tokenized_sentence): + previous_token = tokenized_sentence[index - 1][0] if index > 0 else None + for offset, char in enumerate(token): + if char not in keyword_set: + continue + if self._is_valid_single_char_token(char, token, previous_token): + return start + offset, char + return None + + def _detect_topics_regex(self, sentence: str) -> dict[str, bool]: + """ + Szybkie wykrywanie tematów przy użyciu prekompilowanych regexów. + Złożoność: O(N) względem długości zdania, niezależnie od liczby słów kluczowych. + """ + detected = {} + + # Konwersja TYMCZASOWA na uproszczony chiński dla potrzeb matchowania. + # Dzięki temu zachowujemy oryginalny tekst (tradycyjny/uproszczony) w bazie, + # ale słownik keywords.py może pozostać w zh-cn. + sentence_simp = convert(sentence, 'zh-cn') + + for topic in TOPIC_KEYWORDS: + regex_match = None + if topic in self.topic_patterns: + regex_match = self.topic_patterns[topic].search(sentence_simp) + + single_char_match = self._find_single_char_keyword_match( + sentence_simp, + self.single_char_topic_keywords.get(topic, []), + ) + + matched_word: str | None = None + match_start: int | None = None + + if regex_match and single_char_match: + if single_char_match[0] < regex_match.start(): + match_start, matched_word = single_char_match + else: + match_start = regex_match.start() + matched_word = regex_match.group(0).lower() + elif regex_match: + match_start = regex_match.start() + matched_word = regex_match.group(0).lower() + elif single_char_match: + match_start, matched_word = single_char_match + + if matched_word is not None and match_start is not None: + is_excluded = False + + if matched_word in self.exclusion_patterns: + if self.exclusion_patterns[matched_word].search(sentence_simp): + is_excluded = True + + if not is_excluded: + negated = self._has_negation(sentence_simp, match_start) + detected[topic] = negated + + return detected + + def _run_inference(self, texts: list[str]) -> list[dict]: + """Wrapper dla pipeline'u Hugging Face uruchamiany w wątku.""" + # batch_size=16 optymalizuje operacje macierzowe na CPU (AVX) + # truncation=True, max_length=512 zapobiega przekroczeniu limitu pozycji ONNX + # (max_position_embeddings=512); pipeline uwzględnia tokeny specjalne automatycznie + return self.classifier(texts, batch_size=16, truncation=True, max_length=512) + + @staticmethod + def _map_label(label_str: str, score: float) -> tuple[SentimentType, float]: + """Mapuje surowy label modelu na (SentimentType, score).""" + label_lower = label_str.lower() + if 'positive' in label_lower or 'label_1' in label_lower: + return (SentimentType.POSITIVE, score) + elif 'negative' in label_lower or 'label_0' in label_lower: + return (SentimentType.NEGATIVE, -score) + return (SentimentType.NEUTRAL, 0.0) + + def _cache_put(self, key: str, value: tuple[str, float]) -> None: + """Dodaje wynik do cache LRU, usuwa najstarsze jeśli przekroczono limit.""" + self._sentiment_cache[key] = value + self._sentiment_cache.move_to_end(key) + while len(self._sentiment_cache) > self._cache_maxsize: + self._sentiment_cache.popitem(last=False) + + async def analyze_sentiment_batch( + self, texts: list[str] + ) -> list[tuple[SentimentType, float]]: + """ + Asynchroniczny interfejs do analizy sentymentu. + Offloaduje obliczenia do osobnego wątku, nie blokując API. + Wykorzystuje cache LRU do pomijania powtórzonych zdań. + """ + cleaned_texts = [self.clean_text(t) for t in texts] + norm_keys = [self._normalize_for_dedup(t) for t in cleaned_texts] + + # Rozdziel na cache hits i misses + final_sentiments: list[tuple[SentimentType, float]] = [(SentimentType.NEUTRAL, 0.0)] * len(texts) + miss_indices: list[int] = [] # indeksy w cleaned_texts, które trzeba wysłać do modelu + miss_texts: list[str] = [] + + for i, (cleaned, key) in enumerate(zip(cleaned_texts, norm_keys)): + if not cleaned: + continue + cached = self._sentiment_cache.get(key) + if cached is not None: + self._sentiment_cache.move_to_end(key) + final_sentiments[i] = self._map_label(cached[0], cached[1]) + else: + miss_indices.append(i) + miss_texts.append(cleaned) + + cache_hits = len(texts) - len(miss_texts) + logger.debug(f"Cache: {cache_hits} hits, {len(miss_texts)} misses (cache size: {len(self._sentiment_cache)})") + + if not miss_texts: + return final_sentiments + + # Uruchomienie modelu TYLKO na cache-misses + loop = asyncio.get_event_loop() + results = await loop.run_in_executor(self.executor, self._run_inference, miss_texts) + + for j, res in enumerate(results): + original_idx = miss_indices[j] + label_str = res['label'] + score = res['score'] + + # Zapisz surowy wynik w cache + self._cache_put(norm_keys[original_idx], (label_str, score)) + + final_sentiments[original_idx] = self._map_label(label_str, score) + + return final_sentiments + + async def analyze_batch( + self, + reviews: list[str], + highlights_collector: HighlightsCollector | None = None, + categories: list[str] | None = None, + ) -> tuple[list[TopicSentiment], int]: + """ + Główna metoda przetwarzania partii recenzji. + Łączy segmentację, wykrywanie tematów i analizę sentymentu. + """ + if not reviews: + return [], 0 + + # Krok 1: Pre-processing i identyfikacja zdań do analizy + sentiment_tasks = [] + skipped_sentences = 0 + + for review_idx, review in enumerate(reviews): + if highlights_collector: + highlights_collector.start_review() + + cleaned = self.clean_text(review) + if not cleaned or len(cleaned) < 5: + continue + + sentences = self._split_into_sentences(cleaned) + for sentence in sentences: + topics_map = self._detect_topics_regex(sentence) + if topics_map: + for topic, is_negated in topics_map.items(): + sentiment_tasks.append((review_idx, topic, sentence, is_negated)) + else: + skipped_sentences += 1 + + if not sentiment_tasks: + return [], skipped_sentences + + # Krok 2: Deduplikacja + Analiza sentymentu + all_sentences = [task[2] for task in sentiment_tasks] + + # Deduplikacja: normalizuj -> znajdź unikalne -> inference tylko na unikatach + norm_keys = [self._normalize_for_dedup(s) for s in all_sentences] + unique_map: dict[str, int] = {} # normalized_key -> index in unique_texts + unique_texts: list[str] = [] + + for i, key in enumerate(norm_keys): + if key not in unique_map: + unique_map[key] = len(unique_texts) + unique_texts.append(all_sentences[i]) + + dedup_total = len(all_sentences) + dedup_unique = len(unique_texts) + dedup_pct = round((1 - dedup_unique / dedup_total) * 100) if dedup_total else 0 + logger.debug(f"Dedup: {dedup_total} -> {dedup_unique} sentences ({dedup_pct}% reduced)") + + unique_results = await self.analyze_sentiment_batch(unique_texts) + + # Mapowanie wyników z unikalnych z powrotem na wszystkie zdania + sentiment_results = [unique_results[unique_map[key]] for key in norm_keys] + + # Krok 3: Agregacja wyników + # review_id -> topic -> list of scores + review_topic_scores: dict[int, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list)) + # topic -> (sentence, score) - online selection najlepszego przykładu + topic_best_example: dict[str, tuple[str, float]] = {} + + for i, (review_idx, topic, sentence, is_negated) in enumerate(sentiment_tasks): + _, score = sentiment_results[i] + + # KULOODPORNY PIPELINE: Jeśli wykryto negację (np. "nie lubię gameplayu"), + # a model mimo to zwrócił dodatni sentyment, korygujemy go. + if is_negated and score > 0: + score = -score + + review_topic_scores[review_idx][topic].append(score) + + if highlights_collector: + highlights_collector.add_sentence( + review_idx=review_idx, + sentence=sentence, + topics=[topic], + sentiment_score=score, + categories=categories, + ) + + # Online selection - aktualizuj jeśli lepszy kandydat (wyższy |score|) + if len(sentence) > 20: + current = topic_best_example.get(topic) + if current is None or abs(score) > abs(current[1]): + topic_best_example[topic] = (sentence, score) + + # Agregacja globalna: Średnia per recenzja -> Suma globalna + global_topic_stats: dict[str, dict[str, float]] = defaultdict(lambda: {"sum_score": 0.0, "count": 0.0}) + + for review_idx, topics_data in review_topic_scores.items(): + for topic, scores in topics_data.items(): + avg_review_score = sum(scores) / len(scores) + global_topic_stats[topic]["sum_score"] += avg_review_score + global_topic_stats[topic]["count"] += 1.0 + + # Krok 4: Formatowanie końcowe + final_results: list[TopicSentiment] = [] + + for topic_name, stats in global_topic_stats.items(): + count = int(stats["count"]) + if count == 0: + continue + + avg_global_score = stats["sum_score"] / stats["count"] + normalized_score = max(-1.0, min(1.0, avg_global_score)) + + if normalized_score > settings.sentiment_positive_threshold: + sentiment = SentimentType.POSITIVE + elif normalized_score < settings.sentiment_negative_threshold: + sentiment = SentimentType.NEGATIVE + else: + sentiment = SentimentType.NEUTRAL + + # Pobierz najlepszy przykład i zwaliduj zgodność kierunku + best_example = None + example_score = None + candidate = topic_best_example.get(topic_name) + if candidate: + ex_sentence, ex_score = candidate + # Walidacja: przykład musi być zgodny z kierunkiem sentymentu + if sentiment == SentimentType.NEUTRAL or \ + (sentiment == SentimentType.POSITIVE and ex_score > 0) or \ + (sentiment == SentimentType.NEGATIVE and ex_score < 0): + best_example = ex_sentence + example_score = ex_score + + final_results.append( + TopicSentiment( + topic=topic_name, + sentiment=sentiment, + score=round(normalized_score, 3), + mention_count=count, + example=best_example, + example_score=example_score, + ) + ) + + final_results.sort(key=lambda x: x.mention_count, reverse=True) + return final_results, skipped_sentences + + +_nlp_service: "NLPService | None" = None + + +def get_nlp_service() -> "NLPService": + global _nlp_service + if _nlp_service is None: + _nlp_service = NLPService() + return _nlp_service diff --git a/backend/app/services/precache_service.py b/backend/app/services/precache_service.py new file mode 100644 index 0000000000000000000000000000000000000000..09f2714939d3475f24853070360912132a07bf03 --- /dev/null +++ b/backend/app/services/precache_service.py @@ -0,0 +1,199 @@ +""" +Pre-cache Service — schedules and executes background analyses for top games. + +Creates refresh schedules with checkpoints (e.g. 6h, 12h, 24h after update) +and processes due analyses each cycle, prioritized by game popularity. +""" + +import asyncio +import logging +from datetime import datetime, timedelta, timezone +from typing import Any + +from app.core.config import settings +from app.db.mongodb import mongodb +from app.services.analysis_runner import run_full_analysis, run_incremental_analysis +from app.services.nlp_service import NLPService +from app.services.steam_service import SteamService + +logger = logging.getLogger(__name__) + + +class PreCacheService: + """Manages refresh schedules and triggers pre-cache analyses.""" + + def __init__( + self, steam_svc: SteamService, nlp_svc: NLPService + ) -> None: + self._steam_svc = steam_svc + self._nlp_svc = nlp_svc + + def create_schedule( + self, app_id: str, game_name: str, update_at: datetime, *, is_release: bool = False + ) -> dict[str, Any]: + """Build a schedule document with checkpoints from config.""" + checkpoints = [] + for offset_hours in settings.precache_checkpoints_list: + checkpoints.append({ + "offset_hours": offset_hours, + "due_at": update_at + timedelta(hours=offset_hours), + "completed": False, + }) + + return { + "app_id": str(app_id), + "game_name": game_name, + "update_at": update_at, + "checkpoints": checkpoints, + "is_release": is_release, + "status": "active", + "created_at": datetime.now(timezone.utc), + } + + def create_bootstrap_schedule( + self, app_id: str, game_name: str + ) -> dict[str, Any]: + """Release schedule for a newly prioritized game, starting at 6h.""" + now = datetime.now(timezone.utc) + return self.create_schedule(app_id, game_name, now, is_release=True) + + async def create_schedules_for_updates( + self, updated_games: list[dict[str, Any]] + ) -> int: + """Bulk-create schedules for games that received updates.""" + active_schedules = await mongodb.get_active_schedules() + active_by_app_id = {s["app_id"]: s for s in active_schedules} + + created = 0 + for game in updated_games: + app_id = str(game.get("appid", "")) + name = game.get("name", "") + update_at = game.get("update_at", datetime.now(timezone.utc)) + + existing = active_by_app_id.get(app_id) + if existing: + existing_update_at = existing.get("update_at") + if existing_update_at and update_at <= existing_update_at: + continue # Same or older patch — don't reset checkpoints + + schedule = self.create_schedule(app_id, name, update_at) + await mongodb.upsert_refresh_schedule(schedule) + created += 1 + + logger.info(f"Created {created} refresh schedules for updated games") + return created + + async def bootstrap_missing_analyses( + self, top_games: list[dict[str, Any]] + ) -> int: + """For top games with no cached analysis, create release schedules.""" + # Pre-fetch active schedule app_ids for O(1) lookup + active_schedules = await mongodb.get_active_schedules() + scheduled_app_ids = {s["app_id"] for s in active_schedules} + + created = 0 + for game in top_games: + app_id = str(game.get("appid", "")) + if not app_id or app_id in scheduled_app_ids: + continue + + # Check if analysis already cached + cached = await mongodb.get_cached_analysis(app_id) + if cached is not None: + continue + + schedule = self.create_bootstrap_schedule(app_id, game.get("name", "")) + await mongodb.upsert_refresh_schedule(schedule) + scheduled_app_ids.add(app_id) + created += 1 + + logger.info(f"Bootstrap: created {created} release schedules") + return created + + async def process_due_analyses(self) -> int: + """ + Main processing loop: find due checkpoints, prioritize, execute. + + Returns: + Number of analyses executed. + """ + now = datetime.now(timezone.utc) + schedules = await mongodb.get_active_schedules() + max_per_cycle = settings.precache_max_analyses_per_cycle + delay = settings.precache_batch_delay_seconds + + # Find one due checkpoint per game + due_items: list[dict[str, Any]] = [] + for schedule in schedules: + for cp in schedule.get("checkpoints", []): + if cp.get("completed"): + continue + if cp["due_at"] <= now: + due_items.append({ + "app_id": schedule["app_id"], + "game_name": schedule.get("game_name", ""), + "offset_hours": cp["offset_hours"], + "due_at": cp["due_at"], + "positive": schedule.get("positive", 0), + "negative": schedule.get("negative", 0), + }) + break # Only first due checkpoint per game + + if not due_items: + logger.info("Pre-cache: no due analyses") + return 0 + + # Sort by popularity DESC, then due_at ASC + due_items.sort( + key=lambda x: (-(x.get("positive", 0) + x.get("negative", 0)), x["due_at"]) + ) + + # Execute up to max_per_cycle + executed = 0 + for item in due_items[:max_per_cycle]: + app_id = item["app_id"] + game_name = item["game_name"] + offset_hours = item["offset_hours"] + + logger.info(f"Pre-cache: analyzing {app_id} ({game_name}) — checkpoint {offset_hours}h") + + existing = await mongodb.get_analysis(app_id) + if existing and existing.get("results"): + result = await run_incremental_analysis( + app_id, game_name, self._steam_svc, self._nlp_svc + ) + else: + result = await run_full_analysis( + app_id, game_name, self._steam_svc, self._nlp_svc + ) + + if result is not None: + executed += 1 + + # Mark checkpoint completed regardless of success + await mongodb.mark_checkpoint_completed(app_id, offset_hours) + + # Check if all checkpoints done → complete schedule + await self._check_schedule_completion(app_id) + + if executed < max_per_cycle and item != due_items[-1]: + await asyncio.sleep(delay) + + logger.info(f"Pre-cache: executed {executed}/{len(due_items)} due analyses") + return executed + + @staticmethod + async def _check_schedule_completion(app_id: str) -> None: + """If all checkpoints completed, mark schedule as completed.""" + schedules = await mongodb.get_active_schedules() + for schedule in schedules: + if schedule["app_id"] != str(app_id): + continue + all_done = all( + cp.get("completed", False) + for cp in schedule.get("checkpoints", []) + ) + if all_done: + await mongodb.complete_schedule(app_id) + logger.info(f"Schedule completed for {app_id}") + break diff --git a/backend/app/services/priority_refresh_service.py b/backend/app/services/priority_refresh_service.py new file mode 100644 index 0000000000000000000000000000000000000000..f76fc882eff4fd2d6c6364326b130422ab0fd1c6 --- /dev/null +++ b/backend/app/services/priority_refresh_service.py @@ -0,0 +1,387 @@ +""" +Priority Refresh Service — maintains canonical priority game state in MongoDB. + +Priority sources: + - top500: top 500 games by review count (local DB) + - top_sellers / new_releases / specials: Steam store featured categories + +Priority state fields on games documents: + is_priority bool + priority_sources list[str] + priority_grace_until datetime | None + priority_last_confirmed_at datetime | None +""" + +import asyncio +import logging +from datetime import datetime, timedelta, timezone +from typing import Any + +import httpx + +from app.core.config import settings +from app.db.mongodb import mongodb + +logger = logging.getLogger(__name__) + + +class PriorityRefreshService: + """Refreshes priority flags on the games collection each worker cycle.""" + + def __init__(self, client: httpx.AsyncClient | None = None) -> None: + self._client = client + self._owns_client = client is None + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient(timeout=15.0) + return self._client + + async def close(self) -> None: + if self._owns_client and self._client is not None: + await self._client.aclose() + self._client = None + + async def refresh_priorities(self) -> dict[str, Any]: + """ + Recompute is_priority for all games and write changes to MongoDB. + + Returns a summary dict with counts. + """ + now = datetime.now(timezone.utc) + grace_deadline = now + timedelta(days=settings.steam_priority_grace_days) + + # 1. Build active sources map + top500_ids: set[str] = { + g["appid"] + for g in await mongodb.get_top_games_by_reviews(500) + if g.get("app_type") != "dlc" + } + + category_ids: dict[str, set[str]] = await self._fetch_store_categories() + + active_sources: dict[str, list[str]] = {} + for appid in top500_ids: + active_sources.setdefault(appid, []).append("top500") + for cat_name, ids in category_ids.items(): + for appid in ids: + active_sources.setdefault(appid, []).append(cat_name) + + # 1b. Bootstrap category games that are missing from the local DB. + # top500 appids are safe — they come from existing DB records. + # Category appids may reference games not yet in our DB. + all_category_appids: set[str] = set() + for ids in category_ids.values(): + all_category_appids.update(ids) + + bootstrap_summary: dict[str, Any] = {} + if all_category_appids: + _, bootstrap_summary = await self._bootstrap_missing_games(all_category_appids) + # After bootstrap, remove from active_sources any category appid that + # still has no DB record (failed bootstrap / delisted / per-cycle limit). + # This prevents bulk_update_priority_fields from silently no-oping. + existing_in_db = await mongodb.get_existing_appids(all_category_appids) + for appid in all_category_appids - existing_in_db: + active_sources.pop(appid, None) + + # 2. Load current priority state (only games that already have is_priority field) + existing_priority_docs: list[dict[str, Any]] = [] + if mongodb.db is not None: + try: + collection = mongodb.db[mongodb.COLLECTION_GAMES] + cursor = collection.find( + {"is_priority": {"$exists": True}}, + { + "_id": 0, + "appid": 1, + "app_type": 1, + "is_priority": 1, + "priority_grace_until": 1, + "priority_sources": 1, + }, + ) + existing_priority_docs = await cursor.to_list(length=10000) + except Exception as e: + logger.warning(f"Failed to load existing priority docs: {e}") + + existing_by_appid: dict[str, dict] = { + str(d["appid"]): d for d in existing_priority_docs + } + + # 2b. DLC inherits effective priority from its parent game. + if settings.dlc_worker_analysis_enabled: + priority_parent_ids: set[str] = set(active_sources.keys()) + for appid, doc in existing_by_appid.items(): + if doc.get("app_type") == "dlc": + continue + if not doc.get("is_priority") or appid in active_sources: + continue + + grace_until = doc.get("priority_grace_until") + if grace_until is None or grace_until >= now: + priority_parent_ids.add(appid) + + for parent_appid in priority_parent_ids: + dlcs = await mongodb.get_dlcs_by_parent_appid(parent_appid) + for dlc in dlcs: + dlc_appid = str(dlc.get("appid", "")) + if dlc_appid: + active_sources[dlc_appid] = ["parent_priority"] + + # 2c. When DLC worker analysis is disabled, remove any DLC that entered + # active_sources via other paths (e.g. Steam store categories). + if not settings.dlc_worker_analysis_enabled: + dlc_appids_to_remove = { + appid + for appid in active_sources + if existing_by_appid.get(appid, {}).get("app_type") == "dlc" + } + for appid in dlc_appids_to_remove: + del active_sources[appid] + + # 3. Compute updates + updates: list[tuple[str, dict]] = [] + became_priority = 0 + entered_grace = 0 + expired_grace = 0 + reactivated = 0 + removed_parent_priority = 0 + + # Active games — either new or confirming existing priority + for appid, sources in active_sources.items(): + existing = existing_by_appid.get(appid) + fields: dict[str, Any] = { + "is_priority": True, + "priority_sources": sources, + "priority_grace_until": None, + "priority_last_confirmed_at": now, + } + if existing is None or not existing.get("is_priority"): + became_priority += 1 + elif existing.get("priority_grace_until") is not None: + reactivated += 1 + updates.append((appid, fields)) + + # Games that were priority but are no longer in any active source + for appid, doc in existing_by_appid.items(): + if appid in active_sources: + continue # already handled above + if not doc.get("is_priority"): + continue # already marked non-priority, skip + + if "parent_priority" in (doc.get("priority_sources") or []): + updates.append((appid, { + "is_priority": False, + "priority_sources": [], + "priority_grace_until": None, + })) + removed_parent_priority += 1 + continue + + grace_until = doc.get("priority_grace_until") + + if grace_until is None: + # Just left all sources — start grace period + updates.append((appid, { + "priority_grace_until": grace_deadline, + "priority_sources": [], + })) + entered_grace += 1 + elif grace_until < now: + # Grace expired — remove priority + updates.append((appid, { + "is_priority": False, + "priority_sources": [], + "priority_grace_until": None, + })) + expired_grace += 1 + # else: still in grace and not expired — no update needed + + modified = await mongodb.bulk_update_priority_fields(updates) + + result = { + "total_active": len(active_sources), + "top500_count": len(top500_ids), + "category_counts": {k: len(v) for k, v in category_ids.items()}, + "bootstrap": bootstrap_summary, + "became_priority": became_priority, + "reactivated": reactivated, + "entered_grace": entered_grace, + "expired_grace": expired_grace, + "removed_parent_priority": removed_parent_priority, + "db_modified": modified, + } + logger.info(f"Priority refresh complete: {result}") + return result + + @staticmethod + def _parse_app_type(data: dict[str, Any]) -> dict[str, Any]: + """Parse app_type and parent_appid from an appdetails data block.""" + app_type = data.get("type") or "unknown" + fullgame = data.get("fullgame") + parent_appid = None + if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None: + parent_appid = str(fullgame["appid"]) + return {"app_type": str(app_type), "parent_appid": parent_appid} + + async def _fetch_app_details_bilingual(self, appid: str) -> dict[str, Any] | None: + """ + Fetch appdetails for a single game in both english and schinese. + + Returns a minimal game dict (name, name_cn, app_type, parent_appid, + header_image, cn_name_checked) or None on failure / not found. + """ + client = await self._get_client() + store_url = "https://store.steampowered.com/api/appdetails" + + async def _fetch_one(lang: str) -> dict[str, Any]: + try: + resp = await client.get( + store_url, + params={"appids": appid, "l": lang, "cc": settings.steam_region}, + ) + if resp.status_code != 200: + return {} + entry = resp.json().get(str(appid)) + if entry and entry.get("success"): + return entry.get("data") or {} + return {} + except Exception as e: + logger.warning(f"appdetails error for {appid} (lang={lang}): {e}") + return {} + + data_en, data_cn = await asyncio.gather( + _fetch_one("english"), + _fetch_one("schinese"), + ) + + if not data_en and not data_cn: + logger.warning(f"No appdetails for {appid} — skipping bootstrap") + return None + + name_en = data_en.get("name") or data_cn.get("name") + if not name_en: + logger.warning(f"No name in appdetails for {appid} — skipping bootstrap") + return None + + name_cn = data_cn.get("name") + base = data_en or data_cn + type_info = self._parse_app_type(base) + + return { + "appid": appid, + "name": name_en, + "name_cn": name_cn if name_cn and name_cn != name_en else None, + "cn_name_checked": True, + "app_type": type_info["app_type"], + "parent_appid": type_info["parent_appid"], + "header_image": base.get("header_image"), + } + + async def _bootstrap_missing_games( + self, + category_appids: set[str], + ) -> tuple[set[str], dict[str, Any]]: + """ + Fetch Steam Store data and upsert games missing from the local DB. + + Returns: + (bootstrapped_appids, summary_dict) + bootstrapped_appids: set of appids that were newly upserted + """ + existing = await mongodb.get_existing_appids(category_appids) + missing = category_appids - existing + + if not missing: + return set(), {"bootstrapped": 0, "failed": 0, "skipped_existing": len(existing)} + + limit = settings.steam_bootstrap_max_per_cycle + appids_to_fetch = list(missing)[:limit] + bootstrapped: set[str] = set() + failed = 0 + + for i, appid in enumerate(appids_to_fetch): + game_data = await self._fetch_app_details_bilingual(appid) + if game_data is None: + failed += 1 + else: + await mongodb.upsert_game(game_data) + bootstrapped.add(appid) + + if i < len(appids_to_fetch) - 1: + await asyncio.sleep(settings.steam_bootstrap_delay) + + summary = { + "bootstrapped": len(bootstrapped), + "failed": failed, + "skipped_existing": len(existing), + "missing_over_limit": max(0, len(missing) - limit), + } + if bootstrapped or failed: + logger.info(f"Bootstrap missing games: {summary}") + return bootstrapped, summary + + async def _fetch_region_categories(self, region: str) -> dict[str, set[str]]: + """ + Fetch featured categories for a single Steam region (cc=region). + + Returns dict mapping category name -> set of appid strings. + On any failure, returns {} so the caller can continue with other regions. + """ + try: + client = await self._get_client() + resp = await client.get( + settings.steam_priority_categories_url, + params={"cc": region, "l": "schinese"}, + ) + if resp.status_code != 200: + logger.warning( + f"Steam featuredcategories [{region}] returned {resp.status_code} — skipping region" + ) + return {} + + data = resp.json() + except Exception as e: + logger.warning( + f"Failed to fetch Steam store categories [{region}]: {e} — skipping region" + ) + return {} + + result: dict[str, set[str]] = {} + for cat_name in settings.steam_priority_categories_list: + cat_data = data.get(cat_name) + if not cat_data: + continue + items = cat_data.get("items", []) + appids: set[str] = { + str(item["id"]) + for item in items + if item.get("type") == 0 and item.get("id") is not None + } + result[cat_name] = appids + + return result + + async def _fetch_store_categories(self) -> dict[str, set[str]]: + """ + Fetch game appids from Steam store featured categories across all configured regions. + + Iterates over steam_priority_regions_list (default: CN, US) and merges results. + If one region fails, the other is still used. If all fail, returns {} (fallback + to top-500 only). + + Returns dict mapping category name -> set of appid strings. + """ + regions = settings.steam_priority_regions_list + if not regions: + logger.warning( + "steam_priority_regions is empty — skipping store categories fetch (top500 only)" + ) + return {} + + merged: dict[str, set[str]] = {} + for region in regions: + region_data = await self._fetch_region_categories(region) + for cat_name, appids in region_data.items(): + merged.setdefault(cat_name, set()).update(appids) + return merged diff --git a/backend/app/services/steam_errors.py b/backend/app/services/steam_errors.py new file mode 100644 index 0000000000000000000000000000000000000000..3105a8c5898ab3df96cd9ebb60d34824f2609115 --- /dev/null +++ b/backend/app/services/steam_errors.py @@ -0,0 +1,22 @@ +""" +Custom exceptions for Steam API errors. + +Separate module to avoid circular imports between mongodb.py and steam_service.py. +""" + + +class SteamAPIError(Exception): + """Raised when Steam API returns a non-retryable error (404, 403, other 4xx).""" + + def __init__(self, status_code: int, app_id: str, message: str = "") -> None: + self.status_code = status_code + self.app_id = app_id + self.message = message or f"Steam API error {status_code} for app {app_id}" + super().__init__(self.message) + + +class SteamRateLimitError(SteamAPIError): + """Raised when Steam API returns 429 after all retries are exhausted.""" + + def __init__(self, app_id: str) -> None: + super().__init__(status_code=429, app_id=app_id, message=f"Steam API rate limited for app {app_id}") diff --git a/backend/app/services/steam_service.py b/backend/app/services/steam_service.py new file mode 100644 index 0000000000000000000000000000000000000000..6c8d6404d185826d694659875ab7a5137e4947a7 --- /dev/null +++ b/backend/app/services/steam_service.py @@ -0,0 +1,499 @@ +""" +Serwis do komunikacji ze Steam API. + +Odpowiada za pobieranie informacji o grach oraz recenzji. +Wykorzystuje publiczne API Steam (nie wymaga klucza API). +Implementuje statystyczne próbkowanie recenzji (stratified sampling). +Retry z exponential backoff dla 429/5xx/timeout. +""" + +import asyncio +import logging +from dataclasses import dataclass +from typing import Any, AsyncGenerator + +import httpx + +from app.core.config import settings +from app.core.sampling import SamplePlan, create_sample_plan +from app.db.mongodb import mongodb +from app.models.schemas import GameInfo, ReviewBatch, ReviewItem +from app.services.steam_errors import SteamAPIError, SteamRateLimitError + +logger = logging.getLogger(__name__) + +# Status codes that should be retried +_RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504} + + +@dataclass +class ReviewStats: + """Statystyki recenzji gry.""" + + total: int + positive: int + negative: int + + +class SteamService: + """ + Serwis do pobierania danych ze Steam API. + """ + + STORE_API_URL = "https://store.steampowered.com/api" + REVIEW_API_URL = "https://store.steampowered.com/appreviews" + SEARCH_API_URL = "https://store.steampowered.com/api/storesearch" + + def __init__(self, timeout: float = 30.0) -> None: + self.timeout = timeout + self.client = httpx.AsyncClient(timeout=self.timeout) + + async def close(self) -> None: + """Close the shared HTTP client.""" + await self.client.aclose() + + async def _request_with_retry( + self, + client: httpx.AsyncClient, + url: str, + params: dict[str, Any], + context: str = "", + ) -> httpx.Response: + """ + Wykonuje request z retry i exponential backoff. + """ + max_attempts = settings.steam_retry_max_attempts + base_delay = settings.steam_retry_base_delay + max_delay = settings.steam_retry_max_delay + last_exception: Exception | None = None + + for attempt in range(max_attempts): + try: + response = await client.get(url, params=params) + status = response.status_code + + if status == 200: + return response + + # Non-retryable client errors + if status == 404: + raise SteamAPIError(404, context, f"Not found: {url}") + if status == 403: + raise SteamAPIError(403, context, f"Forbidden: {url}") + if 400 <= status < 500 and status not in _RETRYABLE_STATUS_CODES: + raise SteamAPIError(status, context, f"Client error {status}: {url}") + + # Retryable errors (429, 5xx) + if attempt < max_attempts - 1: + delay = min(base_delay * (2 ** attempt), max_delay) + + # Respect Retry-After header for 429 + if status == 429: + retry_after = response.headers.get("Retry-After") + if retry_after: + try: + delay = min(float(retry_after), max_delay) + except ValueError: + pass + + logger.warning( + f"Steam API {status} for {context}, " + f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s" + ) + await asyncio.sleep(delay) + else: + # Exhausted retries + if status == 429: + raise SteamRateLimitError(context) + raise SteamAPIError(status, context, f"Server error {status} after {max_attempts} attempts: {url}") + + except (httpx.TimeoutException, httpx.ConnectError) as e: + last_exception = e + if attempt < max_attempts - 1: + delay = min(base_delay * (2 ** attempt), max_delay) + logger.warning( + f"Steam API {type(e).__name__} for {context}, " + f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s" + ) + await asyncio.sleep(delay) + else: + raise SteamAPIError( + 0, context, + f"Connection failed after {max_attempts} attempts: {e}" + ) from e + + # Should not reach here, but just in case + raise SteamAPIError(0, context, "Unexpected retry exhaustion") from last_exception + + async def search_game(self, query: str) -> GameInfo | None: + """Wyszukuje grę po nazwie używając publicznego API wyszukiwarki Steam.""" + client = self.client + params = { + "term": query, + "l": settings.steam_review_language, + "cc": settings.steam_region, + } + + try: + response = await self._request_with_retry( + client, self.SEARCH_API_URL, params, context=f"search:{query}" + ) + data = response.json() + except (SteamAPIError, SteamRateLimitError) as e: + logger.error(f"Błąd wyszukiwania gry '{query}': {e}") + return None + + items = data.get("items", []) + if not items: + logger.warning(f"Nie znaleziono gry: {query}") + return None + + first_result = items[0] + app_id = str(first_result.get("id")) + + game_info = await self.get_game_info(app_id) + + if game_info: + await mongodb.upsert_game({ + "appid": game_info.app_id, + "name": game_info.name, + "name_cn": game_info.name_cn, + "cn_name_checked": True, + "header_image": game_info.header_image, + "total_reviews": game_info.total_reviews + }) + + return game_info + + async def get_game_info(self, app_id: str) -> GameInfo | None: + """Pobiera szczegółowe metadane gry (obrazek, nazwę) z appdetails.""" + cached_error = await mongodb.get_steam_error(app_id) + if cached_error: + logger.info( + f"Skipping Steam API for app {app_id} — " + f"cached error {cached_error.get('status_code')}" + ) + return None + + client = self.client + details_url = f"{self.STORE_API_URL}/appdetails" + + async def fetch_localized(lang: str): + try: + params = {"appids": app_id, "l": lang, "cc": settings.steam_region} + resp = await self._request_with_retry( + client, details_url, params, context=app_id + ) + return resp.json().get(app_id, {}) + except SteamAPIError as e: + if e.status_code == 404: + await mongodb.cache_steam_error( + app_id, 404, settings.steam_error_cache_ttl_404 + ) + return {} + + data_zh, data_en = await asyncio.gather( + fetch_localized("schinese"), + fetch_localized("english") + ) + + if not data_en.get("success") and not data_zh.get("success"): + logger.warning(f"Nie znaleziono szczegółów gry: {app_id}") + return None + + base_data = data_en.get("data") or data_zh.get("data") + name_en = data_en.get("data", {}).get("name") or base_data.get("name") + name_zh = data_zh.get("data", {}).get("name") + + stats = await self.get_review_stats(app_id) + + return GameInfo( + app_id=app_id, + name=name_en, + name_cn=name_zh if name_zh != name_en else None, + header_image=base_data.get("header_image"), + total_reviews=stats.total, + ) + + async def get_review_stats(self, app_id: str) -> ReviewStats: + """Pobiera sumaryczne statystyki recenzji potrzebne do planowania próbki.""" + cached_error = await mongodb.get_steam_error(app_id) + if cached_error: + logger.info( + f"Skipping review stats for app {app_id} — " + f"cached error {cached_error.get('status_code')}" + ) + return ReviewStats(total=0, positive=0, negative=0) + + client = self.client + url = f"{self.REVIEW_API_URL}/{app_id}" + params = { + "json": "1", + "filter": "all", + "num_per_page": "0", + } + + try: + response = await self._request_with_retry( + client, url, params, context=app_id + ) + data = response.json() + + summary = data.get("query_summary", {}) + return ReviewStats( + total=summary.get("total_reviews", 0), + positive=summary.get("total_positive", 0), + negative=summary.get("total_negative", 0), + ) + except SteamAPIError as e: + if e.status_code in (404, 429): + ttl = ( + settings.steam_error_cache_ttl_429 + if e.status_code == 429 + else settings.steam_error_cache_ttl_404 + ) + await mongodb.cache_steam_error(app_id, e.status_code, ttl) + logger.error(f"Błąd pobierania statystyk recenzji: {e}") + return ReviewStats(total=0, positive=0, negative=0) + + async def _fetch_reviews_batch( + self, + client: httpx.AsyncClient, + app_id: str, + review_type: str, + filter_type: str, + num_per_page: int, + cursor: str | None, + ) -> tuple[list[str], list[ReviewItem], str | None]: + """Pobiera pojedynczą paczkę recenzji (do 100 sztuk).""" + url = f"{self.REVIEW_API_URL}/{app_id}" + params: dict[str, Any] = { + "json": "1", + "filter": filter_type, + "review_type": review_type, + "language": settings.steam_review_language, + "num_per_page": str(num_per_page), + "cursor": cursor or "*", + "purchase_type": "all", + } + + try: + response = await self._request_with_retry( + client, url, params, context=app_id + ) + data = response.json() + except SteamRateLimitError: + await mongodb.cache_steam_error( + app_id, 429, settings.steam_error_cache_ttl_429 + ) + logger.error(f"Rate limited fetching reviews for {app_id}") + return [], [], None + except SteamAPIError as e: + logger.error(f"Błąd pobierania recenzji: {e}") + return [], [], None + + if not data.get("success"): + return [], [], None + + reviews_data = data.get("reviews", []) + review_texts: list[str] = [] + review_items: list[ReviewItem] = [] + + for review in reviews_data: + text = review.get("review") + if not text: + continue + review_texts.append(text) + review_items.append(ReviewItem( + text=text, + recommendation_id=str(review.get("recommendationid", "")), + timestamp_created=review.get("timestamp_created", 0), + )) + + new_cursor = data.get("cursor") + return review_texts, review_items, new_cursor + + async def fetch_reviews_stratified( + self, + app_id: str, + sample_plan: SamplePlan, + ) -> AsyncGenerator[ReviewBatch, None]: + """ + Główna logika pobierania danych. Działa w dwóch fazach. + """ + batch_size = settings.review_batch_size + all_reviews: set[str] = set() + seen_cursors: set[str] = set() + client = self.client + + # --- FAZA 1: TOP HELPFUL --- + cursor: str | None = "*" + fetched = 0 + + while fetched < sample_plan.top_helpful: + to_fetch = min(batch_size, sample_plan.top_helpful - fetched) + reviews, review_items, cursor = await self._fetch_reviews_batch( + client, app_id, "all", "all", to_fetch, cursor + ) + + if not reviews: + break + if cursor and cursor in seen_cursors: + logger.warning(f"Repeated cursor {cursor} for {app_id} (top_helpful). Shortfall: {sample_plan.top_helpful - fetched}") + break + if cursor: + seen_cursors.add(cursor) + + all_reviews.update(reviews) + fetched += len(reviews) + yield ReviewBatch(reviews=reviews, review_items=review_items, cursor=cursor) + + if not cursor or cursor == "*": + break + + # --- FAZA 2a: RECENT POSITIVE --- + positive_target = sample_plan.positive_count + if positive_target > 0: + cursor = "*" + fetched = 0 + seen_cursors_pos: set[str] = set() + + while fetched < positive_target: + to_fetch = min(batch_size, positive_target - fetched) + # Jeśli mamy dużo duplikatów, prosimy o więcej niż pozostało do targetu (ale max batch_size) + if fetched > 0: + to_fetch = batch_size + + reviews, review_items, cursor = await self._fetch_reviews_batch( + client, app_id, "positive", "recent", to_fetch, cursor or "*" + ) + if not reviews: + break + if cursor and cursor in seen_cursors_pos: + logger.warning(f"Repeated cursor {cursor} for {app_id} (positive). Shortfall: {positive_target - fetched}") + break + if cursor: + seen_cursors_pos.add(cursor) + + new_reviews = [r for r in reviews if r not in all_reviews] + new_texts_set = set(new_reviews) + new_items = [ri for ri in review_items if ri.text in new_texts_set] + all_reviews.update(new_reviews) + fetched += len(new_reviews) + + if new_reviews: + yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor) + if not cursor or cursor == "*": + break + + # --- FAZA 2b: RECENT NEGATIVE --- + negative_target = sample_plan.negative_count + if negative_target > 0: + cursor = "*" + fetched = 0 + seen_cursors_neg: set[str] = set() + + while fetched < negative_target: + to_fetch = min(batch_size, negative_target - fetched) + if fetched > 0: + to_fetch = batch_size + + reviews, review_items, cursor = await self._fetch_reviews_batch( + client, app_id, "negative", "recent", to_fetch, cursor or "*" + ) + if not reviews: + break + if cursor and cursor in seen_cursors_neg: + logger.warning(f"Repeated cursor {cursor} for {app_id} (negative). Shortfall: {negative_target - fetched}") + break + if cursor: + seen_cursors_neg.add(cursor) + + new_reviews = [r for r in reviews if r not in all_reviews] + new_texts_set = set(new_reviews) + new_items = [ri for ri in review_items if ri.text in new_texts_set] + all_reviews.update(new_reviews) + fetched += len(new_reviews) + + if new_reviews: + yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor) + if not cursor or cursor == "*": + break + + logger.info(f"Pobrano łącznie {len(all_reviews)} unikalnych recenzji") + + async def fetch_recent_reviews( + self, + app_id: str, + exclude_ids: set[str] | None = None, + ) -> list[ReviewItem]: + """ + Fetch recent reviews for incremental analysis. + """ + is_new_game = not exclude_ids + exclude_ids = exclude_ids or set() + batch_size = settings.review_batch_size + + # Incremental Fetch limit for new games + if is_new_game: + stats = await self.get_review_stats(app_id) + max_total = min(stats.total, settings.recent_sample_limit, 500) + else: + max_total = settings.recent_sample_limit + + client = self.client + cursor: str | None = "*" + seen_cursors: set[str] = set() + new_items: list[ReviewItem] = [] + + while len(new_items) < max_total: + to_fetch = min(batch_size, max_total - len(new_items)) + _, review_items, cursor = await self._fetch_reviews_batch( + client, app_id, "all", "recent", to_fetch, cursor + ) + + if not review_items: + break + if cursor and cursor in seen_cursors: + logger.warning(f"Repeated cursor {cursor} for {app_id} (recent). Shortfall: {max_total - len(new_items)}") + break + if cursor: + seen_cursors.add(cursor) + + # Filter out already-known reviews + batch_new = [ri for ri in review_items if ri.recommendation_id not in exclude_ids] + + # Early exit: if >80% of batch is known, we've passed the boundary + known_ratio = 1 - (len(batch_new) / len(review_items)) if review_items else 0 + new_items.extend(batch_new) + + if not is_new_game and known_ratio > 0.8: + logger.info( + f"Early exit for {app_id}: {known_ratio:.0%} of batch already known" + ) + break + + if not cursor or cursor == "*": + break + + logger.info(f"Incremental fetch for {app_id}: {len(new_items)} new reviews") + return new_items[:max_total] + + async def fetch_reviews( + self, + app_id: str, + batch_size: int | None = None, + max_reviews: int | None = None, + ) -> AsyncGenerator[ReviewBatch, None]: + """Wrapper dla zachowania kompatybilności.""" + stats = await self.get_review_stats(app_id) + if stats.total == 0: + return + + sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative) + async for batch in self.fetch_reviews_stratified(app_id, sample_plan): + yield batch + + +# Globalna instancja serwisu (Singleton) +steam_service = SteamService() diff --git a/backend/app/services/update_detection_service.py b/backend/app/services/update_detection_service.py new file mode 100644 index 0000000000000000000000000000000000000000..30927183d49131b1baac49330afa30c7745ee23a --- /dev/null +++ b/backend/app/services/update_detection_service.py @@ -0,0 +1,453 @@ +""" +Update Detection Service — checks Steam News API for game updates. + +Compares the latest news/patch date with the stored `last_game_update_at` +to detect games that have been recently updated. +""" + +import logging +import re +from datetime import datetime, timezone +from typing import Any, NamedTuple, cast + +import httpx + +from app.core.config import settings +from app.db.mongodb import mongodb + +logger = logging.getLogger(__name__) + +STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/" + +# Matches two-segment versions: 1.2, v2.0, 0.6, 123.4 +# Excludes three-segment (0.6.1) via negative lookahead, 4-digit years via \d{1,3}, +# and sub-segments of longer versions (e.g. "6.1" within "0.6.1") via lookbehind. +VERSION_RE = re.compile(r'(? None: + self._client = client + self._owns_client = client is None + + async def _get_client(self) -> httpx.AsyncClient: + if self._client is None: + self._client = httpx.AsyncClient(timeout=15.0) + return self._client + + async def close(self) -> None: + if self._owns_client and self._client is not None: + await self._client.aclose() + self._client = None + + @staticmethod + def _is_update_related(item: dict) -> bool: + """Return True if news item is update-related. + + Conditions (any one is sufficient): + A: 'patchnotes' in tags + B: feedlabel == 'Product Update' + C: title matches release-style phrases + D: title matches large content update phrases + E: title has a version number AND an action word + """ + tags = item.get("tags") + if isinstance(tags, list): + is_patch = "patchnotes" in tags + else: + is_patch = "patchnotes" in (tags or "") + feedlabel = item.get("feedlabel") or "" + if is_patch or feedlabel == "Product Update": + return True + + # Conditions C/D/E: title-based signals — restricted to developer feed only. + # Third-party news sites (GamingOnLinux etc.) can write about updates using + # the same language, so we only trust these signals from the developer's own feed. + if item.get("feedname") != "steam_community_announcements": + return False + + title = item.get("title", "") + if RELEASE_PHRASE_RE.search(title): + return True + if CONTENT_UPDATE_RE.search(title): + return True + if VERSION_RE.search(title) and ACTION_WORD_RE.search(title): + return True + # F: named version (V70) + "update" in title (developer feed only) + if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title): + return True + + return False + + @staticmethod + def _is_major_update(item: dict) -> bool: + """Return True if the news item represents a major update. + + Negative signals (blockers) are checked first: + - hotfix keyword → not major + - experimental branch / public test branch → not major + + Positive signals (any one is sufficient): + - version number in title (VERSION_RE) + - release language (MAJOR_RELEASE_RE) + - standalone '1.0' (ONE_ZERO_RE) + - large content phrases (MAJOR_CONTENT_RE) + """ + title = item.get("title", "") + + if HOTFIX_RE.search(title): + return False + if BRANCH_RE.search(title): + return False + if EVENT_FESTIVAL_RE.search(title) and not UPDATE_OR_PATCH_RE.search(title): + return False + if PATCH_WORD_RE.search(title) and MAINT_LANGUAGE_RE.search(title): + return False + + if VERSION_RE.search(title): + return True + if MAJOR_RELEASE_RE.search(title): + return True + if ONE_ZERO_RE.search(title): + return True + if MAJOR_CONTENT_RE.search(title): + return True + if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title): + return True + + return False + + @staticmethod + def _collect_update_candidates( + news_items: list[dict], + ) -> tuple[datetime | None, datetime | None]: + """Scan all items, return (latest_update_date, major_date). + + latest_update_date: max date of all update-related items (or None) + major_date: max date of major items (or None if no major found) + """ + latest_update_ts: int | None = None + major_ts: int | None = None + + for item in news_items: + if not UpdateDetectionService._is_update_related(item): + continue + ts = item.get("date") or 0 + if not ts: + continue + if latest_update_ts is None or ts > latest_update_ts: + latest_update_ts = ts + if UpdateDetectionService._is_major_update(item): + if major_ts is None or ts > major_ts: + major_ts = ts + + latest_update_date = ( + datetime.fromtimestamp(latest_update_ts, tz=timezone.utc) + if latest_update_ts is not None + else None + ) + major_date = ( + datetime.fromtimestamp(major_ts, tz=timezone.utc) + if major_ts is not None + else None + ) + return latest_update_date, major_date + + @staticmethod + async def _fetch_news_page( + client: httpx.AsyncClient, + app_id: str, + count: int, + enddate: int | None = None, + ) -> list[dict]: + """Fetch a single page of news items from Steam API. + + Returns [] on HTTP error or request failure. + """ + params: dict[str, Any] = { + "appid": app_id, + "count": count, + "maxlength": 0, + } + if enddate is not None: + params["enddate"] = enddate + + try: + resp = await client.get(STEAM_NEWS_API_URL, params=params) + if resp.status_code != 200: + return [] + data = resp.json() + return data.get("appnews", {}).get("newsitems", []) + except (httpx.RequestError, ValueError, KeyError) as e: + logger.debug(f"News page fetch failed for {app_id}: {e}") + return [] + + @staticmethod + def _scan_batch_with_stopping( + items: list[dict], + last_seen_gid: str | None, + last_seen_at_ts: int | None, + refresh_cutoff_ts: int | None, + ) -> tuple[list[dict], bool]: + """Scan items (newest→oldest), collecting until a stop condition is met. + + Stop conditions (item is NOT included): + - gid matches last_seen_gid + - item date <= last_seen_at_ts + - item date < refresh_cutoff_ts + + Returns (accepted_items, hit_stop). + """ + accepted: list[dict] = [] + for item in items: + gid = str(item.get("gid", "")) + ts = item.get("date") or 0 + + if last_seen_gid and gid and gid == last_seen_gid: + return accepted, True + if last_seen_at_ts is not None and ts and ts <= last_seen_at_ts: + return accepted, True + if refresh_cutoff_ts is not None and ts and ts < refresh_cutoff_ts: + return accepted, True + + accepted.append(item) + + return accepted, False + + async def _get_latest_news_date( + self, + app_id: str, + last_seen_gid: str | None = None, + last_seen_at: datetime | None = None, + ) -> NewsCheckResult: + """Fetch and scan Steam news for update candidates. + + In initial mode (no cursor): fetches count=20, single page. + In incremental mode (cursor present): fetches count=5 with pagination, + stopping at the known cursor or the refresh window boundary. + """ + client = await self._get_client() + + is_incremental = last_seen_gid is not None or last_seen_at is not None + count = settings.news_incremental_count if is_incremental else settings.news_initial_count + + # Compute stop thresholds for incremental mode + last_seen_at_ts: int | None = None + refresh_cutoff_ts: int | None = None + if is_incremental: + last_seen_at_ts = int(last_seen_at.timestamp()) if last_seen_at else None + now_ts = int(datetime.now(timezone.utc).timestamp()) + cutoff_ts = now_ts - (settings.news_refresh_window_hours * 3600) + + # If cursor is older than the refresh window (worker was down), + # disable the time cutoff and scan to the cursor instead. + # _NEWS_MAX_PAGES protects against unbounded pagination. + if last_seen_at_ts is not None and last_seen_at_ts < cutoff_ts: + refresh_cutoff_ts = None + else: + refresh_cutoff_ts = cutoff_ts + + all_accepted: list[dict] = [] + newest_gid: str | None = None + newest_ts: int = 0 + scan_complete = False + pages_fetched = 0 + enddate: int | None = None + + while True: + items = await self._fetch_news_page(client, app_id, count, enddate) + + if not items: + if pages_fetched == 0: + # First page empty (no news or HTTP error) — newest_gid stays None + pass + # Pagination page empty → incomplete scan → don't update cursor + break + + pages_fetched += 1 + + # Track newest item (from first page only) + if newest_gid is None: + for item in items: + gid = str(item.get("gid", "")) + ts = item.get("date") or 0 + if gid and ts: + newest_gid = gid + newest_ts = ts + break + + if is_incremental: + accepted, hit_stop = self._scan_batch_with_stopping( + items, last_seen_gid, last_seen_at_ts, refresh_cutoff_ts + ) + all_accepted.extend(accepted) + + if hit_stop: + scan_complete = True + break + if len(items) < count: + scan_complete = True # API has no more items + break + if pages_fetched >= _NEWS_MAX_PAGES: + scan_complete = True # page limit reached + break + oldest_ts = items[-1].get("date") or 0 + if not oldest_ts: + break # can't paginate → incomplete scan + enddate = oldest_ts - 1 + else: + # Initial mode: single fetch, always clean + all_accepted.extend(items) + scan_complete = True + break + + latest_update_date, major_date = self._collect_update_candidates(all_accepted) + + cursor_gid: str | None = None + cursor_at: datetime | None = None + if scan_complete and newest_gid: + cursor_gid = newest_gid + cursor_at = datetime.fromtimestamp(newest_ts, tz=timezone.utc) + + if latest_update_date is None: + return NewsCheckResult( + None, False, None, + newest_seen_gid=cursor_gid, + newest_seen_at=cursor_at, + ) + + return NewsCheckResult( + latest_update_date=latest_update_date, + is_major=major_date is not None, + major_date=major_date, + newest_seen_gid=cursor_gid, + newest_seen_at=cursor_at, + ) + + async def check_for_updates( + self, games: list[dict[str, Any]] + ) -> list[dict[str, Any]]: + """ + Check Steam News API for each game. Return games with confirmed major updates. + + Non-major patchnotes update last_game_update_at but do not trigger a schedule. + """ + updated_games: list[dict[str, Any]] = [] + dlcs_by_parent: dict[str, list[dict[str, Any]]] = {} + + for game in games: + if game.get("app_type") == "dlc" and game.get("parent_appid"): + dlcs_by_parent.setdefault(str(game["parent_appid"]), []).append(game) + + for game in games: + app_id = str(game.get("appid", "")) + if not app_id: + continue + + if game.get("app_type") == "dlc": + continue + + last_known = game.get("last_game_update_at") + # Normalize last_known to datetime if it's a timestamp + if last_known is not None and not isinstance(last_known, datetime): + try: + last_known = datetime.fromtimestamp(float(last_known), tz=timezone.utc) + except (ValueError, TypeError): + last_known = None + + result = await self._get_latest_news_date( + app_id, + last_seen_gid=game.get("last_seen_news_gid"), + last_seen_at=game.get("last_seen_news_at"), + ) + + # Persist cursor before any early-continue — even if no updates found + if result.newest_seen_gid: + await mongodb.update_news_cursor( + app_id, result.newest_seen_gid, cast(datetime, result.newest_seen_at) + ) + + if result.latest_update_date is None: + continue + + if last_known is None or result.latest_update_date > last_known: + await mongodb.update_game_update_date(app_id, result.latest_update_date) + + if result.is_major: + current_patch_at = game.get("current_patch_at") + patch_date = cast(datetime, result.major_date) # always not None when is_major=True + if current_patch_at is None or patch_date > current_patch_at: + await mongodb.update_game_patch_date(app_id, patch_date) + updated_games.append({**game, "update_at": patch_date}) + + for dlc in dlcs_by_parent.get(app_id, []): + dlc_appid = str(dlc.get("appid", "")) + if not dlc_appid: + continue + + await mongodb.update_game_patch_date(dlc_appid, patch_date) + updated_games.append({**dlc, "update_at": patch_date}) + + logger.info( + f"Update detection: {len(updated_games)}/{len(games)} games have new updates" + ) + return updated_games diff --git a/backend/pytest.ini b/backend/pytest.ini new file mode 100644 index 0000000000000000000000000000000000000000..24ef57e23361978f45df45211559dbbb7720bfd1 --- /dev/null +++ b/backend/pytest.ini @@ -0,0 +1,6 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_functions = test_* +asyncio_mode = auto +addopts = -v --tb=short diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ad95156ca9f129db7f9a56d6d448795172b7a98 --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,42 @@ +# Web Framework +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +sse-starlette==1.8.2 + +# Database +motor==3.3.2 +pymongo==4.6.1 + +# Data Validation +pydantic==2.5.3 +pydantic-settings==2.1.0 + +# HTTP Client +httpx==0.26.0 + +# AI/ML - Local Inference (ONNX Runtime only, no PyTorch needed at runtime) +numpy<2.0.0 +transformers==4.37.2 +optimum[onnxruntime]==1.16.2 +huggingface-hub==0.20.3 + +# Rate Limiting +slowapi==0.1.9 + +# Utilities +python-dotenv==1.0.0 +jieba==0.42.1 + +# Keyword Expansion (FastText) +gensim==4.3.3 + +# Code Quality +ruff==0.1.14 +mypy==1.8.0 + +# Testing +pytest==7.4.4 +pytest-asyncio==0.23.3 +pytest-cov==4.1.0 +anyio==4.12.1 +zhconv==1.4.3 diff --git a/backend/scripts/smoke_news_cursor.py b/backend/scripts/smoke_news_cursor.py new file mode 100644 index 0000000000000000000000000000000000000000..3b84d52b1f2a59a8c1236130e4331e86da463730 --- /dev/null +++ b/backend/scripts/smoke_news_cursor.py @@ -0,0 +1,264 @@ +""" +Smoke Test: Incremental Steam News Cursor Flow + +Validates that UpdateDetectionService correctly uses cursor-based incremental +news fetching against the real Steam API. + +Test game: Factorio (427520) — stable, always has news, uses patchnotes tags. + +Usage: + cd /mnt/d/sentiment_summarizer/backend + ../venv/bin/python scripts/smoke_news_cursor.py +""" + +import asyncio +import sys +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import httpx + +# Ensure backend/app is importable +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from app.services.update_detection_service import UpdateDetectionService # noqa: E402 + +TEST_APP_ID = "427520" # Factorio + + +# ── helpers ────────────────────────────────────────────────────────── + + +def _ts() -> str: + return datetime.now(timezone.utc).strftime("%H:%M:%S") + + +def _print(status: str, msg: str) -> None: + tag = { + "OK": "\033[32mOK\033[0m", + "FAIL": "\033[31mFAIL\033[0m", + "SKIP": "\033[33mSKIP\033[0m", + "INFO": "\033[36mINFO\033[0m", + } + print(f"[{_ts()}] [{tag.get(status, status)}] {msg}") + + +class RecordingTransport(httpx.AsyncBaseTransport): + """Forwards real HTTP requests but records URL + query params for inspection.""" + + def __init__(self) -> None: + self._inner = httpx.AsyncHTTPTransport() + self.recorded: list[dict] = [] + + async def handle_async_request(self, request: httpx.Request) -> httpx.Response: + params = dict(request.url.params) + self.recorded.append({"url": str(request.url), "params": params}) + return await self._inner.handle_async_request(request) + + async def aclose(self) -> None: + await self._inner.aclose() + + +# ── main ───────────────────────────────────────────────────────────── + + +async def run_smoke_test() -> int: + print(f"\nSteam News Cursor Smoke Test — Factorio ({TEST_APP_ID})") + print("=" * 60) + failures = 0 + + # ── Check 1: initial scan returns cursor fields ─────────────────── + _print("INFO", f"Check 1: initial scan for {TEST_APP_ID} (Factorio)") + result_initial = None + svc1 = UpdateDetectionService() + try: + result_initial = await svc1._get_latest_news_date(TEST_APP_ID) + finally: + await svc1.close() + + if result_initial.newest_seen_gid is None: + _print("SKIP", "No news items returned — Steam API may be rate-limiting or unreachable; skipping all checks") + return 0 + + c1_ok = True + if not isinstance(result_initial.newest_seen_gid, str) or not result_initial.newest_seen_gid: + _print("FAIL", f"newest_seen_gid is empty/non-string: {result_initial.newest_seen_gid!r}") + c1_ok = False + + now = datetime.now(timezone.utc) + if result_initial.newest_seen_at is None: + _print("FAIL", "newest_seen_at is None") + c1_ok = False + elif not ( + datetime(2020, 1, 1, tzinfo=timezone.utc) + <= result_initial.newest_seen_at + <= datetime(now.year + 1, 1, 1, tzinfo=timezone.utc) + ): + _print("FAIL", f"newest_seen_at out of expected range: {result_initial.newest_seen_at!r}") + c1_ok = False + + if c1_ok: + _print( + "OK", + f"cursor GID={result_initial.newest_seen_gid}, " + f"at={result_initial.newest_seen_at.isoformat()}", + ) + else: + failures += 1 + + cursor_gid = result_initial.newest_seen_gid + cursor_at = result_initial.newest_seen_at + + # ── Check 2: incremental scan uses count=5 ──────────────────────── + _print("INFO", "Check 2: incremental scan uses count=5") + transport = RecordingTransport() + client = httpx.AsyncClient(transport=transport, timeout=15.0) + svc2 = UpdateDetectionService(client=client) + result_inc = None + try: + result_inc = await svc2._get_latest_news_date( + TEST_APP_ID, last_seen_gid=cursor_gid, last_seen_at=cursor_at + ) + finally: + await client.aclose() + + if not transport.recorded: + _print("SKIP", "No requests recorded — Steam API may be unreachable") + else: + c2_ok = True + for i, req in enumerate(transport.recorded): + count_val = req["params"].get("count") + enddate_val = req["params"].get("enddate", "n/a") + if str(count_val) != "5": + _print("FAIL", f"Request {i + 1}: count={count_val!r}, expected '5'") + c2_ok = False + else: + _print("INFO", f" Request {i + 1}: count=5 ✓ enddate={enddate_val}") + if c2_ok: + _print("OK", f"All {len(transport.recorded)} request(s) used count=5") + else: + failures += 1 + + # ── Check 3: no items older than cursor boundary ────────────────── + _print("INFO", "Check 3: incremental result respects cursor boundary") + if result_inc is None: + _print("SKIP", "No incremental result available") + else: + c3_ok = True + if result_inc.latest_update_date is not None: + if result_inc.latest_update_date <= cursor_at: + _print( + "FAIL", + f"latest_update_date {result_inc.latest_update_date.isoformat()} " + f"is not strictly newer than cursor {cursor_at.isoformat()}", + ) + c3_ok = False + else: + _print( + "INFO", + f" latest_update_date={result_inc.latest_update_date.isoformat()} " + f"> cursor (new update found between scans)", + ) + else: + _print("INFO", " latest_update_date=None (no new updates since cursor) — expected") + if c3_ok: + _print("OK", "Cursor boundary respected") + else: + failures += 1 + + # ── Check 4: latest_update_date / major_date invariants ────────── + _print("INFO", "Check 4: structural invariants on initial scan result") + c4_ok = True + if result_initial.latest_update_date is None: + if result_initial.is_major or result_initial.major_date is not None: + _print( + "FAIL", + f"latest_update_date=None but is_major={result_initial.is_major}, " + f"major_date={result_initial.major_date!r}", + ) + c4_ok = False + elif result_initial.is_major: + if result_initial.major_date is None: + _print("FAIL", "is_major=True but major_date is None") + c4_ok = False + elif result_initial.major_date > result_initial.latest_update_date: + _print( + "FAIL", + f"major_date {result_initial.major_date.isoformat()} " + f"> latest_update_date {result_initial.latest_update_date.isoformat()}", + ) + c4_ok = False + else: + if result_initial.major_date is not None: + _print("FAIL", f"is_major=False but major_date={result_initial.major_date!r}") + c4_ok = False + if c4_ok: + _print( + "OK", + f"invariants hold: latest_update_date={result_initial.latest_update_date}, " + f"is_major={result_initial.is_major}, major_date={result_initial.major_date}", + ) + else: + failures += 1 + + # ── Check 5: check_for_updates end-to-end, mocked DB ───────────── + _print("INFO", "Check 5: check_for_updates end-to-end (mocked DB)") + mock_mongodb = AsyncMock() + svc5 = UpdateDetectionService() + updated = None + try: + with patch("app.services.update_detection_service.mongodb", mock_mongodb): + updated = await svc5.check_for_updates( + [{"appid": TEST_APP_ID, "name": "Factorio"}] + ) + finally: + await svc5.close() + + c5_ok = True + if not isinstance(updated, list): + _print("FAIL", f"check_for_updates returned {type(updated).__name__}, expected list") + c5_ok = False + + call_count = mock_mongodb.update_news_cursor.call_count + if call_count == 0: + # API may have failed between checks (swallowed internally by the service); + # treat as skip — not a hard failure per the plan. + _print("SKIP", "update_news_cursor not called — Steam API may have been unreachable for this call") + elif call_count > 1: + _print("FAIL", f"update_news_cursor called {call_count} times, expected 1") + c5_ok = False + else: + args = mock_mongodb.update_news_cursor.call_args[0] + if not ( + isinstance(args[0], str) + and isinstance(args[1], str) + and isinstance(args[2], datetime) + ): + _print( + "FAIL", + f"update_news_cursor arg types wrong: " + f"{[type(a).__name__ for a in args]} — expected (str, str, datetime)", + ) + c5_ok = False + else: + _print( + "OK", + f"check_for_updates returned list; " + f"update_news_cursor({args[0]!r}, {args[1]!r}, {args[2].isoformat()!r})", + ) + if not c5_ok: + failures += 1 + + # ── Summary ─────────────────────────────────────────────────────── + print("=" * 60) + if failures == 0: + _print("OK", "All checks passed") + return 0 + else: + _print("FAIL", f"{failures} check(s) failed") + return 1 + + +if __name__ == "__main__": + sys.exit(asyncio.run(run_smoke_test())) diff --git a/backend/scripts/smoke_test.py b/backend/scripts/smoke_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8c1a5f9cbe3958dd8746c2ffcc61dc9cc754917a --- /dev/null +++ b/backend/scripts/smoke_test.py @@ -0,0 +1,185 @@ +""" +Smoke Test — local verification of worker cycle and analysis pipeline. + +Usage: + cd backend + python scripts/smoke_test.py analyze # run full analysis for a game + python scripts/smoke_test.py cycle # mini worker cycle (1 game) +""" + +import argparse +import asyncio +import logging +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +# Ensure backend/app is importable +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from app.core.config import settings # noqa: E402 +from app.db.mongodb import mongodb # noqa: E402 +from app.services.nlp_service import NLPService # noqa: E402 +from app.services.steam_service import SteamService # noqa: E402 +from app.services.update_detection_service import UpdateDetectionService # noqa: E402 +from app.services.precache_service import PreCacheService # noqa: E402 +from app.services.analysis_runner import run_full_analysis # noqa: E402 + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +logger = logging.getLogger("smoke_test") + + +def _ts() -> str: + return datetime.now(timezone.utc).strftime("%H:%M:%S") + + +def _print(status: str, msg: str) -> None: + tag = {"OK": "\033[32mOK\033[0m", "FAIL": "\033[31mFAIL\033[0m", "SKIP": "\033[33mSKIP\033[0m", "INFO": "\033[36mINFO\033[0m"} + print(f"[{_ts()}] [{tag.get(status, status)}] {msg}") + + +# ── analyze subcommand ────────────────────────────────────────────── + + +async def cmd_analyze(app_id: str) -> None: + _print("INFO", f"Starting analysis for app_id={app_id}") + _print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}") + + await mongodb.connect() + + steam_svc = SteamService() + nlp_svc = NLPService() + + try: + t0 = time.monotonic() + result = await run_full_analysis(app_id, f"smoke-{app_id}", steam_svc, nlp_svc) + elapsed = time.monotonic() - t0 + + if result is None: + _print("FAIL", "run_full_analysis returned None") + return + + game = result.get("game", {}) + topics = result.get("topics", []) + analyzed = result.get("analyzed_reviews", 0) + highlights = result.get("general_highlights", []) + + _print("OK", f"Analysis complete in {elapsed:.1f}s") + _print("OK", f" Game: {game.get('name', '?')} (appid {game.get('app_id', '?')})") + _print("OK", f" Reviews analyzed: {analyzed}") + _print("OK", f" Topics found: {len(topics)}") + _print("OK", f" General highlights: {len(highlights)}") + + # Verify cache write + cached = await mongodb.get_cached_analysis(app_id) + if cached: + _print("OK", " Cache write verified — document found in MongoDB") + else: + _print("FAIL", " Cache write verification FAILED — no document in MongoDB") + + finally: + await steam_svc.close() + await mongodb.disconnect() + + +# ── cycle subcommand ───────────────────────────────────────────────── + + +async def cmd_cycle() -> None: + _print("INFO", "Starting mini worker cycle") + _print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}") + + await mongodb.connect() + + steam_svc = SteamService() + nlp_svc = NLPService() + update_svc = UpdateDetectionService() + + try: + # Step 1: Get top 1 game + _print("INFO", "Step 1: Fetching top game by reviews...") + top_games = await mongodb.get_top_games_by_reviews(1) + if not top_games: + _print("SKIP", "No games in DB — run game sync first or use 'analyze' subcommand") + return + + game = top_games[0] + app_id = str(game.get("appid", "")) + name = game.get("name", "?") + _print("OK", f" Top game: {name} (appid {app_id})") + + # Step 2: Test datetime comparison (the bug this patch fixes) + _print("INFO", "Step 2: Testing synced_at datetime comparison...") + synced_at = game.get("synced_at") + if synced_at: + try: + delta = datetime.now(timezone.utc) - synced_at + hours = delta.total_seconds() / 3600 + _print("OK", f" synced_at delta: {hours:.1f}h (tz={synced_at.tzinfo})") + except TypeError as e: + _print("FAIL", f" datetime subtraction failed: {e}") + return + else: + _print("SKIP", " No synced_at field — game sync not run yet") + + # Step 3: Update detection (1 game) + _print("INFO", "Step 3: Update detection...") + t0 = time.monotonic() + updated = await update_svc.check_for_updates([game]) + elapsed = time.monotonic() - t0 + _print("OK", f" Updates detected: {len(updated)} in {elapsed:.1f}s") + + # Step 4: Bootstrap missing analyses + _print("INFO", "Step 4: Bootstrap missing analyses...") + precache_svc = PreCacheService(steam_svc, nlp_svc) + bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games) + _print("OK", f" Bootstrapped: {bootstrapped}") + + # Step 5: Process due analyses (max 1) + _print("INFO", "Step 5: Processing due analyses (max 1)...") + orig = settings.precache_max_analyses_per_cycle + # Temporarily limit to 1 + object.__setattr__(settings, "precache_max_analyses_per_cycle", 1) + try: + executed = await precache_svc.process_due_analyses() + _print("OK", f" Executed: {executed}") + finally: + object.__setattr__(settings, "precache_max_analyses_per_cycle", orig) + + _print("OK", "Mini cycle complete") + + finally: + await update_svc.close() + await steam_svc.close() + await mongodb.disconnect() + + +# ── main ───────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser(description="SentimentStream smoke test") + sub = parser.add_subparsers(dest="command") + + p_analyze = sub.add_parser("analyze", help="Run full analysis for a game") + p_analyze.add_argument("appid", help="Steam app ID (e.g. 730)") + + sub.add_parser("cycle", help="Run mini worker cycle (top 1 game)") + + args = parser.parse_args() + + if args.command == "analyze": + asyncio.run(cmd_analyze(args.appid)) + elif args.command == "cycle": + asyncio.run(cmd_cycle()) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/backend/worker_main.py b/backend/worker_main.py new file mode 100644 index 0000000000000000000000000000000000000000..fd26c4fc68e4dc2db5e92b389bba159b600f9495 --- /dev/null +++ b/backend/worker_main.py @@ -0,0 +1,244 @@ +""" +Worker Main App — lightweight FastAPI for background game sync and pre-cache. + +Endpoints: + GET /health — MongoDB ping, last cycle summary, cycle_running flag + POST /trigger — token-protected, starts a worker cycle as background task + GET /logs — token-protected, read structured log tail +""" + +import asyncio +import logging +import os +import uuid +from contextlib import asynccontextmanager +from datetime import datetime, timezone +from typing import Any + +from fastapi import FastAPI, Query, Request +from fastapi.responses import JSONResponse + +from app.core.config import settings +from app.core.worker_logging import ( + AsyncTimingContext, + WORKER_LOG_WHITELIST, + log_structured, + read_log_tail, + resolve_log_path, + set_cycle_id, + setup_app_logging, + setup_structured_logger, +) +from app.db.mongodb import mongodb +from app.services.game_sync_service import GameSyncService +from app.services.nlp_service import NLPService +from app.services.precache_service import PreCacheService +from app.services.steam_service import SteamService +from app.services.priority_refresh_service import PriorityRefreshService +from app.services.update_detection_service import UpdateDetectionService + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) +logger = logging.getLogger(__name__) + +# Cycle state +_cycle_running = False +_last_cycle_summary: dict[str, Any] = {} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Connect MongoDB on startup, disconnect on shutdown.""" + await mongodb.connect() + setup_structured_logger("worker") + setup_app_logging() + logger.info("Worker started — MongoDB connected, structured logging initialized") + yield + await mongodb.disconnect() + logger.info("Worker shutting down") + + +app = FastAPI(title="SentimentStream Worker", lifespan=lifespan) + + +@app.get("/health") +async def health(): + """Health check with cycle status.""" + mongo_ok = False + try: + if mongodb.client: + await mongodb.client.admin.command("ping") + mongo_ok = True + except Exception: + pass + + return { + "status": "ok" if mongo_ok else "degraded", + "mongodb": "connected" if mongo_ok else "disconnected", + "cycle_running": _cycle_running, + "last_cycle": _last_cycle_summary, + } + + +def _check_bearer_token(request: Request) -> bool: + """Validate Bearer token from Authorization header.""" + auth = request.headers.get("Authorization", "") + expected = settings.worker_trigger_token + return bool(expected and auth.startswith("Bearer ") and auth[7:] == expected) + + +@app.post("/trigger") +async def trigger(request: Request): + """Token-protected trigger to start a worker cycle.""" + global _cycle_running + + if not _check_bearer_token(request): + return JSONResponse(status_code=401, content={"detail": "Unauthorized"}) + + if _cycle_running: + return JSONResponse(status_code=503, content={"detail": "Cycle already running"}) + + asyncio.create_task(_run_cycle()) + return {"status": "started"} + + +@app.get("/logs") +async def get_logs( + request: Request, + lines: int = Query(default=100, ge=1, le=1000), + level: str | None = Query(default=None), + event: str | None = Query(default=None), + file: str = Query(default="worker"), +): + """Token-protected endpoint to read structured log tail.""" + if not _check_bearer_token(request): + return JSONResponse(status_code=401, content={"detail": "Unauthorized"}) + + log_path = resolve_log_path(file, WORKER_LOG_WHITELIST) + if log_path is None: + return JSONResponse( + status_code=400, + content={"detail": f"Unknown log file: '{file}'. Valid: {list(WORKER_LOG_WHITELIST.keys())}"}, + ) + + entries = read_log_tail(log_path, lines=lines, level=level, event=event) + return {"entries": entries, "count": len(entries)} + + +async def _run_cycle() -> None: + """Execute a full worker cycle.""" + global _cycle_running, _last_cycle_summary + _cycle_running = True + started = datetime.now(timezone.utc) + summary: dict[str, Any] = {"started_at": started.isoformat()} + + cycle_id = uuid.uuid4().hex[:8] + set_cycle_id(cycle_id) + log_structured("cycle_start", cycle_id=cycle_id) + + steam_svc = SteamService() + nlp_svc = NLPService() + game_sync_svc = GameSyncService() + priority_svc = PriorityRefreshService() + update_svc = UpdateDetectionService() + + try: + # 1. Game sync (if enabled and not synced recently) + if settings.game_sync_enabled: + top_games = await mongodb.get_top_games_by_reviews(1) + last_synced = top_games[0].get("synced_at") if top_games else None + hours_since_sync = None + if last_synced: + delta = datetime.now(timezone.utc) - last_synced + hours_since_sync = delta.total_seconds() / 3600 + + if hours_since_sync is None or hours_since_sync > 20: + async with AsyncTimingContext() as t_sync: + logger.info("Starting game sync...") + upserted, modified = await game_sync_svc.sync_all_games() + summary["game_sync"] = {"upserted": upserted, "modified": modified} + log_structured("game_sync", elapsed_s=t_sync.elapsed_s, + detail=summary["game_sync"]) + + async with AsyncTimingContext() as t_details: + enriched = await game_sync_svc.sync_top_game_details() + summary["game_details"] = {"enriched": enriched} + log_structured("game_details", elapsed_s=t_details.elapsed_s, + detail=summary["game_details"]) + else: + summary["game_sync"] = "skipped (recent)" + log_structured("game_sync", detail="skipped (recent)") + + # ALWAYS enrich CN names if sync is enabled, even if main sync skipped + async with AsyncTimingContext() as t_cn: + cn_processed = await game_sync_svc.enrich_cn_names() + summary["cn_enrichment"] = {"processed": cn_processed} + log_structured("cn_enrichment", elapsed_s=t_cn.elapsed_s, + detail=summary["cn_enrichment"]) + + async with AsyncTimingContext() as t_app_types: + app_types_processed = await game_sync_svc.enrich_app_types() + summary["app_type_enrichment"] = {"processed": app_types_processed} + log_structured("app_type_enrichment", elapsed_s=t_app_types.elapsed_s, + detail=summary["app_type_enrichment"]) + + # 1b. Priority refresh + async with AsyncTimingContext() as t_priority: + priority_result = await priority_svc.refresh_priorities() + summary["priority_refresh"] = priority_result + log_structured("priority_refresh", elapsed_s=t_priority.elapsed_s, detail=priority_result) + + # 2. Update detection + async with AsyncTimingContext() as t_update: + top_games = await mongodb.get_priority_games_for_analysis() + updated_games = await update_svc.check_for_updates(top_games) + summary["updates_detected"] = len(updated_games) + log_structured("update_detection", elapsed_s=t_update.elapsed_s, + detail={"updates_detected": len(updated_games)}) + + # 3. Create schedules for updated games + precache_svc = PreCacheService(steam_svc, nlp_svc) + + async with AsyncTimingContext() as t_sched: + if updated_games: + await precache_svc.create_schedules_for_updates(updated_games) + log_structured("create_schedules", elapsed_s=t_sched.elapsed_s, + detail={"updated_games": len(updated_games) if updated_games else 0}) + + # 4. Bootstrap missing analyses + async with AsyncTimingContext() as t_boot: + bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games) + summary["bootstrapped"] = bootstrapped + log_structured("bootstrap_missing", elapsed_s=t_boot.elapsed_s, + detail={"bootstrapped": bootstrapped}) + + # 5. Process due analyses + if settings.precache_enabled: + async with AsyncTimingContext() as t_analyses: + executed = await precache_svc.process_due_analyses() + summary["analyses_executed"] = executed + log_structured("process_due_analyses", elapsed_s=t_analyses.elapsed_s, + detail={"executed": executed}) + else: + summary["precache"] = "disabled" + + except Exception as e: + logger.error(f"Cycle error: {e}", exc_info=True) + summary["error"] = str(e) + log_structured("cycle_error", level=logging.ERROR, error=str(e)) + finally: + await game_sync_svc.close() + await priority_svc.close() + await update_svc.close() + await steam_svc.close() + + elapsed = (datetime.now(timezone.utc) - started).total_seconds() + summary["elapsed_seconds"] = round(elapsed, 1) + _last_cycle_summary = summary + _cycle_running = False + log_structured("cycle_end", elapsed_s=round(elapsed, 1), + detail=summary) + set_cycle_id(None) + logger.info(f"Cycle complete in {elapsed:.1f}s: {summary}") diff --git a/scripts/benchmark_major_update.py b/scripts/benchmark_major_update.py new file mode 100644 index 0000000000000000000000000000000000000000..0cd19b8f42bdb18c182d5cf695c71a43f39b2724 --- /dev/null +++ b/scripts/benchmark_major_update.py @@ -0,0 +1,848 @@ +#!/usr/bin/env python3 +""" +Benchmark script for the major update detection heuristic. + +Evaluates UpdateDetectionService._is_update_related, _collect_update_candidates, +and _is_major_update against a curated set of Steam games. + +Three modes: + --discover Fetch news for all games (count=20 by default, matches + production) and display all items with classification + details. Use this to identify ground truth. + --evaluate Item-level evaluation: for each ItemCase, find the item + by gid and check if _is_update_related / _is_major_update + match expectations. + --evaluate-service Service-level evaluation: for each ServiceCase, run the + full selection pipeline and compare the outcome. + +Both --evaluate and --evaluate-service run by default when no mode is specified. + +Examples: + python scripts/benchmark_major_update.py --discover + python scripts/benchmark_major_update.py --discover --count 50 + python scripts/benchmark_major_update.py --evaluate + python scripts/benchmark_major_update.py --evaluate-service + python scripts/benchmark_major_update.py # runs both evaluate modes +""" + +from __future__ import annotations + +import argparse +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Literal + +import httpx + +# ── import project service ──────────────────────────────────────────────────── +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) +from app.services.update_detection_service import UpdateDetectionService # noqa: E402 + +STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/" + +# ── benchmark games ─────────────────────────────────────────────────────────── +GAMES: list[tuple[str, str]] = [ + ("Going Medieval", "1029780"), + ("Timberborn", "1062090"), + ("Hades II", "1145350"), + ("Against the Storm", "1336490"), + ("Valheim", "892970"), + ("Manor Lords", "1363080"), + ("Project Zomboid", "108600"), + ("Dwarf Fortress", "975370"), + ("Helldivers 2", "553850"), + ("Deep Rock Galactic", "548430"), + ("Lethal Company", "1966720"), + ("Factorio", "427520"), + ("Satisfactory", "526870"), +] + +# ── ground truth structures ─────────────────────────────────────────────────── + +@dataclass +class ItemCase: + """Per-item ground truth: is this specific event major?""" + game_name: str + appid: str + gid: str + title: str # for display + expected: Literal["major", "not_major", "ambiguous"] + reasoning: str + + +@dataclass +class ServiceCase: + """Per-game ground truth: what should the production code do?""" + game_name: str + appid: str + expected_major: bool | None # True / False / None = ambiguous + reasoning: str + + +# ── item-level ground truth ─────────────────────────────────────────────────── +# Populated from --discover run on 2026-03-19. +ITEM_CASES: list[ItemCase] = [ + # ── Going Medieval ──────────────────────────────────────────────────────── + ItemCase( + game_name="Going Medieval", + appid="1029780", + gid="1826992588604105", + title="Going Medieval is out now in 1.0!", + expected="major", + reasoning=( + "1.0 full release out of Early Access — unambiguously major. " + "Phase 1: RELEASE_PHRASE_RE matches 'is out now' → update-related. " + "ONE_ZERO_RE matches '1.0' → major." + ), + ), + ItemCase( + game_name="Going Medieval", + appid="1029780", + gid="1827626365751261", + title="Experimental Branch Patch (1.0.48)", + expected="not_major", + reasoning=( + "Experimental branch incremental patch. Three-segment version (1.0.48) " + "excluded by VERSION_RE. BRANCH_RE blocks major classification." + ), + ), + ItemCase( + game_name="Going Medieval", + appid="1029780", + gid="1827626365750723", + title="Patch Notes (1.0.47)", + expected="not_major", + reasoning="Incremental stable patch, three-segment version. not_major is correct.", + ), + # ── Timberborn ──────────────────────────────────────────────────────────── + ItemCase( + game_name="Timberborn", + appid="1062090", + gid="1826992588592887", + title="Timberborn 1.0 is live!", + expected="major", + reasoning=( + "1.0 full release out of Early Access — unambiguously major. " + "Phase 1: RELEASE_PHRASE_RE matches 'is live' → update-related. " + "ONE_ZERO_RE matches '1.0' → major." + ), + ), + ItemCase( + game_name="Timberborn", + appid="1062090", + gid="1826992588603124", + title="Patch notes 2026-03-17 (experimental)", + expected="not_major", + reasoning="Experimental branch date-based patch notes. No version number. not_major is correct.", + ), + # ── Hades II ────────────────────────────────────────────────────────────── + ItemCase( + game_name="Hades II", + appid="1145350", + gid="1816215235360707", + title="Hades II v1.0 Hotfix 3", + expected="not_major", + reasoning=( + "A bugfix hotfix on top of the v1.0 launch — not a content update. " + "Phase 1: HOTFIX_RE blocks major classification. Correct: not_major." + ), + ), + ItemCase( + game_name="Hades II", + appid="1145350", + gid="1811772772516846", + title="Hades II v1.0 Hotfix 2", + expected="not_major", + reasoning="Same pattern: HOTFIX_RE blocks 'v1.0 Hotfix N' from being classified as major.", + ), + ItemCase( + game_name="Hades II", + appid="1145350", + gid="1811772772248738", + title="Hades II v1.0 Is Now Available!", + expected="major", + reasoning=( + "v1.0 full launch — unambiguously major. " + "Phase 1: RELEASE_PHRASE_RE matches 'Is Now Available' → update-related. " + "No hotfix/branch blocker. VERSION_RE matches 'v1.0' → major." + ), + ), + # ── Against the Storm ───────────────────────────────────────────────────── + ItemCase( + game_name="Against the Storm", + appid="1336490", + gid="1818752592135840", + title="Demo Update 1.9.6", + expected="not_major", + reasoning=( + "Demo game update, three-segment version 1.9.6. " + "Service correctly classifies as not_major." + ), + ), + ItemCase( + game_name="Against the Storm", + appid="1336490", + gid="1816849002010836", + title="Brineworks Update (1.9) available!", + expected="major", + reasoning=( + "Named major content update with version 1.9. " + "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' " + "→ update-related. VERSION_RE → major." + ), + ), + # ── Valheim ─────────────────────────────────────────────────────────────── + ItemCase( + game_name="Valheim", + appid="892970", + gid="1825093633184197", + title="Patch 0.221.12", + expected="not_major", + reasoning="Three-segment maintenance patch. Correctly classified as not_major.", + ), + ItemCase( + game_name="Valheim", + appid="892970", + gid="1809869179994587", + title="Patch 0.221.4 (Public Test)", + expected="not_major", + reasoning="Public test branch three-segment patch. Correctly classified as not_major.", + ), + # ── Manor Lords ─────────────────────────────────────────────────────────── + ItemCase( + game_name="Manor Lords", + appid="1363080", + gid="1827626365750540", + title="Major Update #6: Battlefield Changes, New Map, and Family Based Progression", + expected="major", + reasoning=( + "Developer-declared major content drop. " + "Phase 1: CONTENT_UPDATE_RE matches 'Major Update' → update-related and major." + ), + ), + ItemCase( + game_name="Manor Lords", + appid="1363080", + gid="1826992588603500", + title="New BETA version is available for testing (0.8.065)", + expected="not_major", + reasoning=( + "Beta/testing build announcement, not a production major update. " + "Current heuristic misses it entirely, which is acceptable for this benchmark case." + ), + ), + # ── Project Zomboid ─────────────────────────────────────────────────────── + ItemCase( + game_name="Project Zomboid", + appid="108600", + gid="1826992588590120", + title="42.15.2 UNSTABLE HOTFIX Released", + expected="not_major", + reasoning=( + "Unstable-branch hotfix. patchnotes tag makes it update-related, " + "but HOTFIX_RE correctly blocks major classification." + ), + ), + ItemCase( + game_name="Project Zomboid", + appid="108600", + gid="1826362059930323", + title="Build 42.15.0 Unstable Released", + expected="not_major", + reasoning=( + "Unstable build release, not a production major update. " + "Current heuristic does not classify it as update-related because the three-segment " + "build number fails VERSION_RE." + ), + ), + # ── Dwarf Fortress ──────────────────────────────────────────────────────── + ItemCase( + game_name="Dwarf Fortress", + appid="975370", + gid="1826362059918689", + title="Food fixes, AMA, community spotlight and more! Dwarf Fortress Patch 53.11", + expected="not_major", + reasoning=( + "Maintenance patch with Dwarf Fortress' two-segment numbering scheme. " + "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fixes' " + "→ maintenance blocker fires before VERSION_RE → not_major." + ), + ), + ItemCase( + game_name="Dwarf Fortress", + appid="975370", + gid="1821288646585998", + title="Aquatic portraits, Naked dwarf fix and more Dwarf Fortress Patch 53.10", + expected="not_major", + reasoning=( + "Another maintenance patch under the same numbering scheme. " + "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fix' " + "→ maintenance blocker fires → not_major." + ), + ), + # ── Helldivers 2 ────────────────────────────────────────────────────────── + ItemCase( + game_name="Helldivers 2", + appid="553850", + gid="1826992588603352", + title="Machinery of Oppression: 6.1.0", + expected="major", + reasoning=( + "Named content drop with new missions/enemies. This should count as a major update. " + "Useful to test whether named major drops with three-segment versions are still found." + ), + ), + ItemCase( + game_name="Helldivers 2", + appid="553850", + gid="1826992588603981", + title="Revealing our Machinery of Oppression Content Roadmap!", + expected="not_major", + reasoning=( + "Roadmap/announcement post, not the update itself. Should not be treated as major." + ), + ), + # ── Deep Rock Galactic ──────────────────────────────────────────────────── + ItemCase( + game_name="Deep Rock Galactic", + appid="548430", + gid="1825727806720055", + title="'Eight Years in Orbit' Anniversary Event is live now!", + expected="not_major", + reasoning=( + "Live event announcement, not a game patch. " + "Phase 2: EVENT_FESTIVAL_RE matches 'anniversary event'; no 'update'/'patch' in title " + "→ UPDATE_OR_PATCH_RE guard fails → event blocker fires → not_major." + ), + ), + ItemCase( + game_name="Deep Rock Galactic", + appid="548430", + gid="1824644522847377", + title="Lunar Festival 2026 is now live!", + expected="not_major", + reasoning=( + "Seasonal event announcement, not a major patch/update. " + "Phase 2: EVENT_FESTIVAL_RE matches 'festival'; no 'update'/'patch' → event blocker fires → not_major." + ), + ), + # ── Lethal Company ──────────────────────────────────────────────────────── + ItemCase( + game_name="Lethal Company", + appid="1966720", + gid="1800991756395986", + title="V70 - The Incubating Update", + expected="major", + reasoning=( + "Named major content update. " + "Phase 2: NAMED_VERSION_RE matches 'V70'; UPDATE_WORD_RE matches 'Update' " + "→ condition F makes it update-related; named version positive signal → major." + ), + ), + ItemCase( + game_name="Lethal Company", + appid="1966720", + gid="1801617199407807", + title="V72 Bug fix patch", + expected="not_major", + reasoning=( + "Small bug-fix patch. patchnotes tag makes it update-related. " + "Phase 2: PATCH_WORD_RE matches 'patch'; MAINT_LANGUAGE_RE matches 'bug fix' " + "→ maintenance blocker fires → not_major." + ), + ), + # ── Factorio ────────────────────────────────────────────────────────────── + ItemCase( + game_name="Factorio", + appid="427520", + gid="1827626365752749", + title="Version 2.0.76 released as stable", + expected="not_major", + reasoning=( + "Stable maintenance patch under a three-segment versioning scheme. " + "Useful as a clean true negative." + ), + ), + # ── Satisfactory ────────────────────────────────────────────────────────── + ItemCase( + game_name="Satisfactory", + appid="526870", + gid="1826992588604352", + title="Update 1.2 is out now on Experimental!", + expected="not_major", + reasoning=( + "Experimental-branch release, not a production major update. " + "Phase 2: extended BRANCH_RE matches 'on Experimental' → branch blocker fires → not_major." + ), + ), + ItemCase( + game_name="Satisfactory", + appid="526870", + gid="1825093633185794", + title="Experimental Hotfix v1.1.3.1", + expected="not_major", + reasoning=( + "Experimental hotfix on a three-segment version. Correct behavior is not_major." + ), + ), +] + +# ── service-level ground truth ──────────────────────────────────────────────── +# What SHOULD the production code do for this game given the current news window? +# Populated from --discover run on 2026-03-19. +# Phase 1 semantics: verdict based on is_major (major_date is not None), not on selected item title. +SERVICE_CASES: list[ServiceCase] = [ + ServiceCase( + game_name="Going Medieval", + appid="1029780", + expected_major=True, + reasoning=( + "Game released 1.0 on 2026-03-17. Phase 1: 'is out now in 1.0!' matches " + "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. " + "Expected: major_date is not None (TP)." + ), + ), + ServiceCase( + game_name="Timberborn", + appid="1062090", + expected_major=True, + reasoning=( + "Game reached 1.0 on 2026-03-12. Phase 1: '1.0 is live!' matches " + "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. " + "Expected: major_date is not None (TP)." + ), + ), + ServiceCase( + game_name="Hades II", + appid="1145350", + expected_major=True, + reasoning=( + "Game launched v1.0 on 2025-09-25. Phase 1: 'v1.0 Is Now Available!' matches " + "RELEASE_PHRASE_RE → update-related (developer feed). VERSION_RE matches 'v1.0' → major. " + "Subsequent hotfixes (v1.0 Hotfix 2, 3) are correctly blocked by HOTFIX_RE. " + "major_date = v1.0 launch date, latest_update_date = most recent hotfix date. " + "Expected: major_date is not None (TP)." + ), + ), + ServiceCase( + game_name="Against the Storm", + appid="1336490", + expected_major=True, + reasoning=( + "'Brineworks Update (1.9) available!' is a named major content update. " + "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' " + "→ update-related (developer feed). VERSION_RE → major. " + "Expected: major_date is not None (TP)." + ), + ), + ServiceCase( + game_name="Valheim", + appid="892970", + expected_major=False, + reasoning=( + "Top items are three-segment maintenance patches. " + "Correctly classified as not_major. TN." + ), + ), + ServiceCase( + game_name="Manor Lords", + appid="1363080", + expected_major=True, + reasoning=( + "Current window contains a clearly labeled 'Major Update #6' post. " + "Expected: major_date is not None." + ), + ), + ServiceCase( + game_name="Project Zomboid", + appid="108600", + expected_major=False, + reasoning=( + "Current window is dominated by unstable builds and hotfixes. " + "These should update activity, but should not count as major releases." + ), + ), + ServiceCase( + game_name="Dwarf Fortress", + appid="975370", + expected_major=False, + reasoning=( + "Current window contains only maintenance patches (53.11/53.10/53.09 plus hotfixes). " + "Phase 2: maintenance blocker (patch + fix language) correctly blocks all of them → no major_date." + ), + ), + ServiceCase( + game_name="Helldivers 2", + appid="553850", + expected_major=True, + reasoning=( + "Current window contains 'Machinery of Oppression: 6.1.0', a named content update. " + "Expected: major_date is not None." + ), + ), + ServiceCase( + game_name="Lethal Company", + appid="1966720", + expected_major=True, + reasoning=( + "Current window contains 'V70 - The Incubating Update', a named major content drop, " + "plus newer bug-fix patches. Phase 2: NAMED_VERSION_RE + UPDATE_WORD_RE detects V70 → major_date set." + ), + ), + ServiceCase( + game_name="Factorio", + appid="427520", + expected_major=False, + reasoning=( + "Current window contains only three-segment stable maintenance releases (2.0.x). " + "Expected: not_major." + ), + ), + ServiceCase( + game_name="Satisfactory", + appid="526870", + expected_major=False, + reasoning=( + "Current window contains an experimental 1.2 rollout and experimental hotfixes. " + "Phase 2: extended BRANCH_RE ('on Experimental') blocks the 1.2 rollout → no major_date." + ), + ), +] + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def _fmt_ts(ts: int | None) -> str: + if not ts: + return "—" + try: + return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") + except (OSError, ValueError): + return "—" + + +def _fmt_dt(dt: datetime | None) -> str: + if dt is None: + return "—" + return dt.strftime("%Y-%m-%d") + + +def _trunc(s: str, n: int) -> str: + return (s[:n] + "…") if len(s) > n else s + + +def _fetch_news(client: httpx.Client, appid: str, count: int) -> list[dict]: + try: + resp = client.get( + STEAM_NEWS_API_URL, + params={"appid": appid, "count": count, "maxlength": 0}, + ) + if resp.status_code != 200: + print(f" [WARN] HTTP {resp.status_code} for appid {appid}", file=sys.stderr) + return [] + data = resp.json() + return data.get("appnews", {}).get("newsitems", []) or [] + except Exception as exc: + print(f" [WARN] Request failed for appid {appid}: {exc}", file=sys.stderr) + return [] + + +# ── Mode 1: discover ────────────────────────────────────────────────────────── + +def run_discover(count: int) -> None: + if count != 20: + print(f"NOTE: count={count} — beyond production window (prod uses count=20)\n") + + col_idx = 4 + col_gid = 20 + col_date = 10 + col_title = 40 + col_fl = 16 + col_tags = 24 + col_ur = 9 + col_maj = 7 + + header = ( + f"{'#':<{col_idx}} " + f"{'gid':<{col_gid}} " + f"{'date':<{col_date}} " + f"{'title':<{col_title}} " + f"{'feedlabel':<{col_fl}} " + f"{'tags':<{col_tags}} " + f"{'upd_rel?':<{col_ur}} " + f"{'major?':<{col_maj}}" + ) + sep = "-" * len(header) + + with httpx.Client(timeout=30.0) as client: + for game_name, appid in GAMES: + print(f"\n{'=' * len(header)}") + print(f" {game_name} (appid={appid})") + print(f"{'=' * len(header)}") + print(header) + print(sep) + + items = _fetch_news(client, appid, count) + if not items: + print(" (no items returned)") + continue + + for idx, item in enumerate(items, start=1): + gid = str(item.get("gid") or "")[:col_gid] + date_str = _fmt_ts(item.get("date")) + title = _trunc(item.get("title", ""), col_title) + feedlabel = _trunc(item.get("feedlabel") or "", col_fl) + tags = _trunc(str(item.get("tags") or ""), col_tags) + + is_ur = UpdateDetectionService._is_update_related(item) + is_maj = UpdateDetectionService._is_major_update(item) + + ur_str = "Yes" if is_ur else "No" + maj_str = "Yes" if is_maj else "No" + + print( + f"{idx:<{col_idx}} " + f"{gid:<{col_gid}} " + f"{date_str:<{col_date}} " + f"{title:<{col_title}} " + f"{feedlabel:<{col_fl}} " + f"{tags:<{col_tags}} " + f"{ur_str:<{col_ur}} " + f"{maj_str:<{col_maj}}" + ) + + latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items) + print(f"\n >> latest_update_date: {_fmt_dt(latest_update_date)} | major_date: {_fmt_dt(major_date)}") + verdict = "MAJOR" if major_date is not None else "not_major" + print(f" >> Service result: {verdict}") + + +# ── Mode 2: evaluate (item-level) ───────────────────────────────────────────── + +def run_evaluate() -> None: + if not ITEM_CASES: + print("[evaluate] No item-level ground truth defined yet.") + print(" Run --discover first, then populate ITEM_CASES in this script.") + return + + # Build lookup: appid → {gid → item} + gid_index: dict[str, dict[str, dict]] = {} + needed_appids = {case.appid for case in ITEM_CASES} + + with httpx.Client(timeout=30.0) as client: + for appid in needed_appids: + items = _fetch_news(client, appid, count=20) + gid_index[appid] = {str(item.get("gid", "")): item for item in items} + + tp = tn = fp = fn = amb = not_found = 0 + rows: list[tuple] = [] + + for case in ITEM_CASES: + item = gid_index.get(case.appid, {}).get(case.gid) + if item is None: + not_found += 1 + rows.append((case.game_name, case.title, "—", "—", "—", case.expected, "NOT FOUND")) + continue + + is_ur = UpdateDetectionService._is_update_related(item) + is_maj = UpdateDetectionService._is_major_update(item) + + predicted = "major" if (is_ur and is_maj) else "not_major" + expected = case.expected + + if expected == "ambiguous": + verdict = "ambiguous" + amb += 1 + elif predicted == expected: + verdict = "PASS" + if expected == "major": + tp += 1 + else: + tn += 1 + else: + if predicted == "major" and expected == "not_major": + verdict = "FAIL (FP)" + fp += 1 + else: + verdict = "FAIL (FN)" + fn += 1 + + rows.append(( + case.game_name, + _trunc(case.title, 30), + _fmt_ts(item.get("date")), + str(item.get("tags", ""))[:20], + item.get("feedlabel", "")[:16], + expected, + "Yes" if is_ur else "No", + "Yes" if is_maj else "No", + verdict, + )) + + # Print report + print("\n" + "=" * 110) + print("REPORT A — Item-level classification") + print("=" * 110) + hdr = f"{'Game':<18} {'Title':<30} {'Date':<10} {'Tags':<20} {'FeedLabel':<16} {'Expected':<10} {'UpdRel?':<8} {'Major?':<7} Verdict" + print(hdr) + print("-" * 110) + for row in rows: + if len(row) == 7: + print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {'—':<20} {'—':<16} {row[5]:<10} {'—':<8} {'—':<7} {row[6]}") + else: + print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {row[3]:<20} {row[4]:<16} {row[5]:<10} {row[6]:<8} {row[7]:<7} {row[8]}") + + total = tp + tn + fp + fn + print("\nSummary:") + print(f" Total cases : {len(ITEM_CASES)} | not found: {not_found} | ambiguous: {amb}") + print(f" TP={tp} TN={tn} FP={fp} FN={fn}") + if total > 0: + prec = tp / (tp + fp) if (tp + fp) else float("nan") + recall = tp / (tp + fn) if (tp + fn) else float("nan") + acc = (tp + tn) / total + print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}") + + fps = [c for c in ITEM_CASES if "FAIL (FP)" in str(rows[ITEM_CASES.index(c)])] + fns = [c for c in ITEM_CASES if "FAIL (FN)" in str(rows[ITEM_CASES.index(c)])] + if fps: + print("\nFalse Positives:") + for c in fps: + print(f" [{c.game_name}] {c.title!r} — {c.reasoning}") + if fns: + print("\nFalse Negatives:") + for c in fns: + print(f" [{c.game_name}] {c.title!r} — {c.reasoning}") + + +# ── Mode 3: evaluate-service (end-to-end) ───────────────────────────────────── + +def run_evaluate_service() -> None: + if not SERVICE_CASES: + print("[evaluate-service] No service-level ground truth defined yet.") + print(" Run --discover first, then populate SERVICE_CASES in this script.") + return + + tp = tn = fp = fn = amb = 0 + rows: list[tuple] = [] + + with httpx.Client(timeout=30.0) as client: + for case in SERVICE_CASES: + items = _fetch_news(client, case.appid, count=20) + latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items) + is_maj = major_date is not None + + latest_str = _fmt_dt(latest_update_date) + major_str = _fmt_dt(major_date) + maj_label = "Yes" if is_maj else "No" + + if case.expected_major is None: + verdict = "ambiguous" + amb += 1 + elif is_maj == case.expected_major: + verdict = "PASS" + if case.expected_major: + tp += 1 + else: + tn += 1 + else: + if is_maj and not case.expected_major: + verdict = "FAIL (FP)" + fp += 1 + else: + verdict = "FAIL (FN)" + fn += 1 + + rows.append(( + case.game_name, + latest_str, + major_str, + maj_label, + "True" if case.expected_major else ("None" if case.expected_major is None else "False"), + verdict, + )) + + print("\n" + "=" * 100) + print("REPORT B — Service-level (end-to-end)") + print("=" * 100) + hdr = f"{'Game':<18} {'LatestUpdate':<13} {'MajorDate':<11} {'Major?':<7} {'Expected':<9} Verdict" + print(hdr) + print("-" * 100) + for row in rows: + print(f"{row[0]:<18} {row[1]:<13} {row[2]:<11} {row[3]:<7} {row[4]:<9} {row[5]}") + + total = tp + tn + fp + fn + print("\nSummary:") + print(f" Total games : {len(SERVICE_CASES)} | ambiguous: {amb}") + print(f" TP={tp} TN={tn} FP={fp} FN={fn}") + if total > 0: + prec = tp / (tp + fp) if (tp + fp) else float("nan") + recall = tp / (tp + fn) if (tp + fn) else float("nan") + acc = (tp + tn) / total + print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}") + + for idx, case in enumerate(SERVICE_CASES): + verdict = rows[idx][5] + if verdict.startswith("FAIL"): + print(f"\n [{case.game_name}] {verdict} — {case.reasoning}") + + +# ── main ────────────────────────────────────────────────────────────────────── + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + description="Benchmark the major update detection heuristic against real Steam games." + ) + p.add_argument( + "--discover", + action="store_true", + help="Fetch news for all games and display per-item classification details.", + ) + p.add_argument( + "--evaluate", + action="store_true", + help="Run item-level evaluation against ITEM_CASES ground truth.", + ) + p.add_argument( + "--evaluate-service", + action="store_true", + dest="evaluate_service", + help="Run service-level end-to-end evaluation against SERVICE_CASES ground truth.", + ) + p.add_argument( + "--count", + type=int, + default=20, + help="Number of news items to fetch (default: 20, matches production). " + "Values > 20 are beyond the production window.", + ) + return p.parse_args() + + +def main() -> int: + args = _parse_args() + + discover = args.discover + evaluate = args.evaluate + eval_svc = args.evaluate_service + + # Default: run both evaluate modes when nothing is specified + if not discover and not evaluate and not eval_svc: + evaluate = True + eval_svc = True + + if discover: + run_discover(count=args.count) + + if evaluate: + run_evaluate() + + if eval_svc: + run_evaluate_service() + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_db_stats.py b/scripts/check_db_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b0d20485d638e26e472b07e85add1fcafd8760 --- /dev/null +++ b/scripts/check_db_stats.py @@ -0,0 +1,47 @@ +import asyncio +import os +from motor.motor_asyncio import AsyncIOMotorClient +from dotenv import load_dotenv + +# Załaduj .env z głównego katalogu lub katalogu backend +load_dotenv(".env") +load_dotenv("backend/.env") + +async def check_stats(): + # Pobranie parametrów z .env + mongo_url = os.getenv("MONGODB_URL") + db_name = os.getenv("MONGODB_DB_NAME", "sentimentSummary") + + if not mongo_url: + print("ERROR: MONGODB_URL not found in .env file!") + return + + print(f"Connecting to MongoDB: {mongo_url.split('@')[-1]}...") # Pokazuje tylko hosta dla bezpieczeństwa + + try: + client = AsyncIOMotorClient(mongo_url) + db = client[db_name] + collection = db["games"] + + total = await collection.count_documents({}) + with_cn = await collection.count_documents({ + "name_cn": {"$exists": True, "$ne": None, "$nin": ["", "null", "None"]} + }) + + print("\n" + "="*30) + print(f"DATABASE STATS") + print("="*30) + print(f"Total games: {total}") + print(f"With Chinese: {with_cn}") + + if total > 0: + percentage = (with_cn / total) * 100 + print(f"Coverage: {percentage:.2f}%") + print("="*30) + + client.close() + except Exception as e: + print(f"ERROR: Could not connect or query DB: {e}") + +if __name__ == "__main__": + asyncio.run(check_stats()) diff --git a/scripts/expand_keywords/__init__.py b/scripts/expand_keywords/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4bbe80f0e54cba9fb268bd4db88d7f34c0f4cfdc --- /dev/null +++ b/scripts/expand_keywords/__init__.py @@ -0,0 +1,8 @@ +""" +Keyword expansion toolkit using FastText. + +This package provides tools to: +1. Fetch reviews from Steam games +2. Train FastText models on review corpus +3. Expand existing keyword dictionary with semantically similar words +""" diff --git a/scripts/expand_keywords/__main__.py b/scripts/expand_keywords/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..8805391b4408f279790ef419f6aec473bf0968a2 --- /dev/null +++ b/scripts/expand_keywords/__main__.py @@ -0,0 +1,6 @@ +"""Allow running as: python -m scripts.expand_keywords""" + +from .main import main + +if __name__ == "__main__": + main() diff --git a/scripts/expand_keywords/config.py b/scripts/expand_keywords/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a10b1a00f22bc611e733c06868ea3f944b09c117 --- /dev/null +++ b/scripts/expand_keywords/config.py @@ -0,0 +1,106 @@ +""" +Configuration for keyword expansion: game list and settings. +""" + +from pathlib import Path + +# Base directories +BASE_DIR = Path(__file__).parent +DATA_DIR = BASE_DIR / "data" +REVIEWS_DIR = DATA_DIR / "reviews" +MODELS_DIR = DATA_DIR / "models" +OUTPUT_DIR = DATA_DIR / "output" + +# Ensure directories exist +for dir_path in [REVIEWS_DIR, MODELS_DIR, OUTPUT_DIR]: + dir_path.mkdir(parents=True, exist_ok=True) + +# Game list: (app_id, name, genre) +# Selected for variety across genres to get diverse vocabulary +GAMES: list[tuple[str, str, str]] = [ + # Action RPG + ("1245620", "Elden Ring", "action_rpg"), + ("374320", "Dark Souls III", "action_rpg"), + # CRPG + ("1086940", "Baldur's Gate 3", "crpg"), + ("435150", "Divinity: Original Sin 2", "crpg"), + ("1184370", "Pathfinder: Wrath of the Righteous", "crpg"), + # Open World RPG + ("292030", "The Witcher 3", "open_world_rpg"), + ("489830", "Skyrim Special Edition", "open_world_rpg"), + ("1091500", "Cyberpunk 2077", "open_world_rpg"), + # FPS + ("730", "Counter-Strike 2", "fps_competitive"), + ("782330", "DOOM Eternal", "fps_single"), + ("1237970", "Titanfall 2", "fps_single"), + # Survival + ("892970", "Valheim", "survival"), + ("252490", "Rust", "survival"), + ("264710", "Subnautica", "survival"), + ("242760", "The Forest", "survival"), + # Strategy + ("289070", "Civilization VI", "strategy"), + ("1142710", "Total War: Warhammer III", "strategy"), + ("1466860", "Age of Empires IV", "strategy"), + # Roguelike + ("1145360", "Hades", "roguelike"), + ("588650", "Dead Cells", "roguelike"), + ("646570", "Slay the Spire", "roguelike"), + # Metroidvania + ("367520", "Hollow Knight", "metroidvania"), + ("1057090", "Ori and the Will of the Wisps", "metroidvania"), + # Simulation + ("255710", "Cities: Skylines", "simulation"), + ("427520", "Factorio", "simulation"), + ("526870", "Satisfactory", "simulation"), + # Horror + ("1196590", "Resident Evil Village", "horror"), + ("739630", "Phasmophobia", "horror"), + ("381210", "Dead by Daylight", "horror"), + # Live Service + ("1085660", "Destiny 2", "live_service"), + ("230410", "Warframe", "live_service"), + ("238960", "Path of Exile", "live_service"), + # Racing + ("1551360", "Forza Horizon 5", "racing"), + # Story Driven + ("1174180", "Red Dead Redemption 2", "story_driven"), + # Casual + ("413150", "Stardew Valley", "casual"), + ("105600", "Terraria", "casual"), +] + +# Fetching settings +SETTINGS = { + # Review fetching + "reviews_per_game": 2700, # ~80k total across ~30 games + "batch_size": 100, # Steam API batch size + "sleep_between_batches": 1.5, # Seconds between API calls + "sleep_between_games": 5.0, # Longer pause between games + "min_review_length": 50, # Filter short reviews (chars) + "max_retries": 3, # Retry count on failure + "retry_base_delay": 10.0, # Base delay for exponential backoff + + # Preprocessing + "phrase_min_count": 10, # Min occurrences for phrase detection + "phrase_threshold": 10.0, # Phrase detection threshold + + # FastText training + "fasttext_vector_size": 150, + "fasttext_window": 5, + "fasttext_min_count": 5, + "fasttext_epochs": 10, + "fasttext_workers": 4, + + # Expansion + "similarity_threshold": 0.55, + "max_suggestions_per_seed": 20, + "min_frequency": 10, # Min word frequency in corpus + "auto_approve_threshold": 0.70, # Score threshold for auto-approval +} + +# Steam API endpoint +STEAM_REVIEWS_API = "https://store.steampowered.com/appreviews/{app_id}" + +# Steam language setting for reviews +STEAM_REVIEW_LANGUAGE = "schinese" # schinese, english, tchinese, etc. diff --git a/scripts/expand_keywords/expander.py b/scripts/expand_keywords/expander.py new file mode 100644 index 0000000000000000000000000000000000000000..d7ef26f94e55b0cf82aa2cb313722b53ce07a48f --- /dev/null +++ b/scripts/expand_keywords/expander.py @@ -0,0 +1,350 @@ +""" +Keyword dictionary expansion with exclusive category assignment. + +Key principle: Each word can only belong to ONE category. +This prevents cross-contamination where a word like "unplayable" +might be counted in both Bugs and Performance categories. + +Algorithm: +1. For each category: find candidate words similar to seed keywords +2. Collect ALL candidates in a global pool +3. Assign each word to the category with highest score +4. Filter by similarity threshold and frequency +""" + +import json +import logging +import math +from collections import defaultdict +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path + +from gensim.models import FastText + +from .config import OUTPUT_DIR, SETTINGS + +logger = logging.getLogger(__name__) + + +@dataclass +class Candidate: + """A candidate word for dictionary expansion.""" + + word: str + similarity: float + frequency: int + source_seeds: list[str] = field(default_factory=list) + + @property + def score(self) -> float: + """ + Combined score from similarity and frequency. + + Formula: 0.7 * similarity + 0.3 * normalized_log_frequency + Frequency factor normalized to ~0-1 range. + """ + freq_factor = math.log10(max(self.frequency, 1) + 1) / 5 + return self.similarity * 0.7 + freq_factor * 0.3 + + def to_dict(self) -> dict: + return { + "word": self.word.replace("_", " "), + "similarity": round(self.similarity, 3), + "frequency": self.frequency, + "score": round(self.score, 3), + "source_seeds": self.source_seeds, + } + + +class KeywordExpander: + """ + Expands keyword dictionary using trained FastText model. + + Uses exclusive category assignment to prevent words + appearing in multiple categories. + """ + + def __init__( + self, + model: FastText, + existing_keywords: dict[str, list[str]], + word_frequencies: dict[str, int], + similarity_threshold: float | None = None, + max_suggestions_per_seed: int | None = None, + min_frequency: int | None = None, + ): + """ + Initialize expander. + + Args: + model: Trained FastText model + existing_keywords: Current TOPIC_KEYWORDS dictionary + word_frequencies: Word frequency counts from corpus + similarity_threshold: Minimum similarity for candidates + max_suggestions_per_seed: Max similar words per seed + min_frequency: Minimum corpus frequency + """ + self.model = model + self.existing = existing_keywords + self.word_freq = word_frequencies + + self.similarity_threshold = similarity_threshold or SETTINGS["similarity_threshold"] + self.max_suggestions = max_suggestions_per_seed or SETTINGS["max_suggestions_per_seed"] + self.min_frequency = min_frequency or SETTINGS["min_frequency"] + + # Build set of all existing words (normalized) + self.existing_words: set[str] = set() + for words in existing_keywords.values(): + for w in words: + self.existing_words.add(w.lower().replace(" ", "_")) + + logger.info(f"Expander initialized with {len(self.existing_words)} existing keywords") + + def _find_candidates_for_category( + self, + category: str, + seeds: list[str], + ) -> dict[str, Candidate]: + """ + Find candidate words for a single category. + + Returns dict[word -> Candidate] with best similarity per word. + """ + candidates: dict[str, Candidate] = {} + + for seed in seeds: + # Normalize seed (e.g., "frame rate" -> "frame_rate") + seed_normalized = seed.lower().replace(" ", "_") + + # Skip if seed not in vocabulary + if seed_normalized not in self.model.wv: + continue + + # Get similar words + try: + similar = self.model.wv.most_similar( + seed_normalized, + topn=self.max_suggestions, + ) + except KeyError: + continue + + for word, similarity in similar: + # Skip existing words + if word in self.existing_words: + continue + + # Skip below threshold + if similarity < self.similarity_threshold: + continue + + # Check frequency + freq = self.word_freq.get(word, 0) + if freq < self.min_frequency: + continue + + # Update or add candidate + if word in candidates: + # Keep higher similarity + if similarity > candidates[word].similarity: + candidates[word].similarity = similarity + candidates[word].source_seeds.append(seed) + else: + candidates[word] = Candidate( + word=word, + similarity=similarity, + frequency=freq, + source_seeds=[seed], + ) + + return candidates + + def expand_all_exclusive(self) -> dict[str, list[Candidate]]: + """ + Expand all categories with exclusive assignment. + + Each word is assigned only to the category where it has + the highest score. + + Returns: + Dict mapping category -> list of Candidates (sorted by score) + """ + logger.info("Starting exclusive expansion...") + + # Step 1: Collect candidates from all categories + # Format: word -> [(category, Candidate), ...] + all_candidates: dict[str, list[tuple[str, Candidate]]] = defaultdict(list) + + for category, seeds in self.existing.items(): + category_candidates = self._find_candidates_for_category(category, seeds) + for word, candidate in category_candidates.items(): + all_candidates[word].append((category, candidate)) + + logger.info(f"[{category}] Found {len(category_candidates)} raw candidates") + + # Step 2: Assign each word to category with highest score + final_assignments: dict[str, list[Candidate]] = defaultdict(list) + + for word, category_candidates in all_candidates.items(): + # Find category with highest score + best_category, best_candidate = max( + category_candidates, + key=lambda x: x[1].score, + ) + final_assignments[best_category].append(best_candidate) + + # Step 3: Sort candidates in each category by score + for category in final_assignments: + final_assignments[category].sort(key=lambda c: c.score, reverse=True) + + # Log results + total = sum(len(cands) for cands in final_assignments.values()) + logger.info(f"Exclusive assignment complete: {total} total candidates") + + for category, cands in sorted(final_assignments.items()): + logger.info(f" {category}: {len(cands)} candidates") + + return dict(final_assignments) + + def export_candidates( + self, + path: Path | str | None = None, + include_threshold_in_name: bool = False, + ) -> Path: + """ + Export candidates to JSON for manual review. + + Args: + path: Output path (default: output/candidates.json) + include_threshold_in_name: Add threshold to filename for comparison + + Returns: + Path to exported file + """ + if path: + path = Path(path) + elif include_threshold_in_name: + path = OUTPUT_DIR / f"candidates_t{self.similarity_threshold:.2f}.json" + else: + path = OUTPUT_DIR / "candidates.json" + + results = self.expand_all_exclusive() + + export_data = { + "metadata": { + "generated_at": datetime.now().isoformat(), + "similarity_threshold": self.similarity_threshold, + "min_frequency": self.min_frequency, + "total_candidates": sum(len(c) for c in results.values()), + }, + "categories": {}, + } + + for category, candidates in sorted(results.items()): + export_data["categories"][category] = [c.to_dict() for c in candidates] + + with open(path, "w", encoding="utf-8") as f: + json.dump(export_data, f, indent=2, ensure_ascii=False) + + logger.info(f"Exported candidates to {path}") + return path + + def generate_keywords_py( + self, + output_path: Path | str | None = None, + auto_approve_threshold: float | None = None, + ) -> Path: + """ + Generate new keywords.py with expanded dictionary. + + Words with score >= auto_approve_threshold are added directly. + Words below threshold are added as comments for manual review. + + Args: + output_path: Output path (default: output/keywords_expanded.py) + auto_approve_threshold: Score threshold for auto-approval + + Returns: + Path to generated file + """ + output_path = Path(output_path) if output_path else OUTPUT_DIR / "keywords_expanded.py" + auto_approve = auto_approve_threshold or SETTINGS["auto_approve_threshold"] + + results = self.expand_all_exclusive() + + lines = [ + '"""', + "Expanded keyword dictionary for game review topic detection.", + f"Generated: {datetime.now().isoformat()}", + f"Auto-approve threshold: {auto_approve}", + '"""', + "", + "TOPIC_KEYWORDS = {", + ] + + for category, seeds in self.existing.items(): + lines.append(f' "{category}": [') + + # Existing keywords + lines.append(" # Existing") + for seed in seeds: + lines.append(f' "{seed}",') + + # New candidates + candidates = results.get(category, []) + if candidates: + # Auto-approved + auto_approved = [c for c in candidates if c.score >= auto_approve] + if auto_approved: + lines.append(f" # NEW (auto-approved, score >= {auto_approve})") + for c in auto_approved: + word_display = c.word.replace("_", " ") + lines.append(f' "{word_display}", # score={c.score:.2f}') + + # Candidates requiring review + review_needed = [c for c in candidates if c.score < auto_approve] + if review_needed: + lines.append(f" # CANDIDATES (score < {auto_approve}, require review)") + for c in review_needed: + word_display = c.word.replace("_", " ") + lines.append(f' # "{word_display}", # score={c.score:.2f}') + + lines.append(" ],") + lines.append("") + + lines.append("}") + lines.append("") + + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) + + logger.info(f"Generated keywords file at {output_path}") + return output_path + + def get_expansion_stats(self) -> dict: + """Get statistics about the expansion.""" + results = self.expand_all_exclusive() + auto_threshold = SETTINGS["auto_approve_threshold"] + + stats = { + "total_candidates": 0, + "auto_approved": 0, + "needs_review": 0, + "by_category": {}, + } + + for category, candidates in results.items(): + auto = sum(1 for c in candidates if c.score >= auto_threshold) + review = len(candidates) - auto + + stats["by_category"][category] = { + "total": len(candidates), + "auto_approved": auto, + "needs_review": review, + } + stats["total_candidates"] += len(candidates) + stats["auto_approved"] += auto + stats["needs_review"] += review + + return stats diff --git a/scripts/expand_keywords/fetcher.py b/scripts/expand_keywords/fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..74b3915f62c036bb5316445a6dba7f7fa1cc2030 --- /dev/null +++ b/scripts/expand_keywords/fetcher.py @@ -0,0 +1,355 @@ +""" +Review fetcher with rate limiting and progress tracking. + +Downloads reviews from Steam API with: +- Cursor-based pagination +- Sleep between requests to respect rate limits +- Progress persistence (JSONL per game + progress.json) +- Resume capability +""" + +import asyncio +import json +import logging +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any + +import httpx + +from .config import GAMES, REVIEWS_DIR, SETTINGS, STEAM_REVIEW_LANGUAGE, STEAM_REVIEWS_API + +logger = logging.getLogger(__name__) + + +@dataclass +class FetchProgress: + """Progress tracking for a single game.""" + app_id: str + name: str + target: int + fetched: int = 0 + cursor: str = "*" + completed: bool = False + last_updated: str = "" + + def to_dict(self) -> dict: + return { + "app_id": self.app_id, + "name": self.name, + "target": self.target, + "fetched": self.fetched, + "cursor": self.cursor, + "completed": self.completed, + "last_updated": self.last_updated, + } + + @classmethod + def from_dict(cls, data: dict) -> "FetchProgress": + return cls( + app_id=data["app_id"], + name=data["name"], + target=data["target"], + fetched=data.get("fetched", 0), + cursor=data.get("cursor", "*"), + completed=data.get("completed", False), + last_updated=data.get("last_updated", ""), + ) + + +@dataclass +class ReviewFetcher: + """ + Fetches reviews from Steam with rate limiting. + + Features: + - Async HTTP client with timeout + - Exponential backoff on rate limiting + - Progress persistence (resume capability) + - JSONL output per game + """ + + timeout: float = 30.0 + progress_file: Path = field(default_factory=lambda: REVIEWS_DIR / "progress.json") + + def __post_init__(self): + self._progress: dict[str, FetchProgress] = {} + self._load_progress() + + def _load_progress(self) -> None: + """Load progress from file if exists.""" + if self.progress_file.exists(): + try: + with open(self.progress_file, "r", encoding="utf-8") as f: + data = json.load(f) + for app_id, progress_data in data.items(): + self._progress[app_id] = FetchProgress.from_dict(progress_data) + logger.info(f"Loaded progress for {len(self._progress)} games") + except (json.JSONDecodeError, KeyError) as e: + logger.warning(f"Failed to load progress: {e}") + self._progress = {} + + def _save_progress(self) -> None: + """Save progress to file.""" + data = {app_id: prog.to_dict() for app_id, prog in self._progress.items()} + with open(self.progress_file, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + + def get_progress(self) -> dict[str, dict]: + """Get current progress for all games.""" + return {app_id: prog.to_dict() for app_id, prog in self._progress.items()} + + def _get_reviews_file(self, app_id: str) -> Path: + """Get path to reviews JSONL file for a game.""" + return REVIEWS_DIR / f"{app_id}.jsonl" + + def _append_reviews(self, app_id: str, reviews: list[str]) -> None: + """Append reviews to JSONL file.""" + reviews_file = self._get_reviews_file(app_id) + with open(reviews_file, "a", encoding="utf-8") as f: + for review in reviews: + f.write(json.dumps({"text": review}, ensure_ascii=False) + "\n") + + def load_reviews(self, app_id: str) -> list[str]: + """Load reviews from JSONL file.""" + reviews_file = self._get_reviews_file(app_id) + if not reviews_file.exists(): + return [] + + reviews = [] + with open(reviews_file, "r", encoding="utf-8") as f: + for line in f: + try: + data = json.loads(line.strip()) + reviews.append(data["text"]) + except (json.JSONDecodeError, KeyError): + continue + return reviews + + def load_all_reviews(self) -> list[str]: + """Load all reviews from all downloaded games.""" + all_reviews = [] + for app_id, _, _ in GAMES: + reviews = self.load_reviews(app_id) + all_reviews.extend(reviews) + logger.info(f"Loaded {len(all_reviews)} total reviews") + return all_reviews + + async def _fetch_batch( + self, + client: httpx.AsyncClient, + app_id: str, + cursor: str, + batch_size: int, + ) -> tuple[list[str], str | None]: + """Fetch a single batch of reviews.""" + url = STEAM_REVIEWS_API.format(app_id=app_id) + params: dict[str, Any] = { + "json": "1", + "filter": "recent", # "recent" has more reviews available than "all" + "review_type": "all", + "language": STEAM_REVIEW_LANGUAGE, + "num_per_page": str(batch_size), + "cursor": cursor, + "purchase_type": "all", + } + + try: + response = await client.get(url, params=params) + response.raise_for_status() + data = response.json() + except httpx.HTTPError as e: + logger.error(f"HTTP error fetching reviews for {app_id}: {e}") + return [], None + + if not data.get("success"): + logger.warning(f"API returned success=false for {app_id}") + return [], None + + reviews_data = data.get("reviews", []) + min_length = SETTINGS["min_review_length"] + + reviews = [ + review.get("review", "").strip() + for review in reviews_data + if review.get("review") and len(review.get("review", "").strip()) >= min_length + ] + + new_cursor = data.get("cursor") + return reviews, new_cursor + + async def _fetch_with_backoff( + self, + client: httpx.AsyncClient, + app_id: str, + cursor: str, + batch_size: int, + ) -> tuple[list[str], str | None]: + """Fetch with exponential backoff on failure.""" + max_retries = SETTINGS["max_retries"] + base_delay = SETTINGS["retry_base_delay"] + + for attempt in range(max_retries): + reviews, new_cursor = await self._fetch_batch(client, app_id, cursor, batch_size) + + if reviews or new_cursor is None: + return reviews, new_cursor + + # Empty reviews with cursor - might be rate limited + delay = base_delay * (2 ** attempt) + logger.warning(f"Empty response, retrying in {delay}s (attempt {attempt + 1}/{max_retries})") + await asyncio.sleep(delay) + + return [], None + + async def fetch_game_reviews( + self, + app_id: str, + name: str, + target: int, + resume: bool = True, + ) -> int: + """ + Fetch reviews for a single game. + + Returns number of reviews fetched. + """ + # Check if already completed + if resume and app_id in self._progress: + progress = self._progress[app_id] + if progress.completed: + logger.info(f"[{name}] Already completed ({progress.fetched} reviews)") + return progress.fetched + cursor = progress.cursor + fetched = progress.fetched + else: + # Start fresh - clear existing file + reviews_file = self._get_reviews_file(app_id) + if reviews_file.exists(): + reviews_file.unlink() + cursor = "*" + fetched = 0 + + # Initialize progress + self._progress[app_id] = FetchProgress( + app_id=app_id, + name=name, + target=target, + fetched=fetched, + cursor=cursor, + ) + + batch_size = SETTINGS["batch_size"] + sleep_between = SETTINGS["sleep_between_batches"] + seen_cursors: set[str] = set() + + logger.info(f"[{name}] Starting fetch: target={target}, already={fetched}") + + async with httpx.AsyncClient(timeout=self.timeout) as client: + while fetched < target: + reviews, new_cursor = await self._fetch_with_backoff( + client, app_id, cursor, batch_size + ) + + if not reviews: + logger.warning(f"[{name}] No more reviews available") + break + + if new_cursor and new_cursor in seen_cursors: + logger.warning(f"[{name}] Cursor loop detected") + break + + if new_cursor: + seen_cursors.add(new_cursor) + + # Save reviews + self._append_reviews(app_id, reviews) + fetched += len(reviews) + + # Update progress + self._progress[app_id].fetched = fetched + self._progress[app_id].cursor = new_cursor or cursor + self._progress[app_id].last_updated = datetime.now().isoformat() + self._save_progress() + + logger.info(f"[{name}] Fetched {fetched}/{target} reviews") + + if not new_cursor or new_cursor == "*": + break + + cursor = new_cursor + await asyncio.sleep(sleep_between) + + # Mark as completed + self._progress[app_id].completed = True + self._progress[app_id].last_updated = datetime.now().isoformat() + self._save_progress() + + logger.info(f"[{name}] Completed with {fetched} reviews") + return fetched + + async def fetch_all( + self, + resume: bool = True, + limit_games: int | None = None, + ) -> dict[str, int]: + """ + Fetch reviews for all configured games. + + Args: + resume: Continue from previous progress + limit_games: Limit number of games (for testing) + + Returns: + Dict mapping app_id to number of reviews fetched + """ + results: dict[str, int] = {} + sleep_between_games = SETTINGS["sleep_between_games"] + reviews_per_game = SETTINGS["reviews_per_game"] + + games = GAMES[:limit_games] if limit_games else GAMES + + for i, (app_id, name, genre) in enumerate(games): + logger.info(f"Processing game {i + 1}/{len(games)}: {name} ({genre})") + + count = await self.fetch_game_reviews( + app_id=app_id, + name=name, + target=reviews_per_game, + resume=resume, + ) + results[app_id] = count + + # Sleep between games (except for last one) + if i < len(games) - 1: + logger.info(f"Sleeping {sleep_between_games}s before next game...") + await asyncio.sleep(sleep_between_games) + + total = sum(results.values()) + logger.info(f"Total reviews fetched: {total}") + return results + + def get_stats(self) -> dict: + """Get statistics about fetched reviews.""" + stats = { + "games_total": len(GAMES), + "games_completed": 0, + "games_in_progress": 0, + "reviews_total": 0, + "reviews_per_game": {}, + } + + for app_id, name, _ in GAMES: + reviews_file = self._get_reviews_file(app_id) + if reviews_file.exists(): + count = sum(1 for _ in open(reviews_file, "r", encoding="utf-8")) + stats["reviews_per_game"][name] = count + stats["reviews_total"] += count + + if app_id in self._progress and self._progress[app_id].completed: + stats["games_completed"] += 1 + else: + stats["games_in_progress"] += 1 + + return stats diff --git a/scripts/expand_keywords/keywords_base.py b/scripts/expand_keywords/keywords_base.py new file mode 100644 index 0000000000000000000000000000000000000000..5bfd625a7967f7b5d4ee62292101fbed9435d12b --- /dev/null +++ b/scripts/expand_keywords/keywords_base.py @@ -0,0 +1,324 @@ +""" +Słowa kluczowe do wykrywania tematów w recenzjach gier. +Używane w podejściu hybrydowym (Keywords + ML Sentiment). + +Kategorie zostały dobrane na podstawie najczęstszych tematów +poruszanych w recenzjach gier na platformie Steam. +""" + +TOPIC_KEYWORDS = { + # ========================================================================= + # CORE GAMEPLAY + # ========================================================================= + "Gameplay": [ + # Podstawowe + "gameplay", "mechanics", "game mechanics", "core gameplay", "game loop", + "combat", "combat system", "fighting", "battle", "battles", + # Progresja + "progression", "leveling", "level up", "experience", "xp", "grind", "grinding", + "skill tree", "talent tree", "unlock", "unlocks", "unlockables", + # Misje i aktywności + "quests", "quest", "missions", "mission", "objectives", "side quests", + "main quest", "fetch quests", "puzzles", "puzzle", "exploration", + # Design + "game design", "level design", "map design", "pacing", + "balancing", "balanced", "unbalanced", "overpowered", "underpowered", "meta", + # Wrogowie + "enemies", "enemy", "bosses", "boss fights", "boss battle", "mobs", + # Ruch i umiejętności + "movement", "traversal", "parkour", "skills", "abilities", "powers", + "spells", "weapons", "weapon variety", "builds", "build variety", + ], + + "Fun": [ + # Pozytywne + "fun", "enjoyable", "entertaining", "addictive", "addicting", "engaging", + "exciting", "thrilling", "satisfying", "rewarding", "immersive", + "masterpiece", "gem", "hidden gem", "must play", "must buy", + # Negatywne + "boring", "tedious", "repetitive", "monotonous", "dull", "bland", + "frustrating", "annoying", "unfun", "not fun", "waste of time", + "disappointing", "letdown", "overhyped", "overrated", "underrated", + ], + + "Difficulty": [ + # Poziomy trudności + "difficulty", "easy", "normal", "hard", "very hard", "nightmare", + "easy mode", "hard mode", "difficulty settings", "difficulty options", + # Opisy trudności + "challenging", "too easy", "too hard", "too difficult", "punishing", + "forgiving", "casual", "hardcore", "souls-like", "soulslike", + "dark souls", "die a lot", "dying", "deaths", "unfair", "cheap deaths", + # Krzywa trudności + "learning curve", "steep learning curve", "skill ceiling", "skill floor", + "newcomer friendly", "beginner friendly", "accessible", + ], + + # ========================================================================= + # TECHNICAL + # ========================================================================= + "Performance": [ + # Wydajność + "performance", "optimize", "optimized", "optimization", "well optimized", + "poorly optimized", "unoptimized", "runs well", "runs smooth", "runs poorly", + # FPS + "fps", "framerate", "frame rate", "frames per second", "60fps", "30fps", + "fps drops", "frame drops", "drops", "dips", "stuttering", "stutter", + "hitching", "micro stutter", + # Zasoby + "cpu", "gpu", "ram", "vram", "memory", "memory leak", "memory usage", + # Ładowanie + "loading", "loading times", "load times", "loading screens", "long loading", + # Stabilność + "smooth", "stable", "unstable", "lag", "lagging", "input lag", + ], + + "Bugs": [ + # Ogólne + "bugs", "bug", "buggy", "glitch", "glitches", "glitchy", + "broken", "issues", "problems", "jank", "janky", + # Crashe + "crash", "crashes", "crashing", "crashed", "freeze", "freezing", "frozen", + "ctd", "crash to desktop", "black screen", "stuck", + # Konkretne bugi + "save bug", "save corruption", "corrupted save", "lost progress", + "clipping", "falling through", "invisible", "t-pose", + "softlock", "soft lock", "softlocked", "game breaking", + # Stan gry + "unplayable", "unfinished", "early access", "beta", "alpha", + "needs polish", "polished", "rough edges", + ], + + # ========================================================================= + # AUDIO-VISUAL + # ========================================================================= + "Graphics": [ + # Ogólne + "graphics", "visuals", "visual", "graphic", "graphically", + "looks", "look", "looking", "looks good", "looks bad", "looks great", + # Styl + "art style", "art direction", "artstyle", "aesthetic", "stylized", + "realistic", "photorealistic", "cartoony", "anime", "pixel art", "retro", + # Techniczne + "textures", "texture", "models", "model", "animations", "animation", + "lighting", "lights", "shadows", "shadow", "reflections", "ray tracing", + "rendering", "shaders", "particle effects", "particles", + # Rozdzielczość + "resolution", "4k", "1440p", "1080p", "720p", "upscaling", "dlss", "fsr", + # Środowisko + "environments", "environment", "scenery", "landscapes", "world design", + "level of detail", "lod", "draw distance", "pop in", "pop-in", + # Oceny + "beautiful", "gorgeous", "stunning", "breathtaking", "pretty", + "ugly", "dated", "outdated", "aged", "old looking", + ], + + "Sound": [ + # Muzyka + "music", "soundtrack", "ost", "score", "composer", "tracks", + "ambient", "ambient music", "battle music", "menu music", + # Głos + "voice", "voice acting", "voice actors", "voice over", "vo", + "voice lines", "dialogue", "dubbed", "dubbing", "lip sync", + # Efekty dźwiękowe + "sound", "sounds", "audio", "sfx", "sound effects", "sound design", + "footsteps", "gunshots", "explosions", + # Jakość + "atmosphere", "atmospheric", "immersive audio", "spatial audio", + "surround", "audio quality", "sound quality", + # Problemy + "audio bug", "audio glitch", "no sound", "sound cutting", "loud", "quiet", + ], + + # ========================================================================= + # CONTENT & VALUE + # ========================================================================= + "Content": [ + # Długość + "hours", "hour", "length", "long", "short", "playtime", "play time", + "how long", "game length", "campaign length", + # Ilość contentu + "content", "lots of content", "lack of content", "thin", "meaty", + "activities", "things to do", "side content", "endgame", "end game", + "post game", "new game plus", "ng+", + # Replayability + "replay", "replay value", "replayability", "replayable", + "multiple endings", "different endings", "choices matter", + "multiple playthroughs", "completionist", "100%", "100 percent", + ], + + "Monetization": [ + # Cena (ex-Price) + "price", "pricing", "cost", "costs", "priced", + "expensive", "overpriced", "cheap", "affordable", + "value", "worth", "worth it", "not worth", "bang for buck", + "value for money", "money well spent", + "sale", "discount", "on sale", "full price", "wait for sale", + "refund", "refunded", "steam sale", + "aaa price", "indie price", "budget", "premium", + "free to play", "f2p", "free", + # MTX (ex-Microtransactions) + "microtransactions", "microtransaction", "mtx", "monetization", + "in app purchases", "iap", "real money", "cash shop", "item shop", + "pay to win", "p2w", "pay2win", "paywall", "pay wall", + "pay to progress", "paying", "whale", "whales", + "loot box", "loot boxes", "lootbox", "gacha", "gambling", + "rng", "random", "chance", + "battle pass", "season pass", "battlepass", "seasons", + "premium currency", "gems", "coins", "points", + "cosmetics", "cosmetic", "skins", "skin", "outfits", + "dlc", "expansion", "expansions", "dlcs", + "cash grab", "money grab", "greedy", "predatory", "scam", + ], + + # ========================================================================= + # MULTIPLAYER & COMMUNITY + # ========================================================================= + "Multiplayer": [ + # Tryby + "multiplayer", "multi-player", "online", "offline", + "co-op", "coop", "co op", "cooperative", + "pvp", "pve", "pvpve", "versus", + "singleplayer", "single player", "solo", "solo play", + # Matchmaking + "matchmaking", "queue", "queue times", "waiting", + "servers", "server", "dedicated servers", "p2p", "peer to peer", + "ping", "latency", "connection", "disconnects", "desync", + # Gracze + "players", "teammates", "team", "squad", "party", + "randoms", "random teammates", "lobbies", "lobby", + # Problemy + "cheaters", "cheater", "hackers", "hacker", "hacking", "cheating", + "aimbots", "wallhacks", "anticheat", "anti cheat", + "toxic", "toxicity", "griefing", "griefers", + ], + + "Community": [ + # Społeczność + "community", "playerbase", "player base", "players", + "active", "dead game", "dead", "alive", "population", + # Modding + "mods", "mod", "modding", "mod support", "workshop", + "steam workshop", "nexus", "modders", "modded", + "custom content", "user generated", + # Deweloperzy (interakcja) + "devs", "developers", "dev team", "community manager", + "communication", "transparent", "listening", + # Społeczność graczy + "helpful", "friendly", "toxic community", "welcoming", + "guides", "wiki", "tutorials", "newbie friendly", + ], + + # ========================================================================= + # CONTROLS & UI + # ========================================================================= + "Controls": [ + # Sterowanie + "controls", "control", "controlling", "control scheme", + "keybinds", "keybind", "key bindings", "rebind", "remapping", + # Urządzenia + "keyboard", "mouse", "kb+m", "kbm", + "controller", "gamepad", "joystick", "controller support", + "xbox controller", "ps controller", "dualsense", + # Responsywność + "responsive", "unresponsive", "clunky", "sluggish", "tight", + "smooth controls", "floaty", "heavy", "weighty", + # Celowanie + "aiming", "aim", "aim assist", "auto aim", + "camera", "camera controls", "camera angle", + ], + + "UI": [ + # Interface + "ui", "user interface", "interface", "hud", + "menu", "menus", "main menu", "pause menu", + "ux", "user experience", + # Design UI + "clean ui", "cluttered", "minimalist", "intuitive", + "confusing", "overwhelming", "readable", "readable text", + # Elementy + "minimap", "map", "inventory", "crafting menu", + "skill menu", "quest log", "journal", + # Problemy + "font size", "text size", "too small", "can't read", + "navigation", "navigating", + ], + + # ========================================================================= + # STORY & NARRATIVE + # ========================================================================= + "Story": [ + # Narracja + "story", "storyline", "plot", "narrative", "storytelling", + "writing", "written", "well written", "poorly written", + # Elementy fabularne + "characters", "character", "protagonist", "main character", + "villain", "antagonist", "npcs", "npc", "companions", + "dialogue", "dialogues", "conversations", "choices", + # Świat + "lore", "world building", "worldbuilding", "universe", + "setting", "backstory", "history", + # Emocje + "emotional", "emotions", "feels", "touching", "heartwarming", + "dark", "mature", "gritty", "lighthearted", + # Zakończenie + "ending", "endings", "conclusion", "finale", + "twist", "plot twist", "predictable", "unpredictable", + # Cutscenki + "cutscenes", "cutscene", "cinematics", "cinematic", + "script", "scripted", "linear", "open ended", + ], + + # ========================================================================= + # DEVELOPER SUPPORT + # ========================================================================= + "Support": [ + # Aktualizacje + "updates", "update", "patch", "patches", "patched", + "hotfix", "hotfixes", "bug fixes", "fixed", + # Stan rozwoju + "abandoned", "dead", "no updates", "still updating", + "active development", "roadmap", "planned", + "early access", "full release", "1.0", "launch", + # Deweloperzy + "developer", "developers", "dev", "devs", "studio", + "indie dev", "indie developer", "aaa developer", + # Wsparcie + "support", "customer support", "response", "feedback", + "listening to feedback", "ignoring", "communication", + # Porty + "port", "ported", "console port", "pc port", "lazy port", + ], + + # ========================================================================= + # PREDICTION & INTENT (NEW!) + # ========================================================================= + "Retention": [ + # Pozytywne (High Retention) + "addictive", "addicted", "can't stop playing", "hooked", "drug", + "thousands of hours", "hundreds of hours", "worth it", "worth every penny", + "buy it", "must buy", "highly recommend", "masterpiece", "goty", + "game of the year", "10/10", "best game", "favorite game", + # Negatywne (Churn) + "refund", "refunded", "refunding", "uninstalled", "uninstall", "delete", + "waste of money", "waste of time", "don't buy", "do not buy", + "regret", "regretting", "boring", "bored", "sleep", "sleepy", + "wait for sale", "not worth it", "cash grab", "scam", + ], +} + +# ============================================================================= +# WYKLUCZENIA (Context-aware filtering) +# ============================================================================= +# Słowa wykluczające - jeśli występują w pobliżu słowa kluczowego, +# ignorujemy to słowo kluczowe w danym kontekście. +# Format: "słowo_kluczowe": ["słowo_obok", "inne_słowo"] + +EXCLUSIONS = { + # "fps" jako gatunek (FPS shooter) vs wydajność (60 fps) + "fps": ["genre", "shooter", "first person", "fps game", "fps genre"], + # "free" jako darmowy vs "free to play" model biznesowy + "free": ["drm free", "bug free", "free roam", "free world"], + # "controls" jako sterowanie vs "kontroluje" w narracji + "control": ["mind control", "control the world", "control freak"], +} diff --git a/scripts/expand_keywords/main.py b/scripts/expand_keywords/main.py new file mode 100644 index 0000000000000000000000000000000000000000..279594fe8b82b07686cd9f62aa207088445e4b21 --- /dev/null +++ b/scripts/expand_keywords/main.py @@ -0,0 +1,447 @@ +""" +CLI for keyword expansion toolkit. + +Usage: + # Fetch reviews from Steam (can be resumed) + python -m scripts.expand_keywords fetch --resume + + # Train FastText model + python -m scripts.expand_keywords train + + # Expand dictionary and export candidates + python -m scripts.expand_keywords expand --threshold 0.55 + + # Generate new keywords.py + python -m scripts.expand_keywords generate --auto-approve 0.7 + + # Run all steps + python -m scripts.expand_keywords run --resume + + # Show statistics + python -m scripts.expand_keywords stats +""" + +import argparse +import asyncio +import logging +import sys +from pathlib import Path + +# Add project root to path for imports +PROJECT_ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(PROJECT_ROOT)) + +from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS +from scripts.expand_keywords.expander import KeywordExpander +from scripts.expand_keywords.fetcher import ReviewFetcher +from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords +from scripts.expand_keywords.trainer import FastTextTrainer + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger(__name__) + + +def load_existing_keywords() -> dict[str, list[str]]: + """Load existing TOPIC_KEYWORDS from keywords.py.""" + keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py" + + if not keywords_path.exists(): + raise FileNotFoundError(f"Keywords file not found: {keywords_path}") + + # Execute keywords.py to get TOPIC_KEYWORDS + namespace: dict = {} + exec(keywords_path.read_text(encoding="utf-8"), namespace) + + keywords = namespace.get("TOPIC_KEYWORDS") + if not keywords: + raise ValueError("TOPIC_KEYWORDS not found in keywords.py") + + return keywords + + +async def cmd_fetch(args: argparse.Namespace) -> None: + """Fetch reviews from Steam.""" + logger.info("Starting review fetch...") + + fetcher = ReviewFetcher() + + # Show current progress + stats = fetcher.get_stats() + logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games") + + await fetcher.fetch_all( + resume=args.resume, + limit_games=args.limit, + ) + + # Show final stats + stats = fetcher.get_stats() + logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games") + + +def cmd_train(args: argparse.Namespace) -> None: + """Train FastText model.""" + logger.info("Starting model training...") + + # Load existing keywords for frozen n-grams + keywords = load_existing_keywords() + existing_ngrams = extract_ngrams_from_keywords(keywords) + logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary") + + # Load reviews + fetcher = ReviewFetcher() + reviews = fetcher.load_all_reviews() + + if not reviews: + logger.error("No reviews found. Run 'fetch' first.") + return + + logger.info(f"Loaded {len(reviews)} reviews") + + # Preprocess + preprocessor = Preprocessor(existing_ngrams=existing_ngrams) + sentences = preprocessor.preprocess_corpus(reviews) + preprocessor.save() + + # Train + trainer = FastTextTrainer() + trainer.train(sentences) + trainer.save() + + logger.info("Training complete!") + + +def cmd_expand(args: argparse.Namespace) -> None: + """Expand dictionary and export candidates.""" + logger.info("Starting dictionary expansion...") + + # Load components + keywords = load_existing_keywords() + + preprocessor = Preprocessor() + try: + preprocessor.load() + except FileNotFoundError: + logger.error("Preprocessor not found. Run 'train' first.") + return + + trainer = FastTextTrainer() + try: + model = trainer.load() + except FileNotFoundError: + logger.error("Model not found. Run 'train' first.") + return + + # Expand + expander = KeywordExpander( + model=model, + existing_keywords=keywords, + word_frequencies=preprocessor.get_word_frequencies(), + similarity_threshold=args.threshold, + ) + + # Export candidates (with threshold in filename if requested) + expander.export_candidates(include_threshold_in_name=args.compare) + + # Show stats + stats = expander.get_expansion_stats() + logger.info(f"Expansion complete: {stats['total_candidates']} candidates") + logger.info(f" Auto-approved: {stats['auto_approved']}") + logger.info(f" Needs review: {stats['needs_review']}") + + +def cmd_compare(args: argparse.Namespace) -> None: + """Compare multiple thresholds.""" + logger.info("Comparing thresholds...") + + # Load components + keywords = load_existing_keywords() + + preprocessor = Preprocessor() + try: + preprocessor.load() + except FileNotFoundError: + logger.error("Preprocessor not found. Run 'train' first.") + return + + trainer = FastTextTrainer() + try: + model = trainer.load() + except FileNotFoundError: + logger.error("Model not found. Run 'train' first.") + return + + thresholds = args.thresholds + results = [] + + for threshold in thresholds: + expander = KeywordExpander( + model=model, + existing_keywords=keywords, + word_frequencies=preprocessor.get_word_frequencies(), + similarity_threshold=threshold, + ) + + # Export with threshold in name + expander.export_candidates(include_threshold_in_name=True) + + stats = expander.get_expansion_stats() + results.append((threshold, stats)) + + # Print comparison table + print("\n" + "=" * 60) + print("THRESHOLD COMPARISON") + print("=" * 60) + print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}") + print("-" * 60) + + for threshold, stats in results: + print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}") + + print("-" * 60) + print(f"\nOutput files saved to: {OUTPUT_DIR}/") + print("Compare candidates_t*.json to see differences.") + + +def cmd_generate(args: argparse.Namespace) -> None: + """Generate new keywords.py.""" + logger.info("Generating expanded keywords.py...") + + # Load components + keywords = load_existing_keywords() + + preprocessor = Preprocessor() + try: + preprocessor.load() + except FileNotFoundError: + logger.error("Preprocessor not found. Run 'train' first.") + return + + trainer = FastTextTrainer() + try: + model = trainer.load() + except FileNotFoundError: + logger.error("Model not found. Run 'train' first.") + return + + # Generate + expander = KeywordExpander( + model=model, + existing_keywords=keywords, + word_frequencies=preprocessor.get_word_frequencies(), + ) + + output_path = expander.generate_keywords_py( + auto_approve_threshold=args.auto_approve, + ) + + logger.info(f"Generated: {output_path}") + + +async def cmd_run(args: argparse.Namespace) -> None: + """Run all steps: fetch, train, expand, generate.""" + logger.info("Running complete pipeline...") + + # Step 1: Fetch + await cmd_fetch(args) + + # Step 2: Train + cmd_train(args) + + # Step 3: Expand + cmd_expand(args) + + # Step 4: Generate + cmd_generate(args) + + logger.info("Pipeline complete!") + + +def cmd_stats(args: argparse.Namespace) -> None: + """Show statistics.""" + # Fetcher stats + fetcher = ReviewFetcher() + fetch_stats = fetcher.get_stats() + + print("\n=== Fetch Statistics ===") + print(f"Games configured: {fetch_stats['games_total']}") + print(f"Games completed: {fetch_stats['games_completed']}") + print(f"Games in progress: {fetch_stats['games_in_progress']}") + print(f"Total reviews: {fetch_stats['reviews_total']}") + + if fetch_stats["reviews_per_game"]: + print("\nReviews per game:") + for name, count in sorted(fetch_stats["reviews_per_game"].items()): + print(f" {name}: {count}") + + # Model stats + model_path = MODELS_DIR / "fasttext.model" + if model_path.exists(): + print("\n=== Model Statistics ===") + trainer = FastTextTrainer() + model = trainer.load() + print(f"Vocabulary size: {len(model.wv)}") + + # Expansion stats (if available) + candidates_path = OUTPUT_DIR / "candidates.json" + if candidates_path.exists(): + import json + with open(candidates_path, "r", encoding="utf-8") as f: + data = json.load(f) + print("\n=== Expansion Statistics ===") + print(f"Total candidates: {data['metadata']['total_candidates']}") + for cat, cands in data["categories"].items(): + print(f" {cat}: {len(cands)}") + + +def cmd_similar(args: argparse.Namespace) -> None: + """Find similar words for testing.""" + trainer = FastTextTrainer() + try: + model = trainer.load() + except FileNotFoundError: + logger.error("Model not found. Run 'train' first.") + return + + word = args.word + topn = args.topn + + similar = trainer.get_similar(word, topn=topn) + + if similar: + print(f"\nWords similar to '{word}':") + for w, sim in similar: + print(f" {w}: {sim:.3f}") + else: + print(f"Word '{word}' not found in vocabulary") + + +def main(): + parser = argparse.ArgumentParser( + description="Keyword expansion toolkit using FastText", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # fetch command + fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam") + fetch_parser.add_argument( + "--resume", "-r", + action="store_true", + help="Resume from previous progress", + ) + fetch_parser.add_argument( + "--limit", "-l", + type=int, + default=None, + help="Limit number of games (for testing)", + ) + + # train command + train_parser = subparsers.add_parser("train", help="Train FastText model") + + # expand command + expand_parser = subparsers.add_parser("expand", help="Expand dictionary") + expand_parser.add_argument( + "--threshold", "-t", + type=float, + default=SETTINGS["similarity_threshold"], + help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})", + ) + expand_parser.add_argument( + "--compare", "-c", + action="store_true", + help="Include threshold in output filename (for comparison)", + ) + + # compare command + compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds") + compare_parser.add_argument( + "--thresholds", "-t", + type=float, + nargs="+", + default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70], + help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)", + ) + + # generate command + generate_parser = subparsers.add_parser("generate", help="Generate keywords.py") + generate_parser.add_argument( + "--auto-approve", "-a", + type=float, + default=SETTINGS["auto_approve_threshold"], + help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})", + ) + + # run command (all steps) + run_parser = subparsers.add_parser("run", help="Run all steps") + run_parser.add_argument( + "--resume", "-r", + action="store_true", + help="Resume fetch from previous progress", + ) + run_parser.add_argument( + "--limit", "-l", + type=int, + default=None, + help="Limit number of games (for testing)", + ) + run_parser.add_argument( + "--threshold", "-t", + type=float, + default=SETTINGS["similarity_threshold"], + help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})", + ) + run_parser.add_argument( + "--auto-approve", "-a", + type=float, + default=SETTINGS["auto_approve_threshold"], + help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})", + ) + + # stats command + stats_parser = subparsers.add_parser("stats", help="Show statistics") + + # similar command (for testing) + similar_parser = subparsers.add_parser("similar", help="Find similar words") + similar_parser.add_argument("word", help="Word to find similar words for") + similar_parser.add_argument( + "--topn", "-n", + type=int, + default=20, + help="Number of results (default: 20)", + ) + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return + + # Execute command + if args.command == "fetch": + asyncio.run(cmd_fetch(args)) + elif args.command == "train": + cmd_train(args) + elif args.command == "expand": + cmd_expand(args) + elif args.command == "compare": + cmd_compare(args) + elif args.command == "generate": + cmd_generate(args) + elif args.command == "run": + asyncio.run(cmd_run(args)) + elif args.command == "stats": + cmd_stats(args) + elif args.command == "similar": + cmd_similar(args) + + +if __name__ == "__main__": + main() diff --git a/scripts/expand_keywords/preprocessor.py b/scripts/expand_keywords/preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..b99363aebcd841376b12298fa1975948ae4d3159 --- /dev/null +++ b/scripts/expand_keywords/preprocessor.py @@ -0,0 +1,282 @@ +""" +Text preprocessing with n-gram detection using gensim.Phrases. + +Pipeline: +1. Tokenization (jieba for Chinese, regex for English/mixed) +2. Build Phrases models (bigrams, trigrams) +3. Apply frozen n-grams from existing dictionary +4. Apply detected phrases + +This ensures that multi-word concepts like "帧率" or "加载画面" +are treated as single tokens during FastText training. + +For Chinese text: +- Uses jieba for word segmentation (Chinese has no spaces) +- Keeps English words intact (common in gaming reviews: fps, bug, dlc) +- Removes punctuation but preserves Chinese characters +""" + +import logging +import pickle +import re +from collections import Counter +from pathlib import Path + +import jieba +from gensim.models import Phrases +from gensim.models.phrases import Phraser + +from .config import MODELS_DIR, SETTINGS + +logger = logging.getLogger(__name__) + + +class Preprocessor: + """ + Text preprocessor with n-gram detection. + + Uses gensim Phrases for automatic phrase detection plus + frozen n-grams from the existing keyword dictionary. + """ + + def __init__(self, existing_ngrams: list[str] | None = None): + """ + Initialize preprocessor. + + Args: + existing_ngrams: Multi-word phrases from existing keywords.py + (e.g., "frame rate", "loading screen") + """ + self.frozen_ngrams: set[tuple[str, ...]] = set() + if existing_ngrams: + self.frozen_ngrams = self._normalize_ngrams(existing_ngrams) + logger.info(f"Loaded {len(self.frozen_ngrams)} frozen n-grams") + + self.bigram_model: Phraser | None = None + self.trigram_model: Phraser | None = None + self.word_frequencies: Counter = Counter() + + def _normalize_ngrams(self, ngrams: list[str]) -> set[tuple[str, ...]]: + """Convert n-grams to lowercase tuple format for fast lookup.""" + result = set() + for ng in ngrams: + if " " in ng: + tokens = tuple(ng.lower().split()) + result.add(tokens) + return result + + def tokenize(self, text: str) -> list[str]: + """ + Tokenization for Chinese/mixed text using jieba. + + - Uses jieba for Chinese word segmentation + - Keeps English words intact (common in gaming: fps, bug, dlc) + - Removes punctuation (both Chinese and English) + - Lowercases English text + """ + # Remove URLs + text = re.sub(r'https?://\S+', ' ', text) + + # Remove punctuation (Chinese and English) but keep Chinese chars and alphanumeric + # Chinese punctuation: 。!?,、;:""''()【】《》 + text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbfa-zA-Z0-9\s]', ' ', text) + + # Lowercase English text + text = text.lower() + + # Use jieba to segment Chinese text + # jieba handles mixed Chinese/English text well + tokens = list(jieba.cut(text)) + + # Filter: remove empty strings and single spaces + tokens = [t.strip() for t in tokens if t.strip()] + + return tokens + + def build_phrase_models( + self, + corpus: list[list[str]], + min_count: int | None = None, + threshold: float | None = None, + ) -> None: + """ + Build Phrases models for automatic n-gram detection. + + Args: + corpus: List of tokenized documents + min_count: Minimum phrase occurrences (default from settings) + threshold: Scoring threshold (higher = fewer phrases) + """ + min_count = min_count or SETTINGS["phrase_min_count"] + threshold = threshold or SETTINGS["phrase_threshold"] + + logger.info(f"Building phrase models (min_count={min_count}, threshold={threshold})") + + # Build bigram model: "frame rate" -> "frame_rate" + bigram_phrases = Phrases( + corpus, + min_count=min_count, + threshold=threshold, + delimiter="_", + ) + self.bigram_model = Phraser(bigram_phrases) + + # Apply bigramy to create input for trigram detection + bigram_corpus = [self.bigram_model[doc] for doc in corpus] + + # Build trigram model: "dark_souls like" -> "dark_souls_like" + trigram_phrases = Phrases( + bigram_corpus, + min_count=min_count, + threshold=threshold, + delimiter="_", + ) + self.trigram_model = Phraser(trigram_phrases) + + # Log detected phrases + bigram_count = len(bigram_phrases.export_phrases()) + trigram_count = len(trigram_phrases.export_phrases()) + logger.info(f"Detected {bigram_count} bigrams, {trigram_count} trigrams") + + def _apply_frozen_ngrams(self, tokens: list[str]) -> list[str]: + """ + Apply frozen n-grams from existing dictionary. + + These are always joined, even if not detected by Phrases. + """ + result = [] + i = 0 + + while i < len(tokens): + matched = False + + # Try trigrams first (longer matches preferred) + if i + 2 < len(tokens): + trigram = (tokens[i], tokens[i + 1], tokens[i + 2]) + if trigram in self.frozen_ngrams: + result.append("_".join(trigram)) + i += 3 + matched = True + + # Try bigrams + if not matched and i + 1 < len(tokens): + bigram = (tokens[i], tokens[i + 1]) + if bigram in self.frozen_ngrams: + result.append("_".join(bigram)) + i += 2 + matched = True + + if not matched: + result.append(tokens[i]) + i += 1 + + return result + + def apply_phrases(self, tokens: list[str]) -> list[str]: + """ + Apply phrase models and frozen n-grams to tokens. + + Order: + 1. Frozen n-grams (from existing dictionary) + 2. Automatic Phrases (bigrams then trigrams) + """ + # Apply frozen n-grams first + tokens = self._apply_frozen_ngrams(tokens) + + # Apply automatic phrase models + if self.bigram_model: + tokens = list(self.bigram_model[tokens]) + if self.trigram_model: + tokens = list(self.trigram_model[tokens]) + + return tokens + + def preprocess_corpus( + self, + reviews: list[str], + build_phrases: bool = True, + ) -> list[list[str]]: + """ + Full preprocessing pipeline. + + Args: + reviews: Raw review texts + build_phrases: Whether to build phrase models (skip if loading) + + Returns: + List of tokenized documents with phrases applied + """ + logger.info(f"Preprocessing {len(reviews)} reviews...") + + # Step 1: Tokenize all reviews + tokenized = [self.tokenize(review) for review in reviews] + logger.info("Tokenization complete") + + # Step 2: Build phrase models + if build_phrases: + self.build_phrase_models(tokenized) + + # Step 3: Apply phrases and count frequencies + processed = [] + for tokens in tokenized: + phrased = self.apply_phrases(tokens) + processed.append(phrased) + self.word_frequencies.update(phrased) + + logger.info(f"Vocabulary size: {len(self.word_frequencies)}") + return processed + + def get_word_frequencies(self) -> dict[str, int]: + """Get word frequency dictionary.""" + return dict(self.word_frequencies) + + def save(self, path: Path | None = None) -> None: + """Save preprocessor state (phrase models, frequencies).""" + path = path or MODELS_DIR / "preprocessor.pkl" + + data = { + "frozen_ngrams": self.frozen_ngrams, + "bigram_model": self.bigram_model, + "trigram_model": self.trigram_model, + "word_frequencies": self.word_frequencies, + } + + with open(path, "wb") as f: + pickle.dump(data, f) + + logger.info(f"Saved preprocessor to {path}") + + def load(self, path: Path | None = None) -> None: + """Load preprocessor state.""" + path = path or MODELS_DIR / "preprocessor.pkl" + + if not path.exists(): + raise FileNotFoundError(f"Preprocessor not found at {path}") + + with open(path, "rb") as f: + data = pickle.load(f) + + self.frozen_ngrams = data["frozen_ngrams"] + self.bigram_model = data["bigram_model"] + self.trigram_model = data["trigram_model"] + self.word_frequencies = data["word_frequencies"] + + logger.info(f"Loaded preprocessor from {path}") + + +def extract_ngrams_from_keywords(keywords: dict[str, list[str]]) -> list[str]: + """ + Extract multi-word phrases from keywords dictionary. + + Args: + keywords: TOPIC_KEYWORDS dictionary from keywords.py + + Returns: + List of multi-word phrases (e.g., ["frame rate", "loading screen"]) + """ + ngrams = [] + for category_words in keywords.values(): + for word in category_words: + if " " in word: + ngrams.append(word) + return ngrams diff --git a/scripts/expand_keywords/trainer.py b/scripts/expand_keywords/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..575d15d62c2426b5e277b09f09c653077a2ab563 --- /dev/null +++ b/scripts/expand_keywords/trainer.py @@ -0,0 +1,185 @@ +""" +FastText model training. + +FastText is preferred over Word2Vec because: +- Better handling of typos and misspellings (common in reviews) +- Can generate vectors for out-of-vocabulary words +- Uses character n-grams internally +""" + +import logging +from pathlib import Path + +from gensim.models import FastText + +from .config import MODELS_DIR, SETTINGS + +logger = logging.getLogger(__name__) + + +class FastTextTrainer: + """ + Trains FastText word embeddings on review corpus. + """ + + def __init__( + self, + vector_size: int | None = None, + window: int | None = None, + min_count: int | None = None, + epochs: int | None = None, + workers: int | None = None, + ): + """ + Initialize trainer with hyperparameters. + + Args: + vector_size: Dimensionality of word vectors + window: Context window size + min_count: Minimum word frequency + epochs: Number of training iterations + workers: Number of worker threads + """ + self.vector_size = vector_size or SETTINGS["fasttext_vector_size"] + self.window = window or SETTINGS["fasttext_window"] + self.min_count = min_count or SETTINGS["fasttext_min_count"] + self.epochs = epochs or SETTINGS["fasttext_epochs"] + self.workers = workers or SETTINGS["fasttext_workers"] + + self.model: FastText | None = None + + def train(self, sentences: list[list[str]]) -> FastText: + """ + Train FastText model on tokenized sentences. + + Args: + sentences: List of tokenized documents (output from preprocessor) + + Returns: + Trained FastText model + """ + logger.info( + f"Training FastText model: " + f"vector_size={self.vector_size}, window={self.window}, " + f"min_count={self.min_count}, epochs={self.epochs}" + ) + logger.info(f"Training on {len(sentences)} documents") + + self.model = FastText( + sentences=sentences, + vector_size=self.vector_size, + window=self.window, + min_count=self.min_count, + epochs=self.epochs, + workers=self.workers, + sg=1, # Skip-gram (better for semantic similarity) + min_n=3, # Minimum character n-gram length + max_n=6, # Maximum character n-gram length + ) + + vocab_size = len(self.model.wv) + logger.info(f"Training complete. Vocabulary size: {vocab_size}") + + return self.model + + def save(self, path: Path | str | None = None) -> Path: + """ + Save trained model. + + Args: + path: Save path (default: models/fasttext.model) + + Returns: + Path where model was saved + """ + if self.model is None: + raise ValueError("No model to save. Train first.") + + path = Path(path) if path else MODELS_DIR / "fasttext.model" + self.model.save(str(path)) + logger.info(f"Saved model to {path}") + return path + + def load(self, path: Path | str | None = None) -> FastText: + """ + Load model from file. + + Args: + path: Model path (default: models/fasttext.model) + + Returns: + Loaded FastText model + """ + path = Path(path) if path else MODELS_DIR / "fasttext.model" + + if not path.exists(): + raise FileNotFoundError(f"Model not found at {path}") + + self.model = FastText.load(str(path)) + vocab_size = len(self.model.wv) + logger.info(f"Loaded model from {path}. Vocabulary size: {vocab_size}") + return self.model + + def get_similar( + self, + word: str, + topn: int = 10, + ) -> list[tuple[str, float]]: + """ + Get most similar words to a given word. + + Args: + word: Query word + topn: Number of results + + Returns: + List of (word, similarity) tuples + """ + if self.model is None: + raise ValueError("No model loaded. Train or load first.") + + # Normalize word (space to underscore for phrases) + word_normalized = word.lower().replace(" ", "_") + + try: + return self.model.wv.most_similar(word_normalized, topn=topn) + except KeyError: + logger.warning(f"Word '{word}' not in vocabulary") + return [] + + def get_similarity(self, word1: str, word2: str) -> float: + """ + Get similarity between two words. + + Args: + word1: First word + word2: Second word + + Returns: + Cosine similarity (-1 to 1) + """ + if self.model is None: + raise ValueError("No model loaded. Train or load first.") + + w1 = word1.lower().replace(" ", "_") + w2 = word2.lower().replace(" ", "_") + + try: + return float(self.model.wv.similarity(w1, w2)) + except KeyError as e: + logger.warning(f"Word not in vocabulary: {e}") + return 0.0 + + def word_in_vocab(self, word: str) -> bool: + """Check if word is in vocabulary.""" + if self.model is None: + return False + + word_normalized = word.lower().replace(" ", "_") + return word_normalized in self.model.wv + + def get_vocab_words(self) -> list[str]: + """Get all words in vocabulary.""" + if self.model is None: + return [] + return list(self.model.wv.key_to_index.keys()) diff --git a/scripts/fetch_games_to_mongodb.py b/scripts/fetch_games_to_mongodb.py new file mode 100644 index 0000000000000000000000000000000000000000..20fd9ed6845c64e09c70167370e434357a4938cc --- /dev/null +++ b/scripts/fetch_games_to_mongodb.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Skrypt do pobierania listy gier ze SteamSpy i zapisywania do MongoDB. + +Pobiera wszystkie gry dostępne w SteamSpy API (~85,000 gier) +i zapisuje je do kolekcji 'games' w MongoDB. + +Uruchom z głównego katalogu projektu: + python scripts/fetch_games_to_mongodb.py + +Wymaga uruchomionego MongoDB i ustawionych zmiennych środowiskowych. +""" + +import asyncio +import sys +import time +from pathlib import Path + +import httpx + +# Dodaj backend do path +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) + +from app.db.mongodb import mongodb + +STEAMSPY_API = "https://steamspy.com/api.php" +MAX_PAGES = 90 # ~90,000 gier +DELAY_SECONDS = 1.0 # Opóźnienie między zapytaniami +BATCH_SIZE = 1000 # Zapisuj do MongoDB co tyle gier + + +def fetch_page_sync(client: httpx.Client, page: int) -> list[dict]: + """Pobiera jedną stronę gier ze SteamSpy (synchronicznie).""" + params = {"request": "all", "page": page} + + try: + response = client.get(STEAMSPY_API, params=params, timeout=30) + response.raise_for_status() + data = response.json() + + if not data or not isinstance(data, dict): + return [] + + # Wyciągnij appid, name, developer, publisher + games = [] + for info in data.values(): + if not isinstance(info, dict) or not info.get("name"): + continue + + game = { + "appid": str(info["appid"]), + "name": info["name"], + } + + # Dodaj developer jeśli istnieje + if info.get("developer"): + game["developer"] = info["developer"] + + # Dodaj publisher jeśli istnieje i różny od developer + if info.get("publisher") and info.get("publisher") != info.get("developer"): + game["publisher"] = info["publisher"] + + games.append(game) + + return games + except Exception as e: + print(f" Błąd na stronie {page}: {e}") + return [] + + +async def main(): + """Główna funkcja - pobiera gry i zapisuje do MongoDB.""" + print("=" * 60) + print("Pobieranie listy gier ze SteamSpy do MongoDB") + print("=" * 60) + print() + + # Połącz z MongoDB + print("Łączenie z MongoDB...") + try: + await mongodb.connect() + except Exception as e: + print(f"Błąd połączenia z MongoDB: {e}") + print("Upewnij się, że MongoDB jest uruchomione i zmienne środowiskowe są ustawione.") + return + + # Sprawdź czy już są gry w bazie + existing_count = await mongodb.get_games_count() + if existing_count > 0: + print(f"W bazie jest już {existing_count} gier.") + response = input("Czy usunąć istniejące i pobrać nowe? (t/n): ").strip().lower() + if response != "t": + print("Anulowano.") + await mongodb.disconnect() + return + await mongodb.clear_games() + print("Usunięto istniejące gry.") + + print() + print(f"Pobieranie maksymalnie {MAX_PAGES} stron po ~1000 gier każda...") + print(f"Opóźnienie między zapytaniami: {DELAY_SECONDS}s") + print() + + all_games: list[dict] = [] + total_saved = 0 + + with httpx.Client() as client: + for page in range(MAX_PAGES): + print(f"Strona {page + 1}/{MAX_PAGES}...", end=" ", flush=True) + + games = fetch_page_sync(client, page) + + if not games: + print("Pusta - koniec danych") + break + + all_games.extend(games) + print(f"OK ({len(games)} gier, łącznie: {len(all_games)})") + + # Zapisuj do MongoDB co BATCH_SIZE gier + if len(all_games) >= BATCH_SIZE: + saved = await mongodb.save_games_batch(all_games) + total_saved += saved + all_games = [] + print(f" -> Zapisano do MongoDB (łącznie: {total_saved})") + + # Opóźnienie + if page < MAX_PAGES - 1: + time.sleep(DELAY_SECONDS) + + # Zapisz pozostałe gry + if all_games: + saved = await mongodb.save_games_batch(all_games) + total_saved += saved + print(f" -> Zapisano pozostałe do MongoDB") + + # Podsumowanie + final_count = await mongodb.get_games_count() + print() + print("=" * 60) + print(f"Zakończono!") + print(f"Gier w bazie: {final_count}") + print("=" * 60) + + await mongodb.disconnect() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/fetch_steam_news.py b/scripts/fetch_steam_news.py new file mode 100644 index 0000000000000000000000000000000000000000..f59ec153ca4c964e9bd6d9793af0b6d0371b6165 --- /dev/null +++ b/scripts/fetch_steam_news.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Diagnostic helper for inspecting public Steam News API payloads. + +Defaults to Going Medieval (appid 1029780), which recently had a SteamDB patch +note marked as "Major". The goal is to inspect what fields are actually exposed +by the public ISteamNews/GetNewsForApp endpoint. + +Examples: + python scripts/fetch_steam_news.py + python scripts/fetch_steam_news.py --appid 1029780 --count 20 --raw + python scripts/fetch_steam_news.py --appid 1029780 --contains "1.0" +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from datetime import datetime, timezone +from pathlib import Path + +import httpx + + +STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/" +DEFAULT_APP_ID = 1029780 +DEFAULT_COUNT = 20 + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Inspect recent Steam News API payloads for a game." + ) + parser.add_argument( + "--appid", + default=str(DEFAULT_APP_ID), + help=f"Steam appid to inspect (default: {DEFAULT_APP_ID} for Going Medieval).", + ) + parser.add_argument( + "--count", + type=int, + default=DEFAULT_COUNT, + help=f"Number of news items to fetch (default: {DEFAULT_COUNT}).", + ) + parser.add_argument( + "--maxlength", + type=int, + default=0, + help="Steam API maxlength parameter (default: 0).", + ) + parser.add_argument( + "--contains", + default=None, + help="Only print items whose title or contents contain this substring.", + ) + parser.add_argument( + "--raw", + action="store_true", + help="Dump full raw JSON response after the human-readable summary.", + ) + parser.add_argument( + "--output", + default=None, + help="Optional path to save the raw JSON response.", + ) + return parser.parse_args() + + +def _format_timestamp(value: object) -> str | None: + if not isinstance(value, int) or value <= 0: + return None + return datetime.fromtimestamp(value, tz=timezone.utc).isoformat() + + +def _matches_filter(item: dict, needle: str | None) -> bool: + if not needle: + return True + haystacks = [ + str(item.get("title", "")), + str(item.get("contents", "")), + str(item.get("feedlabel", "")), + str(item.get("feedname", "")), + ] + lowered = needle.lower() + return any(lowered in text.lower() for text in haystacks) + + +def _print_summary(app_id: str, news_items: list[dict]) -> None: + print("=" * 80) + print(f"Steam News API inspection for appid {app_id}") + print(f"Items returned: {len(news_items)}") + + all_keys = sorted({key for item in news_items for key in item.keys()}) + print(f"Observed keys: {', '.join(all_keys) if all_keys else '(none)'}") + + feedlabels = Counter(str(item.get("feedlabel", "")) for item in news_items) + feednames = Counter(str(item.get("feedname", "")) for item in news_items) + print(f"feedlabel values: {dict(feedlabels)}") + print(f"feedname values: {dict(feednames)}") + print("=" * 80) + + +def _print_items(news_items: list[dict]) -> None: + for index, item in enumerate(news_items, start=1): + title = item.get("title", "") + date_iso = _format_timestamp(item.get("date")) + print(f"[{index}] {title}") + print(f" date: {date_iso}") + print(f" gid: {item.get('gid')}") + print(f" url: {item.get('url')}") + print(f" author: {item.get('author')}") + print(f" feedlabel: {item.get('feedlabel')}") + print(f" feedname: {item.get('feedname')}") + print(f" tags: {item.get('tags')}") + if "feed_type" in item: + print(f" feed_type: {item.get('feed_type')}") + if "announcement_body" in item: + print(" announcement_body: present") + contents = str(item.get("contents", "")) + preview = contents[:220].replace("\n", " ").strip() + print(f" contents_preview: {preview}") + print(f" contents_length: {len(contents)}") + print(f" keys: {sorted(item.keys())}") + print() + + +def main() -> int: + args = _parse_args() + params = { + "appid": args.appid, + "count": args.count, + "maxlength": args.maxlength, + } + + try: + with httpx.Client(timeout=30.0) as client: + response = client.get(STEAM_NEWS_API_URL, params=params) + response.raise_for_status() + payload = response.json() + except Exception as exc: # deliberate broad catch for diagnostics + print(f"Steam News request failed: {exc}", file=sys.stderr) + return 1 + + appnews = payload.get("appnews", {}) + news_items = appnews.get("newsitems", []) + filtered_items = [item for item in news_items if _matches_filter(item, args.contains)] + + _print_summary(str(args.appid), news_items) + + if args.contains: + print(f"Filter substring: {args.contains!r}") + print(f"Filtered items: {len(filtered_items)}") + print("-" * 80) + + _print_items(filtered_items) + + if args.output: + output_path = Path(args.output) + output_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"Raw JSON saved to: {output_path}") + + if args.raw: + print("=" * 80) + print("Raw JSON") + print("=" * 80) + print(json.dumps(payload, indent=2, ensure_ascii=False)) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/prewarm_cache.py b/scripts/prewarm_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..787611e8d987299351edd7923afc6399050672b4 --- /dev/null +++ b/scripts/prewarm_cache.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +Skrypt do "rozgrzewania" cache (Pre-warming). + +Wykonuje pełną analizę dla zdefiniowanej listy popularnych gier, +aby podczas prezentacji wyniki były dostępne natychmiast (z cache MongoDB). + +Uruchomienie: + python scripts/prewarm_cache.py +""" + +import asyncio +import sys +from pathlib import Path +import json + +# Dodaj backend do path +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) + +from app.services.steam_service import steam_service +from app.services.nlp_service import nlp_service +from app.db.mongodb import mongodb +from app.core.sampling import create_sample_plan +from app.routers.analyze import aggregate_topics +from app.models.schemas import AnalysisResult, TopicSentiment + +# Lista gier do pre-analizy (Hity Steam) +TARGET_GAMES = [ + "Cyberpunk 2077", + "Baldur's Gate 3", + "Elden Ring", + "Starfield", + "Hades", + "Stardew Valley" +] + +async def analyze_game_headless(game_name: str): + """Wykonuje analizę gry bez streamowania (headless).""" + print(f"\n[START] Analiza: {game_name}") + + # 1. Szukaj gry + game = await steam_service.search_game(game_name) + if not game: + print(f"[ERROR] Nie znaleziono gry: {game_name}") + return + + print(f" -> Znaleziono: {game.name} (ID: {game.app_id})") + + # 2. Sprawdź cache + cached = await mongodb.get_cached_analysis(game.app_id) + if cached: + print(f" -> [SKIP] Wyniki już są w cache (zaktualizowane: {cached.get('timestamp', 'unknown')})") + return + + # 3. Analiza + stats = await steam_service.get_review_stats(game.app_id) + sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative) + + print(f" -> Plan: {sample_plan.total} recenzji (Statystyczna: {sample_plan.statistical_sample})") + + # Update game object + game = game.model_copy(update={"target_count": sample_plan.total}) + + processed = 0 + total_skipped = 0 + aggregated_topics: list[TopicSentiment] = [] + + print(" -> Pobieranie i analiza w toku...", end="", flush=True) + + async for batch in steam_service.fetch_reviews_stratified(game.app_id, sample_plan): + if not batch.reviews: + continue + + print(".", end="", flush=True) + batch_results, batch_skipped = await nlp_service.analyze_batch(batch.reviews) + + if batch_results: + aggregated_topics = aggregate_topics(aggregated_topics, batch_results) + + total_skipped += batch_skipped + processed += len(batch.reviews) + + # Małe opóźnienie żeby nie zabić API HF + await asyncio.sleep(0.5) + + print(" OK") + + # 4. Zapisz + result = AnalysisResult( + game=game, + topics=aggregated_topics, + analyzed_reviews=processed, + skipped_count=total_skipped + ) + + await mongodb.save_analysis(game.app_id, result.model_dump()) + print(f"[DONE] Zapisano wyniki dla {game.name}!") + + +async def main(): + print("=" * 60) + print("CACHE PRE-WARMER - Przygotowanie do prezentacji") + print("=" * 60) + + try: + await mongodb.connect() + + for game_name in TARGET_GAMES: + await analyze_game_headless(game_name) + + except Exception as e: + print(f"\n[FATAL ERROR] {e}") + finally: + await mongodb.disconnect() + print("\nZakończono.") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/profile_nlp_pipeline.py b/scripts/profile_nlp_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..7cadcfa60d9b9bd764deab9d4cea07adfd52816f --- /dev/null +++ b/scripts/profile_nlp_pipeline.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +""" +Detailed NLP Pipeline Profiler. + +Measures time spent in each step of the analysis pipeline: +1. Steam API: Search game +2. Steam API: Fetch reviews +3. NLP: Text cleaning +4. NLP: Sentence splitting +5. NLP: Topic detection (regex) +6. NLP: Sentiment inference (model) +7. Aggregation + +Usage: + python scripts/profile_nlp_pipeline.py --game "Cyberpunk 2077" --count 100 +""" + +import asyncio +import sys +import time +import argparse +from pathlib import Path +from collections import defaultdict +from dataclasses import dataclass, field + +# Add backend to path +sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) + +from app.services.steam_service import steam_service +from app.services.nlp_service import nlp_service, SENTENCE_SPLIT_PATTERN +from app.core.sampling import create_sample_plan +from app.core.config import settings + + +@dataclass +class ProfileMetrics: + """Stores timing metrics for each pipeline step.""" + # Steam API + search_game: float = 0.0 + fetch_reviews: float = 0.0 + + # NLP Pre-processing + clean_text: float = 0.0 + sentence_split: float = 0.0 + topic_detection: float = 0.0 + + # NLP Model Inference + model_inference: float = 0.0 + + # Post-processing + aggregation: float = 0.0 + + # Counts + reviews_count: int = 0 + sentences_count: int = 0 + sentences_with_topics: int = 0 + model_calls: int = 0 + + # Per-item tracking + clean_text_calls: int = 0 + topic_detection_calls: int = 0 + + +def profile_analyze_batch(nlp_svc, reviews: list[str], metrics: ProfileMetrics): + """ + Profiled version of nlp_service.analyze_batch(). + Breaks down timing for each sub-step. + """ + if not reviews: + return [], 0 + + metrics.reviews_count += len(reviews) + + # ===================================================================== + # Step 1: Text Cleaning + # ===================================================================== + t0 = time.perf_counter() + cleaned_reviews = [] + for review in reviews: + cleaned = nlp_svc.clean_text(review) + cleaned_reviews.append(cleaned) + metrics.clean_text_calls += 1 + metrics.clean_text += time.perf_counter() - t0 + + # ===================================================================== + # Step 2: Sentence Splitting + # ===================================================================== + t0 = time.perf_counter() + all_sentences = [] + review_sentence_map = [] # (review_idx, sentence) + + for review_idx, cleaned in enumerate(cleaned_reviews): + if not cleaned or len(cleaned) < 5: + continue + sentences = nlp_svc._split_into_sentences(cleaned) + metrics.sentences_count += len(sentences) + for sentence in sentences: + all_sentences.append(sentence) + review_sentence_map.append((review_idx, sentence)) + + metrics.sentence_split += time.perf_counter() - t0 + + # ===================================================================== + # Step 3: Topic Detection (Regex) + # ===================================================================== + t0 = time.perf_counter() + sentiment_tasks = [] # (review_idx, topic, sentence) + skipped = 0 + + for review_idx, sentence in review_sentence_map: + topics = nlp_svc._detect_topics_regex(sentence) + metrics.topic_detection_calls += 1 + if topics: + metrics.sentences_with_topics += 1 + for topic in topics: + sentiment_tasks.append((review_idx, topic, sentence)) + else: + skipped += 1 + + metrics.topic_detection += time.perf_counter() - t0 + + if not sentiment_tasks: + return [], skipped + + # ===================================================================== + # Step 4: Model Inference (THE HEAVY PART) + # ===================================================================== + t0 = time.perf_counter() + + texts_to_analyze = [task[2] for task in sentiment_tasks] + + # Run synchronously for accurate timing (bypass executor) + results = nlp_svc._run_inference(texts_to_analyze) + metrics.model_calls += len(texts_to_analyze) + + metrics.model_inference += time.perf_counter() - t0 + + # ===================================================================== + # Step 5: Aggregation + # ===================================================================== + t0 = time.perf_counter() + + # Simplified aggregation (just count, don't compute full results) + topic_counts = defaultdict(int) + for i, (review_idx, topic, sentence) in enumerate(sentiment_tasks): + topic_counts[topic] += 1 + + metrics.aggregation += time.perf_counter() - t0 + + return list(topic_counts.items()), skipped + + +async def run_profiler(game_name: str, target_count: int): + print("=" * 70) + print("🔬 NLP PIPELINE PROFILER - Detailed Timing Analysis") + print("=" * 70) + print(f"Game: {game_name}") + print(f"Target reviews: {target_count}") + print("-" * 70) + + metrics = ProfileMetrics() + + # ========================================================================= + # Steam API: Search Game + # ========================================================================= + print("\n📡 Steam API...") + t0 = time.perf_counter() + game = await steam_service.search_game(game_name) + metrics.search_game = time.perf_counter() - t0 + + if not game: + print(f"❌ Game not found: {game_name}") + return + + print(f" Found: {game.name} (ID: {game.app_id})") + + stats = await steam_service.get_review_stats(game.app_id) + sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative) + + if target_count > 0: + sample_plan.statistical_sample = target_count + sample_plan.top_helpful = 0 + sample_plan.total = target_count + + # ========================================================================= + # Steam API: Fetch Reviews + # ========================================================================= + print(f"\n📥 Fetching {sample_plan.total} reviews...") + t0 = time.perf_counter() + + all_reviews = [] + async for batch in steam_service.fetch_reviews_stratified(game.app_id, sample_plan): + all_reviews.extend(batch.reviews) + if len(all_reviews) >= target_count: + break + + metrics.fetch_reviews = time.perf_counter() - t0 + print(f" Fetched: {len(all_reviews)} reviews") + + # ========================================================================= + # NLP Analysis (Profiled) + # ========================================================================= + print(f"\n🧠 Running NLP analysis...") + + # Process in batches like the real system + batch_size = settings.review_batch_size + total_topics = [] + total_skipped = 0 + + for i in range(0, len(all_reviews), batch_size): + batch = all_reviews[i:i + batch_size] + topics, skipped = profile_analyze_batch(nlp_service, batch, metrics) + total_topics.extend(topics) + total_skipped += skipped + + # ========================================================================= + # Results + # ========================================================================= + print("\n" + "=" * 70) + print("📊 PROFILING RESULTS") + print("=" * 70) + + total_nlp = (metrics.clean_text + metrics.sentence_split + + metrics.topic_detection + metrics.model_inference + + metrics.aggregation) + total_time = metrics.search_game + metrics.fetch_reviews + total_nlp + + print(f"\n{'STEP':<30} {'TIME':>10} {'%':>8} {'CALLS':>10} {'PER CALL':>12}") + print("-" * 70) + + steps = [ + ("Steam: Search Game", metrics.search_game, 1, None), + ("Steam: Fetch Reviews", metrics.fetch_reviews, 1, None), + ("NLP: Text Cleaning", metrics.clean_text, metrics.clean_text_calls, None), + ("NLP: Sentence Splitting", metrics.sentence_split, metrics.reviews_count, None), + ("NLP: Topic Detection", metrics.topic_detection, metrics.topic_detection_calls, None), + ("NLP: Model Inference ⚡", metrics.model_inference, metrics.model_calls, None), + ("NLP: Aggregation", metrics.aggregation, 1, None), + ] + + for name, duration, calls, _ in steps: + pct = (duration / total_time * 100) if total_time > 0 else 0 + per_call = (duration / calls * 1000) if calls > 0 else 0 # ms + calls_str = str(calls) if calls else "-" + per_call_str = f"{per_call:.3f}ms" if calls > 0 else "-" + + # Highlight the bottleneck + marker = "🔥" if pct > 50 else "" + print(f"{name:<30} {duration:>9.3f}s {pct:>7.1f}% {calls_str:>10} {per_call_str:>12} {marker}") + + print("-" * 70) + print(f"{'TOTAL':<30} {total_time:>9.3f}s {100.0:>7.1f}%") + + # Summary stats + print("\n" + "=" * 70) + print("📈 SUMMARY STATISTICS") + print("=" * 70) + print(f"Reviews processed: {metrics.reviews_count}") + print(f"Sentences extracted: {metrics.sentences_count}") + print(f"Sentences with topics: {metrics.sentences_with_topics} ({metrics.sentences_with_topics/max(1,metrics.sentences_count)*100:.1f}%)") + print(f"Model inference calls: {metrics.model_calls}") + print(f"Skipped (no topic): {total_skipped}") + + print("\n" + "=" * 70) + print("⏱️ PERFORMANCE METRICS") + print("=" * 70) + + reviews_per_sec = metrics.reviews_count / total_nlp if total_nlp > 0 else 0 + ms_per_review = (total_nlp / metrics.reviews_count * 1000) if metrics.reviews_count > 0 else 0 + ms_per_inference = (metrics.model_inference / metrics.model_calls * 1000) if metrics.model_calls > 0 else 0 + + print(f"NLP throughput: {reviews_per_sec:.1f} reviews/sec") + print(f"Time per review: {ms_per_review:.2f}ms") + print(f"Time per model call: {ms_per_inference:.3f}ms") + print(f"Model batch size: 16 (hardcoded)") + + # Bottleneck analysis + print("\n" + "=" * 70) + print("🎯 BOTTLENECK ANALYSIS") + print("=" * 70) + + nlp_breakdown = { + "Text Cleaning": metrics.clean_text, + "Sentence Splitting": metrics.sentence_split, + "Topic Detection": metrics.topic_detection, + "Model Inference": metrics.model_inference, + "Aggregation": metrics.aggregation, + } + + bottleneck = max(nlp_breakdown, key=nlp_breakdown.get) + bottleneck_pct = nlp_breakdown[bottleneck] / total_nlp * 100 if total_nlp > 0 else 0 + + print(f"Primary bottleneck: {bottleneck} ({bottleneck_pct:.1f}% of NLP time)") + + if bottleneck == "Model Inference": + print("\nOptimization suggestions:") + print(" • Increase batch_size in _run_inference() (currently 16)") + print(" • Use a smaller/faster model (DistilBERT vs RoBERTa)") + print(" • Apply more aggressive quantization (INT4)") + print(" • Filter more sentences before inference (stricter topic matching)") + elif bottleneck == "Topic Detection": + print("\nOptimization suggestions:") + print(" • Reduce number of keyword patterns") + print(" • Use Aho-Corasick algorithm for multi-pattern matching") + print(" • Pre-filter by sentence length") + elif bottleneck == "Text Cleaning": + print("\nOptimization suggestions:") + print(" • Simplify regex patterns") + print(" • Skip cleaning for very short texts") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Profile NLP pipeline performance") + parser.add_argument("--game", type=str, default="Cyberpunk 2077", help="Game name") + parser.add_argument("--count", type=int, default=100, help="Number of reviews") + + args = parser.parse_args() + + asyncio.run(run_profiler(args.game, args.count)) diff --git a/scripts/quantize_model.py b/scripts/quantize_model.py new file mode 100644 index 0000000000000000000000000000000000000000..0fe87a1687c5a08818fe90e9ce51582d12104e3a --- /dev/null +++ b/scripts/quantize_model.py @@ -0,0 +1,53 @@ +""" +Quantize the sentiment model to INT8 using ONNX Runtime dynamic quantization. + +Usage: + python scripts/quantize_model.py + +Output: + backend/models/quantized/ — contains model_quantized.onnx + tokenizer files +""" + +import shutil +import tempfile +from pathlib import Path + +from optimum.onnxruntime import ORTModelForSequenceClassification, ORTQuantizer +from optimum.onnxruntime.configuration import AutoQuantizationConfig +from transformers import AutoTokenizer + +MODEL_ID = "uer/roberta-base-finetuned-jd-binary-chinese" +OUTPUT_DIR = Path(__file__).resolve().parent.parent / "backend" / "models" / "quantized" + + +def main() -> None: + print(f"[1/4] Exporting {MODEL_ID} to ONNX...") + tmp_dir = tempfile.mkdtemp(prefix="onnx_export_") + try: + model = ORTModelForSequenceClassification.from_pretrained( + MODEL_ID, export=True + ) + model.save_pretrained(tmp_dir) + + print("[2/4] Applying INT8 dynamic quantization (AVX2)...") + quantizer = ORTQuantizer.from_pretrained(tmp_dir) + qconfig = AutoQuantizationConfig.avx2(is_static=False) + + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + quantizer.quantize(save_dir=OUTPUT_DIR, quantization_config=qconfig) + + print("[3/4] Copying tokenizer files...") + tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + tokenizer.save_pretrained(str(OUTPUT_DIR)) + + print(f"[4/4] Done! Quantized model saved to: {OUTPUT_DIR}") + print("Files:") + for f in sorted(OUTPUT_DIR.iterdir()): + size_mb = f.stat().st_size / (1024 * 1024) + print(f" {f.name} ({size_mb:.1f} MB)") + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) + + +if __name__ == "__main__": + main()