GitHub Action commited on
Commit
8ff1b66
·
0 Parent(s):

deploy: worker release from GitHub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +23 -0
  2. Dockerfile +47 -0
  3. README.md +14 -0
  4. backend/.env.example +38 -0
  5. backend/app/__init__.py +8 -0
  6. backend/app/core/__init__.py +1 -0
  7. backend/app/core/config.py +174 -0
  8. backend/app/core/freshness.py +71 -0
  9. backend/app/core/jieba_userdict.txt +14 -0
  10. backend/app/core/keywords.py +273 -0
  11. backend/app/core/rate_limit.py +6 -0
  12. backend/app/core/sampling.py +135 -0
  13. backend/app/core/stopwords_zh.py +39 -0
  14. backend/app/core/ttl_tiers.py +19 -0
  15. backend/app/core/worker_logging.py +316 -0
  16. backend/app/db/__init__.py +1 -0
  17. backend/app/db/mongodb.py +1152 -0
  18. backend/app/main.py +159 -0
  19. backend/app/models/__init__.py +19 -0
  20. backend/app/models/schemas.py +210 -0
  21. backend/app/routers/__init__.py +5 -0
  22. backend/app/routers/analyze.py +597 -0
  23. backend/app/routers/games.py +68 -0
  24. backend/app/services/__init__.py +6 -0
  25. backend/app/services/analysis_runner.py +643 -0
  26. backend/app/services/analysis_utils.py +259 -0
  27. backend/app/services/game_sync_service.py +290 -0
  28. backend/app/services/highlights_service.py +202 -0
  29. backend/app/services/nlp_service.py +524 -0
  30. backend/app/services/precache_service.py +199 -0
  31. backend/app/services/priority_refresh_service.py +387 -0
  32. backend/app/services/steam_errors.py +22 -0
  33. backend/app/services/steam_service.py +499 -0
  34. backend/app/services/update_detection_service.py +453 -0
  35. backend/pytest.ini +6 -0
  36. backend/requirements.txt +42 -0
  37. backend/scripts/smoke_news_cursor.py +264 -0
  38. backend/scripts/smoke_test.py +185 -0
  39. backend/worker_main.py +244 -0
  40. scripts/benchmark_major_update.py +848 -0
  41. scripts/check_db_stats.py +47 -0
  42. scripts/expand_keywords/__init__.py +8 -0
  43. scripts/expand_keywords/__main__.py +6 -0
  44. scripts/expand_keywords/config.py +106 -0
  45. scripts/expand_keywords/expander.py +350 -0
  46. scripts/expand_keywords/fetcher.py +355 -0
  47. scripts/expand_keywords/keywords_base.py +324 -0
  48. scripts/expand_keywords/main.py +447 -0
  49. scripts/expand_keywords/preprocessor.py +282 -0
  50. scripts/expand_keywords/trainer.py +185 -0
.dockerignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore everything by default
2
+ *
3
+
4
+ # Allow only what is needed for Docker build
5
+ !backend/
6
+ !frontend/
7
+ !scripts/
8
+ !Dockerfile
9
+ !README.md
10
+ !requirements.txt
11
+ !.gitignore
12
+
13
+ # Exclude unnecessary subfolders
14
+ backend/tests/
15
+ backend/__pycache__/
16
+ backend/.pytest_cache/
17
+ frontend/node_modules/
18
+ frontend/dist/
19
+
20
+ # Exclude specific files
21
+ *.pdf
22
+ .env
23
+ *.log
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Stage 1: Quantize NLP model (torch needed ONLY here for PyTorch -> ONNX export)
3
+ # ------------------------------------------------------------------------------
4
+ FROM python:3.11-slim AS model-quantizer
5
+
6
+ WORKDIR /app
7
+
8
+ RUN pip install --no-cache-dir \
9
+ --extra-index-url https://download.pytorch.org/whl/cpu \
10
+ "torch==2.2.0" \
11
+ "optimum[onnxruntime]==1.16.2" \
12
+ "transformers==4.37.2" \
13
+ "huggingface-hub==0.20.3" \
14
+ "numpy<2.0.0"
15
+
16
+ COPY scripts/quantize_model.py scripts/quantize_model.py
17
+ RUN python3 scripts/quantize_model.py
18
+
19
+
20
+ # ------------------------------------------------------------------------------
21
+ # Stage 2: Runtime (Python FastAPI Worker — no torch, no frontend)
22
+ # ------------------------------------------------------------------------------
23
+ FROM python:3.11-slim
24
+
25
+ WORKDIR /app
26
+
27
+ # Create non-root user for security
28
+ RUN useradd -m -u 1000 user
29
+ USER user
30
+ ENV HOME=/home/user \
31
+ PATH=/home/user/.local/bin:$PATH
32
+
33
+ # Install Python dependencies (no torch — ~700MB RAM saved)
34
+ COPY --chown=user:user backend/requirements.txt backend/requirements.txt
35
+ RUN pip install --no-cache-dir --upgrade -r backend/requirements.txt
36
+
37
+ # Copy Backend code
38
+ COPY --chown=user:user backend backend
39
+
40
+ # Copy pre-quantized ONNX model from Stage 1
41
+ COPY --chown=user:user --from=model-quantizer /app/backend/models/quantized backend/models/quantized
42
+
43
+ WORKDIR /app/backend
44
+
45
+ EXPOSE 7860
46
+
47
+ CMD ["uvicorn", "worker_main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SentimentStream Worker
3
+ emoji: ⚙️
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: agpl-3.0
10
+ ---
11
+
12
+ # SentimentStream Worker
13
+
14
+ Background worker for SentimentStream. Syncs games from SteamSpy, detects updates via Steam News API, and pre-caches sentiment analyses.
backend/.env.example ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MongoDB
2
+ MONGODB_URL=mongodb://admin:password@localhost:27017
3
+ MONGODB_DB_NAME=sentimentSummary
4
+
5
+ # App Settings
6
+ DEBUG=true
7
+ CORS_ORIGINS=http://localhost:5173,http://localhost:3000
8
+
9
+ # Cache Settings
10
+ CACHE_TTL_HOURS=24
11
+
12
+ # Steam API Settings
13
+ REVIEW_BATCH_SIZE=500
14
+ STEAM_REVIEW_LANGUAGE=schinese
15
+ STEAM_REGION=CN
16
+
17
+ # Steam API Retry
18
+ STEAM_RETRY_MAX_ATTEMPTS=3
19
+ STEAM_RETRY_BASE_DELAY=1.0
20
+ STEAM_RETRY_MAX_DELAY=10.0
21
+
22
+ # Sampling Settings - Statistical sampling parameters
23
+ SAMPLE_TOP_HELPFUL=50
24
+ SAMPLE_CONFIDENCE_LEVEL=0.95
25
+ SAMPLE_MARGIN_OF_ERROR=0.01
26
+ SAMPLE_MAX_REVIEWS=3000
27
+
28
+ # NLP Settings - Hugging Face Models
29
+ HF_SENTIMENT_MODEL=uer/roberta-base-finetuned-jd-binary-chinese
30
+
31
+ # NLP Settings - Analysis Parameters
32
+ TEXT_MAX_LENGTH=512
33
+ SENTIMENT_POSITIVE_THRESHOLD=0.1
34
+ SENTIMENT_NEGATIVE_THRESHOLD=-0.1
35
+ TOPIC_MIN_MENTIONS=5
36
+
37
+ # Deduplication Cache
38
+ DEDUP_CACHE_MAXSIZE=10000
backend/app/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SentimentStream Backend Application.
3
+
4
+ Narzędzie do analizy sentymentu i modelowania tematów
5
+ w recenzjach gier Steam w czasie rzeczywistym.
6
+ """
7
+
8
+ __version__ = "0.1.0"
backend/app/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Moduł konfiguracji aplikacji."""
backend/app/core/config.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Konfiguracja aplikacji.
3
+
4
+ Wykorzystuje Pydantic Settings do zarządzania zmiennymi środowiskowymi.
5
+ """
6
+
7
+ from functools import lru_cache
8
+
9
+ from pydantic_settings import BaseSettings, SettingsConfigDict
10
+
11
+
12
+ class Settings(BaseSettings):
13
+ """
14
+ Ustawienia aplikacji ładowane ze zmiennych środowiskowych.
15
+ """
16
+
17
+ model_config = SettingsConfigDict(
18
+ env_file=(".env", "backend/.env"),
19
+ env_file_encoding="utf-8",
20
+ case_sensitive=False,
21
+ extra="ignore"
22
+ )
23
+
24
+ # MongoDB
25
+ mongodb_url: str = ""
26
+ mongodb_db_name: str = "sentimentSummary"
27
+
28
+ # App Mode
29
+ app_mode: str = "full" # "full" = monolith, "api" = API-only (no frontend)
30
+
31
+ # App Settings
32
+ debug: bool = False
33
+ cors_origins: str = "http://localhost:5173,http://localhost:3000"
34
+
35
+ # Cache Settings
36
+ cache_ttl_hours: int = 24
37
+ cache_ttl_short_hours: int = 12 # frequently updated games
38
+ cache_ttl_long_hours: int = 168 # stable games (7 days)
39
+ cache_ttl_worker_managed_hours: int = 1440 # 60 days to preserve stale fallback results
40
+ cache_ttl_on_demand_hours: int = 1440 # 60 days to preserve stale fallback results
41
+
42
+ # Incremental Analysis
43
+ incremental_enabled: bool = True
44
+ incremental_max_stored_ids: int = 5000
45
+ incremental_max_gap_days: int = 90 # fall back to full analysis after this many days without reviews
46
+ recent_sample_limit: int = 1000
47
+ niche_cache_max_age_days: int = 60
48
+ analysis_freshness_max_age_days: int = 60
49
+ patch_context_max_age_days: int = 90
50
+ dlc_min_reviews_for_analysis: int = 50
51
+ dlc_visible_in_search: bool = False # Temporary policy: hide DLC from autocomplete/suggestions
52
+ dlc_worker_analysis_enabled: bool = False # Temporary policy: exclude DLC from worker-managed analysis
53
+
54
+ # Steam API Settings
55
+ review_batch_size: int = 100
56
+ steam_review_language: str = "schinese" # Review fetch scope; product analyzes Simplified Chinese Steam reviews.
57
+ steam_region: str = "CN" # CN, US, etc.
58
+
59
+ # Steam API Retry
60
+ steam_retry_max_attempts: int = 3
61
+ steam_retry_base_delay: float = 1.0 # doubles each retry
62
+ steam_retry_max_delay: float = 10.0 # cap
63
+
64
+ # Steam API Error Cache TTL (seconds)
65
+ steam_error_cache_ttl_404: int = 3600 # 1h
66
+ steam_error_cache_ttl_429: int = 300 # 5min
67
+
68
+ # Sampling Settings - Statistical sampling parameters
69
+ sample_top_helpful: int = 50
70
+ sample_confidence_level: float = 0.95
71
+ sample_margin_of_error: float = 0.02
72
+ sample_max_reviews: int = 3000
73
+ sample_minority_min: int = 100
74
+
75
+ # NLP Settings - Analysis Parameters
76
+ text_max_length: int = 512
77
+ sentiment_positive_threshold: float = 0.1
78
+ sentiment_negative_threshold: float = -0.1
79
+ topic_min_mentions: int = 5
80
+
81
+ # NLP Settings - Deduplication Cache
82
+ dedup_cache_maxsize: int = 10000
83
+
84
+ # NLP Settings - Performance & Logic
85
+ nlp_onnx_intra_threads: int = 2
86
+ nlp_onnx_inter_threads: int = 2
87
+ nlp_negation_window: int = 3
88
+
89
+ # Prediction Settings
90
+ prediction_retention_threshold_pos: float = 0.2
91
+ prediction_retention_threshold_neg: float = -0.2
92
+
93
+ # Community Highlights
94
+ highlights_ngram_min: int = 2
95
+ highlights_ngram_max: int = 5
96
+ highlights_min_mentions: int = 3
97
+ highlights_max_doc_freq_ratio: float = 0.4
98
+ highlights_top_n_general: int = 15
99
+ highlights_top_n_per_topic: int = 5
100
+
101
+ # Worker — Pre-cache
102
+ worker_trigger_token: str = ""
103
+ precache_enabled: bool = False
104
+ precache_top_n_games: int = 500
105
+ precache_batch_delay_seconds: int = 10
106
+ precache_checkpoints_hours: str = "6,12,24,72,168,336"
107
+ precache_max_analyses_per_cycle: int = 50
108
+
109
+ # Worker — Priority Games
110
+ steam_priority_categories: str = "top_sellers,new_releases,specials"
111
+ steam_priority_regions: str = "CN,US"
112
+ steam_priority_grace_days: int = 3
113
+ steam_priority_categories_url: str = "https://store.steampowered.com/api/featuredcategories"
114
+ steam_bootstrap_max_per_cycle: int = 20
115
+ steam_bootstrap_delay: float = 1.5
116
+
117
+ # Worker — News Scan
118
+ news_refresh_window_hours: int = 6
119
+ news_initial_count: int = 20
120
+ news_incremental_count: int = 5
121
+
122
+ # Worker — Game Sync
123
+ game_sync_enabled: bool = False
124
+ game_sync_steamspy_delay: float = 61.0
125
+ game_sync_details_delay: float = 1.1
126
+ game_sync_top_n_details: int = 500
127
+ game_sync_cn_enrichment_delay: float = 1.5
128
+ game_sync_cn_enrichment_limit: int = 200
129
+ game_sync_app_type_enrichment_delay: float = 1.5
130
+ game_sync_app_type_enrichment_limit: int = 200
131
+
132
+ # Logging (both Live API and Worker)
133
+ worker_log_dir: str = "/data/worker_logs"
134
+ worker_log_fallback_dir: str = "/tmp/worker_logs"
135
+ worker_log_max_bytes: int = 5_000_000 # 5 MB per file
136
+ worker_log_backup_count: int = 3 # 3 rotated files = 20 MB max
137
+ nlp_verbose_logging: bool = False # re-enable NLP debug logs to stdout
138
+ nlp_debug_log_max_bytes: int = 2_000_000 # 2 MB per file
139
+ errors_log_max_bytes: int = 2_000_000 # 2 MB per file
140
+
141
+ # Rate Limiting
142
+ rate_limit_analyze: str = "10/minute"
143
+ rate_limit_default: str = "30/minute"
144
+
145
+ # NLP Settings - Hugging Face Models
146
+ # Using specialized Chinese model (RoBERTa-JD) - 90% accuracy on product reviews
147
+ hf_sentiment_model: str = "uer/roberta-base-finetuned-jd-binary-chinese"
148
+
149
+ @property
150
+ def cors_origins_list(self) -> list[str]:
151
+ """Zwraca listę dozwolonych originów CORS."""
152
+ return [origin.strip() for origin in self.cors_origins.split(",")]
153
+
154
+ @property
155
+ def precache_checkpoints_list(self) -> list[int]:
156
+ """Parse checkpoint hours from comma-separated string."""
157
+ return sorted(int(h.strip()) for h in self.precache_checkpoints_hours.split(","))
158
+
159
+ @property
160
+ def steam_priority_categories_list(self) -> list[str]:
161
+ return [c.strip() for c in self.steam_priority_categories.split(",") if c.strip()]
162
+
163
+ @property
164
+ def steam_priority_regions_list(self) -> list[str]:
165
+ return [r.strip() for r in self.steam_priority_regions.split(",") if r.strip()]
166
+
167
+
168
+ @lru_cache
169
+ def get_settings() -> Settings:
170
+ """Zwraca singleton instancji Settings."""
171
+ return Settings()
172
+
173
+
174
+ settings = get_settings()
backend/app/core/freshness.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Product-level analysis freshness rules.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from datetime import datetime, timezone
8
+ from enum import Enum
9
+ from typing import Any, cast
10
+
11
+ from app.core.config import settings
12
+
13
+
14
+ class FreshnessStatus(str, Enum):
15
+ """Product freshness state for an existing analysis."""
16
+
17
+ FRESH = "fresh"
18
+ STALE_BY_AGE = "stale_by_age"
19
+ STALE_BY_PATCH = "stale_by_patch"
20
+
21
+
22
+ def _as_utc_datetime(value: Any) -> datetime | None:
23
+ if value is None:
24
+ return None
25
+ if isinstance(value, datetime):
26
+ return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
27
+ if isinstance(value, str):
28
+ parsed = datetime.fromisoformat(value)
29
+ return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
30
+ return None
31
+
32
+
33
+ def get_analysis_reference_at(document: dict[str, Any]) -> datetime | None:
34
+ """Return the best available execution timestamp for freshness checks."""
35
+ raw = document.get("results")
36
+ results: dict[str, Any] = cast(dict[str, Any], raw) if isinstance(raw, dict) else {}
37
+ return (
38
+ _as_utc_datetime(results.get("analysis_date"))
39
+ or _as_utc_datetime(document.get("analyzed_at"))
40
+ or _as_utc_datetime(document.get("cached_at"))
41
+ )
42
+
43
+
44
+ def evaluate_freshness(
45
+ document: dict[str, Any],
46
+ current_patch_at: datetime | None,
47
+ ) -> FreshnessStatus:
48
+ """
49
+ Evaluate analysis freshness using product rules:
50
+ patch recency first, then max age.
51
+ """
52
+ analysis_at = get_analysis_reference_at(document)
53
+ if analysis_at is None:
54
+ return FreshnessStatus.STALE_BY_AGE
55
+
56
+ if current_patch_at is not None and analysis_at < current_patch_at:
57
+ return FreshnessStatus.STALE_BY_PATCH
58
+
59
+ age_days = (datetime.now(timezone.utc) - analysis_at).days
60
+ if age_days > settings.analysis_freshness_max_age_days:
61
+ return FreshnessStatus.STALE_BY_AGE
62
+
63
+ return FreshnessStatus.FRESH
64
+
65
+
66
+ def get_staleness_reason(status: FreshnessStatus) -> str | None:
67
+ if status == FreshnessStatus.STALE_BY_AGE:
68
+ return "STALE_REASON_AGE"
69
+ if status == FreshnessStatus.STALE_BY_PATCH:
70
+ return "STALE_REASON_PATCH"
71
+ return None
backend/app/core/jieba_userdict.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ boss战 5 n
2
+ 开放世界 5 n
3
+ 大逃杀 5 n
4
+ 战斗通行证 5 n
5
+ 皮肤系统 5 n
6
+ 氪金 10 v
7
+ 开箱 5 v
8
+ 人机对战 5 n
9
+ 帧数不稳 5 n
10
+ 内存泄漏 5 n
11
+ 手感好 5 a
12
+ 手感差 5 a
13
+ 上手简单 5 a
14
+ 劝退新手 5 v
backend/app/core/keywords.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chinese keywords for game review topic detection.
3
+ Used in hybrid approach (Keywords + ML Sentiment).
4
+
5
+ Categories based on common topics in Steam game reviews.
6
+ Seed keywords will be expanded using the expand_keywords pipeline.
7
+
8
+ Structure: topic -> {single_char, compound, phrase}
9
+ - single_char: standalone Chinese characters (1 char, prone to false positives)
10
+ - compound: multi-char Chinese words or short English words
11
+ - phrase: multi-word phrases (EN or ZH)
12
+ """
13
+
14
+ TOPIC_KEYWORDS: dict[str, dict[str, list[str]]] = {
15
+ # =========================================================================
16
+ # CORE GAMEPLAY - 核心玩法
17
+ # =========================================================================
18
+ "Gameplay": {
19
+ "single_char": ["刷", "肝"],
20
+ "compound": [
21
+ "玩法", "游戏性", "机制", "战斗", "任务", "关卡",
22
+ "探索", "技能", "装备", "gameplay",
23
+ ],
24
+ "phrase": ["战斗系统"],
25
+ },
26
+
27
+ "Fun": {
28
+ "single_char": ["爽", "烂"],
29
+ "compound": [
30
+ # Positive
31
+ "好玩", "有趣", "上瘾", "神作", "佳作", "精品",
32
+ "沉浸", "过瘾", "带感", "回血", "爽游",
33
+ "解压", "杀时间",
34
+ # Negative
35
+ "无聊", "枯燥", "乏味", "垃圾", "辣鸡", "粪作",
36
+ "失望", "无趣",
37
+ ],
38
+ "phrase": [
39
+ "电子伟哥", "治好了", "精神时光屋", "时光屋",
40
+ "电子阳痿", "电子ed",
41
+ ],
42
+ },
43
+
44
+ "Difficulty": {
45
+ "single_char": [],
46
+ "compound": [
47
+ "难度", "简单", "困难", "硬核",
48
+ "劝退", "手残", "新手", "上手",
49
+ "souls", "魂类",
50
+ ],
51
+ "phrase": ["太难", "太简单"],
52
+ },
53
+
54
+ # =========================================================================
55
+ # TECHNICAL - 技术
56
+ # =========================================================================
57
+ "Performance": {
58
+ "single_char": ["卡"],
59
+ "compound": [
60
+ "优化", "卡顿", "帧率", "帧数", "流畅", "掉帧",
61
+ "丝滑", "显卡", "显存", "延迟",
62
+ "fps", "cpu", "gpu",
63
+ ],
64
+ "phrase": [
65
+ "稳60", "锁60", "解锁帧率", "吃配置", "带不动",
66
+ "PPT效果", "幻灯片", "帧生成", "输入延迟", "帧数不稳",
67
+ ],
68
+ },
69
+
70
+ "Bugs": {
71
+ "single_char": [],
72
+ "compound": [
73
+ "闪退", "崩溃", "卡死", "报错", "存档",
74
+ "黑屏", "进不去", "打不开", "未响应", "无响应",
75
+ "弹窗", "坏档", "掉线",
76
+ "bug", "bugs",
77
+ ],
78
+ "phrase": [
79
+ "存档损坏", "无法保存", "卡加载",
80
+ "加载失败", "连不上",
81
+ ],
82
+ },
83
+
84
+ # =========================================================================
85
+ # AUDIO-VISUAL - 视听
86
+ # =========================================================================
87
+ "Graphics": {
88
+ "single_char": [],
89
+ "compound": [
90
+ "画面", "画质", "特效", "建模", "贴图",
91
+ "美术", "风格", "场景", "光影",
92
+ "4k", "hdr",
93
+ ],
94
+ "phrase": [],
95
+ },
96
+
97
+ "Sound": {
98
+ "single_char": [],
99
+ "compound": [
100
+ "音乐", "音效", "配音", "配乐", "声音",
101
+ "原声",
102
+ "bgm", "ost",
103
+ ],
104
+ "phrase": ["中文配音"],
105
+ },
106
+
107
+ # =========================================================================
108
+ # CONTENT & VALUE - 内容与价值
109
+ # =========================================================================
110
+ "Content": {
111
+ "single_char": [],
112
+ "compound": [
113
+ "内容", "时长", "流程", "耐玩", "通关",
114
+ "主线", "支线", "收集", "小时", "体量",
115
+ "注水", "重复", "换皮", "多周目",
116
+ "dlc",
117
+ ],
118
+ "phrase": [
119
+ "素材复用", "拖时长", "强行延长", "通关后",
120
+ ],
121
+ },
122
+
123
+ "Monetization": {
124
+ "single_char": [],
125
+ "compound": [
126
+ # ex-Price
127
+ "价格", "定价", "值得", "不值", "贵", "便宜",
128
+ "打折", "史低", "入手", "白嫖", "性价比",
129
+ # ex-Microtransactions
130
+ "氪金", "内购", "充值", "抽卡", "648",
131
+ "课金", "首充", "月卡", "战令", "季票",
132
+ "开箱", "箱子", "钥匙", "保底", "抽奖",
133
+ "p2w",
134
+ ],
135
+ "phrase": [
136
+ "通行证", "pay to win",
137
+ ],
138
+ },
139
+
140
+ # =========================================================================
141
+ # MULTIPLAYER & COMMUNITY - 多人与社区
142
+ # =========================================================================
143
+ "Multiplayer": {
144
+ "single_char": [],
145
+ "compound": [
146
+ "联机", "多人", "匹��", "服务器", "延迟",
147
+ "掉线", "开黑", "组队", "单机", "野排", "车队",
148
+ "单排", "组排", "路人", "挂机",
149
+ "pvp", "pve", "coop",
150
+ ],
151
+ "phrase": [
152
+ "坑比", "猪队友", "送人头",
153
+ ],
154
+ },
155
+
156
+ "Community": {
157
+ "single_char": [],
158
+ "compound": [
159
+ "社区", "玩家", "汉化",
160
+ "官方", "民间",
161
+ "mod", "mods",
162
+ ],
163
+ "phrase": ["创意工坊"],
164
+ },
165
+
166
+ # =========================================================================
167
+ # CONTROLS & UI - 操控与界面
168
+ # =========================================================================
169
+ "Controls": {
170
+ "single_char": [],
171
+ "compound": [
172
+ "操作", "手感", "手柄", "键鼠", "键盘",
173
+ "摇杆", "触发", "键位", "改键",
174
+ "死区", "陀螺仪", "扳机", "震动",
175
+ ],
176
+ "phrase": [
177
+ "自定义键位", "辅助瞄准", "触觉反馈", "自适应扳机",
178
+ ],
179
+ },
180
+
181
+ "UI": {
182
+ "single_char": [],
183
+ "compound": [
184
+ "界面", "菜单", "字幕", "字体",
185
+ "中文", "汉化",
186
+ "ui", "hud",
187
+ ],
188
+ "phrase": [],
189
+ },
190
+
191
+ # =========================================================================
192
+ # STORY & NARRATIVE - 剧情
193
+ # =========================================================================
194
+ "Story": {
195
+ "single_char": [],
196
+ "compound": [
197
+ "剧情", "故事", "人物", "角色", "结局",
198
+ "剧本", "叙事", "世界观", "背景", "喂屎",
199
+ "烂尾", "降智", "工具人", "脸谱化",
200
+ "剧情杀", "都合主义",
201
+ "npc",
202
+ ],
203
+ "phrase": ["逻辑硬伤"],
204
+ },
205
+
206
+ # =========================================================================
207
+ # DEVELOPER SUPPORT - 开发支持
208
+ # =========================================================================
209
+ "Support": {
210
+ "single_char": [],
211
+ "compound": [
212
+ "更新", "修复", "维护", "开发商", "官方",
213
+ "补丁", "版本",
214
+ ],
215
+ "phrase": [],
216
+ },
217
+
218
+ "Localization": {
219
+ "single_char": [],
220
+ "compound": [
221
+ "本地化", "汉化", "翻译", "机翻", "缺字", "乱码",
222
+ "繁体", "简体",
223
+ ],
224
+ "phrase": [
225
+ "语言支持", "中文支持", "无中文", "不支援中文",
226
+ "文本质量", "字幕翻译", "界面翻译",
227
+ ],
228
+ },
229
+
230
+ # =========================================================================
231
+ # REFINEMENT - 打磨
232
+ # =========================================================================
233
+ "Polish": {
234
+ "single_char": [],
235
+ "compound": [
236
+ "打磨", "精致", "粗糙", "用心", "敷衍", "细节",
237
+ "诚意", "偷懒", "不用心", "精良", "精美",
238
+ ],
239
+ "phrase": ["粗制滥造"],
240
+ },
241
+
242
+ # =========================================================================
243
+ # RETENTION - 留存
244
+ # =========================================================================
245
+ "Retention": {
246
+ "single_char": [],
247
+ "compound": [
248
+ # Positive (High Retention)
249
+ "推荐", "安利", "入正", "入坑", "必玩",
250
+ "神作", "年度", "满分",
251
+ # Negative (Churn)
252
+ "退款", "卸载", "弃坑", "劝退", "不推荐",
253
+ "避雷", "踩雷", "退坑",
254
+ "回坑", "出坑", "已弃",
255
+ ],
256
+ "phrase": [
257
+ "坚持玩", "每天玩", "停不下来", "刷了",
258
+ "已退", "退款了",
259
+ ],
260
+ },
261
+ }
262
+
263
+ # =============================================================================
264
+ # EXCLUSIONS (Context-aware filtering)
265
+ # =============================================================================
266
+ # Words to exclude when they appear in certain contexts.
267
+ # Format: "keyword": ["context_word1", "context_word2"]
268
+
269
+ EXCLUSIONS = {
270
+ # "fps" as genre (FPS shooter) vs performance (60 fps)
271
+ "fps": ["射击", "枪战", "第一人称"],
272
+ # Empty for now - will be expanded based on false positives
273
+ }
backend/app/core/rate_limit.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Shared rate limiter instance for the application."""
2
+
3
+ from slowapi import Limiter
4
+ from slowapi.util import get_remote_address
5
+
6
+ limiter = Limiter(key_func=get_remote_address)
backend/app/core/sampling.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Moduł do obliczania statystycznej wielkości próbki.
3
+
4
+ Implementuje wzory statystyczne dla próbkowania populacji.
5
+ """
6
+
7
+ import math
8
+ from dataclasses import dataclass
9
+
10
+ from app.core.config import settings
11
+
12
+
13
+ # Wartości Z dla poziomów ufności
14
+ Z_SCORES = {
15
+ 0.90: 1.645,
16
+ 0.95: 1.96,
17
+ 0.99: 2.576,
18
+ }
19
+
20
+
21
+ @dataclass
22
+ class SamplePlan:
23
+ """
24
+ Plan próbkowania dla gry.
25
+
26
+ Attributes:
27
+ top_helpful: Liczba najprzydatniejszych recenzji.
28
+ statistical_sample: Wielkość próbki statystycznej.
29
+ positive_count: Ile pobrać pozytywnych (stratified).
30
+ negative_count: Ile pobrać negatywnych (stratified).
31
+ total: Łączna liczba recenzji do pobrania.
32
+ """
33
+
34
+ top_helpful: int
35
+ statistical_sample: int
36
+ positive_count: int
37
+ negative_count: int
38
+ total: int
39
+
40
+
41
+ def calculate_sample_size(
42
+ population: int,
43
+ confidence_level: float | None = None,
44
+ margin_of_error: float | None = None,
45
+ ) -> int:
46
+ """
47
+ Oblicza minimalną wielkość próbki dla danej populacji.
48
+ Wykorzystuje wzór Cochrana z korektą dla populacji skończonej.
49
+ """
50
+ if confidence_level is None:
51
+ confidence_level = settings.sample_confidence_level
52
+ if margin_of_error is None:
53
+ margin_of_error = settings.sample_margin_of_error
54
+
55
+ # 1. Pobieramy Z-score (np. 1.96 dla 95% ufności).
56
+ # Mówi on, jak bardzo wynik może odbiegać od średniej w jednostkach odchylenia standardowego.
57
+ z = Z_SCORES.get(confidence_level, 1.96)
58
+
59
+ # 2. Zakładamy p=0.5 (maksymalna zmienność).
60
+ # To daje nam najbezpieczniejszą (największą) wielkość próbki.
61
+ p = 0.5
62
+
63
+ # 3. Wzór Cochrana dla nieskończonej populacji:
64
+ # n0 = (Z^2 * p * (1-p)) / e^2
65
+ # Wyjaśnienie: Z kwadrat razy zmienność, podzielone przez kwadrat błędu.
66
+ n_0 = (z ** 2 * p * (1 - p)) / (margin_of_error ** 2)
67
+
68
+ # 4. Korekta dla populacji skończonej (Steam ma policzalną liczbę recenzji):
69
+ # n = n0 / (1 + (n0 - 1) / N)
70
+ # Wyjaśnienie: Zmniejszamy próbkę, bo wiemy dokładnie, ile osób (recenzji) jest w "całym świecie" tej gry.
71
+ n = n_0 / (1 + (n_0 - 1) / population)
72
+
73
+ # Zaokrąglamy w górę do pełnej recenzji
74
+ return math.ceil(n)
75
+
76
+
77
+ def create_sample_plan(
78
+ total_reviews: int,
79
+ positive_reviews: int,
80
+ negative_reviews: int,
81
+ ) -> SamplePlan:
82
+ """
83
+ Tworzy plan próbkowania, łącząc dwa podejścia.
84
+ """
85
+ top_helpful = settings.sample_top_helpful
86
+ max_reviews = settings.sample_max_reviews
87
+
88
+ # Obliczamy, ile recenzji musimy pobrać, żeby wynik był wiarygodny
89
+ statistical_sample = calculate_sample_size(total_reviews)
90
+
91
+ # Pilnujemy, żeby nie przekroczyć ustawionego limitu (np. 3000)
92
+ statistical_sample = min(statistical_sample, max_reviews - top_helpful)
93
+
94
+ # Obliczamy jaki procent stanowią pozytywy i negatywy w całości
95
+ if total_reviews > 0:
96
+ pos_ratio = positive_reviews / total_reviews
97
+ neg_ratio = negative_reviews / total_reviews
98
+ else:
99
+ pos_ratio = 0.5
100
+ neg_ratio = 0.5
101
+
102
+ # Rozdzielamy naszą próbkę proporcjonalnie do tych wyników (Stratified Sampling)
103
+ pos_target = math.ceil(statistical_sample * pos_ratio)
104
+ neg_target = math.ceil(statistical_sample * neg_ratio)
105
+
106
+ # Minority protection: boost the smaller group to minority_min if possible
107
+ minority_min = settings.sample_minority_min
108
+
109
+ if pos_target < minority_min and positive_reviews > pos_target:
110
+ pos_target = min(minority_min, positive_reviews)
111
+
112
+ if neg_target < minority_min and negative_reviews > neg_target:
113
+ neg_target = min(minority_min, negative_reviews)
114
+
115
+ # Final adjustment to stay within statistical_sample limit
116
+ if pos_target + neg_target > statistical_sample:
117
+ if pos_target > neg_target:
118
+ pos_target = max(pos_target - (pos_target + neg_target - statistical_sample), minority_min)
119
+ else:
120
+ neg_target = max(neg_target - (pos_target + neg_target - statistical_sample), minority_min)
121
+
122
+ # Final cap by actual availability
123
+ positive_count = min(pos_target, positive_reviews)
124
+ negative_count = min(neg_target, negative_reviews)
125
+
126
+ # Sumujemy wszystko (Top Helpful + Próbka Statystyczna)
127
+ total = top_helpful + positive_count + negative_count
128
+
129
+ return SamplePlan(
130
+ top_helpful=top_helpful,
131
+ statistical_sample=statistical_sample,
132
+ positive_count=positive_count,
133
+ negative_count=negative_count,
134
+ total=total,
135
+ )
backend/app/core/stopwords_zh.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Chinskie stop words dla NLP pipeline.
3
+ Uzywane przez Community Highlights (n-gram extraction) i potencjalnie inne moduly.
4
+ """
5
+
6
+ # Jednooznakowe tokeny do odfiltrowania (WYJATKI ponizej)
7
+ SINGLE_CHAR_EXCEPTIONS = {"卡", "肝", "爽", "氪", "菜", "毒"}
8
+
9
+ # Stop words — czeste slowa bez wartosci informacyjnej
10
+ STOPWORDS_ZH = {
11
+ # Zaimki
12
+ "我", "你", "他", "她", "它", "我们", "你们", "他们",
13
+ # Czastki i spojniki
14
+ "的", "了", "是", "在", "不", "有", "和", "就",
15
+ "都", "也", "很", "要", "会", "可以", "这", "那",
16
+ "还", "没", "着", "被", "把", "让", "给", "从",
17
+ "到", "对", "但", "而", "或", "与",
18
+ # Czastki modalne
19
+ "吗", "呢", "啊", "吧", "呀", "嘛", "哦", "哈",
20
+ # Przysliwki
21
+ "比较", "非常", "真的", "确实", "其实", "可能",
22
+ "已经", "一直", "马上", "刚刚",
23
+ # Czasowniki ogolne
24
+ "觉得", "感觉", "知道", "看到", "说",
25
+ # Liczebniki i okreslniki
26
+ "一个", "一些", "这个", "那个", "什么", "怎么",
27
+ "多少", "几个",
28
+ # Filler w recenzjach gier
29
+ "这游戏", "这个游戏", "游戏", "玩家",
30
+ }
31
+
32
+
33
+ def is_stopword(token: str) -> bool:
34
+ """Sprawdza czy token jest stop wordem lub jednooznakowym tokenem bez wartosci."""
35
+ if token in STOPWORDS_ZH:
36
+ return True
37
+ if len(token) == 1 and token not in SINGLE_CHAR_EXCEPTIONS:
38
+ return True
39
+ return False
backend/app/core/ttl_tiers.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tiered TTL configuration for game cache expiry.
3
+
4
+ Popular games (worker-managed top N) get longer cache,
5
+ niche games (on-demand) get shorter cache.
6
+ """
7
+
8
+ from app.core.config import settings
9
+
10
+
11
+ async def get_ttl_hours(app_id: str) -> int:
12
+ """Return TTL in hours based on whether the game is a priority game."""
13
+ from app.db.mongodb import mongodb
14
+
15
+ priority_ids = await mongodb.get_priority_game_ids_for_analysis()
16
+
17
+ if app_id in priority_ids:
18
+ return settings.cache_ttl_worker_managed_hours # 1440h (60d)
19
+ return settings.cache_ttl_on_demand_hours # 1440h (60d)
backend/app/core/worker_logging.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Structured logging infrastructure for Worker and Live API.
3
+
4
+ Provides JSON-line file logging with rotation, timing context managers,
5
+ and module-level accessors for use across the codebase.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import logging.handlers
11
+ import os
12
+ import time
13
+ from typing import Any
14
+
15
+ from app.core.config import settings
16
+
17
+ # Module-level state
18
+ _structured_logger: logging.Logger | None = None
19
+ _cycle_id: str | None = None
20
+ _app_logging_initialized: bool = False
21
+
22
+ # Per-process log file whitelists (key → filename)
23
+ LIVE_LOG_WHITELIST: dict[str, str] = {
24
+ "live": "live.jsonl",
25
+ "errors": "errors.log",
26
+ "nlp_debug": "nlp_debug.log",
27
+ }
28
+ WORKER_LOG_WHITELIST: dict[str, str] = {
29
+ "worker": "worker.jsonl",
30
+ "errors": "errors.log",
31
+ "nlp_debug": "nlp_debug.log",
32
+ }
33
+
34
+
35
+ class DebugOnlyFilter(logging.Filter):
36
+ """Pass only DEBUG-level records (blocks INFO and above)."""
37
+
38
+ def filter(self, record: logging.LogRecord) -> bool:
39
+ return record.levelno == logging.DEBUG
40
+
41
+
42
+ def _get_writable_log_dir() -> str:
43
+ """Return the first writable log directory (primary or fallback)."""
44
+ log_dir = settings.worker_log_dir
45
+ try:
46
+ os.makedirs(log_dir, exist_ok=True)
47
+ test_path = os.path.join(log_dir, ".write_test")
48
+ with open(test_path, "w") as f:
49
+ f.write("ok")
50
+ os.remove(test_path)
51
+ except (OSError, PermissionError):
52
+ log_dir = settings.worker_log_fallback_dir
53
+ os.makedirs(log_dir, exist_ok=True)
54
+ return log_dir
55
+
56
+
57
+ class JsonLineFormatter(logging.Formatter):
58
+ """Formats log records as single-line JSON (JSONL)."""
59
+
60
+ def format(self, record: logging.LogRecord) -> str:
61
+ entry: dict[str, Any] = {
62
+ "ts": self.formatTime(record, self.datefmt),
63
+ "level": record.levelname,
64
+ "event": getattr(record, "event", record.getMessage()),
65
+ }
66
+
67
+ # Optional structured fields
68
+ for key in ("detail", "elapsed_s", "breakdown", "app_id",
69
+ "game_name", "source", "reviews_processed",
70
+ "topics_found", "analysis_type", "cycle_id", "error"):
71
+ val = getattr(record, key, None)
72
+ if val is not None:
73
+ entry[key] = val
74
+
75
+ # Include cycle_id from module state if not on record
76
+ if "cycle_id" not in entry or entry["cycle_id"] is None:
77
+ cid = get_cycle_id()
78
+ if cid:
79
+ entry["cycle_id"] = cid
80
+
81
+ # Remove None values
82
+ entry = {k: v for k, v in entry.items() if v is not None}
83
+
84
+ return json.dumps(entry, default=str, ensure_ascii=False)
85
+
86
+
87
+ def setup_structured_logger(name: str) -> logging.Logger:
88
+ """
89
+ Create a rotating JSON-line file logger.
90
+
91
+ Tries settings.worker_log_dir first, falls back to
92
+ settings.worker_log_fallback_dir if the primary is not writable.
93
+
94
+ Args:
95
+ name: Logger name and file prefix (e.g. "worker" or "live").
96
+
97
+ Returns:
98
+ Configured logger instance.
99
+ """
100
+ logger = logging.getLogger(f"structured.{name}")
101
+ logger.setLevel(logging.INFO)
102
+
103
+ # Don't add duplicate handlers on re-init
104
+ if logger.handlers:
105
+ return logger
106
+
107
+ log_dir = _get_writable_log_dir()
108
+ log_path = os.path.join(log_dir, f"{name}.jsonl")
109
+ handler = logging.handlers.RotatingFileHandler(
110
+ log_path,
111
+ maxBytes=settings.worker_log_max_bytes,
112
+ backupCount=settings.worker_log_backup_count,
113
+ encoding="utf-8",
114
+ )
115
+ handler.setFormatter(JsonLineFormatter())
116
+ logger.addHandler(handler)
117
+
118
+ # Also store as module-level default
119
+ set_structured_logger(logger)
120
+
121
+ return logger
122
+
123
+
124
+ class TimingContext:
125
+ """Sync context manager that measures wall-clock time via time.monotonic()."""
126
+
127
+ def __init__(self) -> None:
128
+ self.elapsed_s: float = 0.0
129
+ self._start: float = 0.0
130
+
131
+ def __enter__(self) -> "TimingContext":
132
+ self._start = time.monotonic()
133
+ return self
134
+
135
+ def __exit__(self, *exc: Any) -> None:
136
+ self.elapsed_s = round(time.monotonic() - self._start, 3)
137
+
138
+
139
+ class AsyncTimingContext:
140
+ """Async context manager that measures wall-clock time via time.monotonic()."""
141
+
142
+ def __init__(self) -> None:
143
+ self.elapsed_s: float = 0.0
144
+ self._start: float = 0.0
145
+
146
+ async def __aenter__(self) -> "AsyncTimingContext":
147
+ self._start = time.monotonic()
148
+ return self
149
+
150
+ async def __aexit__(self, *exc: Any) -> None:
151
+ self.elapsed_s = round(time.monotonic() - self._start, 3)
152
+
153
+
154
+ def read_log_tail(
155
+ path: str,
156
+ lines: int = 100,
157
+ level: str | None = None,
158
+ event: str | None = None,
159
+ ) -> list[dict[str, Any]]:
160
+ """
161
+ Read last N JSON lines from a log file, with optional filtering.
162
+
163
+ Args:
164
+ path: Path to .jsonl log file.
165
+ lines: Max number of lines to return.
166
+ level: Filter by log level (e.g. "ERROR").
167
+ event: Filter by event name substring.
168
+
169
+ Returns:
170
+ List of parsed JSON dicts, newest last.
171
+ """
172
+ if not os.path.exists(path):
173
+ return []
174
+
175
+ # Read all lines, take last N (simple approach for small-ish files)
176
+ with open(path, "r", encoding="utf-8") as f:
177
+ all_lines = f.readlines()
178
+
179
+ # Parse from the end, collect up to `lines` matching entries
180
+ results: list[dict[str, Any]] = []
181
+ for raw in reversed(all_lines):
182
+ raw = raw.strip()
183
+ if not raw:
184
+ continue
185
+ try:
186
+ entry = json.loads(raw)
187
+ except json.JSONDecodeError:
188
+ continue
189
+
190
+ if level and entry.get("level") != level:
191
+ continue
192
+ if event and event not in entry.get("event", ""):
193
+ continue
194
+
195
+ results.append(entry)
196
+ if len(results) >= lines:
197
+ break
198
+
199
+ results.reverse() # Restore chronological order
200
+ return results
201
+
202
+
203
+ def resolve_log_path(file_key: str, whitelist: dict[str, str]) -> str | None:
204
+ """
205
+ Resolve a whitelisted log file key to its absolute path.
206
+
207
+ Returns the expected path if the key is in the whitelist, None otherwise.
208
+ The file may not exist yet (read_log_tail handles that gracefully).
209
+
210
+ Args:
211
+ file_key: Logical name for the log file (e.g. "live", "errors").
212
+ whitelist: Mapping of allowed keys to filenames for this process.
213
+
214
+ Returns:
215
+ Absolute path to the log file, or None if key is not whitelisted.
216
+ """
217
+ filename = whitelist.get(file_key)
218
+ if not filename:
219
+ return None
220
+
221
+ primary = os.path.join(settings.worker_log_dir, filename)
222
+ if os.path.isdir(settings.worker_log_dir):
223
+ return primary
224
+
225
+ return os.path.join(settings.worker_log_fallback_dir, filename)
226
+
227
+
228
+ def setup_app_logging() -> None:
229
+ """
230
+ Set up application-wide file logging handlers. Idempotent.
231
+
232
+ Creates:
233
+ - errors.log: WARNING+ from all loggers (attached to root logger)
234
+ - nlp_debug.log: DEBUG-only NLP trace from app.services.nlp_service
235
+
236
+ Call once during app lifespan startup, after setup_structured_logger().
237
+ """
238
+ global _app_logging_initialized
239
+ if _app_logging_initialized:
240
+ return
241
+ _app_logging_initialized = True
242
+
243
+ log_dir = _get_writable_log_dir()
244
+
245
+ # 1. errors.log — WARNING+ from root (catches all loggers via propagation)
246
+ errors_handler = logging.handlers.RotatingFileHandler(
247
+ os.path.join(log_dir, "errors.log"),
248
+ maxBytes=settings.errors_log_max_bytes,
249
+ backupCount=settings.worker_log_backup_count,
250
+ encoding="utf-8",
251
+ )
252
+ errors_handler.setLevel(logging.WARNING)
253
+ errors_handler.setFormatter(JsonLineFormatter())
254
+ logging.getLogger().addHandler(errors_handler)
255
+
256
+ # 2. nlp_debug.log — DEBUG-only NLP trace (Dedup/Cache messages)
257
+ nlp_handler = logging.handlers.RotatingFileHandler(
258
+ os.path.join(log_dir, "nlp_debug.log"),
259
+ maxBytes=settings.nlp_debug_log_max_bytes,
260
+ backupCount=settings.worker_log_backup_count,
261
+ encoding="utf-8",
262
+ )
263
+ nlp_handler.setLevel(logging.DEBUG)
264
+ nlp_handler.addFilter(DebugOnlyFilter())
265
+ nlp_handler.setFormatter(JsonLineFormatter())
266
+
267
+ nlp_logger = logging.getLogger("app.services.nlp_service")
268
+ nlp_logger.setLevel(logging.DEBUG)
269
+ nlp_logger.addHandler(nlp_handler)
270
+
271
+ # 3. Optional: re-enable NLP debug to stdout
272
+ if settings.nlp_verbose_logging:
273
+ verbose_handler = logging.StreamHandler()
274
+ verbose_handler.setLevel(logging.DEBUG)
275
+ verbose_handler.setFormatter(logging.Formatter(
276
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
277
+ ))
278
+ nlp_logger.addHandler(verbose_handler)
279
+
280
+
281
+ def get_structured_logger() -> logging.Logger | None:
282
+ """Get the module-level structured logger (if initialized)."""
283
+ return _structured_logger
284
+
285
+
286
+ def set_structured_logger(logger: logging.Logger) -> None:
287
+ """Set the module-level structured logger."""
288
+ global _structured_logger
289
+ _structured_logger = logger
290
+
291
+
292
+ def get_cycle_id() -> str | None:
293
+ """Get the current worker cycle ID."""
294
+ return _cycle_id
295
+
296
+
297
+ def set_cycle_id(cycle_id: str | None) -> None:
298
+ """Set the current worker cycle ID."""
299
+ global _cycle_id
300
+ _cycle_id = cycle_id
301
+
302
+
303
+ def log_structured(
304
+ event: str,
305
+ level: int = logging.INFO,
306
+ **kwargs: Any,
307
+ ) -> None:
308
+ """
309
+ Emit a structured log entry via the module-level logger.
310
+
311
+ No-op if no structured logger has been initialized (e.g. in tests).
312
+ """
313
+ slog = get_structured_logger()
314
+ if not slog:
315
+ return
316
+ slog.log(level, event, extra={"event": event, **kwargs})
backend/app/db/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Moduł bazy danych."""
backend/app/db/mongodb.py ADDED
@@ -0,0 +1,1152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Moduł połączenia z bazą danych MongoDB.
3
+
4
+ Wykorzystuje Motor (async driver) do asynchronicznej komunikacji z MongoDB.
5
+ Implementuje cache wyników analizy z TTL 24h.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import re
11
+ from datetime import datetime, timedelta, timezone
12
+ from typing import Any
13
+
14
+ from bson.codec_options import CodecOptions
15
+ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
16
+ from pymongo import ASCENDING, DESCENDING, UpdateOne
17
+ from pymongo.errors import (
18
+ BulkWriteError,
19
+ ConnectionFailure,
20
+ OperationFailure,
21
+ PyMongoError,
22
+ )
23
+
24
+ from app.core.config import settings
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class MongoDB:
30
+ """
31
+ Klasa zarządzająca połączeniem z MongoDB.
32
+
33
+ Implementuje wzorzec Singleton poprzez globalną instancję.
34
+ Obsługuje cache wyników analizy z automatyczną walidacją TTL.
35
+ Przechowuje listę gier Steam do autouzupełniania.
36
+
37
+ Attributes:
38
+ client: Klient MongoDB (Motor).
39
+ db: Referencja do bazy danych.
40
+ """
41
+
42
+ COLLECTION_ANALYSES = "analyses"
43
+ COLLECTION_GAMES = "games"
44
+ COLLECTION_STEAM_ERRORS = "steam_errors"
45
+ COLLECTION_REFRESH_SCHEDULES = "refresh_schedules"
46
+
47
+ def __init__(self) -> None:
48
+ """Inicjalizuje instancję bez aktywnego połączenia."""
49
+ self.client: AsyncIOMotorClient | None = None # type: ignore
50
+ self.db: AsyncIOMotorDatabase | None = None # type: ignore
51
+
52
+ async def connect(self, max_retries: int = 3) -> None:
53
+ """
54
+ Nawiązuje połączenie z MongoDB z exponential backoff.
55
+
56
+ Tworzy indeksy dla optymalnej wydajności zapytań.
57
+
58
+ Args:
59
+ max_retries: Maksymalna liczba prób połączenia.
60
+
61
+ Raises:
62
+ ConnectionError: Gdy nie można połączyć się z bazą po wszystkich próbach.
63
+ """
64
+ for attempt in range(1, max_retries + 1):
65
+ try:
66
+ self.client = AsyncIOMotorClient(settings.mongodb_url, tz_aware=True)
67
+ codec_options: CodecOptions = CodecOptions(tz_aware=True)
68
+ self.db = self.client.get_database(
69
+ settings.mongodb_db_name, codec_options=codec_options
70
+ )
71
+
72
+ # Weryfikacja połączenia
73
+ await self.client.admin.command("ping")
74
+ logger.info(f"Połączono z MongoDB: {settings.mongodb_db_name}")
75
+
76
+ # Utwórz indeksy
77
+ await self._create_indexes()
78
+ return
79
+
80
+ except (ConnectionFailure, PyMongoError) as e:
81
+ if attempt < max_retries:
82
+ delay = 2 ** (attempt - 1) # 1s, 2s, 4s
83
+ logger.warning(
84
+ f"MongoDB connection attempt {attempt}/{max_retries} failed: {e}. "
85
+ f"Retrying in {delay}s..."
86
+ )
87
+ await asyncio.sleep(delay)
88
+ else:
89
+ logger.error(f"MongoDB connection failed after {max_retries} attempts: {e}")
90
+ raise ConnectionError(
91
+ f"Nie można połączyć się z MongoDB po {max_retries} próbach: {e}"
92
+ )
93
+
94
+ async def _create_indexes(self) -> None:
95
+ """Tworzy indeksy dla kolekcji."""
96
+ if self.db is None:
97
+ return
98
+
99
+ # Indeksy dla analiz
100
+ analyses = self.db[self.COLLECTION_ANALYSES]
101
+ await analyses.create_index("game_id", unique=True)
102
+
103
+ # Migrate from old global TTL index (cached_at) to per-document TTL (expires_at)
104
+ try:
105
+ existing_indexes = await analyses.index_information()
106
+ for idx_name, idx_info in existing_indexes.items():
107
+ if idx_info.get("expireAfterSeconds") is not None and "cached_at" in str(idx_info.get("key")):
108
+ await analyses.drop_index(idx_name)
109
+ logger.info(f"Dropped old TTL index: {idx_name}")
110
+ break
111
+ except OperationFailure:
112
+ pass # Old index may not exist
113
+
114
+ await analyses.create_index("expires_at", expireAfterSeconds=0)
115
+
116
+ # Indeksy dla listy gier
117
+ games = self.db[self.COLLECTION_GAMES]
118
+ await games.create_index("appid", unique=True)
119
+ # Indeks dla wyszukiwania regex (case-insensitive)
120
+ await games.create_index("name_lower")
121
+ await games.create_index("name_cn")
122
+ # Rzadki indeks dla flagi sprawdzenia (oszczędność miejsca, szybkość zapytania)
123
+ await games.create_index("cn_name_checked", sparse=True)
124
+ await games.create_index("parent_appid", sparse=True)
125
+
126
+ # Compound index for sorting games by review count (worker game sync)
127
+ await games.create_index(
128
+ [("positive", DESCENDING), ("negative", DESCENDING)],
129
+ sparse=True,
130
+ )
131
+ await games.create_index(
132
+ [
133
+ ("name_lower", ASCENDING),
134
+ ("app_type", ASCENDING),
135
+ ("positive", DESCENDING),
136
+ ("negative", DESCENDING),
137
+ ]
138
+ )
139
+ await games.create_index("is_priority", sparse=True)
140
+
141
+ # Indeksy dla cache błędów Steam API
142
+ steam_errors = self.db[self.COLLECTION_STEAM_ERRORS]
143
+ await steam_errors.create_index("app_id", unique=True)
144
+ await steam_errors.create_index("expires_at", expireAfterSeconds=0)
145
+
146
+ # Indexes for refresh schedules (worker pre-cache)
147
+ schedules = self.db[self.COLLECTION_REFRESH_SCHEDULES]
148
+ await schedules.create_index("app_id", unique=True)
149
+ await schedules.create_index("status")
150
+
151
+ logger.debug("Utworzono indeksy MongoDB")
152
+
153
+ async def disconnect(self) -> None:
154
+ """Zamyka połączenie z MongoDB."""
155
+ if self.client:
156
+ self.client.close()
157
+ logger.info("Rozłączono z MongoDB")
158
+
159
+ def _is_document_expired(self, document: dict[str, Any]) -> bool:
160
+ """Check if a cache document is expired using expires_at or cached_at fallback.
161
+
162
+ With tz_aware=True on the Motor client, all datetimes from MongoDB are
163
+ already timezone-aware, so no manual .replace(tzinfo=...) is needed.
164
+ """
165
+ now = datetime.now(timezone.utc)
166
+
167
+ # New-format: per-document expires_at
168
+ expires_at = document.get("expires_at")
169
+ if expires_at:
170
+ if isinstance(expires_at, str):
171
+ expires_at = datetime.fromisoformat(expires_at)
172
+ return now >= expires_at
173
+
174
+ # Old-format fallback: cached_at + default TTL
175
+ cached_at = document.get("cached_at")
176
+ if cached_at:
177
+ if isinstance(cached_at, str):
178
+ cached_at = datetime.fromisoformat(cached_at)
179
+ ttl_hours = document.get("ttl_hours", settings.cache_ttl_hours)
180
+ return now - cached_at > timedelta(hours=ttl_hours)
181
+
182
+ return True # No timestamp info = treat as expired
183
+
184
+ async def get_cached_analysis_full(self, game_id: str) -> dict[str, Any] | None:
185
+ """
186
+ Returns full cache document (with review IDs, TTL info) or None if expired/missing.
187
+ """
188
+ if self.db is None:
189
+ return None
190
+
191
+ collection = self.db[self.COLLECTION_ANALYSES]
192
+
193
+ try:
194
+ document = await collection.find_one({"game_id": game_id})
195
+ if not document:
196
+ return None
197
+
198
+ if self._is_document_expired(document):
199
+ logger.info(f"Cache expired for game {game_id}")
200
+ return None
201
+
202
+ document.pop("_id", None)
203
+ return document
204
+
205
+ except PyMongoError as e:
206
+ logger.error(f"Error reading cache: {e}")
207
+ return None
208
+
209
+ async def get_stale_analysis(self, game_id: str) -> dict[str, Any] | None:
210
+ """
211
+ Returns cache document even if expired. Used by incremental path
212
+ to retrieve old review IDs. Returns None only if no document exists.
213
+ """
214
+ return await self.get_analysis(game_id)
215
+
216
+ async def get_analysis(self, game_id: str) -> dict[str, Any] | None:
217
+ """
218
+ Returns an analysis document regardless of TTL.
219
+
220
+ Product freshness is evaluated outside MongoDB, so this method is the
221
+ canonical read path for "show stale result + refresh" behavior.
222
+ """
223
+ if self.db is None:
224
+ return None
225
+
226
+ collection = self.db[self.COLLECTION_ANALYSES]
227
+
228
+ try:
229
+ document = await collection.find_one({"game_id": game_id})
230
+ if not document:
231
+ return None
232
+
233
+ document.pop("_id", None)
234
+ return document
235
+
236
+ except PyMongoError as e:
237
+ logger.error(f"Error reading stale cache: {e}")
238
+ return None
239
+
240
+ async def get_cached_analysis(self, game_id: str) -> dict[str, Any] | None:
241
+ """
242
+ Returns cached analysis results or None if expired/missing.
243
+ Backward-compatible wrapper around get_cached_analysis_full.
244
+ """
245
+ doc = await self.get_cached_analysis_full(game_id)
246
+ if doc is None:
247
+ return None
248
+ results = doc.get("results")
249
+ if isinstance(results, dict) and results.get("cached_at") is None and doc.get("cached_at") is not None:
250
+ results = {**results, "cached_at": doc["cached_at"]}
251
+ return results
252
+
253
+ async def save_analysis(
254
+ self,
255
+ game_id: str,
256
+ results: dict[str, Any],
257
+ analyzed_review_ids: list[str] | None = None,
258
+ latest_review_timestamp: int = 0,
259
+ ttl_hours: int | None = None,
260
+ analyzed_at: datetime | None = None,
261
+ ) -> None:
262
+ """
263
+ Saves analysis results to cache with per-document TTL.
264
+ Purges review IDs to keep only the most recent ones (space efficiency).
265
+ """
266
+ if self.db is None:
267
+ logger.warning("Brak połączenia z MongoDB - nie zapisano cache")
268
+ return
269
+
270
+ collection = self.db[self.COLLECTION_ANALYSES]
271
+
272
+ effective_ttl = ttl_hours or settings.cache_ttl_hours
273
+ now = datetime.now(timezone.utc)
274
+ analysis_date = analyzed_at
275
+ if analysis_date is None:
276
+ raw_value = results.get("analysis_date") or results.get("cached_at")
277
+ if isinstance(raw_value, str):
278
+ analysis_date = datetime.fromisoformat(raw_value)
279
+ elif isinstance(raw_value, datetime):
280
+ analysis_date = raw_value
281
+ if analysis_date is None:
282
+ analysis_date = now
283
+
284
+ if results.get("analysis_date") is None:
285
+ results = {**results, "analysis_date": analysis_date}
286
+
287
+ # Purge old IDs — keep only the most recent N
288
+ if analyzed_review_ids:
289
+ analyzed_review_ids = analyzed_review_ids[-settings.incremental_max_stored_ids:]
290
+
291
+ document: dict[str, Any] = {
292
+ "game_id": game_id,
293
+ "results": results,
294
+ "analyzed_review_ids": analyzed_review_ids or [],
295
+ "latest_review_timestamp": latest_review_timestamp,
296
+ "cached_at": now,
297
+ "analyzed_at": analysis_date,
298
+ "ttl_hours": effective_ttl,
299
+ "expires_at": now + timedelta(hours=effective_ttl),
300
+ }
301
+
302
+ try:
303
+ await collection.update_one(
304
+ {"game_id": game_id},
305
+ {"$set": document},
306
+ upsert=True,
307
+ )
308
+ logger.info(f"Saved cache for game {game_id} (TTL: {effective_ttl}h)")
309
+
310
+ except PyMongoError as e:
311
+ logger.error(f"Error saving cache: {e}")
312
+
313
+ async def delete_cached_analysis(self, game_id: str) -> bool:
314
+ """
315
+ Usuwa cache dla danej gry.
316
+
317
+ Args:
318
+ game_id: Identyfikator gry Steam.
319
+
320
+ Returns:
321
+ True jeśli usunięto, False w przeciwnym razie.
322
+ """
323
+ if self.db is None:
324
+ return False
325
+
326
+ collection = self.db[self.COLLECTION_ANALYSES]
327
+
328
+ try:
329
+ result = await collection.delete_one({"game_id": game_id})
330
+ return result.deleted_count > 0
331
+ except PyMongoError as e:
332
+ logger.error(f"Błąd usuwania cache: {e}")
333
+ return False
334
+
335
+ # ========== Steam API Error Cache ==========
336
+
337
+ async def get_steam_error(self, app_id: str) -> dict[str, Any] | None:
338
+ """
339
+ Sprawdza czy app_id ma cached error.
340
+
341
+ Returns:
342
+ Dict z polami app_id, status_code, expires_at lub None.
343
+ """
344
+ if self.db is None:
345
+ return None
346
+
347
+ collection = self.db[self.COLLECTION_STEAM_ERRORS]
348
+
349
+ try:
350
+ document = await collection.find_one({"app_id": app_id})
351
+ if not document:
352
+ return None
353
+
354
+ document.pop("_id", None)
355
+ return document
356
+
357
+ except PyMongoError as e:
358
+ logger.error(f"Błąd odczytu steam error cache: {e}")
359
+ return None
360
+
361
+ async def cache_steam_error(
362
+ self, app_id: str, status_code: int, ttl_seconds: int
363
+ ) -> None:
364
+ """
365
+ Cachuje błąd Steam API z automatycznym TTL.
366
+
367
+ MongoDB TTL index automatycznie usunie dokument po expires_at.
368
+ """
369
+ if self.db is None:
370
+ return
371
+
372
+ collection = self.db[self.COLLECTION_STEAM_ERRORS]
373
+
374
+ document = {
375
+ "app_id": app_id,
376
+ "status_code": status_code,
377
+ "cached_at": datetime.now(timezone.utc),
378
+ "expires_at": datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds),
379
+ }
380
+
381
+ try:
382
+ await collection.update_one(
383
+ {"app_id": app_id},
384
+ {"$set": document},
385
+ upsert=True,
386
+ )
387
+ logger.info(
388
+ f"Cached Steam error {status_code} for app {app_id} (TTL: {ttl_seconds}s)"
389
+ )
390
+ except PyMongoError as e:
391
+ logger.error(f"Błąd zapisu steam error cache: {e}")
392
+
393
+ # ========== Metody dla listy gier (autouzupełnianie) ==========
394
+
395
+ async def get_games_count(self) -> int:
396
+ """Zwraca liczbę gier w bazie."""
397
+ if self.db is None:
398
+ return 0
399
+
400
+ collection = self.db[self.COLLECTION_GAMES]
401
+ return await collection.count_documents({})
402
+
403
+ async def save_games_batch(self, games: list[dict[str, str]]) -> int:
404
+ """
405
+ Zapisuje partię gier do bazy (bulk insert).
406
+
407
+ Args:
408
+ games: Lista słowników z kluczami 'appid', 'name', opcjonalnie 'developer', 'publisher'.
409
+
410
+ Returns:
411
+ Liczba zapisanych gier.
412
+ """
413
+ if self.db is None or not games:
414
+ return 0
415
+
416
+ collection = self.db[self.COLLECTION_GAMES]
417
+
418
+ # Dodaj pole name_lower dla wyszukiwania case-insensitive
419
+ documents = []
420
+ for game in games:
421
+ if not game.get("name"):
422
+ continue
423
+
424
+ doc = {
425
+ "appid": game["appid"],
426
+ "name": game["name"],
427
+ "name_lower": game["name"].lower(),
428
+ }
429
+
430
+ # Dodaj opcjonalne pola
431
+ if game.get("developer"):
432
+ doc["developer"] = game["developer"]
433
+ if game.get("publisher"):
434
+ doc["publisher"] = game["publisher"]
435
+
436
+ documents.append(doc)
437
+
438
+ try:
439
+ # Użyj ordered=False żeby kontynuować mimo duplikatów
440
+ result = await collection.insert_many(documents, ordered=False)
441
+ return len(result.inserted_ids)
442
+ except BulkWriteError as e:
443
+ # Duplicates are expected with ordered=False — count successful inserts
444
+ inserted = e.details.get("nInserted", 0)
445
+ logger.debug(f"Pominięto duplikaty podczas zapisu gier ({inserted} inserted)")
446
+ return inserted
447
+ except PyMongoError as e:
448
+ logger.error(f"Błąd zapisu gier: {e}")
449
+ return 0
450
+
451
+ async def clear_games(self) -> None:
452
+ """Usuwa wszystkie gry z bazy."""
453
+ if self.db is None:
454
+ return
455
+
456
+ collection = self.db[self.COLLECTION_GAMES]
457
+ await collection.delete_many({})
458
+ logger.info("Usunięto wszystkie gry z bazy")
459
+
460
+ async def upsert_game(self, game_data: dict[str, Any]) -> None:
461
+ """
462
+ Dodaje lub aktualizuje pojedynczą grę w bazie danych.
463
+ Używane głównie przez mechanizm Fallback Search.
464
+ """
465
+ if self.db is None:
466
+ return
467
+
468
+ collection = self.db[self.COLLECTION_GAMES]
469
+ appid = str(game_data["appid"])
470
+
471
+ # Przygotuj dokument
472
+ update_doc = {
473
+ "appid": appid,
474
+ "name": game_data["name"],
475
+ "name_lower": game_data["name"].lower(),
476
+ }
477
+
478
+ if game_data.get("name_cn"):
479
+ update_doc["name_cn"] = game_data["name_cn"]
480
+ update_doc["cn_name_checked"] = True
481
+ elif game_data.get("cn_name_checked"):
482
+ update_doc["cn_name_checked"] = True
483
+
484
+ if game_data.get("header_image") is not None:
485
+ update_doc["header_image"] = game_data["header_image"]
486
+ if game_data.get("total_reviews") is not None:
487
+ update_doc["total_reviews"] = game_data["total_reviews"]
488
+
489
+ # Worker-supplied fields
490
+ for field in (
491
+ "positive", "negative", "tags", "genre", "ccu",
492
+ "last_game_update_at", "synced_at", "developer", "publisher",
493
+ "app_type", "parent_appid", "dlc_checked_at",
494
+ ):
495
+ if game_data.get(field) is not None:
496
+ update_doc[field] = game_data[field]
497
+
498
+ try:
499
+ await collection.update_one(
500
+ {"appid": appid},
501
+ {"$set": update_doc},
502
+ upsert=True
503
+ )
504
+ logger.debug(f"Zsynchronizowano grę {appid} w MongoDB")
505
+ except PyMongoError as e:
506
+ logger.error(f"Błąd upsert gry {appid}: {e}")
507
+
508
+ async def search_games(self, query: str, limit: int = 10) -> list[dict[str, Any]]:
509
+ """
510
+ Wyszukuje gry po nazwie (EN lub CN).
511
+
512
+ Używa wyszukiwania case-insensitive z prefiksem.
513
+
514
+ Args:
515
+ query: Tekst do wyszukania.
516
+ limit: Maksymalna liczba wyników.
517
+
518
+ Returns:
519
+ Lista gier pasujących do zapytania (appid, name, name_cn, developer, publisher).
520
+ """
521
+ normalized_query = query.strip()
522
+ if self.db is None or not normalized_query or len(normalized_query) < 2:
523
+ return []
524
+
525
+ collection = self.db[self.COLLECTION_GAMES]
526
+
527
+ try:
528
+ query_lower = normalized_query.lower()
529
+ name_pattern = re.escape(query_lower)
530
+ name_prefix_pattern = f"^{name_pattern}"
531
+ name_exact_pattern = f"^{name_pattern}$"
532
+ cn_pattern = re.escape(normalized_query)
533
+ cn_prefix_pattern = f"^{cn_pattern}"
534
+ cn_exact_pattern = f"^{cn_pattern}$"
535
+
536
+ match_filter: dict[str, Any] = {
537
+ "$or": [
538
+ {"name_lower": {"$regex": name_pattern}},
539
+ {"name_cn": {"$regex": cn_pattern, "$options": "i"}},
540
+ ]
541
+ }
542
+ if not settings.dlc_visible_in_search:
543
+ match_filter["app_type"] = {"$ne": "dlc"}
544
+
545
+ pipeline = [
546
+ {"$match": match_filter},
547
+ {
548
+ "$addFields": {
549
+ "match_rank": {
550
+ "$switch": {
551
+ "branches": [
552
+ {
553
+ "case": {
554
+ "$or": [
555
+ {
556
+ "$regexMatch": {
557
+ "input": {"$ifNull": ["$name_lower", ""]},
558
+ "regex": name_exact_pattern,
559
+ }
560
+ },
561
+ {
562
+ "$regexMatch": {
563
+ "input": {"$ifNull": ["$name_cn", ""]},
564
+ "regex": cn_exact_pattern,
565
+ "options": "i",
566
+ }
567
+ },
568
+ ]
569
+ },
570
+ "then": 0,
571
+ },
572
+ {
573
+ "case": {
574
+ "$or": [
575
+ {
576
+ "$regexMatch": {
577
+ "input": {"$ifNull": ["$name_lower", ""]},
578
+ "regex": name_prefix_pattern,
579
+ }
580
+ },
581
+ {
582
+ "$regexMatch": {
583
+ "input": {"$ifNull": ["$name_cn", ""]},
584
+ "regex": cn_prefix_pattern,
585
+ "options": "i",
586
+ }
587
+ },
588
+ ]
589
+ },
590
+ "then": 1,
591
+ },
592
+ ],
593
+ "default": 2,
594
+ }
595
+ },
596
+ "type_rank": {
597
+ "$switch": {
598
+ "branches": [
599
+ {
600
+ "case": {
601
+ "$in": [
602
+ {"$ifNull": ["$app_type", "unknown"]},
603
+ ["game", "unknown"],
604
+ ]
605
+ },
606
+ "then": 0,
607
+ },
608
+ {"case": {"$eq": ["$app_type", "dlc"]}, "then": 1},
609
+ {"case": {"$eq": ["$app_type", "demo"]}, "then": 2},
610
+ ],
611
+ "default": 1,
612
+ }
613
+ },
614
+ "review_count": {
615
+ "$add": [
616
+ {"$ifNull": ["$positive", 0]},
617
+ {"$ifNull": ["$negative", 0]},
618
+ ]
619
+ },
620
+ }
621
+ },
622
+ {
623
+ "$sort": {
624
+ "match_rank": 1,
625
+ "type_rank": 1,
626
+ "review_count": -1,
627
+ "name": 1,
628
+ }
629
+ },
630
+ {"$limit": limit},
631
+ {
632
+ "$project": {
633
+ "_id": 0,
634
+ "appid": 1,
635
+ "name": 1,
636
+ "name_cn": 1,
637
+ "developer": 1,
638
+ "publisher": 1,
639
+ "app_type": 1,
640
+ "parent_appid": 1,
641
+ }
642
+ },
643
+ ]
644
+
645
+ cursor = collection.aggregate(pipeline)
646
+ results = await cursor.to_list(length=limit)
647
+ return results
648
+
649
+ except PyMongoError as e:
650
+ logger.error(f"Błąd wyszukiwania gier: {e}")
651
+ return []
652
+
653
+
654
+ async def get_game_update_date(self, app_id: str) -> datetime | None:
655
+ """Get the last game update timestamp for a game."""
656
+ if self.db is None:
657
+ return None
658
+
659
+ collection = self.db[self.COLLECTION_GAMES]
660
+ try:
661
+ doc = await collection.find_one(
662
+ {"appid": str(app_id)},
663
+ {"_id": 0, "last_game_update_at": 1},
664
+ )
665
+ if doc and doc.get("last_game_update_at"):
666
+ val = doc["last_game_update_at"]
667
+ if isinstance(val, datetime):
668
+ return val
669
+ return None
670
+ return None
671
+ except PyMongoError as e:
672
+ logger.error(f"Error getting game update date for {app_id}: {e}")
673
+ return None
674
+
675
+ async def get_games_without_cn_name(self, limit: int = 200) -> list[dict[str, Any]]:
676
+ """
677
+ Pobiera gry, które nie mają jeszcze nazwy chińskiej i nie były sprawdzane.
678
+ Sortuje po liczbie pozytywnych recenzji (jeśli dostępne, dla priorytetyzacji).
679
+ """
680
+ if self.db is None:
681
+ return []
682
+
683
+ collection = self.db[self.COLLECTION_GAMES]
684
+ try:
685
+ pipeline = [
686
+ {"$match": {
687
+ "name_cn": {"$exists": False},
688
+ "cn_name_checked": {"$ne": True}, # Pomiń już sprawdzone
689
+ }},
690
+ # Sortowanie po positive (DESC), ale gry bez tego pola trafią na koniec (sparse index handling)
691
+ {"$sort": {"positive": -1}},
692
+ {"$limit": limit},
693
+ {"$project": {"_id": 0, "appid": 1, "name": 1}},
694
+ ]
695
+ cursor = collection.aggregate(pipeline)
696
+ return await cursor.to_list(length=limit)
697
+ except PyMongoError as e:
698
+ logger.error(f"Error getting games without CN name: {e}")
699
+ return []
700
+
701
+ async def mark_cn_name_checked(self, app_id: str, name_cn: str | None = None) -> None:
702
+ """
703
+ Oznacza grę jako sprawdzoną pod kątem chińskiej nazwy.
704
+ Opcjonalnie zapisuje znalezioną nazwę.
705
+ """
706
+ if self.db is None:
707
+ return
708
+
709
+ collection = self.db[self.COLLECTION_GAMES]
710
+ update_doc: dict[str, Any] = {"cn_name_checked": True}
711
+ if name_cn:
712
+ update_doc["name_cn"] = name_cn
713
+
714
+ try:
715
+ await collection.update_one(
716
+ {"appid": str(app_id)},
717
+ {"$set": update_doc}
718
+ )
719
+ except PyMongoError as e:
720
+ logger.error(f"Error marking CN name checked for {app_id}: {e}")
721
+
722
+ async def get_games_missing_app_type(self, limit: int = 200) -> list[dict[str, Any]]:
723
+ """
724
+ Return high-signal games that still need Steam Store type enrichment.
725
+
726
+ We prioritize already-priority games first, then any app with enough reviews
727
+ to qualify a DLC for worker-managed analysis.
728
+ """
729
+ if self.db is None:
730
+ return []
731
+
732
+ collection = self.db[self.COLLECTION_GAMES]
733
+ try:
734
+ pipeline = [
735
+ {
736
+ "$addFields": {
737
+ "total_reviews_sum": {
738
+ "$add": [
739
+ {"$ifNull": ["$positive", 0]},
740
+ {"$ifNull": ["$negative", 0]},
741
+ ]
742
+ }
743
+ }
744
+ },
745
+ {
746
+ "$match": {
747
+ "dlc_checked_at": {"$exists": False},
748
+ "$or": [
749
+ {"is_priority": True},
750
+ {
751
+ "total_reviews_sum": {
752
+ "$gte": settings.dlc_min_reviews_for_analysis
753
+ }
754
+ },
755
+ ],
756
+ }
757
+ },
758
+ {"$sort": {"is_priority": -1, "total_reviews_sum": -1}},
759
+ {"$limit": limit},
760
+ {"$project": {"_id": 0, "appid": 1, "name": 1}},
761
+ ]
762
+ cursor = collection.aggregate(pipeline)
763
+ return await cursor.to_list(length=limit)
764
+ except PyMongoError as e:
765
+ logger.error(f"Error getting games missing app type: {e}")
766
+ return []
767
+
768
+ async def mark_app_type_checked(
769
+ self,
770
+ app_id: str,
771
+ *,
772
+ app_type: str,
773
+ parent_appid: str | None = None,
774
+ ) -> None:
775
+ """Persist Steam Store app type metadata."""
776
+ if self.db is None:
777
+ return
778
+
779
+ collection = self.db[self.COLLECTION_GAMES]
780
+ update_doc: dict[str, Any] = {
781
+ "app_type": app_type,
782
+ "parent_appid": str(parent_appid) if parent_appid else None,
783
+ "dlc_checked_at": datetime.now(timezone.utc),
784
+ }
785
+
786
+ try:
787
+ await collection.update_one(
788
+ {"appid": str(app_id)},
789
+ {"$set": update_doc},
790
+ )
791
+ except PyMongoError as e:
792
+ logger.error(f"Error marking app type checked for {app_id}: {e}")
793
+
794
+ # ========== Worker Methods ==========
795
+
796
+ async def upsert_games_batch(self, games: list[dict[str, Any]]) -> tuple[int, int]:
797
+ """
798
+ Bulk upsert games via UpdateOne operations.
799
+
800
+ Returns:
801
+ (upserted_count, modified_count)
802
+ """
803
+ if self.db is None or not games:
804
+ return (0, 0)
805
+
806
+ collection = self.db[self.COLLECTION_GAMES]
807
+ operations = []
808
+
809
+ for game in games:
810
+ appid = str(game.get("appid", ""))
811
+ name = game.get("name", "")
812
+ if not appid or not name:
813
+ continue
814
+
815
+ update_doc: dict[str, Any] = {
816
+ "appid": appid,
817
+ "name": name,
818
+ "name_lower": name.lower(),
819
+ }
820
+ for field in (
821
+ "developer", "publisher", "positive", "negative",
822
+ "tags", "genre", "ccu", "synced_at",
823
+ "app_type", "parent_appid", "dlc_checked_at",
824
+ ):
825
+ if game.get(field) is not None:
826
+ update_doc[field] = game[field]
827
+
828
+ operations.append(
829
+ UpdateOne({"appid": appid}, {"$set": update_doc}, upsert=True)
830
+ )
831
+
832
+ if not operations:
833
+ return (0, 0)
834
+
835
+ try:
836
+ result = await collection.bulk_write(operations, ordered=False)
837
+ return (result.upserted_count, result.modified_count)
838
+ except BulkWriteError as e:
839
+ details = e.details or {}
840
+ return (details.get("nUpserted", 0), details.get("nModified", 0))
841
+ except PyMongoError as e:
842
+ logger.error(f"Error in upsert_games_batch: {e}")
843
+ return (0, 0)
844
+
845
+ async def get_top_games_by_reviews(self, limit: int = 500) -> list[dict[str, Any]]:
846
+ """Top N games sorted by total review count (positive + negative) DESC."""
847
+ if self.db is None:
848
+ return []
849
+
850
+ collection = self.db[self.COLLECTION_GAMES]
851
+ try:
852
+ pipeline = [
853
+ {"$match": {"positive": {"$exists": True}, "negative": {"$exists": True}}},
854
+ {"$addFields": {"total_reviews_sum": {"$add": ["$positive", "$negative"]}}},
855
+ {"$sort": {"total_reviews_sum": -1}},
856
+ {"$limit": limit},
857
+ {"$project": {"_id": 0}},
858
+ ]
859
+ cursor = collection.aggregate(pipeline)
860
+ return await cursor.to_list(length=limit)
861
+ except PyMongoError as e:
862
+ logger.error(f"Error getting top games: {e}")
863
+ return []
864
+
865
+ async def update_game_update_date(self, app_id: str, update_at: datetime) -> None:
866
+ """Store the latest game update timestamp."""
867
+ if self.db is None:
868
+ return
869
+
870
+ collection = self.db[self.COLLECTION_GAMES]
871
+ try:
872
+ await collection.update_one(
873
+ {"appid": str(app_id)},
874
+ {"$set": {"last_game_update_at": update_at}},
875
+ )
876
+ except PyMongoError as e:
877
+ logger.error(f"Error updating game update date for {app_id}: {e}")
878
+
879
+ async def update_game_patch_date(self, app_id: str, patch_date: datetime) -> None:
880
+ """Store the latest confirmed major-update timestamp."""
881
+ if self.db is None:
882
+ return
883
+
884
+ collection = self.db[self.COLLECTION_GAMES]
885
+ try:
886
+ await collection.update_one(
887
+ {"appid": str(app_id)},
888
+ {"$set": {"current_patch_at": patch_date}},
889
+ )
890
+ except PyMongoError as e:
891
+ logger.error(f"Error updating game patch date for {app_id}: {e}")
892
+
893
+ async def update_news_cursor(self, app_id: str, gid: str, date: datetime) -> None:
894
+ """Store the latest seen news GID and its date as an incremental scan cursor."""
895
+ if self.db is None:
896
+ return
897
+
898
+ collection = self.db[self.COLLECTION_GAMES]
899
+ try:
900
+ await collection.update_one(
901
+ {"appid": str(app_id)},
902
+ {"$set": {"last_seen_news_gid": gid, "last_seen_news_at": date}},
903
+ )
904
+ except PyMongoError as e:
905
+ logger.error(f"Error updating news cursor for {app_id}: {e}")
906
+
907
+ async def get_game_patch_date(self, app_id: str) -> datetime | None:
908
+ """Get the latest confirmed major-update timestamp for a game."""
909
+ if self.db is None:
910
+ return None
911
+
912
+ collection = self.db[self.COLLECTION_GAMES]
913
+ try:
914
+ doc = await collection.find_one(
915
+ {"appid": str(app_id)},
916
+ {"_id": 0, "current_patch_at": 1},
917
+ )
918
+ if doc and doc.get("current_patch_at"):
919
+ val = doc["current_patch_at"]
920
+ if isinstance(val, datetime):
921
+ return val
922
+ return None
923
+ return None
924
+ except PyMongoError as e:
925
+ logger.error(f"Error getting game patch date for {app_id}: {e}")
926
+ return None
927
+
928
+ async def upsert_refresh_schedule(self, schedule: dict[str, Any]) -> None:
929
+ """Create or replace a refresh schedule document."""
930
+ if self.db is None:
931
+ return
932
+
933
+ collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
934
+ try:
935
+ await collection.update_one(
936
+ {"app_id": schedule["app_id"]},
937
+ {"$set": schedule},
938
+ upsert=True,
939
+ )
940
+ except PyMongoError as e:
941
+ logger.error(f"Error upserting refresh schedule for {schedule.get('app_id')}: {e}")
942
+
943
+ async def get_active_schedules(self) -> list[dict[str, Any]]:
944
+ """All schedules with status: 'active'."""
945
+ if self.db is None:
946
+ return []
947
+
948
+ collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
949
+ try:
950
+ cursor = collection.find({"status": "active"}, {"_id": 0})
951
+ return await cursor.to_list(length=10000)
952
+ except PyMongoError as e:
953
+ logger.error(f"Error getting active schedules: {e}")
954
+ return []
955
+
956
+ async def has_due_refresh_schedule(self, app_id: str) -> bool:
957
+ """True when an active schedule has at least one due, incomplete checkpoint."""
958
+ if self.db is None:
959
+ return False
960
+
961
+ collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
962
+ now = datetime.now(timezone.utc)
963
+ try:
964
+ document = await collection.find_one(
965
+ {
966
+ "app_id": str(app_id),
967
+ "status": "active",
968
+ "checkpoints": {
969
+ "$elemMatch": {
970
+ "completed": False,
971
+ "due_at": {"$lte": now},
972
+ }
973
+ },
974
+ },
975
+ {"_id": 0, "app_id": 1},
976
+ )
977
+ return document is not None
978
+ except PyMongoError as e:
979
+ logger.error(f"Error checking due refresh schedule for {app_id}: {e}")
980
+ return False
981
+
982
+ async def mark_checkpoint_completed(self, app_id: str, offset_hours: int) -> None:
983
+ """Mark a specific checkpoint as completed using positional $ update."""
984
+ if self.db is None:
985
+ return
986
+
987
+ collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
988
+ try:
989
+ await collection.update_one(
990
+ {"app_id": str(app_id), "checkpoints.offset_hours": offset_hours},
991
+ {"$set": {"checkpoints.$.completed": True}},
992
+ )
993
+ except PyMongoError as e:
994
+ logger.error(f"Error marking checkpoint for {app_id}/{offset_hours}h: {e}")
995
+
996
+ async def complete_schedule(self, app_id: str) -> None:
997
+ """Set schedule status to 'completed'."""
998
+ if self.db is None:
999
+ return
1000
+
1001
+ collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
1002
+ try:
1003
+ await collection.update_one(
1004
+ {"app_id": str(app_id)},
1005
+ {"$set": {"status": "completed"}},
1006
+ )
1007
+ except PyMongoError as e:
1008
+ logger.error(f"Error completing schedule for {app_id}: {e}")
1009
+
1010
+ # ========== Priority Games Methods ==========
1011
+
1012
+ async def get_priority_games(self) -> list[dict[str, Any]]:
1013
+ """All games with is_priority == True, all fields except _id."""
1014
+ if self.db is None:
1015
+ return []
1016
+
1017
+ collection = self.db[self.COLLECTION_GAMES]
1018
+ try:
1019
+ cursor = collection.find({"is_priority": True}, {"_id": 0})
1020
+ return await cursor.to_list(length=10000)
1021
+ except PyMongoError as e:
1022
+ logger.error(f"Error getting priority games: {e}")
1023
+ return []
1024
+
1025
+ async def get_priority_games_for_analysis(self) -> list[dict[str, Any]]:
1026
+ """
1027
+ Priority games eligible for worker-managed analysis.
1028
+
1029
+ DLC stays linked to the priority universe via is_priority, but low-review DLC
1030
+ falls back to on-demand mode instead of occupying worker capacity.
1031
+ """
1032
+ if self.db is None:
1033
+ return []
1034
+
1035
+ collection = self.db[self.COLLECTION_GAMES]
1036
+ if settings.dlc_worker_analysis_enabled:
1037
+ query: dict[str, Any] = {
1038
+ "is_priority": True,
1039
+ "$or": [
1040
+ {"app_type": {"$ne": "dlc"}},
1041
+ {
1042
+ "$expr": {
1043
+ "$gte": [
1044
+ {
1045
+ "$add": [
1046
+ {"$ifNull": ["$positive", 0]},
1047
+ {"$ifNull": ["$negative", 0]},
1048
+ ]
1049
+ },
1050
+ settings.dlc_min_reviews_for_analysis,
1051
+ ]
1052
+ }
1053
+ },
1054
+ ],
1055
+ }
1056
+ else:
1057
+ query = {
1058
+ "is_priority": True,
1059
+ "app_type": {"$ne": "dlc"},
1060
+ }
1061
+
1062
+ try:
1063
+ cursor = collection.find(query, {"_id": 0})
1064
+ return await cursor.to_list(length=10000)
1065
+ except PyMongoError as e:
1066
+ logger.error(f"Error getting priority games for analysis: {e}")
1067
+ return []
1068
+
1069
+ async def get_priority_game_ids(self) -> set[str]:
1070
+ """Lightweight set of appids for is_priority == True games."""
1071
+ if self.db is None:
1072
+ return set()
1073
+
1074
+ collection = self.db[self.COLLECTION_GAMES]
1075
+ try:
1076
+ cursor = collection.find({"is_priority": True}, {"_id": 0, "appid": 1})
1077
+ docs = await cursor.to_list(length=10000)
1078
+ return {str(d["appid"]) for d in docs if d.get("appid")}
1079
+ except PyMongoError as e:
1080
+ logger.error(f"Error getting priority game ids: {e}")
1081
+ return set()
1082
+
1083
+ async def get_priority_game_ids_for_analysis(self) -> set[str]:
1084
+ """App IDs that should behave as worker-managed in runtime decisions."""
1085
+ docs = await self.get_priority_games_for_analysis()
1086
+ return {str(d["appid"]) for d in docs if d.get("appid")}
1087
+
1088
+ async def get_dlcs_by_parent_appid(self, parent_appid: str) -> list[dict[str, Any]]:
1089
+ """Return DLC documents linked to a given base game."""
1090
+ if self.db is None:
1091
+ return []
1092
+
1093
+ collection = self.db[self.COLLECTION_GAMES]
1094
+ try:
1095
+ cursor = collection.find(
1096
+ {"app_type": "dlc", "parent_appid": str(parent_appid)},
1097
+ {"_id": 0},
1098
+ )
1099
+ return await cursor.to_list(length=1000)
1100
+ except PyMongoError as e:
1101
+ logger.error(f"Error getting DLCs for parent {parent_appid}: {e}")
1102
+ return []
1103
+
1104
+ async def get_existing_appids(self, appids: set[str]) -> set[str]:
1105
+ """Return the subset of the given appids that have a document in games."""
1106
+ if self.db is None or not appids:
1107
+ return set()
1108
+
1109
+ collection = self.db[self.COLLECTION_GAMES]
1110
+ try:
1111
+ cursor = collection.find(
1112
+ {"appid": {"$in": list(appids)}},
1113
+ {"_id": 0, "appid": 1},
1114
+ )
1115
+ docs = await cursor.to_list(length=len(appids) + 1)
1116
+ return {str(d["appid"]) for d in docs if d.get("appid")}
1117
+ except PyMongoError as e:
1118
+ logger.error(f"Error in get_existing_appids: {e}")
1119
+ return set()
1120
+
1121
+ async def bulk_update_priority_fields(self, updates: list[tuple[str, dict]]) -> int:
1122
+ """
1123
+ Batch UpdateOne operations for priority fields.
1124
+
1125
+ Args:
1126
+ updates: List of (appid, fields_dict) tuples.
1127
+
1128
+ Returns:
1129
+ modified_count
1130
+ """
1131
+ if self.db is None or not updates:
1132
+ return 0
1133
+
1134
+ collection = self.db[self.COLLECTION_GAMES]
1135
+ operations = [
1136
+ UpdateOne({"appid": appid}, {"$set": fields})
1137
+ for appid, fields in updates
1138
+ ]
1139
+
1140
+ try:
1141
+ result = await collection.bulk_write(operations, ordered=False)
1142
+ return result.modified_count
1143
+ except BulkWriteError as e:
1144
+ details = e.details or {}
1145
+ return details.get("nModified", 0)
1146
+ except PyMongoError as e:
1147
+ logger.error(f"Error in bulk_update_priority_fields: {e}")
1148
+ return 0
1149
+
1150
+
1151
+ # Globalna instancja (Singleton)
1152
+ mongodb = MongoDB()
backend/app/main.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from contextlib import asynccontextmanager
4
+ from typing import AsyncGenerator
5
+
6
+ from fastapi import FastAPI, Query, Request, Response
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.staticfiles import StaticFiles
9
+ from fastapi.responses import FileResponse, JSONResponse
10
+ from slowapi import _rate_limit_exceeded_handler
11
+ from slowapi.errors import RateLimitExceeded
12
+ from starlette.middleware.base import BaseHTTPMiddleware
13
+
14
+ from app.core.config import settings
15
+ from app.core.rate_limit import limiter
16
+ from app.core.worker_logging import (
17
+ LIVE_LOG_WHITELIST,
18
+ read_log_tail,
19
+ resolve_log_path,
20
+ setup_app_logging,
21
+ setup_structured_logger,
22
+ )
23
+ from app.db.mongodb import mongodb
24
+ from app.routers import analyze, games
25
+ from app.services.nlp_service import get_nlp_service
26
+ from app.services.steam_service import steam_service
27
+
28
+
29
+ # Konfiguracja logowania
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
33
+ )
34
+
35
+
36
+ class SecurityHeadersMiddleware(BaseHTTPMiddleware):
37
+ async def dispatch(self, request: Request, call_next):
38
+ response: Response = await call_next(request)
39
+ response.headers["X-Content-Type-Options"] = "nosniff"
40
+ response.headers["X-Frame-Options"] = "SAMEORIGIN"
41
+ response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
42
+ return response
43
+
44
+
45
+ @asynccontextmanager
46
+ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
47
+ """
48
+ Zarządza cyklem życia aplikacji.
49
+
50
+ Nawiązuje połączenie z MongoDB przy starcie
51
+ i zamyka je przy wyłączeniu.
52
+ """
53
+ if not settings.mongodb_url:
54
+ raise RuntimeError(
55
+ "MONGODB_URL is not set. Please configure it in .env or environment variables."
56
+ )
57
+ await mongodb.connect()
58
+ setup_structured_logger("live")
59
+ setup_app_logging()
60
+ yield
61
+ await steam_service.close()
62
+ await mongodb.disconnect()
63
+
64
+
65
+ app = FastAPI(
66
+ title="SentimentStream API",
67
+ description="API do analizy sentymentu recenzji gier Steam w czasie rzeczywistym",
68
+ version="1.0.0",
69
+ lifespan=lifespan,
70
+ )
71
+
72
+ # Rate limiter
73
+ app.state.limiter = limiter
74
+ app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) # type: ignore[arg-type]
75
+
76
+ # Konfiguracja CORS
77
+ app.add_middleware(
78
+ CORSMiddleware,
79
+ allow_origins=settings.cors_origins_list,
80
+ allow_credentials=True,
81
+ allow_methods=["GET", "POST", "OPTIONS"],
82
+ allow_headers=["Content-Type", "Accept"],
83
+ )
84
+
85
+ # Security headers
86
+ app.add_middleware(SecurityHeadersMiddleware)
87
+
88
+ # Rejestracja routerów
89
+ app.include_router(analyze.router, prefix="/api", tags=["analyze"])
90
+ app.include_router(games.router, prefix="/api", tags=["games"])
91
+
92
+
93
+ @app.get("/api/logs")
94
+ async def get_logs(
95
+ request: Request,
96
+ lines: int = Query(default=100, ge=1, le=1000),
97
+ level: str | None = Query(default=None),
98
+ event: str | None = Query(default=None),
99
+ file: str = Query(default="live"),
100
+ ):
101
+ """Token-protected endpoint to read structured log tail."""
102
+ auth = request.headers.get("Authorization", "")
103
+ expected = settings.worker_trigger_token
104
+ if expected:
105
+ if not auth.startswith("Bearer ") or auth[7:] != expected:
106
+ return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
107
+
108
+ log_path = resolve_log_path(file, LIVE_LOG_WHITELIST)
109
+ if log_path is None:
110
+ return JSONResponse(
111
+ status_code=400,
112
+ content={"detail": f"Unknown log file: '{file}'. Valid: {list(LIVE_LOG_WHITELIST.keys())}"},
113
+ )
114
+
115
+ entries = read_log_tail(log_path, lines=lines, level=level, event=event)
116
+ return {"entries": entries, "count": len(entries)}
117
+
118
+
119
+ @app.get("/health")
120
+ async def health_check() -> dict:
121
+ """Endpoint sprawdzający stan aplikacji z rzeczywistą weryfikacją zależności."""
122
+ mongo_ok = False
123
+ if mongodb.client is not None:
124
+ try:
125
+ await mongodb.client.admin.command("ping")
126
+ mongo_ok = True
127
+ except Exception:
128
+ pass
129
+
130
+ nlp_svc = get_nlp_service()
131
+ model_ok = hasattr(nlp_svc, "classifier") and nlp_svc.classifier is not None
132
+
133
+ overall = "healthy" if (mongo_ok and model_ok) else "degraded"
134
+ return {
135
+ "status": overall,
136
+ "mongodb": "connected" if mongo_ok else "disconnected",
137
+ "model": "loaded" if model_ok else "not_loaded",
138
+ }
139
+
140
+
141
+ # Obsługa plików statycznych (Frontend) - tylko jeśli istnieją (np. w Dockerze)
142
+ # Ścieżka w kontenerze Docker będzie: /app/frontend/dist
143
+ # Lokalnie zazwyczaj nie istnieje (bo używamy vite dev server), więc pomijamy
144
+ static_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "frontend", "dist")
145
+
146
+ if settings.app_mode != "api" and os.path.exists(static_dir):
147
+ app.mount("/assets", StaticFiles(directory=os.path.join(static_dir, "assets")), name="assets")
148
+
149
+ # Catch-all dla SPA (React Router)
150
+ @app.get("/{full_path:path}")
151
+ async def serve_spa(full_path: str):
152
+ if full_path.startswith("api"):
153
+ return {"error": "API route not found"}
154
+
155
+ file_path = os.path.join(static_dir, full_path)
156
+ if os.path.exists(file_path) and os.path.isfile(file_path):
157
+ return FileResponse(file_path)
158
+
159
+ return FileResponse(os.path.join(static_dir, "index.html"))
backend/app/models/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Modele danych Pydantic."""
2
+
3
+ from app.models.schemas import (
4
+ AnalysisProgress,
5
+ AnalysisResult,
6
+ GameInfo,
7
+ ReviewBatch,
8
+ SentimentType,
9
+ TopicSentiment,
10
+ )
11
+
12
+ __all__ = [
13
+ "AnalysisProgress",
14
+ "AnalysisResult",
15
+ "GameInfo",
16
+ "ReviewBatch",
17
+ "SentimentType",
18
+ "TopicSentiment",
19
+ ]
backend/app/models/schemas.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modele danych Pydantic.
3
+
4
+ Definiuje struktury danych używane w API oraz do walidacji.
5
+ """
6
+
7
+ from datetime import datetime
8
+ from enum import Enum
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+
13
+ class SentimentType(str, Enum):
14
+ """Typ sentymentu dla tematu."""
15
+
16
+ POSITIVE = "positive"
17
+ NEGATIVE = "negative"
18
+ NEUTRAL = "neutral"
19
+
20
+
21
+ class PredictionType(str, Enum):
22
+ """Typ przewidywanego trendu liczby graczy."""
23
+
24
+ INCREASING = "increasing"
25
+ DECREASING = "decreasing"
26
+ STABLE = "stable"
27
+ UNCERTAIN = "uncertain"
28
+
29
+
30
+ class UserCountPrediction(BaseModel):
31
+ """
32
+ Przewidywanie trendu liczby graczy.
33
+
34
+ Attributes:
35
+ trend: Przewidywany kierunek (wzrost/spadek).
36
+ confidence: Pewność predykcji (0.0 - 1.0).
37
+ reasoning: Krótkie uzasadnienie.
38
+ """
39
+
40
+ trend: PredictionType
41
+ confidence: float
42
+ reasoning: str
43
+
44
+
45
+ class GameInfo(BaseModel):
46
+ """
47
+ Informacje o grze ze Steam.
48
+
49
+ Attributes:
50
+ app_id: Unikalny identyfikator gry na Steam.
51
+ name: Nazwa gry.
52
+ name_cn: Chińska nazwa gry (jeśli dostępna).
53
+ header_image: URL obrazka nagłówkowego.
54
+ total_reviews: Całkowita liczba recenzji.
55
+ target_count: Docelowa liczba recenzji do analizy (sample size).
56
+ """
57
+
58
+ app_id: str
59
+ name: str
60
+ name_cn: str | None = None
61
+ header_image: str | None = None
62
+ total_reviews: int = 0
63
+ target_count: int | None = None
64
+ last_game_update_at: int | None = None
65
+
66
+
67
+ class TopicSentiment(BaseModel):
68
+ """
69
+ Sentyment dla pojedynczego tematu.
70
+
71
+ Attributes:
72
+ topic: Nazwa tematu (np. "Grafika", "Gameplay").
73
+ sentiment: Typ sentymentu.
74
+ score: Wynik sentymentu (-1.0 do 1.0).
75
+ mention_count: Liczba wzmianek o temacie.
76
+ example: Przykładowe zdanie z recenzji.
77
+ """
78
+
79
+ topic: str
80
+ sentiment: SentimentType
81
+ score: float = Field(ge=-1.0, le=1.0)
82
+ mention_count: int = 0
83
+ example: str | None = None
84
+ example_score: float | None = None # score przykładu do porównań przy agregacji
85
+
86
+
87
+ class Highlight(BaseModel):
88
+ """Czesto powtarzana fraza z recenzji."""
89
+
90
+ phrase: str
91
+ mention_count: int
92
+ sentiment: SentimentType
93
+ score: float
94
+ ngram_size: int
95
+
96
+
97
+ class TopicHighlights(BaseModel):
98
+ """Highlights dla konkretnego tematu."""
99
+
100
+ topic: str
101
+ highlights: list[Highlight]
102
+
103
+
104
+ class AnalysisProgress(BaseModel):
105
+ """
106
+ Postęp analizy (wysyłany przez SSE).
107
+
108
+ Attributes:
109
+ processed: Liczba przetworzonych recenzji.
110
+ total: Całkowita liczba recenzji do przetworzenia.
111
+ current_topics: Aktualne wyniki tematów.
112
+ skipped_count: Liczba zdań pominiętych (brak słów kluczowych).
113
+ """
114
+
115
+ processed: int
116
+ total: int
117
+ current_topics: list[TopicSentiment] = []
118
+ skipped_count: int = 0
119
+
120
+
121
+ class AnalysisResult(BaseModel):
122
+ """
123
+ Końcowy wynik analizy.
124
+
125
+ Attributes:
126
+ game: Informacje o grze.
127
+ general_topics: Lista tematów z sentymentem (pełny agregat).
128
+ prediction: Przewidywanie trendu liczby graczy.
129
+ analyzed_reviews: Liczba przeanalizowanych recenzji.
130
+ skipped_count: Łączna liczba pominiętych zdań.
131
+ cached_at: Data zapisania w cache.
132
+ """
133
+
134
+ game: GameInfo
135
+ general_topics: list[TopicSentiment]
136
+ prediction: UserCountPrediction | None = None
137
+ analyzed_reviews: int
138
+ skipped_count: int = 0
139
+ general_highlights: list[Highlight] = []
140
+ recent_highlights: list[Highlight] | None = None
141
+ current_patch_highlights: list[Highlight] | None = None
142
+ topic_highlights: list[TopicHighlights] = []
143
+ cached_at: datetime | None = None
144
+ recent_topics: list[TopicSentiment] | None = None
145
+ recent_reviews_count: int = 0
146
+ current_patch_topics: list[TopicSentiment] | None = None
147
+ current_patch_reviews_count: int = 0
148
+ last_patch_topics: list[TopicSentiment] | None = None
149
+ last_patch_reviews_count: int = 0
150
+ current_patch_timestamp: int | None = None
151
+ analysis_date: datetime | None = None
152
+ current_patch_date: datetime | None = None
153
+ preferred_context: str | None = None
154
+ freshness_status: str | None = None
155
+ staleness_reason: str | None = None
156
+ is_refreshing: bool = False
157
+
158
+
159
+ class ReviewItem(BaseModel):
160
+ """Single review with metadata for incremental tracking."""
161
+
162
+ text: str
163
+ recommendation_id: str
164
+ timestamp_created: int
165
+
166
+
167
+ class ReviewBatch(BaseModel):
168
+ """
169
+ Partia recenzji do przetworzenia.
170
+
171
+ Attributes:
172
+ reviews: Lista tekstów recenzji.
173
+ review_items: Recenzje z metadanymi (do incremental analysis).
174
+ cursor: Kursor do paginacji Steam API.
175
+ """
176
+
177
+ reviews: list[str]
178
+ review_items: list[ReviewItem] = []
179
+ cursor: str | None = None
180
+
181
+
182
+ class SSEEvent(BaseModel):
183
+ """
184
+ Wydarzenie Server-Sent Events.
185
+
186
+ Attributes:
187
+ event: Typ wydarzenia (progress/complete/error).
188
+ data: Dane wydarzenia.
189
+ """
190
+
191
+ event: str
192
+ data: AnalysisProgress | AnalysisResult | dict
193
+
194
+
195
+ class CachedAnalysis(BaseModel):
196
+ """
197
+ Dokument cache w MongoDB.
198
+
199
+ Przechowuje wyniki analizy z timestampem dla walidacji TTL.
200
+
201
+ Attributes:
202
+ game_id: Identyfikator gry Steam (klucz cache).
203
+ results: Wyniki analizy sentymentu.
204
+ cached_at: Data i czas zapisania do cache.
205
+ """
206
+
207
+ game_id: str
208
+ results: AnalysisResult
209
+ cached_at: datetime
210
+ analyzed_at: datetime | None = None
backend/app/routers/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Routery API."""
2
+
3
+ from app.routers import analyze
4
+
5
+ __all__ = ["analyze"]
backend/app/routers/analyze.py ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Router API do analizy sentymentu.
3
+
4
+ Zawiera endpoint do streamowania wyników analizy przez SSE.
5
+ """
6
+
7
+ import asyncio
8
+ import contextlib
9
+ import json
10
+ import logging
11
+ import time
12
+ from datetime import datetime, timezone
13
+ from typing import Any, AsyncGenerator
14
+
15
+ from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request
16
+ from sse_starlette.sse import EventSourceResponse
17
+
18
+ from app.core.config import settings
19
+ from app.core.freshness import (
20
+ FreshnessStatus,
21
+ evaluate_freshness,
22
+ get_staleness_reason,
23
+ )
24
+ from app.core.sampling import SamplePlan, create_sample_plan
25
+ from app.core.ttl_tiers import get_ttl_hours
26
+ from app.core.worker_logging import get_structured_logger, log_structured
27
+ from app.db.mongodb import mongodb
28
+ from app.core.rate_limit import limiter
29
+ from app.models.schemas import (
30
+ AnalysisProgress,
31
+ AnalysisResult,
32
+ GameInfo,
33
+ Highlight,
34
+ TopicHighlights,
35
+ TopicSentiment,
36
+ )
37
+ from app.services.analysis_utils import (
38
+ aggregate_topics,
39
+ calculate_prediction,
40
+ coerce_utc_datetime,
41
+ compute_preferred_context,
42
+ datetime_from_timestamp,
43
+ filter_topics_by_min_mentions,
44
+ normalize_legacy_results,
45
+ serialize_datetime,
46
+ )
47
+ from app.services.highlights_service import HighlightsCollector
48
+ from app.services.analysis_runner import iter_incremental_analysis_events
49
+ from app.services.nlp_service import NLPService
50
+ from app.services.nlp_service import get_nlp_service as _get_nlp_service_instance
51
+ from app.services.steam_errors import SteamAPIError, SteamRateLimitError
52
+ from app.services.steam_service import SteamService, steam_service
53
+
54
+ logger = logging.getLogger(__name__)
55
+
56
+ router = APIRouter()
57
+
58
+ # Background refresh concurrency control
59
+ _refreshing_app_ids: set[str] = set()
60
+ _refresh_semaphore = asyncio.Semaphore(3) # max 3 concurrent background refreshes
61
+
62
+ # Funkcje pomocnicze dla Dependency Injection
63
+ def get_nlp_service() -> NLPService:
64
+ return _get_nlp_service_instance()
65
+
66
+ def get_steam_service() -> SteamService:
67
+ return steam_service
68
+
69
+
70
+ @router.get("/health")
71
+ async def health_check():
72
+ """
73
+ Endpoint do sprawdzania stanu aplikacji (Health Check).
74
+ """
75
+ return {
76
+ "status": "ok",
77
+ "services": {
78
+ "mongodb": "connected",
79
+ "nlp": "ready",
80
+ "steam_api": "reachable"
81
+ }
82
+ }
83
+
84
+
85
+
86
+ def _build_analysis_payload(
87
+ document: dict[str, Any],
88
+ freshness_status: FreshnessStatus,
89
+ *,
90
+ current_patch_at: datetime | None = None,
91
+ is_refreshing: bool = False,
92
+ ) -> dict[str, Any]:
93
+ results = normalize_legacy_results(document.get("results", {}))
94
+ payload = dict(results)
95
+ analysis_date = (
96
+ coerce_utc_datetime(payload.get("analysis_date"))
97
+ or coerce_utc_datetime(document.get("analyzed_at"))
98
+ or coerce_utc_datetime(payload.get("cached_at"))
99
+ or coerce_utc_datetime(document.get("cached_at"))
100
+ )
101
+ if current_patch_at is not None:
102
+ current_patch_date: datetime | None = current_patch_at
103
+ else:
104
+ # No confirmed major update in DB — nullify current_patch fields so
105
+ # legacy cached values don't appear as a valid Current Patch tab.
106
+ current_patch_date = None
107
+ payload["current_patch_topics"] = None
108
+ payload["current_patch_reviews_count"] = 0
109
+ payload["current_patch_highlights"] = None
110
+ payload["current_patch_timestamp"] = None
111
+
112
+ if payload.get("cached_at") is None and document.get("cached_at") is not None:
113
+ payload["cached_at"] = serialize_datetime(document["cached_at"])
114
+ elif payload.get("cached_at") is not None:
115
+ payload["cached_at"] = serialize_datetime(payload["cached_at"])
116
+
117
+ payload["analysis_date"] = serialize_datetime(analysis_date)
118
+ payload["current_patch_date"] = serialize_datetime(current_patch_date)
119
+ payload["freshness_status"] = freshness_status.value
120
+ payload["staleness_reason"] = get_staleness_reason(freshness_status)
121
+ payload["is_refreshing"] = is_refreshing
122
+ # Always recompute preferred_context from the current patch date so cached
123
+ # documents with a stale stored value get the correct tab on read.
124
+ patch_ts_for_context = int(current_patch_date.timestamp()) if current_patch_date else None
125
+ payload["preferred_context"] = compute_preferred_context(patch_ts_for_context)
126
+ return payload
127
+
128
+
129
+ async def _full_analysis(
130
+ game: GameInfo,
131
+ sample_plan: SamplePlan,
132
+ steam_svc: SteamService,
133
+ nlp_svc: NLPService,
134
+ patch_timestamp: int | None = None,
135
+ stale_doc: dict[str, Any] | None = None,
136
+ ) -> AsyncGenerator[dict, None]:
137
+ """Full analysis path — Producer-Consumer queue pattern."""
138
+ total_target = sample_plan.total
139
+ ttl_hours = await get_ttl_hours(game.app_id)
140
+ nlp_cumulative_s: float = 0.0
141
+
142
+ # Producer-Consumer queue (max 5 batches in flight)
143
+ queue: asyncio.Queue = asyncio.Queue(maxsize=5)
144
+
145
+ async def fetch_worker():
146
+ try:
147
+ async for batch in steam_svc.fetch_reviews_stratified(game.app_id, sample_plan):
148
+ await queue.put(batch)
149
+ except Exception as e:
150
+ # Relay all exceptions to consumer via queue — they'll be re-raised
151
+ # and caught by the SSE generator's specific exception handlers.
152
+ await queue.put(e)
153
+ finally:
154
+ await queue.put(None)
155
+
156
+ fetch_task = asyncio.create_task(fetch_worker())
157
+
158
+ processed = 0
159
+ total_skipped = 0
160
+ aggregated_topics: list[TopicSentiment] = []
161
+ recent_processed = 0
162
+ recent_limit = settings.recent_sample_limit
163
+ all_review_ids: list[str] = []
164
+ latest_timestamp = 0
165
+ highlights_collector = HighlightsCollector()
166
+ current_patch_topics: list[TopicSentiment] = []
167
+ current_patch_count = 0
168
+ review_topic_results: list[tuple[int, list[TopicSentiment]]] = []
169
+
170
+ try:
171
+ while True:
172
+ item = await queue.get()
173
+
174
+ if item is None:
175
+ break
176
+ if isinstance(item, Exception):
177
+ raise item
178
+
179
+ batch = item
180
+ if not batch.reviews:
181
+ continue
182
+
183
+ # Collect review IDs for incremental cache
184
+ for ri in batch.review_items:
185
+ all_review_ids.append(ri.recommendation_id)
186
+ if ri.timestamp_created > latest_timestamp:
187
+ latest_timestamp = ri.timestamp_created
188
+
189
+ # Split by patch timestamp when available and we have review_items
190
+ batch_skipped = 0
191
+ if patch_timestamp and batch.review_items:
192
+ for ri, text in zip(batch.review_items, batch.reviews):
193
+ is_recent = recent_processed < recent_limit
194
+ cat = []
195
+ if is_recent:
196
+ cat.append("recent")
197
+
198
+ if ri.timestamp_created >= patch_timestamp:
199
+ cat.append("current_patch")
200
+ nlp_start = time.monotonic()
201
+ res, skipped = await nlp_svc.analyze_batch(
202
+ [text], highlights_collector=highlights_collector, categories=cat
203
+ )
204
+ nlp_cumulative_s += time.monotonic() - nlp_start
205
+ batch_skipped += skipped
206
+ if res:
207
+ aggregated_topics = aggregate_topics(aggregated_topics, res)
208
+ current_patch_topics = aggregate_topics(current_patch_topics, res)
209
+ review_topic_results.append((ri.timestamp_created, res))
210
+ current_patch_count += 1
211
+ else:
212
+ nlp_start = time.monotonic()
213
+ res, skipped = await nlp_svc.analyze_batch(
214
+ [text], highlights_collector=highlights_collector, categories=cat
215
+ )
216
+ nlp_cumulative_s += time.monotonic() - nlp_start
217
+ batch_skipped += skipped
218
+ if res:
219
+ aggregated_topics = aggregate_topics(aggregated_topics, res)
220
+ review_topic_results.append((ri.timestamp_created, res))
221
+
222
+ recent_processed += 1
223
+ else:
224
+ for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews):
225
+ is_recent = recent_processed < recent_limit
226
+ cat = ["recent"] if is_recent else []
227
+
228
+ nlp_start = time.monotonic()
229
+ res, skipped = await nlp_svc.analyze_batch(
230
+ [text], highlights_collector=highlights_collector, categories=cat
231
+ )
232
+ nlp_cumulative_s += time.monotonic() - nlp_start
233
+ batch_skipped += skipped
234
+ ts = ri.timestamp_created if batch.review_items else 0
235
+ if res:
236
+ aggregated_topics = aggregate_topics(aggregated_topics, res)
237
+ review_topic_results.append((ts, res))
238
+ recent_processed += 1
239
+
240
+ total_skipped += batch_skipped
241
+ processed += len(batch.reviews)
242
+
243
+ progress = AnalysisProgress(
244
+ processed=processed,
245
+ total=total_target,
246
+ current_topics=aggregated_topics,
247
+ skipped_count=total_skipped,
248
+ )
249
+ yield {"event": "progress", "data": progress.model_dump_json()}
250
+
251
+ await fetch_task
252
+ except BaseException:
253
+ fetch_task.cancel()
254
+ with contextlib.suppress(asyncio.CancelledError):
255
+ await fetch_task
256
+ raise
257
+
258
+ # Build recent_topics from highest-timestamp reviews
259
+ review_topic_results.sort(key=lambda x: x[0], reverse=True)
260
+ recent_entries = review_topic_results[:recent_limit]
261
+ recent_topics: list[TopicSentiment] = []
262
+ for _, topics_batch in recent_entries:
263
+ for ts in topics_batch:
264
+ recent_topics = aggregate_topics(recent_topics, [ts])
265
+ recent_reviews_count = len(recent_entries)
266
+
267
+ # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
268
+ aggregated_topics = filter_topics_by_min_mentions(aggregated_topics)
269
+ recent_topics = filter_topics_by_min_mentions(recent_topics)
270
+ current_patch_topics = filter_topics_by_min_mentions(current_patch_topics)
271
+
272
+ prediction = calculate_prediction(aggregated_topics)
273
+
274
+ highlights_data = highlights_collector.compute_highlights()
275
+ general_highlights = highlights_data["general"]
276
+ recent_highlights = highlights_data["recent"]
277
+ current_patch_highlights = highlights_data["current_patch"]
278
+ topic_highlights_dict = highlights_data["topics"]
279
+
280
+ # Restrict topic highlights to topics that survived the min-mentions filter,
281
+ # so the topic_highlights set is always consistent with general_topics.
282
+ _surviving_topics = {t.topic for t in aggregated_topics}
283
+ topic_highlights_list = [
284
+ TopicHighlights(
285
+ topic=topic,
286
+ highlights=[Highlight(**h) for h in highlights],
287
+ )
288
+ for topic, highlights in topic_highlights_dict.items()
289
+ if topic in _surviving_topics
290
+ ]
291
+
292
+ # Show recent tab if we have enough reviews to make the split meaningful
293
+ has_recent_split = processed > recent_limit
294
+
295
+ has_current_patch = patch_timestamp is not None and current_patch_count > 0
296
+ analysis_generated_at = datetime.now(timezone.utc)
297
+ current_patch_date = datetime_from_timestamp(patch_timestamp)
298
+
299
+ # Archive last_patch_topics when this full analysis replaces a doc with a different patch.
300
+ last_patch_topics: list[TopicSentiment] | None = None
301
+ last_patch_reviews_count = 0
302
+ if stale_doc:
303
+ old_r = normalize_legacy_results(stale_doc.get("results", {}))
304
+ old_patch_ts = old_r.get("current_patch_timestamp")
305
+ if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
306
+ raw_cp = old_r.get("current_patch_topics")
307
+ last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None
308
+ last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0)
309
+ else:
310
+ raw_lp = old_r.get("last_patch_topics")
311
+ last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None
312
+ last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0)
313
+
314
+ result = AnalysisResult(
315
+ game=game,
316
+ general_topics=aggregated_topics,
317
+ recent_topics=recent_topics if has_recent_split else None,
318
+ recent_reviews_count=recent_reviews_count if has_recent_split else 0,
319
+ current_patch_topics=current_patch_topics if has_current_patch else None,
320
+ current_patch_reviews_count=current_patch_count if has_current_patch else 0,
321
+ last_patch_topics=last_patch_topics,
322
+ last_patch_reviews_count=last_patch_reviews_count,
323
+ current_patch_timestamp=patch_timestamp,
324
+ analysis_date=analysis_generated_at,
325
+ current_patch_date=current_patch_date,
326
+ prediction=prediction,
327
+ analyzed_reviews=processed,
328
+ skipped_count=total_skipped,
329
+ general_highlights=[Highlight(**h) for h in general_highlights],
330
+ recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None,
331
+ current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None,
332
+ topic_highlights=topic_highlights_list,
333
+ cached_at=analysis_generated_at,
334
+ preferred_context=compute_preferred_context(patch_timestamp),
335
+ freshness_status=FreshnessStatus.FRESH.value,
336
+ is_refreshing=False,
337
+ )
338
+ await mongodb.save_analysis(
339
+ game.app_id,
340
+ result.model_dump(),
341
+ analyzed_review_ids=all_review_ids,
342
+ latest_review_timestamp=latest_timestamp,
343
+ ttl_hours=ttl_hours,
344
+ analyzed_at=analysis_generated_at,
345
+ )
346
+
347
+ # Log structured timing for full analysis
348
+ if get_structured_logger():
349
+ log_structured(
350
+ "full_analysis_complete",
351
+ app_id=game.app_id,
352
+ game_name=game.name if hasattr(game, "name") else str(game.app_id),
353
+ source="live",
354
+ reviews_processed=processed,
355
+ topics_found=len(aggregated_topics),
356
+ detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)},
357
+ )
358
+
359
+ yield {"event": "complete", "data": result.model_dump_json()}
360
+
361
+
362
+ async def _incremental_analysis(
363
+ game: GameInfo,
364
+ stale_doc: dict[str, Any],
365
+ steam_svc: SteamService,
366
+ nlp_svc: NLPService,
367
+ patch_timestamp: int | None = None,
368
+ ) -> AsyncGenerator[dict, None]:
369
+ """Incremental analysis SSE wrapper over the shared service implementation."""
370
+ async for event in iter_incremental_analysis_events(
371
+ game,
372
+ stale_doc,
373
+ steam_svc,
374
+ nlp_svc,
375
+ patch_timestamp=patch_timestamp,
376
+ source="live",
377
+ ):
378
+ yield event
379
+
380
+
381
+ async def _background_refresh(
382
+ game: GameInfo,
383
+ stale_doc: dict[str, Any],
384
+ steam_svc: SteamService,
385
+ nlp_svc: NLPService,
386
+ patch_ts: int | None,
387
+ ) -> None:
388
+ """Fire-and-forget incremental analysis for stale niche caches."""
389
+ async with _refresh_semaphore:
390
+ try:
391
+ async for _ in _incremental_analysis(
392
+ game, stale_doc, steam_svc, nlp_svc, patch_timestamp=patch_ts
393
+ ):
394
+ pass
395
+ logger.info(f"Background refresh completed for {game.app_id}")
396
+ except Exception as e:
397
+ logger.error(f"Background refresh failed for {game.app_id}: {e}")
398
+ finally:
399
+ _refreshing_app_ids.discard(game.app_id)
400
+
401
+
402
+ async def analysis_event_generator(
403
+ game_name: str,
404
+ steam_service: SteamService,
405
+ nlp_service: NLPService,
406
+ *,
407
+ appid: str | None = None,
408
+ ) -> AsyncGenerator[dict, None]:
409
+ """
410
+ Main SSE event generator. Decides between full and incremental analysis paths.
411
+ """
412
+ t_start = time.monotonic()
413
+ analysis_type = "unknown"
414
+ app_id = ""
415
+ resolved_game_name = game_name
416
+ reviews_processed = 0
417
+
418
+ try:
419
+ # 1. Resolve game — use appid directly if provided, otherwise search by name
420
+ if appid:
421
+ game = await steam_service.get_game_info(appid)
422
+ else:
423
+ game = await steam_service.search_game(game_name)
424
+ if not game:
425
+ yield {
426
+ "event": "analysis_error",
427
+ "data": json.dumps({"message": "ERROR_GAME_NOT_FOUND"}),
428
+ }
429
+ return
430
+
431
+ app_id = game.app_id
432
+ resolved_game_name = game.name if hasattr(game, "name") else game_name
433
+
434
+ # 1b. Fetch game patch date for current_patch tab / freshness evaluation
435
+ patch_date = await mongodb.get_game_patch_date(game.app_id)
436
+ patch_ts = int(patch_date.timestamp()) if patch_date else None
437
+ if patch_ts:
438
+ game = game.model_copy(update={"last_game_update_at": patch_ts})
439
+
440
+ # 2. Load any existing analysis and evaluate product freshness.
441
+ analysis_doc = await mongodb.get_analysis(game.app_id)
442
+ priority_ids = await mongodb.get_priority_game_ids_for_analysis()
443
+ is_priority = game.app_id in priority_ids
444
+ is_niche = not is_priority
445
+
446
+ if analysis_doc and analysis_doc.get("results"):
447
+ freshness_status = evaluate_freshness(analysis_doc, patch_date)
448
+
449
+ if freshness_status == FreshnessStatus.FRESH:
450
+ analysis_type = "cached"
451
+ payload = _build_analysis_payload(
452
+ analysis_doc,
453
+ freshness_status,
454
+ current_patch_at=patch_date,
455
+ )
456
+ yield {"event": "result", "data": json.dumps(payload)}
457
+ return
458
+
459
+ analysis_type = "stale_result"
460
+ is_refreshing = (
461
+ await mongodb.has_due_refresh_schedule(game.app_id)
462
+ if is_priority
463
+ else True
464
+ )
465
+ stale_payload = _build_analysis_payload(
466
+ analysis_doc,
467
+ freshness_status,
468
+ current_patch_at=patch_date,
469
+ is_refreshing=is_refreshing,
470
+ )
471
+ yield {"event": "result", "data": json.dumps(stale_payload)}
472
+
473
+ if is_priority:
474
+ return
475
+
476
+ try:
477
+ if settings.incremental_enabled and analysis_doc.get("analyzed_review_ids"):
478
+ refresh_generator = _incremental_analysis(
479
+ game, analysis_doc, steam_service, nlp_service, patch_timestamp=patch_ts
480
+ )
481
+ else:
482
+ stats = await steam_service.get_review_stats(game.app_id)
483
+ sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
484
+ game = game.model_copy(update={"target_count": sample_plan.total})
485
+ refresh_generator = _full_analysis(
486
+ game,
487
+ sample_plan,
488
+ steam_service,
489
+ nlp_service,
490
+ patch_timestamp=patch_ts,
491
+ stale_doc=analysis_doc,
492
+ )
493
+ async for event in refresh_generator:
494
+ if event.get("event") == "complete":
495
+ try:
496
+ data = json.loads(event["data"])
497
+ reviews_processed = data.get("analyzed_reviews", 0)
498
+ except (json.JSONDecodeError, KeyError):
499
+ pass
500
+ yield event
501
+ return
502
+ except Exception as e:
503
+ logger.error(f"Refresh failed for {game.app_id}: {e}")
504
+ return
505
+
506
+ # 3. No cache at all — live analysis
507
+
508
+ analysis_type = "full"
509
+ stats = await steam_service.get_review_stats(game.app_id)
510
+ sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
511
+ total_target = sample_plan.total
512
+ game = game.model_copy(update={"target_count": total_target})
513
+
514
+ yield {"event": "game_found", "data": game.model_dump_json()}
515
+
516
+ if is_niche:
517
+ yield {
518
+ "event": "state",
519
+ "data": json.dumps({"type": "first_live_analysis"}),
520
+ }
521
+
522
+ async for event in _full_analysis(game, sample_plan, steam_service, nlp_service, patch_timestamp=patch_ts):
523
+ if event.get("event") == "complete":
524
+ try:
525
+ data = json.loads(event["data"])
526
+ reviews_processed = data.get("analyzed_reviews", 0)
527
+ except (json.JSONDecodeError, KeyError):
528
+ pass
529
+ yield event
530
+
531
+ except SteamRateLimitError as e:
532
+ logger.warning(f"Steam rate limit: {e}")
533
+ yield {
534
+ "event": "analysis_error",
535
+ "data": json.dumps({"message": "ERROR_STEAM_RATE_LIMIT"}),
536
+ }
537
+ except SteamAPIError as e:
538
+ logger.error(f"Steam API error: {e}")
539
+ yield {
540
+ "event": "analysis_error",
541
+ "data": json.dumps({"message": "ERROR_STEAM_API"}),
542
+ }
543
+ except Exception as e:
544
+ # Safety net — SSE generator must always send an error event, never crash silently.
545
+ logger.error(f"Analysis error: {e}", exc_info=True)
546
+ yield {
547
+ "event": "analysis_error",
548
+ "data": json.dumps({"message": "ERROR_INTERNAL"}),
549
+ }
550
+ finally:
551
+ elapsed = round(time.monotonic() - t_start, 3)
552
+ if get_structured_logger():
553
+ log_structured(
554
+ "live_analysis",
555
+ app_id=app_id,
556
+ game_name=resolved_game_name,
557
+ analysis_type=analysis_type,
558
+ elapsed_s=elapsed,
559
+ reviews_processed=reviews_processed,
560
+ source="live",
561
+ )
562
+
563
+
564
+ @router.get("/analyze/{game_name}")
565
+ @limiter.limit(settings.rate_limit_analyze)
566
+ async def analyze_game(
567
+ request: Request,
568
+ game_name: str = Path(..., min_length=1, max_length=200),
569
+ appid: str | None = Query(None, min_length=1, max_length=20),
570
+ steam_service: SteamService = Depends(get_steam_service),
571
+ nlp_service: NLPService = Depends(get_nlp_service),
572
+ ) -> EventSourceResponse:
573
+ """
574
+ Endpoint do analizy sentymentu gry (SSE Stream).
575
+ """
576
+ return EventSourceResponse(
577
+ analysis_event_generator(game_name, steam_service, nlp_service, appid=appid)
578
+ )
579
+
580
+
581
+ @router.get("/game/{game_name}")
582
+ @limiter.limit(settings.rate_limit_default)
583
+ async def get_game_info(
584
+ request: Request,
585
+ game_name: str = Path(..., min_length=1, max_length=200),
586
+ steam_service: SteamService = Depends(get_steam_service),
587
+ ) -> dict:
588
+ """
589
+ Endpoint do pobierania informacji o grze.
590
+ """
591
+ game = await steam_service.search_game(game_name)
592
+ if not game:
593
+ raise HTTPException(
594
+ status_code=404, detail="ERROR_GAME_NOT_FOUND"
595
+ )
596
+
597
+ return game.model_dump()
backend/app/routers/games.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Router API do wyszukiwania gier.
3
+
4
+ Zawiera endpoint do pobierania sugestii gier dla autouzupełniania.
5
+ """
6
+
7
+ from fastapi import APIRouter, Query, Request
8
+
9
+ from app.core.config import settings
10
+ from app.db.mongodb import mongodb
11
+ from app.core.rate_limit import limiter
12
+
13
+ router = APIRouter()
14
+
15
+
16
+ @router.get("/games/suggestions")
17
+ @limiter.limit(settings.rate_limit_default)
18
+ async def get_game_suggestions(
19
+ request: Request,
20
+ q: str = Query(..., min_length=2, max_length=100, description="Tekst do wyszukania"),
21
+ limit: int = Query(10, ge=1, le=20, description="Maksymalna liczba wyników"),
22
+ ) -> list[dict[str, str]]:
23
+ """
24
+ Endpoint do pobierania sugestii gier dla autouzupełniania.
25
+
26
+ Wyszukuje gry po nazwie (case-insensitive).
27
+ Wymaga minimum 2 znaków.
28
+
29
+ Args:
30
+ q: Tekst do wyszukania w nazwie gry.
31
+ limit: Maksymalna liczba wyników (1-20).
32
+
33
+ Returns:
34
+ Lista gier pasujących do zapytania.
35
+
36
+ Example:
37
+ ```
38
+ GET /api/games/suggestions?q=cyber&limit=5
39
+
40
+ [
41
+ {"appid": "1091500", "name": "Cyberpunk 2077"},
42
+ {"appid": "12345", "name": "Cyber Shadow"},
43
+ ...
44
+ ]
45
+ ```
46
+ """
47
+ games = await mongodb.search_games(q, limit)
48
+ return games
49
+
50
+
51
+ @router.get("/games/count")
52
+ @limiter.limit(settings.rate_limit_default)
53
+ async def get_games_count(request: Request) -> dict[str, int]:
54
+ """
55
+ Endpoint do sprawdzenia liczby gier w bazie.
56
+
57
+ Returns:
58
+ Liczba gier w bazie danych.
59
+
60
+ Example:
61
+ ```
62
+ GET /api/games/count
63
+
64
+ {"count": 85432}
65
+ ```
66
+ """
67
+ count = await mongodb.get_games_count()
68
+ return {"count": count}
backend/app/services/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Serwisy biznesowe aplikacji."""
2
+
3
+ from app.services.nlp_service import get_nlp_service
4
+ from app.services.steam_service import steam_service
5
+
6
+ __all__ = ["get_nlp_service", "steam_service"]
backend/app/services/analysis_runner.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Analysis Runner — programmatic (non-SSE) analysis for the Worker.
3
+
4
+ Extracts the core full-analysis logic from analyze.py without SSE wrapping.
5
+ Used by the pre-cache service to run analyses in the background.
6
+ """
7
+
8
+ import asyncio
9
+ import contextlib
10
+ import json
11
+ import logging
12
+ import time
13
+ from datetime import datetime, timezone
14
+ from typing import Any, AsyncGenerator
15
+
16
+ from app.core.config import settings
17
+ from app.core.freshness import FreshnessStatus
18
+ from app.core.sampling import create_sample_plan
19
+ from app.core.ttl_tiers import get_ttl_hours
20
+ from app.core.worker_logging import AsyncTimingContext, get_structured_logger, log_structured
21
+ from app.db.mongodb import mongodb
22
+ from app.models.schemas import (
23
+ AnalysisProgress,
24
+ AnalysisResult,
25
+ GameInfo,
26
+ Highlight,
27
+ TopicHighlights,
28
+ TopicSentiment,
29
+ )
30
+ from app.services.highlights_service import HighlightsCollector
31
+ from app.services.analysis_utils import (
32
+ aggregate_topics,
33
+ calculate_prediction,
34
+ compute_preferred_context,
35
+ datetime_from_timestamp,
36
+ filter_topics_by_min_mentions,
37
+ normalize_legacy_results,
38
+ scale_topics,
39
+ serialize_datetime,
40
+ )
41
+ from app.services.nlp_service import NLPService
42
+ from app.services.steam_service import SteamService
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ async def iter_incremental_analysis_events(
48
+ game: GameInfo,
49
+ stale_doc: dict[str, Any],
50
+ steam_svc: SteamService,
51
+ nlp_svc: NLPService,
52
+ patch_timestamp: int | None = None,
53
+ *,
54
+ source: str = "live",
55
+ ) -> AsyncGenerator[dict[str, str], None]:
56
+ """Yield incremental-analysis progress and final result events."""
57
+ ttl_hours = await get_ttl_hours(game.app_id)
58
+ old_results = normalize_legacy_results(stale_doc.get("results", {}))
59
+ old_review_ids: list[str] = stale_doc.get("analyzed_review_ids", [])
60
+ old_review_ids_set = set(old_review_ids)
61
+ nlp_cumulative_s: float = 0.0
62
+
63
+ old_general = [TopicSentiment(**topic) for topic in old_results.get("general_topics", [])]
64
+ old_recent = (
65
+ [TopicSentiment(**topic) for topic in old_results.get("recent_topics", [])]
66
+ if old_results.get("recent_topics")
67
+ else []
68
+ )
69
+ old_current_patch = (
70
+ [TopicSentiment(**topic) for topic in old_results.get("current_patch_topics", [])]
71
+ if old_results.get("current_patch_topics")
72
+ else []
73
+ )
74
+ old_last_patch = (
75
+ [TopicSentiment(**topic) for topic in old_results.get("last_patch_topics", [])]
76
+ if old_results.get("last_patch_topics")
77
+ else None
78
+ )
79
+ old_last_patch_count = old_results.get("last_patch_reviews_count", 0)
80
+ old_patch_ts = old_results.get("current_patch_timestamp")
81
+
82
+ new_items = await steam_svc.fetch_recent_reviews(
83
+ game.app_id,
84
+ exclude_ids=old_review_ids_set,
85
+ )
86
+
87
+ if not new_items:
88
+ refreshed_at = datetime.now(timezone.utc)
89
+ refreshed_results = {
90
+ **old_results,
91
+ "cached_at": refreshed_at,
92
+ "analysis_date": refreshed_at,
93
+ "current_patch_date": datetime_from_timestamp(
94
+ patch_timestamp if patch_timestamp is not None else old_results.get("current_patch_timestamp")
95
+ ),
96
+ "freshness_status": FreshnessStatus.FRESH.value,
97
+ "staleness_reason": None,
98
+ "is_refreshing": False,
99
+ }
100
+ await mongodb.save_analysis(
101
+ game.app_id,
102
+ refreshed_results,
103
+ analyzed_review_ids=old_review_ids,
104
+ latest_review_timestamp=stale_doc.get("latest_review_timestamp", 0),
105
+ ttl_hours=ttl_hours,
106
+ analyzed_at=refreshed_at,
107
+ )
108
+ yield {
109
+ "event": "complete",
110
+ "data": json.dumps(refreshed_results, default=serialize_datetime),
111
+ }
112
+ return
113
+
114
+ new_texts = [item.text for item in new_items]
115
+ new_review_ids = [item.recommendation_id for item in new_items]
116
+ latest_timestamp = max(
117
+ (item.timestamp_created for item in new_items),
118
+ default=stale_doc.get("latest_review_timestamp", 0),
119
+ )
120
+
121
+ batch_size = settings.review_batch_size
122
+ delta_topics: list[TopicSentiment] = []
123
+ delta_current_patch_topics: list[TopicSentiment] = []
124
+ delta_current_patch_count = 0
125
+ highlights_collector = HighlightsCollector()
126
+ processed = 0
127
+ total_skipped = 0
128
+
129
+ for i in range(0, len(new_texts), batch_size):
130
+ chunk_texts = new_texts[i:i + batch_size]
131
+ chunk_items = new_items[i:i + batch_size]
132
+
133
+ batch_skipped = 0
134
+ if patch_timestamp:
135
+ for review_item, text in zip(chunk_items, chunk_texts):
136
+ categories = ["recent"]
137
+ if review_item.timestamp_created >= patch_timestamp:
138
+ categories.append("current_patch")
139
+
140
+ nlp_start = time.monotonic()
141
+ result_topics, skipped = await nlp_svc.analyze_batch(
142
+ [text],
143
+ highlights_collector=highlights_collector,
144
+ categories=categories,
145
+ )
146
+ nlp_cumulative_s += time.monotonic() - nlp_start
147
+ batch_skipped += skipped
148
+ if result_topics:
149
+ delta_topics = aggregate_topics(delta_topics, result_topics)
150
+ if review_item.timestamp_created >= patch_timestamp:
151
+ delta_current_patch_topics = aggregate_topics(
152
+ delta_current_patch_topics,
153
+ result_topics,
154
+ )
155
+ delta_current_patch_count += 1
156
+ total_skipped += batch_skipped
157
+ else:
158
+ nlp_start = time.monotonic()
159
+ batch_results, batch_skipped = await nlp_svc.analyze_batch(
160
+ chunk_texts,
161
+ highlights_collector=highlights_collector,
162
+ categories=["recent"],
163
+ )
164
+ nlp_cumulative_s += time.monotonic() - nlp_start
165
+ if batch_results:
166
+ delta_topics = aggregate_topics(delta_topics, batch_results)
167
+ total_skipped += batch_skipped
168
+
169
+ processed += len(chunk_texts)
170
+
171
+ progress = AnalysisProgress(
172
+ processed=processed,
173
+ total=len(new_texts),
174
+ current_topics=delta_topics,
175
+ skipped_count=total_skipped,
176
+ )
177
+ yield {"event": "progress", "data": progress.model_dump_json()}
178
+
179
+ new_general = aggregate_topics(old_general, delta_topics)
180
+
181
+ old_recent_count = old_results.get("recent_reviews_count", 0)
182
+ new_count = len(new_texts)
183
+
184
+ if (
185
+ old_recent_count + new_count > settings.recent_sample_limit
186
+ and old_recent
187
+ and old_recent_count > 0
188
+ ):
189
+ overflow = old_recent_count + new_count - settings.recent_sample_limit
190
+ retain_ratio = max(0.2, 1.0 - overflow / old_recent_count)
191
+ scaled_old = scale_topics(old_recent, retain_ratio)
192
+ new_recent = aggregate_topics(scaled_old, delta_topics)
193
+ recent_count = int(old_recent_count * retain_ratio) + new_count
194
+ else:
195
+ new_recent = aggregate_topics(old_recent, delta_topics) if old_recent else delta_topics
196
+ recent_count = old_recent_count + new_count
197
+
198
+ last_patch_topics = old_last_patch
199
+ last_patch_count = old_last_patch_count
200
+
201
+ if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
202
+ last_patch_topics = old_current_patch if old_current_patch else None
203
+ last_patch_count = old_results.get("current_patch_reviews_count", 0)
204
+ old_current_patch = []
205
+
206
+ new_current_patch = (
207
+ aggregate_topics(old_current_patch, delta_current_patch_topics)
208
+ if old_current_patch
209
+ else (delta_current_patch_topics if delta_current_patch_topics else [])
210
+ )
211
+ base_current_patch_count = (
212
+ 0
213
+ if (patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts)
214
+ else old_results.get("current_patch_reviews_count", 0)
215
+ )
216
+ new_current_patch_count = base_current_patch_count + delta_current_patch_count
217
+ has_current_patch = patch_timestamp is not None and (
218
+ new_current_patch_count > 0 or bool(old_current_patch)
219
+ )
220
+
221
+ # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
222
+ new_general = filter_topics_by_min_mentions(new_general)
223
+ new_recent = filter_topics_by_min_mentions(new_recent)
224
+ new_current_patch = filter_topics_by_min_mentions(new_current_patch)
225
+
226
+ prediction = calculate_prediction(new_general)
227
+
228
+ highlights_data = highlights_collector.compute_highlights()
229
+ general_highlights = highlights_data["general"]
230
+ recent_highlights = highlights_data["recent"]
231
+ current_patch_highlights = highlights_data["current_patch"]
232
+ topic_highlights_dict = highlights_data["topics"]
233
+
234
+ # Restrict topic highlights to topics that survived the min-mentions filter,
235
+ # so the topic_highlights set is always consistent with general_topics.
236
+ _surviving_topics = {t.topic for t in new_general}
237
+ topic_highlights_list = [
238
+ TopicHighlights(
239
+ topic=topic,
240
+ highlights=[Highlight(**highlight) for highlight in highlights],
241
+ )
242
+ for topic, highlights in topic_highlights_dict.items()
243
+ if topic in _surviving_topics
244
+ ]
245
+
246
+ merged_review_ids = old_review_ids + new_review_ids
247
+
248
+ analysis_generated_at = datetime.now(timezone.utc)
249
+ result = AnalysisResult(
250
+ game=game,
251
+ general_topics=new_general,
252
+ recent_topics=new_recent,
253
+ recent_reviews_count=recent_count,
254
+ current_patch_topics=new_current_patch if has_current_patch else None,
255
+ current_patch_reviews_count=new_current_patch_count if has_current_patch else 0,
256
+ last_patch_topics=last_patch_topics,
257
+ last_patch_reviews_count=last_patch_count,
258
+ current_patch_timestamp=patch_timestamp,
259
+ analysis_date=analysis_generated_at,
260
+ current_patch_date=datetime_from_timestamp(patch_timestamp),
261
+ prediction=prediction,
262
+ analyzed_reviews=old_results.get("analyzed_reviews", 0) + processed,
263
+ skipped_count=old_results.get("skipped_count", 0) + total_skipped,
264
+ general_highlights=[Highlight(**highlight) for highlight in general_highlights],
265
+ recent_highlights=[Highlight(**highlight) for highlight in recent_highlights] if recent_highlights else None,
266
+ current_patch_highlights=[Highlight(**highlight) for highlight in current_patch_highlights] if current_patch_highlights else None,
267
+ topic_highlights=topic_highlights_list,
268
+ cached_at=analysis_generated_at,
269
+ preferred_context=compute_preferred_context(patch_timestamp),
270
+ freshness_status=FreshnessStatus.FRESH.value,
271
+ is_refreshing=False,
272
+ )
273
+ await mongodb.save_analysis(
274
+ game.app_id,
275
+ result.model_dump(),
276
+ analyzed_review_ids=merged_review_ids,
277
+ latest_review_timestamp=latest_timestamp,
278
+ ttl_hours=ttl_hours,
279
+ analyzed_at=analysis_generated_at,
280
+ )
281
+
282
+ if get_structured_logger():
283
+ log_structured(
284
+ "incremental_analysis_complete",
285
+ app_id=game.app_id,
286
+ game_name=game.name if hasattr(game, "name") else str(game.app_id),
287
+ source=source,
288
+ reviews_processed=processed,
289
+ topics_found=len(new_general),
290
+ detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)},
291
+ )
292
+
293
+ yield {"event": "complete", "data": result.model_dump_json()}
294
+
295
+
296
+ async def run_incremental_analysis(
297
+ app_id: str,
298
+ game_name: str,
299
+ steam_svc: SteamService,
300
+ nlp_svc: NLPService,
301
+ ) -> dict[str, Any] | None:
302
+ """Run a non-SSE incremental analysis for worker jobs."""
303
+ slog = get_structured_logger()
304
+
305
+ try:
306
+ stale_doc = await mongodb.get_analysis(app_id)
307
+ if not stale_doc or not stale_doc.get("results") or not stale_doc.get("analyzed_review_ids"):
308
+ return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc)
309
+
310
+ # Long gap guard: if the most recent review we have is too old, Steam's cursor-based
311
+ # API may not reliably surface all reviews since then. Fall back to full analysis.
312
+ latest_ts = stale_doc.get("latest_review_timestamp", 0)
313
+ if latest_ts > 0:
314
+ gap_days = (time.time() - latest_ts) / 86400
315
+ if gap_days > settings.incremental_max_gap_days:
316
+ logger.info(
317
+ f"Incremental gap {gap_days:.0f}d > {settings.incremental_max_gap_days}d "
318
+ f"for {app_id} ({game_name}) — falling back to full analysis"
319
+ )
320
+ return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc)
321
+
322
+ game = await steam_svc.get_game_info(app_id)
323
+ if not game:
324
+ cached_game = stale_doc.get("results", {}).get("game")
325
+ if isinstance(cached_game, dict):
326
+ game = GameInfo(**cached_game)
327
+ else:
328
+ game = GameInfo(app_id=app_id, name=game_name)
329
+
330
+ patch_date = await mongodb.get_game_patch_date(app_id)
331
+ patch_timestamp = int(patch_date.timestamp()) if patch_date else None
332
+ if patch_timestamp:
333
+ game = game.model_copy(update={"last_game_update_at": patch_timestamp})
334
+
335
+ final_payload: dict[str, Any] | None = None
336
+ async for event in iter_incremental_analysis_events(
337
+ game,
338
+ stale_doc,
339
+ steam_svc,
340
+ nlp_svc,
341
+ patch_timestamp=patch_timestamp,
342
+ source="worker",
343
+ ):
344
+ if event.get("event") == "complete":
345
+ final_payload = json.loads(event["data"])
346
+
347
+ return final_payload
348
+ except Exception as e:
349
+ logger.error(f"Incremental analysis runner error for {app_id} ({game_name}): {e}", exc_info=True)
350
+ if slog:
351
+ log_structured(
352
+ "analysis_error",
353
+ level=logging.ERROR,
354
+ app_id=app_id,
355
+ game_name=game_name,
356
+ source="worker",
357
+ error=str(e),
358
+ )
359
+ return None
360
+
361
+
362
+ async def run_full_analysis(
363
+ app_id: str,
364
+ game_name: str,
365
+ steam_svc: SteamService,
366
+ nlp_svc: NLPService,
367
+ stale_doc: dict[str, Any] | None = None,
368
+ ) -> dict[str, Any] | None:
369
+ """
370
+ Run a full analysis for a game (no SSE, no streaming).
371
+
372
+ Returns:
373
+ Analysis result dict, or None on error.
374
+ """
375
+ slog = get_structured_logger()
376
+
377
+ try:
378
+ # Phase 1: Setup — game info + review stats + sample plan
379
+ async with AsyncTimingContext() as t_setup:
380
+ # 1. Get game info
381
+ game = await steam_svc.get_game_info(app_id)
382
+ if not game:
383
+ logger.warning(f"Analysis runner: game info not found for {app_id}")
384
+ return None
385
+
386
+ # 2. Get review stats
387
+ stats = await steam_svc.get_review_stats(app_id)
388
+ if stats.total == 0:
389
+ logger.warning(f"Analysis runner: no reviews for {app_id}")
390
+ return None
391
+
392
+ # 3. Create sample plan
393
+ sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
394
+ ttl_hours = await get_ttl_hours(app_id)
395
+
396
+ # 3b. Fetch game patch date for current_patch splitting
397
+ patch_date = await mongodb.get_game_patch_date(app_id)
398
+ patch_timestamp = int(patch_date.timestamp()) if patch_date else None
399
+ if patch_timestamp and isinstance(game, GameInfo):
400
+ game = game.model_copy(update={"last_game_update_at": patch_timestamp})
401
+
402
+ # Phase 2: Fetch + Analyze — producer-consumer loop
403
+ nlp_cumulative_s: float = 0.0
404
+
405
+ async with AsyncTimingContext() as t_fetch_analyze:
406
+ # 4. Producer-consumer fetch + analyze
407
+ queue: asyncio.Queue = asyncio.Queue(maxsize=5)
408
+
409
+ async def fetch_worker():
410
+ try:
411
+ async for batch in steam_svc.fetch_reviews_stratified(app_id, sample_plan):
412
+ await queue.put(batch)
413
+ except Exception as e:
414
+ await queue.put(e)
415
+ finally:
416
+ await queue.put(None)
417
+
418
+ fetch_task = asyncio.create_task(fetch_worker())
419
+
420
+ processed = 0
421
+ total_skipped = 0
422
+ aggregated_topics: list[TopicSentiment] = []
423
+ recent_processed = 0
424
+ recent_limit = settings.recent_sample_limit
425
+ all_review_ids: list[str] = []
426
+ latest_timestamp = 0
427
+ highlights_collector = HighlightsCollector()
428
+ current_patch_topics: list[TopicSentiment] = []
429
+ current_patch_count = 0
430
+ review_topic_results: list[tuple[int, list[TopicSentiment]]] = []
431
+
432
+ try:
433
+ while True:
434
+ item = await queue.get()
435
+
436
+ if item is None:
437
+ break
438
+ if isinstance(item, Exception):
439
+ raise item
440
+
441
+ batch = item
442
+ if not batch.reviews:
443
+ continue
444
+
445
+ for ri in batch.review_items:
446
+ all_review_ids.append(ri.recommendation_id)
447
+ if ri.timestamp_created > latest_timestamp:
448
+ latest_timestamp = ri.timestamp_created
449
+
450
+ batch_skipped = 0
451
+ if patch_timestamp and batch.review_items:
452
+ for ri, text in zip(batch.review_items, batch.reviews):
453
+ is_recent = recent_processed < recent_limit
454
+ cat = []
455
+ if is_recent:
456
+ cat.append("recent")
457
+
458
+ if ri.timestamp_created >= patch_timestamp:
459
+ cat.append("current_patch")
460
+ nlp_start = time.monotonic()
461
+ res, skipped = await nlp_svc.analyze_batch(
462
+ [text], highlights_collector=highlights_collector, categories=cat
463
+ )
464
+ nlp_cumulative_s += time.monotonic() - nlp_start
465
+ batch_skipped += skipped
466
+ if res:
467
+ aggregated_topics = aggregate_topics(aggregated_topics, res)
468
+ current_patch_topics = aggregate_topics(current_patch_topics, res)
469
+ review_topic_results.append((ri.timestamp_created, res))
470
+ current_patch_count += 1
471
+ else:
472
+ nlp_start = time.monotonic()
473
+ res, skipped = await nlp_svc.analyze_batch(
474
+ [text], highlights_collector=highlights_collector, categories=cat
475
+ )
476
+ nlp_cumulative_s += time.monotonic() - nlp_start
477
+ batch_skipped += skipped
478
+ if res:
479
+ aggregated_topics = aggregate_topics(aggregated_topics, res)
480
+ review_topic_results.append((ri.timestamp_created, res))
481
+ recent_processed += 1
482
+ else:
483
+ for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews):
484
+ is_recent = recent_processed < recent_limit
485
+ cat = ["recent"] if is_recent else []
486
+
487
+ nlp_start = time.monotonic()
488
+ res, skipped = await nlp_svc.analyze_batch(
489
+ [text], highlights_collector=highlights_collector, categories=cat
490
+ )
491
+ nlp_cumulative_s += time.monotonic() - nlp_start
492
+ batch_skipped += skipped
493
+ ts = ri.timestamp_created if batch.review_items else 0
494
+ if res:
495
+ aggregated_topics = aggregate_topics(aggregated_topics, res)
496
+ review_topic_results.append((ts, res))
497
+ recent_processed += 1
498
+
499
+ total_skipped += batch_skipped
500
+ processed += len(batch.reviews)
501
+
502
+ await fetch_task
503
+ except BaseException:
504
+ fetch_task.cancel()
505
+ with contextlib.suppress(asyncio.CancelledError):
506
+ await fetch_task
507
+ raise
508
+
509
+ # Phase 3: Save — highlights + MongoDB save
510
+ async with AsyncTimingContext() as t_save:
511
+ # 5. Compute prediction + highlights
512
+
513
+ # Build recent_topics from highest-timestamp reviews
514
+ review_topic_results.sort(key=lambda x: x[0], reverse=True)
515
+ recent_entries = review_topic_results[:recent_limit]
516
+ recent_topics: list[TopicSentiment] = []
517
+ for _, topics_batch in recent_entries:
518
+ for ts in topics_batch:
519
+ recent_topics = aggregate_topics(recent_topics, [ts])
520
+ recent_reviews_count = len(recent_entries)
521
+
522
+ # Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
523
+ aggregated_topics = filter_topics_by_min_mentions(aggregated_topics)
524
+ recent_topics = filter_topics_by_min_mentions(recent_topics)
525
+ current_patch_topics = filter_topics_by_min_mentions(current_patch_topics)
526
+
527
+ prediction = calculate_prediction(aggregated_topics)
528
+
529
+ highlights_data = highlights_collector.compute_highlights()
530
+ general_highlights = highlights_data["general"]
531
+ recent_highlights = highlights_data["recent"]
532
+ current_patch_highlights = highlights_data["current_patch"]
533
+ topic_highlights_dict = highlights_data["topics"]
534
+
535
+ # Restrict topic highlights to topics that survived the min-mentions filter,
536
+ # so the topic_highlights set is always consistent with general_topics.
537
+ _surviving_topics = {t.topic for t in aggregated_topics}
538
+ topic_highlights_list = [
539
+ TopicHighlights(
540
+ topic=topic,
541
+ highlights=[Highlight(**h) for h in highlights],
542
+ )
543
+ for topic, highlights in topic_highlights_dict.items()
544
+ if topic in _surviving_topics
545
+ ]
546
+
547
+ has_recent_split = processed > recent_limit
548
+ has_current_patch = patch_timestamp is not None and current_patch_count > 0
549
+ analysis_generated_at = datetime.now(timezone.utc)
550
+ current_patch_date = (
551
+ datetime.fromtimestamp(patch_timestamp, tz=timezone.utc)
552
+ if patch_timestamp is not None
553
+ else None
554
+ )
555
+
556
+ # Archive last_patch_topics when full analysis replaces a doc with a different patch.
557
+ last_patch_topics: list[TopicSentiment] | None = None
558
+ last_patch_reviews_count = 0
559
+ if stale_doc:
560
+ old_r = normalize_legacy_results(stale_doc.get("results", {}))
561
+ old_patch_ts = old_r.get("current_patch_timestamp")
562
+ if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
563
+ raw_cp = old_r.get("current_patch_topics")
564
+ last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None
565
+ last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0)
566
+ else:
567
+ raw_lp = old_r.get("last_patch_topics")
568
+ last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None
569
+ last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0)
570
+
571
+ result = AnalysisResult(
572
+ game=game,
573
+ general_topics=aggregated_topics,
574
+ recent_topics=recent_topics if has_recent_split else None,
575
+ recent_reviews_count=recent_reviews_count if has_recent_split else 0,
576
+ current_patch_topics=current_patch_topics if has_current_patch else None,
577
+ current_patch_reviews_count=current_patch_count if has_current_patch else 0,
578
+ last_patch_topics=last_patch_topics,
579
+ last_patch_reviews_count=last_patch_reviews_count,
580
+ current_patch_timestamp=patch_timestamp,
581
+ analysis_date=analysis_generated_at,
582
+ current_patch_date=current_patch_date,
583
+ prediction=prediction,
584
+ analyzed_reviews=processed,
585
+ skipped_count=total_skipped,
586
+ general_highlights=[Highlight(**h) for h in general_highlights],
587
+ recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None,
588
+ current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None,
589
+ topic_highlights=topic_highlights_list,
590
+ cached_at=analysis_generated_at,
591
+ preferred_context=compute_preferred_context(patch_timestamp),
592
+ freshness_status=FreshnessStatus.FRESH.value,
593
+ is_refreshing=False,
594
+ )
595
+
596
+ # 6. Save to cache
597
+ await mongodb.save_analysis(
598
+ game.app_id,
599
+ result.model_dump(),
600
+ analyzed_review_ids=all_review_ids,
601
+ latest_review_timestamp=latest_timestamp,
602
+ ttl_hours=ttl_hours,
603
+ analyzed_at=analysis_generated_at,
604
+ )
605
+
606
+ total_elapsed = t_setup.elapsed_s + t_fetch_analyze.elapsed_s + t_save.elapsed_s
607
+
608
+ logger.info(
609
+ f"Analysis runner: completed {app_id} ({game_name}) — "
610
+ f"{processed} reviews, {len(aggregated_topics)} topics"
611
+ )
612
+
613
+ if slog:
614
+ log_structured(
615
+ "analysis_complete",
616
+ app_id=app_id,
617
+ game_name=game_name,
618
+ elapsed_s=round(total_elapsed, 3),
619
+ source="worker",
620
+ breakdown={
621
+ "setup_s": t_setup.elapsed_s,
622
+ "fetch_analyze_s": t_fetch_analyze.elapsed_s,
623
+ "nlp_cumulative_s": round(nlp_cumulative_s, 3),
624
+ "save_s": t_save.elapsed_s,
625
+ },
626
+ reviews_processed=processed,
627
+ topics_found=len(aggregated_topics),
628
+ )
629
+
630
+ return result.model_dump()
631
+
632
+ except Exception as e:
633
+ logger.error(f"Analysis runner error for {app_id} ({game_name}): {e}", exc_info=True)
634
+ if slog:
635
+ log_structured(
636
+ "analysis_error",
637
+ level=logging.ERROR,
638
+ app_id=app_id,
639
+ game_name=game_name,
640
+ source="worker",
641
+ error=str(e),
642
+ )
643
+ return None
backend/app/services/analysis_utils.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared analysis helpers used by both live and worker paths."""
2
+
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import Any
6
+
7
+ from app.core.config import settings
8
+ from app.models.schemas import (
9
+ PredictionType,
10
+ SentimentType,
11
+ TopicSentiment,
12
+ UserCountPrediction,
13
+ )
14
+
15
+
16
+ def calculate_prediction(topics: list[TopicSentiment]) -> UserCountPrediction:
17
+ """Compute the player-count trend prediction from aggregated topics."""
18
+ topic_map = {t.topic: t for t in topics}
19
+
20
+ retention = topic_map.get("Retention")
21
+ if retention and retention.mention_count > 5:
22
+ if retention.score > settings.prediction_retention_threshold_pos:
23
+ return UserCountPrediction(
24
+ trend=PredictionType.INCREASING,
25
+ confidence=min(0.95, 0.5 + (retention.mention_count / 100)),
26
+ reasoning="PREDICTION_REASONING_RETENTION_HIGH",
27
+ )
28
+ if retention.score < settings.prediction_retention_threshold_neg:
29
+ return UserCountPrediction(
30
+ trend=PredictionType.DECREASING,
31
+ confidence=min(0.95, 0.5 + (retention.mention_count / 100)),
32
+ reasoning="PREDICTION_REASONING_RETENTION_LOW",
33
+ )
34
+
35
+ bugs = topic_map.get("Bugs")
36
+ performance = topic_map.get("Performance")
37
+ tech_score = 0.0
38
+ tech_count = 0
39
+
40
+ if bugs:
41
+ tech_score += bugs.score
42
+ tech_count += 1
43
+ if performance:
44
+ tech_score += performance.score
45
+ tech_count += 1
46
+
47
+ if tech_count > 0 and (tech_score / tech_count) < -0.3:
48
+ return UserCountPrediction(
49
+ trend=PredictionType.DECREASING,
50
+ confidence=0.75,
51
+ reasoning="PREDICTION_REASONING_TECH_ISSUES",
52
+ )
53
+
54
+ gameplay = topic_map.get("Gameplay")
55
+ fun = topic_map.get("Fun")
56
+ gameplay_score = 0.0
57
+ gameplay_count = 0
58
+
59
+ if gameplay:
60
+ gameplay_score += gameplay.score
61
+ gameplay_count += 1
62
+ if fun:
63
+ gameplay_score += fun.score
64
+ gameplay_count += 1
65
+
66
+ if gameplay_count > 0:
67
+ average_gameplay = gameplay_score / gameplay_count
68
+ if average_gameplay > 0.4:
69
+ return UserCountPrediction(
70
+ trend=PredictionType.INCREASING,
71
+ confidence=0.8,
72
+ reasoning="PREDICTION_REASONING_GAMEPLAY_HIGH",
73
+ )
74
+ if average_gameplay < -0.2:
75
+ return UserCountPrediction(
76
+ trend=PredictionType.DECREASING,
77
+ confidence=0.6,
78
+ reasoning="PREDICTION_REASONING_GAMEPLAY_LOW",
79
+ )
80
+
81
+ return UserCountPrediction(
82
+ trend=PredictionType.STABLE,
83
+ confidence=0.5,
84
+ reasoning="PREDICTION_REASONING_STABLE",
85
+ )
86
+
87
+
88
+ def aggregate_topics(
89
+ existing: list[TopicSentiment],
90
+ new_batch: list[TopicSentiment],
91
+ ) -> list[TopicSentiment]:
92
+ """Merge topic aggregates using weighted mention counts."""
93
+ topic_data: dict[str, dict[str, Any]] = {}
94
+
95
+ def better_example(
96
+ current: tuple[str, float] | None,
97
+ new: tuple[str, float] | None,
98
+ ) -> tuple[str, float] | None:
99
+ if new is None:
100
+ return current
101
+ if current is None:
102
+ return new
103
+ return new if abs(new[1]) > abs(current[1]) else current
104
+
105
+ for topic in existing:
106
+ if topic.topic not in topic_data:
107
+ topic_data[topic.topic] = {"scores": [], "count": 0, "example": None}
108
+ topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count)
109
+ topic_data[topic.topic]["count"] += topic.mention_count
110
+ new_example = (
111
+ (topic.example, topic.example_score)
112
+ if topic.example and topic.example_score is not None
113
+ else None
114
+ )
115
+ topic_data[topic.topic]["example"] = better_example(
116
+ topic_data[topic.topic]["example"],
117
+ new_example,
118
+ )
119
+
120
+ for topic in new_batch:
121
+ if topic.topic not in topic_data:
122
+ topic_data[topic.topic] = {"scores": [], "count": 0, "example": None}
123
+ topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count)
124
+ topic_data[topic.topic]["count"] += topic.mention_count
125
+ new_example = (
126
+ (topic.example, topic.example_score)
127
+ if topic.example and topic.example_score is not None
128
+ else None
129
+ )
130
+ topic_data[topic.topic]["example"] = better_example(
131
+ topic_data[topic.topic]["example"],
132
+ new_example,
133
+ )
134
+
135
+ results: list[TopicSentiment] = []
136
+ for topic_name, data in topic_data.items():
137
+ count = data["count"]
138
+ if count == 0:
139
+ continue
140
+
141
+ average_score = sum(data["scores"]) / count
142
+ normalized_score = max(-1.0, min(1.0, average_score))
143
+
144
+ if normalized_score > settings.sentiment_positive_threshold:
145
+ sentiment = SentimentType.POSITIVE
146
+ elif normalized_score < settings.sentiment_negative_threshold:
147
+ sentiment = SentimentType.NEGATIVE
148
+ else:
149
+ sentiment = SentimentType.NEUTRAL
150
+
151
+ best_example = None
152
+ example_score = None
153
+ example_data = data["example"]
154
+ if example_data:
155
+ example_text, candidate_score = example_data
156
+ if sentiment == SentimentType.NEUTRAL or (
157
+ sentiment == SentimentType.POSITIVE and candidate_score > 0
158
+ ) or (
159
+ sentiment == SentimentType.NEGATIVE and candidate_score < 0
160
+ ):
161
+ best_example = example_text
162
+ example_score = candidate_score
163
+
164
+ results.append(
165
+ TopicSentiment(
166
+ topic=topic_name,
167
+ sentiment=sentiment,
168
+ score=round(normalized_score, 3),
169
+ mention_count=count,
170
+ example=best_example,
171
+ example_score=example_score,
172
+ )
173
+ )
174
+
175
+ results.sort(key=lambda item: item.mention_count, reverse=True)
176
+ return results
177
+
178
+
179
+ def scale_topics(topics: list[TopicSentiment], factor: float) -> list[TopicSentiment]:
180
+ """Scale mention counts for the approximate recent sliding window."""
181
+ return [
182
+ topic.model_copy(update={"mention_count": max(1, int(topic.mention_count * factor))})
183
+ for topic in topics
184
+ ]
185
+
186
+
187
+ def filter_topics_by_min_mentions(
188
+ topics: list[TopicSentiment],
189
+ min_mentions: int | None = None,
190
+ ) -> list[TopicSentiment]:
191
+ """Filter topics below the minimum mention threshold.
192
+
193
+ Preserves existing sort order. Only filters — does not modify score or sentiment.
194
+ Applied at the final aggregate level, never at the per-review level.
195
+ """
196
+ threshold = min_mentions if min_mentions is not None else settings.topic_min_mentions
197
+ return [t for t in topics if t.mention_count >= threshold]
198
+
199
+
200
+ def compute_preferred_context(patch_timestamp: int | None) -> str:
201
+ """Choose the default user-facing context tab.
202
+
203
+ Returns 'current_patch' only when a recent major patch exists; otherwise
204
+ returns 'general' so the UI defaults to the full-picture view.
205
+ """
206
+ if patch_timestamp is None:
207
+ return "general"
208
+ patch_age_days = (time.time() - patch_timestamp) / 86400
209
+ if patch_age_days > settings.patch_context_max_age_days:
210
+ return "general"
211
+ return "current_patch"
212
+
213
+
214
+ _LEGACY_FIELD_MAP = {
215
+ "topics": "general_topics",
216
+ "historical_topics": "general_topics",
217
+ "post_update_topics": "current_patch_topics",
218
+ "post_update_reviews_count": "current_patch_reviews_count",
219
+ "post_update_highlights": "current_patch_highlights",
220
+ "previous_update_topics": "last_patch_topics",
221
+ "previous_update_reviews_count": "last_patch_reviews_count",
222
+ "last_update_timestamp": "current_patch_timestamp",
223
+ }
224
+
225
+
226
+ def normalize_legacy_results(results: dict[str, Any]) -> dict[str, Any]:
227
+ """Map legacy persisted result fields to the current schema."""
228
+ normalized: dict[str, Any] = {}
229
+ for key, value in results.items():
230
+ new_key = _LEGACY_FIELD_MAP.get(key, key)
231
+ if key == "is_incremental":
232
+ continue
233
+ if new_key not in normalized:
234
+ normalized[new_key] = value
235
+ return normalized
236
+
237
+
238
+ def serialize_datetime(value: Any) -> str | Any:
239
+ """Serialize datetimes in SSE payloads and persisted compatibility helpers."""
240
+ if isinstance(value, datetime):
241
+ return value.isoformat()
242
+ return value
243
+
244
+
245
+ def coerce_utc_datetime(value: Any) -> datetime | None:
246
+ """Coerce persisted datetime values into timezone-aware UTC datetimes."""
247
+ if isinstance(value, datetime):
248
+ return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
249
+ if isinstance(value, str):
250
+ parsed = datetime.fromisoformat(value)
251
+ return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
252
+ return None
253
+
254
+
255
+ def datetime_from_timestamp(timestamp: int | None) -> datetime | None:
256
+ """Convert a unix timestamp into UTC datetime."""
257
+ if timestamp is None:
258
+ return None
259
+ return datetime.fromtimestamp(timestamp, tz=timezone.utc)
backend/app/services/game_sync_service.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Game Sync Service — fetches game data from SteamSpy and upserts to MongoDB.
3
+
4
+ Replaces the manual scripts/fetch_games_to_mongodb.py with an automated,
5
+ rate-limited sync that runs as part of the Worker cycle.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from datetime import datetime, timezone
11
+ from typing import Any
12
+
13
+ import httpx
14
+
15
+ from app.core.config import settings
16
+ from app.db.mongodb import mongodb
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ STEAMSPY_API_URL = "https://steamspy.com/api.php"
21
+ STEAM_STORE_API_URL = "https://store.steampowered.com/api"
22
+
23
+
24
+ class GameSyncService:
25
+ """Syncs game data from SteamSpy into MongoDB."""
26
+
27
+ def __init__(self, client: httpx.AsyncClient | None = None) -> None:
28
+ self._client = client
29
+ self._owns_client = client is None
30
+
31
+ async def _get_client(self) -> httpx.AsyncClient:
32
+ if self._client is None:
33
+ self._client = httpx.AsyncClient(timeout=30.0)
34
+ return self._client
35
+
36
+ async def close(self) -> None:
37
+ if self._owns_client and self._client is not None:
38
+ await self._client.aclose()
39
+ self._client = None
40
+
41
+ async def sync_all_games(self) -> tuple[int, int]:
42
+ """
43
+ Fetch all games from SteamSpy (paginated, up to 90 pages).
44
+
45
+ Returns:
46
+ (total_upserted, total_modified)
47
+ """
48
+ client = await self._get_client()
49
+ total_upserted = 0
50
+ total_modified = 0
51
+ now = datetime.now(timezone.utc)
52
+
53
+ for page in range(90):
54
+ try:
55
+ resp = await client.get(
56
+ STEAMSPY_API_URL,
57
+ params={"request": "all", "page": page},
58
+ )
59
+ resp.raise_for_status()
60
+ data = resp.json()
61
+
62
+ if not data:
63
+ logger.info(f"SteamSpy page {page} empty — sync complete")
64
+ break
65
+
66
+ games = self._parse_all_response(data, now)
67
+ if games:
68
+ upserted, modified = await mongodb.upsert_games_batch(games)
69
+ total_upserted += upserted
70
+ total_modified += modified
71
+
72
+ logger.info(
73
+ f"SteamSpy page {page}: {len(games)} games "
74
+ f"(upserted={total_upserted}, modified={total_modified})"
75
+ )
76
+
77
+ except httpx.HTTPStatusError as e:
78
+ logger.error(f"SteamSpy HTTP error on page {page}: {e}")
79
+ break
80
+ except httpx.RequestError as e:
81
+ logger.error(f"SteamSpy request error on page {page}: {e}")
82
+ break
83
+
84
+ # Rate limit: SteamSpy allows ~1 request per minute
85
+ if page < 89:
86
+ await asyncio.sleep(settings.game_sync_steamspy_delay)
87
+
88
+ logger.info(
89
+ f"Game sync complete: upserted={total_upserted}, modified={total_modified}"
90
+ )
91
+ return (total_upserted, total_modified)
92
+
93
+ async def sync_top_game_details(self, limit: int | None = None) -> int:
94
+ """
95
+ Enrich top N games with detailed info (tags, genre, ccu) from SteamSpy.
96
+
97
+ Returns:
98
+ Number of games enriched.
99
+ """
100
+ limit = limit or settings.game_sync_top_n_details
101
+ client = await self._get_client()
102
+
103
+ top_games = await mongodb.get_top_games_by_reviews(limit)
104
+ enriched = 0
105
+
106
+ for game in top_games:
107
+ appid = game.get("appid", "")
108
+ if not appid:
109
+ continue
110
+
111
+ try:
112
+ resp = await client.get(
113
+ STEAMSPY_API_URL,
114
+ params={"request": "appdetails", "appid": appid},
115
+ )
116
+ resp.raise_for_status()
117
+ detail = resp.json()
118
+
119
+ update = self._parse_detail_response(detail)
120
+ if update:
121
+ await mongodb.upsert_game({"appid": appid, "name": game["name"], **update})
122
+ enriched += 1
123
+
124
+ except httpx.HTTPStatusError as e:
125
+ logger.warning(f"SteamSpy detail error for {appid}: {e}")
126
+ except httpx.RequestError as e:
127
+ logger.warning(f"SteamSpy detail request error for {appid}: {e}")
128
+
129
+ await asyncio.sleep(settings.game_sync_details_delay)
130
+
131
+ logger.info(f"Enriched {enriched}/{len(top_games)} games with details")
132
+ return enriched
133
+
134
+ async def enrich_cn_names(self, limit: int | None = None) -> int:
135
+ """
136
+ Enrich games with Chinese names from Steam Store API.
137
+
138
+ Returns:
139
+ Number of games processed.
140
+ """
141
+ limit = limit or settings.game_sync_cn_enrichment_limit
142
+ client = await self._get_client()
143
+
144
+ games_to_check = await mongodb.get_games_without_cn_name(limit)
145
+ processed = 0
146
+
147
+ for game in games_to_check:
148
+ appid = game.get("appid")
149
+ name_en = game.get("name")
150
+ if not appid:
151
+ continue
152
+
153
+ try:
154
+ app_data = await self._fetch_store_app_data(client, appid)
155
+ if app_data and app_data.get("success"):
156
+ info = app_data.get("data", {})
157
+ name_cn = info.get("name")
158
+
159
+ # If names are different, we found a translation
160
+ if name_cn and name_cn != name_en:
161
+ await mongodb.mark_cn_name_checked(appid, name_cn)
162
+ else:
163
+ await mongodb.mark_cn_name_checked(appid)
164
+ else:
165
+ # Not found or error in API - still mark as checked
166
+ await mongodb.mark_cn_name_checked(appid)
167
+
168
+ processed += 1
169
+
170
+ except httpx.HTTPError as e:
171
+ logger.warning(f"Error fetching CN name for {appid}: {e}")
172
+ # Don't mark as checked on network error, try again next cycle
173
+
174
+ # Respect rate limits
175
+ await asyncio.sleep(settings.game_sync_cn_enrichment_delay)
176
+
177
+ logger.info(f"Enriched CN names for {processed}/{len(games_to_check)} games")
178
+ return processed
179
+
180
+ async def enrich_app_types(self, limit: int | None = None) -> int:
181
+ """
182
+ Enrich app_type/parent_appid using Steam Store appdetails.
183
+
184
+ Returns:
185
+ Number of games processed.
186
+ """
187
+ limit = limit or settings.game_sync_app_type_enrichment_limit
188
+ client = await self._get_client()
189
+
190
+ games_to_check = await mongodb.get_games_missing_app_type(limit)
191
+ processed = 0
192
+
193
+ for game in games_to_check:
194
+ appid = game.get("appid")
195
+ if not appid:
196
+ continue
197
+
198
+ try:
199
+ app_data = await self._fetch_store_app_data(client, appid)
200
+ info = app_data.get("data", {}) if app_data and app_data.get("success") else {}
201
+
202
+ parsed = self._parse_store_type_response(info)
203
+ await mongodb.mark_app_type_checked(
204
+ appid,
205
+ app_type=parsed["app_type"],
206
+ parent_appid=parsed["parent_appid"],
207
+ )
208
+ processed += 1
209
+
210
+ except httpx.HTTPError as e:
211
+ logger.warning(f"Error fetching app type for {appid}: {e}")
212
+
213
+ await asyncio.sleep(settings.game_sync_app_type_enrichment_delay)
214
+
215
+ logger.info(f"Enriched app types for {processed}/{len(games_to_check)} games")
216
+ return processed
217
+
218
+ @staticmethod
219
+ def _parse_all_response(
220
+ data: dict[str, Any], synced_at: datetime
221
+ ) -> list[dict[str, Any]]:
222
+ """Parse SteamSpy 'all' response into list of game dicts."""
223
+ games: list[dict[str, Any]] = []
224
+ for appid_str, info in data.items():
225
+ name = info.get("name", "")
226
+ if not name:
227
+ continue
228
+
229
+ games.append({
230
+ "appid": str(appid_str),
231
+ "name": name,
232
+ "developer": info.get("developer", ""),
233
+ "publisher": info.get("publisher", ""),
234
+ "positive": info.get("positive", 0),
235
+ "negative": info.get("negative", 0),
236
+ "synced_at": synced_at,
237
+ })
238
+ return games
239
+
240
+ @staticmethod
241
+ def _parse_detail_response(detail: dict[str, Any]) -> dict[str, Any]:
242
+ """Parse SteamSpy 'appdetails' response into enrichment fields."""
243
+ update: dict[str, Any] = {}
244
+
245
+ tags = detail.get("tags")
246
+ if isinstance(tags, dict) and tags:
247
+ # Sort by vote count descending, keep top 20 tag names
248
+ sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]
249
+ update["tags"] = [tag_name for tag_name, _ in sorted_tags]
250
+
251
+ genre = detail.get("genre")
252
+ if genre:
253
+ update["genre"] = genre
254
+
255
+ ccu = detail.get("ccu")
256
+ if ccu is not None:
257
+ update["ccu"] = ccu
258
+
259
+ return update
260
+
261
+ @staticmethod
262
+ def _parse_store_type_response(info: dict[str, Any]) -> dict[str, Any]:
263
+ app_type = info.get("type") or "unknown"
264
+ fullgame = info.get("fullgame")
265
+
266
+ parent_appid = None
267
+ if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None:
268
+ parent_appid = str(fullgame["appid"])
269
+
270
+ return {
271
+ "app_type": str(app_type),
272
+ "parent_appid": parent_appid,
273
+ }
274
+
275
+ @staticmethod
276
+ async def _fetch_store_app_data(
277
+ client: httpx.AsyncClient, appid: str
278
+ ) -> dict[str, Any] | None:
279
+ """Fetch one appdetails payload from Steam Store."""
280
+ resp = await client.get(
281
+ f"{STEAM_STORE_API_URL}/appdetails",
282
+ params={
283
+ "appids": appid,
284
+ "l": "schinese",
285
+ "cc": "CN",
286
+ },
287
+ )
288
+ resp.raise_for_status()
289
+ data = resp.json()
290
+ return data.get(str(appid))
backend/app/services/highlights_service.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Serwis ekstrakcji Community Highlights z recenzji.
3
+ Uzywa n-gramow (2-5 tokenow) + TF-IDF do identyfikacji najczesciej uzywanych fraz.
4
+ """
5
+
6
+ import math
7
+ from collections import Counter, defaultdict
8
+ from typing import Any
9
+
10
+ import jieba
11
+
12
+ from app.core.config import settings
13
+ from app.core.stopwords_zh import is_stopword
14
+
15
+
16
+ class HighlightsCollector:
17
+ """
18
+ Stateful collector — akumuluje dane przez caly cykl analizy w sposob przyrostowy,
19
+ aby oszczedzac pamiec RAM. Oblicza highlights raz na koncu.
20
+ """
21
+
22
+ def __init__(self) -> None:
23
+ self._topic_ngrams: dict[str, Counter] = defaultdict(Counter)
24
+ self._category_ngrams: dict[str, Counter] = defaultdict(Counter)
25
+ self._global_counts: Counter = Counter()
26
+ self._ngram_doc_freq: Counter = Counter()
27
+ self._ngram_sentiment_sum: dict[str, float] = defaultdict(float)
28
+ self._ngram_sentiment_count: Counter = Counter()
29
+ self._review_count = 0
30
+ self._current_review_seen_ngrams: set[str] = set()
31
+
32
+ def start_review(self) -> None:
33
+ """Sygnalizuje poczatek nowej recenzji (do obliczania Document Frequency)."""
34
+ self._review_count += 1
35
+ self._current_review_seen_ngrams = set()
36
+
37
+ def add_sentence(
38
+ self,
39
+ review_idx: int, # Zachowane dla kompatybilnosci, uzywaj start_review() do separacji
40
+ sentence: str,
41
+ topics: list[str],
42
+ sentiment_score: float,
43
+ categories: list[str] | None = None,
44
+ ) -> None:
45
+ """Wywolywane per zdanie podczas analyze_batch()."""
46
+ # Prosta detekcja ASCII dla angielskich fraz (unikniecie blednego ciecia przez jieba)
47
+ is_ascii = all(ord(c) < 128 for c in sentence)
48
+ if is_ascii:
49
+ words = [w for w in sentence.split() if not is_stopword(w) and len(w.strip()) > 0]
50
+ else:
51
+ words = [w for w in jieba.lcut(sentence) if not is_stopword(w) and len(w.strip()) > 0]
52
+
53
+ if len(words) < 2:
54
+ return
55
+
56
+ for n in range(settings.highlights_ngram_min, settings.highlights_ngram_max + 1):
57
+ for i in range(len(words) - n + 1):
58
+ ngram = " ".join(words[i : i + n])
59
+
60
+ # 1. Globalne liczniki
61
+ self._global_counts[ngram] += 1
62
+ self._ngram_sentiment_sum[ngram] += sentiment_score
63
+ self._ngram_sentiment_count[ngram] += 1
64
+
65
+ # 2. Przyrostowe Document Frequency (raz per recenzja)
66
+ if ngram not in self._current_review_seen_ngrams:
67
+ self._ngram_doc_freq[ngram] += 1
68
+ self._current_review_seen_ngrams.add(ngram)
69
+
70
+ # 3. Liczniki tematyczne i kategoryczne
71
+ for topic in topics:
72
+ self._topic_ngrams[topic][ngram] += 1
73
+ if categories:
74
+ for category in categories:
75
+ self._category_ngrams[category][ngram] += 1
76
+
77
+ if self._review_count % 500 == 0:
78
+ self._prune_singletons()
79
+
80
+ def _prune_singletons(self) -> None:
81
+ """Glebokie czyszczenie n-gramow z count=1 (oszczednosc pamieci)."""
82
+ singletons = [k for k, v in self._global_counts.items() if v <= 1]
83
+ for k in singletons:
84
+ del self._global_counts[k]
85
+ if k in self._ngram_sentiment_sum:
86
+ del self._ngram_sentiment_sum[k]
87
+ del self._ngram_sentiment_count[k]
88
+ del self._ngram_doc_freq[k]
89
+
90
+ # Czyszczenie w tematach
91
+ for topic in self._topic_ngrams:
92
+ if k in self._topic_ngrams[topic]:
93
+ del self._topic_ngrams[topic][k]
94
+
95
+ # Czyszczenie w kategoriach
96
+ for cat in self._category_ngrams:
97
+ if k in self._category_ngrams[cat]:
98
+ del self._category_ngrams[cat][k]
99
+
100
+ def compute_highlights(self) -> dict[str, Any]:
101
+ """
102
+ Oblicza highlights po zakonczeniu analizy.
103
+ """
104
+ if self._review_count == 0:
105
+ return {
106
+ "general": [],
107
+ "recent": [],
108
+ "current_patch": [],
109
+ "topics": {}
110
+ }
111
+
112
+ results: dict[str, Any] = {
113
+ "general": self._compute_tfidf_highlights(
114
+ self._global_counts,
115
+ top_n=settings.highlights_top_n_general,
116
+ ),
117
+ "recent": self._compute_tfidf_highlights(
118
+ self._category_ngrams.get("recent", Counter()),
119
+ top_n=settings.highlights_top_n_general,
120
+ ),
121
+ "current_patch": self._compute_tfidf_highlights(
122
+ self._category_ngrams.get("current_patch", Counter()),
123
+ top_n=settings.highlights_top_n_general,
124
+ ),
125
+ "topics": {}
126
+ }
127
+
128
+ for topic, counter in self._topic_ngrams.items():
129
+ h = self._compute_tfidf_highlights(
130
+ counter,
131
+ top_n=settings.highlights_top_n_per_topic,
132
+ )
133
+ if h:
134
+ results["topics"][topic] = h
135
+
136
+ return results
137
+
138
+ def _compute_tfidf_highlights(self, counter: Counter, top_n: int) -> list[dict]:
139
+ """TF-IDF scoring + filtering + dedup."""
140
+ candidates = []
141
+ n = self._review_count
142
+ total_count = sum(counter.values()) if counter.values() else 1
143
+
144
+ for ngram, count in counter.items():
145
+ df = self._ngram_doc_freq.get(ngram, 0)
146
+
147
+ if df < settings.highlights_min_mentions:
148
+ continue
149
+ if df / n > settings.highlights_max_doc_freq_ratio:
150
+ continue
151
+
152
+ idf = math.log(n / df) if df > 0 else 0
153
+ tf = count / total_count
154
+ tfidf = tf * idf
155
+ rank_score = count * tfidf
156
+
157
+ # Oblicz sredni sentyment z sumy i liczby
158
+ s_sum = self._ngram_sentiment_sum.get(ngram, 0.0)
159
+ s_count = self._ngram_sentiment_count.get(ngram, 0)
160
+ avg_score = s_sum / s_count if s_count > 0 else 0.0
161
+
162
+ candidates.append({
163
+ "phrase": ngram,
164
+ "mention_count": df,
165
+ "score": round(avg_score, 3),
166
+ "sentiment": (
167
+ "positive" if avg_score > settings.sentiment_positive_threshold
168
+ else "negative" if avg_score < settings.sentiment_negative_threshold
169
+ else "neutral"
170
+ ),
171
+ "ngram_size": len(ngram.split()),
172
+ "_rank": rank_score,
173
+ })
174
+
175
+ candidates.sort(key=lambda x: x["_rank"], reverse=True)
176
+
177
+ # Substring absorption
178
+ absorbed: set[int] = set()
179
+ for i, c in enumerate(candidates):
180
+ if i in absorbed:
181
+ continue
182
+ for j in range(i + 1, len(candidates)):
183
+ if j in absorbed:
184
+ continue
185
+ if candidates[j]["phrase"] in c["phrase"]:
186
+ parent_has_neg = any(neg in c["phrase"] for neg in ["不", "没", "无"])
187
+ child_has_neg = any(neg in candidates[j]["phrase"] for neg in ["不", "没", "无"])
188
+ if parent_has_neg == child_has_neg:
189
+ absorbed.add(j)
190
+
191
+ results = [c for i, c in enumerate(candidates) if i not in absorbed]
192
+
193
+ # Re-sort by mention_count descending for display order.
194
+ # TF-IDF sort above selected the top candidates; this ensures the final
195
+ # list the UI receives is ordered from most-mentioned to least-mentioned,
196
+ # with score and phrase as stable tie-breakers.
197
+ results.sort(key=lambda x: (-x["mention_count"], -x["score"], x["phrase"]))
198
+
199
+ for r in results[:top_n]:
200
+ r.pop("_rank", None)
201
+
202
+ return results[:top_n]
backend/app/services/nlp_service.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Serwis NLP do analizy sentymentu i modelowania tematów.
3
+
4
+ Architektura: Local Inference (CPU).
5
+ Wykorzystuje model Transformer (DistilBERT) uruchamiany bezpośrednio w aplikacji,
6
+ co eliminuje opóźnienia sieciowe i zapewnia deterministyczny czas wykonania.
7
+
8
+ Optymalizacje:
9
+ 1. Pre-kompilacja wzorców Regex (O(1) matching).
10
+ 2. Wykonywanie inferencji w Executorze (nie blokuje Event Loop).
11
+ 3. Batching zapytań do modelu (wykorzystanie instrukcji wektorowych CPU).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import asyncio
17
+ import logging
18
+ import re
19
+ from collections import OrderedDict, defaultdict
20
+ from concurrent.futures import ThreadPoolExecutor
21
+ from typing import TYPE_CHECKING
22
+
23
+ from pathlib import Path
24
+
25
+ import jieba
26
+ from transformers import AutoTokenizer, pipeline
27
+ from optimum.onnxruntime import ORTModelForSequenceClassification
28
+ from zhconv import convert
29
+
30
+ from app.core.config import settings
31
+ from app.core.keywords import EXCLUSIONS, TOPIC_KEYWORDS
32
+ from app.models.schemas import SentimentType, TopicSentiment
33
+
34
+ if TYPE_CHECKING:
35
+ from app.services.highlights_service import HighlightsCollector
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ CARD_LAG_PREFIXES = frozenset({"不", "很", "好", "太", "真", "挺", "老", "总"})
40
+ CARD_STANDALONE_PREVIOUS_TOKENS = frozenset({"有点", "一直", "偶尔"})
41
+
42
+ # Zakresy Unicode dla Emoji i symboli graficznych
43
+ # UWAGA: Poprzedni pattern "\U000024C2-\U0001F251" był zbyt szeroki i usuwał chińskie znaki!
44
+ # Teraz używamy precyzyjnych zakresów tylko dla emoji.
45
+ EMOJI_PATTERN = re.compile(
46
+ "["
47
+ "\U0001F600-\U0001F64F" # Emoticons
48
+ "\U0001F300-\U0001F5FF" # Misc Symbols and Pictographs
49
+ "\U0001F680-\U0001F6FF" # Transport and Map
50
+ "\U0001F1E0-\U0001F1FF" # Flags (iOS)
51
+ "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
52
+ "\U0001FA00-\U0001FA6F" # Chess Symbols
53
+ "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
54
+ "\U00002702-\U000027B0" # Dingbats
55
+ "\U0000FE00-\U0000FE0F" # Variation Selectors
56
+ "]+",
57
+ flags=re.UNICODE,
58
+ )
59
+
60
+ # Inteligentny podział na zdania (wspiera angielski i chiński)
61
+ # Chiński: 。!?;
62
+ # Angielski: .!?
63
+ # Interpunkcja do usunięcia przy deduplikacji (EN + ZH)
64
+ DEDUP_PUNCTUATION = re.compile(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~。!?,、;:""''【】()《》~…·]')
65
+
66
+ SENTENCE_SPLIT_PATTERN = re.compile(r"""
67
+ (?<=[.!?。!?;])\s* # Koniec zdania (EN + ZH punctuation)
68
+ | # LUB
69
+ (?<=[a-z]),\s+ # Przecinek po literze + spacja...
70
+ (?=but\b|however\b|although\b|though\b) # ...przed spójnikiem przeciwstawnym (EN)
71
+ |
72
+ \s+(?=but\b|however\b|although\b|though\b) # Spójnik bez przecinka (EN)
73
+ |
74
+ (?<=。|!|?|;) # Po chińskiej interpunkcji (bez spacji)
75
+ |
76
+ (?=但是|然而|虽然|不过|可是) # Przed chińskim spójnikiem przeciwstawnym
77
+ """, re.VERBOSE | re.IGNORECASE)
78
+
79
+
80
+ class NLPService:
81
+ """
82
+ Serwis NLP realizujący analizę hybrydową:
83
+ 1. Słowa kluczowe (Regex) -> Wykrywanie tematów.
84
+ 2. DistilBERT (Local Model) -> Analiza sentymentu.
85
+ """
86
+
87
+ def __init__(self) -> None:
88
+ """
89
+ Inicjalizuje pipeline ML oraz kompiluje wzorce tekstowe.
90
+ Model ładowany jest raz przy starcie aplikacji (Singleton pattern).
91
+ """
92
+ logger.info("Inicjalizacja serwisu NLP (ONNX Optimized)...")
93
+
94
+ # 0. Jieba user dict — terminy gamingowe
95
+ userdict_path = Path(__file__).parent.parent / "core" / "jieba_userdict.txt"
96
+ if userdict_path.exists():
97
+ jieba.load_userdict(str(userdict_path))
98
+ logger.info(f"Załadowano jieba user dict: {userdict_path}")
99
+
100
+ # 1. Kompilacja Regexów
101
+ # Łączymy słowa kluczowe w jeden efektywny "automat" (Regex).
102
+ # UWAGA: \b nie działa z chińskimi znakami, więc używamy różnych wzorców
103
+ # dla słów ASCII (z \b) i chińskich (bez \b).
104
+ self.topic_patterns = {}
105
+ self.single_char_topic_keywords = {}
106
+ self.exclusion_patterns = {}
107
+
108
+ for topic, keyword_groups in TOPIC_KEYWORDS.items():
109
+ ascii_keywords: list[str] = []
110
+ chinese_keywords: list[str] = []
111
+ chinese_single_char_keywords: list[str] = []
112
+
113
+ for group_name, group in keyword_groups.items():
114
+ for keyword in group:
115
+ if keyword.isascii():
116
+ ascii_keywords.append(keyword)
117
+ elif group_name == "single_char" and len(keyword) == 1:
118
+ chinese_single_char_keywords.append(keyword)
119
+ else:
120
+ chinese_keywords.append(keyword)
121
+
122
+ self.single_char_topic_keywords[topic] = chinese_single_char_keywords
123
+
124
+ patterns = []
125
+ if ascii_keywords:
126
+ # Use word boundaries for ASCII keywords
127
+ sorted_ascii = sorted(ascii_keywords, key=len, reverse=True)
128
+ patterns.append(r'\b(' + '|'.join(re.escape(k) for k in sorted_ascii) + r')\b')
129
+ if chinese_keywords:
130
+ # No word boundaries for Chinese (they don't have spaces),
131
+ # but prefer longer keywords so compounds win over partial overlaps.
132
+ sorted_chinese = sorted(chinese_keywords, key=len, reverse=True)
133
+ patterns.append('(' + '|'.join(re.escape(k) for k in sorted_chinese) + ')')
134
+
135
+ if patterns:
136
+ combined_pattern = '|'.join(patterns)
137
+ self.topic_patterns[topic] = re.compile(combined_pattern, re.IGNORECASE)
138
+
139
+ for keyword, exclusions in EXCLUSIONS.items():
140
+ if exclusions:
141
+ pattern_str = '|'.join(re.escape(e) for e in exclusions)
142
+ self.exclusion_patterns[keyword] = re.compile(pattern_str, re.IGNORECASE)
143
+
144
+ # 2. Ładowanie modelu ONNX
145
+ logger.info(f"Ładowanie modelu ONNX {settings.hf_sentiment_model}...")
146
+ try:
147
+ from onnxruntime import GraphOptimizationLevel, SessionOptions
148
+
149
+ # OPTYMALIZACJA DLA HF SPACES (Shared CPU)
150
+ # Na darmowym tierze mamy 2 vCPU. Ograniczenie wątków zapobiega
151
+ # "context switching" i walce o zasoby.
152
+ session_options = SessionOptions()
153
+ session_options.intra_op_num_threads = settings.nlp_onnx_intra_threads
154
+ session_options.inter_op_num_threads = settings.nlp_onnx_inter_threads
155
+ session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
156
+
157
+ # Load pre-built quantized INT8 ONNX model (no PyTorch needed at runtime)
158
+ quantized_path = Path(__file__).resolve().parent.parent.parent / "models" / "quantized"
159
+ model_file = quantized_path / "model_quantized.onnx"
160
+ if not model_file.exists():
161
+ raise FileNotFoundError(
162
+ f"Quantized ONNX model not found at {model_file}. "
163
+ "Run 'python scripts/quantize_model.py' to generate it."
164
+ )
165
+
166
+ logger.info(f"Loading quantized INT8 model from {quantized_path}")
167
+ model = ORTModelForSequenceClassification.from_pretrained(
168
+ str(quantized_path),
169
+ file_name="model_quantized.onnx",
170
+ session_options=session_options,
171
+ )
172
+ tokenizer = AutoTokenizer.from_pretrained(str(quantized_path))
173
+
174
+ self.classifier = pipeline(
175
+ "sentiment-analysis",
176
+ model=model,
177
+ tokenizer=tokenizer,
178
+ device="cpu",
179
+ )
180
+
181
+ logger.info("Model NLP ONNX ready: INT8 quantized, graph_optimization=ALL")
182
+ except Exception as e:
183
+ # Deliberate broad catch — model loading can fail with OSError, RuntimeError,
184
+ # ONNX errors, HF Hub errors, etc. Always fatal, always re-raised.
185
+ logger.error(f"Krytyczny błąd ładowania modelu ONNX: {e}")
186
+ raise
187
+
188
+ # Pula wątków, żeby ciężkie obliczenia AI nie blokowały serwera (Event Loop)
189
+ self.executor = ThreadPoolExecutor(max_workers=1)
190
+
191
+ # Cache sentymentu: normalized_text -> (label_str, score)
192
+ self._sentiment_cache: OrderedDict[str, tuple[str, float]] = OrderedDict()
193
+ self._cache_maxsize = settings.dedup_cache_maxsize
194
+
195
+ def clean_text(self, text: str) -> str:
196
+ """Usuwa szum (emoji, nadmiarowe spacje) i normalizuje tekst."""
197
+ text = EMOJI_PATTERN.sub("", text)
198
+ text = text.lower()
199
+ text = re.sub(r"\s+", " ", text).strip()
200
+ max_len = settings.text_max_length
201
+ return text[:max_len] if len(text) > max_len else text
202
+
203
+ def _normalize_for_dedup(self, text: str) -> str:
204
+ """Normalizuje zdanie do klucza deduplikacji (zachowuje kolejność słów)."""
205
+ text = DEDUP_PUNCTUATION.sub("", text).lower()
206
+ text = re.sub(r"\s+", " ", text).strip()
207
+ return convert(text, 'zh-cn')
208
+
209
+ def _split_into_sentences(self, text: str) -> list[str]:
210
+ """Rozbija recenzję na logiczne jednostki (zdania/klauzule)."""
211
+ parts = SENTENCE_SPLIT_PATTERN.split(text)
212
+ return [p.strip() for p in parts if p and p.strip()]
213
+
214
+ def _has_negation(self, text: str, position: int) -> bool:
215
+ """
216
+ Wykrywa negację przed słowem kluczowym (w zasięgu zdefiniowanym w configu).
217
+ Przydatne przy precyzyjniejszej analizie aspektowej w języku chińskim.
218
+ """
219
+ window = settings.nlp_negation_window
220
+ left_context = text[max(0, position-window):position]
221
+ return any(neg in left_context for neg in ["不", "没", "别", "无"])
222
+
223
+ @staticmethod
224
+ def _is_valid_single_char_token(keyword: str, token: str, previous_token: str | None) -> bool:
225
+ """Waliduje pojedynczy chiński keyword w kontekście całego tokenu."""
226
+ if keyword != "卡":
227
+ return True
228
+ if token == keyword:
229
+ return previous_token is None or previous_token in CARD_STANDALONE_PREVIOUS_TOKENS
230
+ return token.endswith(keyword) and token[:-1] in CARD_LAG_PREFIXES
231
+
232
+ def _find_single_char_keyword_match(self, sentence: str, keywords: list[str]) -> tuple[int, str] | None:
233
+ """Zwraca pierwszy poprawny match dla chińskiego single-char keywordu."""
234
+ if not keywords:
235
+ return None
236
+
237
+ keyword_set = set(keywords)
238
+ tokenized_sentence = list(jieba.tokenize(sentence))
239
+ for index, (token, start, _) in enumerate(tokenized_sentence):
240
+ previous_token = tokenized_sentence[index - 1][0] if index > 0 else None
241
+ for offset, char in enumerate(token):
242
+ if char not in keyword_set:
243
+ continue
244
+ if self._is_valid_single_char_token(char, token, previous_token):
245
+ return start + offset, char
246
+ return None
247
+
248
+ def _detect_topics_regex(self, sentence: str) -> dict[str, bool]:
249
+ """
250
+ Szybkie wykrywanie tematów przy użyciu prekompilowanych regexów.
251
+ Złożoność: O(N) względem długości zdania, niezależnie od liczby słów kluczowych.
252
+ """
253
+ detected = {}
254
+
255
+ # Konwersja TYMCZASOWA na uproszczony chiński dla potrzeb matchowania.
256
+ # Dzięki temu zachowujemy oryginalny tekst (tradycyjny/uproszczony) w bazie,
257
+ # ale słownik keywords.py może pozostać w zh-cn.
258
+ sentence_simp = convert(sentence, 'zh-cn')
259
+
260
+ for topic in TOPIC_KEYWORDS:
261
+ regex_match = None
262
+ if topic in self.topic_patterns:
263
+ regex_match = self.topic_patterns[topic].search(sentence_simp)
264
+
265
+ single_char_match = self._find_single_char_keyword_match(
266
+ sentence_simp,
267
+ self.single_char_topic_keywords.get(topic, []),
268
+ )
269
+
270
+ matched_word: str | None = None
271
+ match_start: int | None = None
272
+
273
+ if regex_match and single_char_match:
274
+ if single_char_match[0] < regex_match.start():
275
+ match_start, matched_word = single_char_match
276
+ else:
277
+ match_start = regex_match.start()
278
+ matched_word = regex_match.group(0).lower()
279
+ elif regex_match:
280
+ match_start = regex_match.start()
281
+ matched_word = regex_match.group(0).lower()
282
+ elif single_char_match:
283
+ match_start, matched_word = single_char_match
284
+
285
+ if matched_word is not None and match_start is not None:
286
+ is_excluded = False
287
+
288
+ if matched_word in self.exclusion_patterns:
289
+ if self.exclusion_patterns[matched_word].search(sentence_simp):
290
+ is_excluded = True
291
+
292
+ if not is_excluded:
293
+ negated = self._has_negation(sentence_simp, match_start)
294
+ detected[topic] = negated
295
+
296
+ return detected
297
+
298
+ def _run_inference(self, texts: list[str]) -> list[dict]:
299
+ """Wrapper dla pipeline'u Hugging Face uruchamiany w wątku."""
300
+ # batch_size=16 optymalizuje operacje macierzowe na CPU (AVX)
301
+ # truncation=True, max_length=512 zapobiega przekroczeniu limitu pozycji ONNX
302
+ # (max_position_embeddings=512); pipeline uwzględnia tokeny specjalne automatycznie
303
+ return self.classifier(texts, batch_size=16, truncation=True, max_length=512)
304
+
305
+ @staticmethod
306
+ def _map_label(label_str: str, score: float) -> tuple[SentimentType, float]:
307
+ """Mapuje surowy label modelu na (SentimentType, score)."""
308
+ label_lower = label_str.lower()
309
+ if 'positive' in label_lower or 'label_1' in label_lower:
310
+ return (SentimentType.POSITIVE, score)
311
+ elif 'negative' in label_lower or 'label_0' in label_lower:
312
+ return (SentimentType.NEGATIVE, -score)
313
+ return (SentimentType.NEUTRAL, 0.0)
314
+
315
+ def _cache_put(self, key: str, value: tuple[str, float]) -> None:
316
+ """Dodaje wynik do cache LRU, usuwa najstarsze jeśli przekroczono limit."""
317
+ self._sentiment_cache[key] = value
318
+ self._sentiment_cache.move_to_end(key)
319
+ while len(self._sentiment_cache) > self._cache_maxsize:
320
+ self._sentiment_cache.popitem(last=False)
321
+
322
+ async def analyze_sentiment_batch(
323
+ self, texts: list[str]
324
+ ) -> list[tuple[SentimentType, float]]:
325
+ """
326
+ Asynchroniczny interfejs do analizy sentymentu.
327
+ Offloaduje obliczenia do osobnego wątku, nie blokując API.
328
+ Wykorzystuje cache LRU do pomijania powtórzonych zdań.
329
+ """
330
+ cleaned_texts = [self.clean_text(t) for t in texts]
331
+ norm_keys = [self._normalize_for_dedup(t) for t in cleaned_texts]
332
+
333
+ # Rozdziel na cache hits i misses
334
+ final_sentiments: list[tuple[SentimentType, float]] = [(SentimentType.NEUTRAL, 0.0)] * len(texts)
335
+ miss_indices: list[int] = [] # indeksy w cleaned_texts, które trzeba wysłać do modelu
336
+ miss_texts: list[str] = []
337
+
338
+ for i, (cleaned, key) in enumerate(zip(cleaned_texts, norm_keys)):
339
+ if not cleaned:
340
+ continue
341
+ cached = self._sentiment_cache.get(key)
342
+ if cached is not None:
343
+ self._sentiment_cache.move_to_end(key)
344
+ final_sentiments[i] = self._map_label(cached[0], cached[1])
345
+ else:
346
+ miss_indices.append(i)
347
+ miss_texts.append(cleaned)
348
+
349
+ cache_hits = len(texts) - len(miss_texts)
350
+ logger.debug(f"Cache: {cache_hits} hits, {len(miss_texts)} misses (cache size: {len(self._sentiment_cache)})")
351
+
352
+ if not miss_texts:
353
+ return final_sentiments
354
+
355
+ # Uruchomienie modelu TYLKO na cache-misses
356
+ loop = asyncio.get_event_loop()
357
+ results = await loop.run_in_executor(self.executor, self._run_inference, miss_texts)
358
+
359
+ for j, res in enumerate(results):
360
+ original_idx = miss_indices[j]
361
+ label_str = res['label']
362
+ score = res['score']
363
+
364
+ # Zapisz surowy wynik w cache
365
+ self._cache_put(norm_keys[original_idx], (label_str, score))
366
+
367
+ final_sentiments[original_idx] = self._map_label(label_str, score)
368
+
369
+ return final_sentiments
370
+
371
+ async def analyze_batch(
372
+ self,
373
+ reviews: list[str],
374
+ highlights_collector: HighlightsCollector | None = None,
375
+ categories: list[str] | None = None,
376
+ ) -> tuple[list[TopicSentiment], int]:
377
+ """
378
+ Główna metoda przetwarzania partii recenzji.
379
+ Łączy segmentację, wykrywanie tematów i analizę sentymentu.
380
+ """
381
+ if not reviews:
382
+ return [], 0
383
+
384
+ # Krok 1: Pre-processing i identyfikacja zdań do analizy
385
+ sentiment_tasks = []
386
+ skipped_sentences = 0
387
+
388
+ for review_idx, review in enumerate(reviews):
389
+ if highlights_collector:
390
+ highlights_collector.start_review()
391
+
392
+ cleaned = self.clean_text(review)
393
+ if not cleaned or len(cleaned) < 5:
394
+ continue
395
+
396
+ sentences = self._split_into_sentences(cleaned)
397
+ for sentence in sentences:
398
+ topics_map = self._detect_topics_regex(sentence)
399
+ if topics_map:
400
+ for topic, is_negated in topics_map.items():
401
+ sentiment_tasks.append((review_idx, topic, sentence, is_negated))
402
+ else:
403
+ skipped_sentences += 1
404
+
405
+ if not sentiment_tasks:
406
+ return [], skipped_sentences
407
+
408
+ # Krok 2: Deduplikacja + Analiza sentymentu
409
+ all_sentences = [task[2] for task in sentiment_tasks]
410
+
411
+ # Deduplikacja: normalizuj -> znajdź unikalne -> inference tylko na unikatach
412
+ norm_keys = [self._normalize_for_dedup(s) for s in all_sentences]
413
+ unique_map: dict[str, int] = {} # normalized_key -> index in unique_texts
414
+ unique_texts: list[str] = []
415
+
416
+ for i, key in enumerate(norm_keys):
417
+ if key not in unique_map:
418
+ unique_map[key] = len(unique_texts)
419
+ unique_texts.append(all_sentences[i])
420
+
421
+ dedup_total = len(all_sentences)
422
+ dedup_unique = len(unique_texts)
423
+ dedup_pct = round((1 - dedup_unique / dedup_total) * 100) if dedup_total else 0
424
+ logger.debug(f"Dedup: {dedup_total} -> {dedup_unique} sentences ({dedup_pct}% reduced)")
425
+
426
+ unique_results = await self.analyze_sentiment_batch(unique_texts)
427
+
428
+ # Mapowanie wyników z unikalnych z powrotem na wszystkie zdania
429
+ sentiment_results = [unique_results[unique_map[key]] for key in norm_keys]
430
+
431
+ # Krok 3: Agregacja wyników
432
+ # review_id -> topic -> list of scores
433
+ review_topic_scores: dict[int, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
434
+ # topic -> (sentence, score) - online selection najlepszego przykładu
435
+ topic_best_example: dict[str, tuple[str, float]] = {}
436
+
437
+ for i, (review_idx, topic, sentence, is_negated) in enumerate(sentiment_tasks):
438
+ _, score = sentiment_results[i]
439
+
440
+ # KULOODPORNY PIPELINE: Jeśli wykryto negację (np. "nie lubię gameplayu"),
441
+ # a model mimo to zwrócił dodatni sentyment, korygujemy go.
442
+ if is_negated and score > 0:
443
+ score = -score
444
+
445
+ review_topic_scores[review_idx][topic].append(score)
446
+
447
+ if highlights_collector:
448
+ highlights_collector.add_sentence(
449
+ review_idx=review_idx,
450
+ sentence=sentence,
451
+ topics=[topic],
452
+ sentiment_score=score,
453
+ categories=categories,
454
+ )
455
+
456
+ # Online selection - aktualizuj jeśli lepszy kandydat (wyższy |score|)
457
+ if len(sentence) > 20:
458
+ current = topic_best_example.get(topic)
459
+ if current is None or abs(score) > abs(current[1]):
460
+ topic_best_example[topic] = (sentence, score)
461
+
462
+ # Agregacja globalna: Średnia per recenzja -> Suma globalna
463
+ global_topic_stats: dict[str, dict[str, float]] = defaultdict(lambda: {"sum_score": 0.0, "count": 0.0})
464
+
465
+ for review_idx, topics_data in review_topic_scores.items():
466
+ for topic, scores in topics_data.items():
467
+ avg_review_score = sum(scores) / len(scores)
468
+ global_topic_stats[topic]["sum_score"] += avg_review_score
469
+ global_topic_stats[topic]["count"] += 1.0
470
+
471
+ # Krok 4: Formatowanie końcowe
472
+ final_results: list[TopicSentiment] = []
473
+
474
+ for topic_name, stats in global_topic_stats.items():
475
+ count = int(stats["count"])
476
+ if count == 0:
477
+ continue
478
+
479
+ avg_global_score = stats["sum_score"] / stats["count"]
480
+ normalized_score = max(-1.0, min(1.0, avg_global_score))
481
+
482
+ if normalized_score > settings.sentiment_positive_threshold:
483
+ sentiment = SentimentType.POSITIVE
484
+ elif normalized_score < settings.sentiment_negative_threshold:
485
+ sentiment = SentimentType.NEGATIVE
486
+ else:
487
+ sentiment = SentimentType.NEUTRAL
488
+
489
+ # Pobierz najlepszy przykład i zwaliduj zgodność kierunku
490
+ best_example = None
491
+ example_score = None
492
+ candidate = topic_best_example.get(topic_name)
493
+ if candidate:
494
+ ex_sentence, ex_score = candidate
495
+ # Walidacja: przykład musi być zgodny z kierunkiem sentymentu
496
+ if sentiment == SentimentType.NEUTRAL or \
497
+ (sentiment == SentimentType.POSITIVE and ex_score > 0) or \
498
+ (sentiment == SentimentType.NEGATIVE and ex_score < 0):
499
+ best_example = ex_sentence
500
+ example_score = ex_score
501
+
502
+ final_results.append(
503
+ TopicSentiment(
504
+ topic=topic_name,
505
+ sentiment=sentiment,
506
+ score=round(normalized_score, 3),
507
+ mention_count=count,
508
+ example=best_example,
509
+ example_score=example_score,
510
+ )
511
+ )
512
+
513
+ final_results.sort(key=lambda x: x.mention_count, reverse=True)
514
+ return final_results, skipped_sentences
515
+
516
+
517
+ _nlp_service: "NLPService | None" = None
518
+
519
+
520
+ def get_nlp_service() -> "NLPService":
521
+ global _nlp_service
522
+ if _nlp_service is None:
523
+ _nlp_service = NLPService()
524
+ return _nlp_service
backend/app/services/precache_service.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pre-cache Service — schedules and executes background analyses for top games.
3
+
4
+ Creates refresh schedules with checkpoints (e.g. 6h, 12h, 24h after update)
5
+ and processes due analyses each cycle, prioritized by game popularity.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from datetime import datetime, timedelta, timezone
11
+ from typing import Any
12
+
13
+ from app.core.config import settings
14
+ from app.db.mongodb import mongodb
15
+ from app.services.analysis_runner import run_full_analysis, run_incremental_analysis
16
+ from app.services.nlp_service import NLPService
17
+ from app.services.steam_service import SteamService
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class PreCacheService:
23
+ """Manages refresh schedules and triggers pre-cache analyses."""
24
+
25
+ def __init__(
26
+ self, steam_svc: SteamService, nlp_svc: NLPService
27
+ ) -> None:
28
+ self._steam_svc = steam_svc
29
+ self._nlp_svc = nlp_svc
30
+
31
+ def create_schedule(
32
+ self, app_id: str, game_name: str, update_at: datetime, *, is_release: bool = False
33
+ ) -> dict[str, Any]:
34
+ """Build a schedule document with checkpoints from config."""
35
+ checkpoints = []
36
+ for offset_hours in settings.precache_checkpoints_list:
37
+ checkpoints.append({
38
+ "offset_hours": offset_hours,
39
+ "due_at": update_at + timedelta(hours=offset_hours),
40
+ "completed": False,
41
+ })
42
+
43
+ return {
44
+ "app_id": str(app_id),
45
+ "game_name": game_name,
46
+ "update_at": update_at,
47
+ "checkpoints": checkpoints,
48
+ "is_release": is_release,
49
+ "status": "active",
50
+ "created_at": datetime.now(timezone.utc),
51
+ }
52
+
53
+ def create_bootstrap_schedule(
54
+ self, app_id: str, game_name: str
55
+ ) -> dict[str, Any]:
56
+ """Release schedule for a newly prioritized game, starting at 6h."""
57
+ now = datetime.now(timezone.utc)
58
+ return self.create_schedule(app_id, game_name, now, is_release=True)
59
+
60
+ async def create_schedules_for_updates(
61
+ self, updated_games: list[dict[str, Any]]
62
+ ) -> int:
63
+ """Bulk-create schedules for games that received updates."""
64
+ active_schedules = await mongodb.get_active_schedules()
65
+ active_by_app_id = {s["app_id"]: s for s in active_schedules}
66
+
67
+ created = 0
68
+ for game in updated_games:
69
+ app_id = str(game.get("appid", ""))
70
+ name = game.get("name", "")
71
+ update_at = game.get("update_at", datetime.now(timezone.utc))
72
+
73
+ existing = active_by_app_id.get(app_id)
74
+ if existing:
75
+ existing_update_at = existing.get("update_at")
76
+ if existing_update_at and update_at <= existing_update_at:
77
+ continue # Same or older patch — don't reset checkpoints
78
+
79
+ schedule = self.create_schedule(app_id, name, update_at)
80
+ await mongodb.upsert_refresh_schedule(schedule)
81
+ created += 1
82
+
83
+ logger.info(f"Created {created} refresh schedules for updated games")
84
+ return created
85
+
86
+ async def bootstrap_missing_analyses(
87
+ self, top_games: list[dict[str, Any]]
88
+ ) -> int:
89
+ """For top games with no cached analysis, create release schedules."""
90
+ # Pre-fetch active schedule app_ids for O(1) lookup
91
+ active_schedules = await mongodb.get_active_schedules()
92
+ scheduled_app_ids = {s["app_id"] for s in active_schedules}
93
+
94
+ created = 0
95
+ for game in top_games:
96
+ app_id = str(game.get("appid", ""))
97
+ if not app_id or app_id in scheduled_app_ids:
98
+ continue
99
+
100
+ # Check if analysis already cached
101
+ cached = await mongodb.get_cached_analysis(app_id)
102
+ if cached is not None:
103
+ continue
104
+
105
+ schedule = self.create_bootstrap_schedule(app_id, game.get("name", ""))
106
+ await mongodb.upsert_refresh_schedule(schedule)
107
+ scheduled_app_ids.add(app_id)
108
+ created += 1
109
+
110
+ logger.info(f"Bootstrap: created {created} release schedules")
111
+ return created
112
+
113
+ async def process_due_analyses(self) -> int:
114
+ """
115
+ Main processing loop: find due checkpoints, prioritize, execute.
116
+
117
+ Returns:
118
+ Number of analyses executed.
119
+ """
120
+ now = datetime.now(timezone.utc)
121
+ schedules = await mongodb.get_active_schedules()
122
+ max_per_cycle = settings.precache_max_analyses_per_cycle
123
+ delay = settings.precache_batch_delay_seconds
124
+
125
+ # Find one due checkpoint per game
126
+ due_items: list[dict[str, Any]] = []
127
+ for schedule in schedules:
128
+ for cp in schedule.get("checkpoints", []):
129
+ if cp.get("completed"):
130
+ continue
131
+ if cp["due_at"] <= now:
132
+ due_items.append({
133
+ "app_id": schedule["app_id"],
134
+ "game_name": schedule.get("game_name", ""),
135
+ "offset_hours": cp["offset_hours"],
136
+ "due_at": cp["due_at"],
137
+ "positive": schedule.get("positive", 0),
138
+ "negative": schedule.get("negative", 0),
139
+ })
140
+ break # Only first due checkpoint per game
141
+
142
+ if not due_items:
143
+ logger.info("Pre-cache: no due analyses")
144
+ return 0
145
+
146
+ # Sort by popularity DESC, then due_at ASC
147
+ due_items.sort(
148
+ key=lambda x: (-(x.get("positive", 0) + x.get("negative", 0)), x["due_at"])
149
+ )
150
+
151
+ # Execute up to max_per_cycle
152
+ executed = 0
153
+ for item in due_items[:max_per_cycle]:
154
+ app_id = item["app_id"]
155
+ game_name = item["game_name"]
156
+ offset_hours = item["offset_hours"]
157
+
158
+ logger.info(f"Pre-cache: analyzing {app_id} ({game_name}) — checkpoint {offset_hours}h")
159
+
160
+ existing = await mongodb.get_analysis(app_id)
161
+ if existing and existing.get("results"):
162
+ result = await run_incremental_analysis(
163
+ app_id, game_name, self._steam_svc, self._nlp_svc
164
+ )
165
+ else:
166
+ result = await run_full_analysis(
167
+ app_id, game_name, self._steam_svc, self._nlp_svc
168
+ )
169
+
170
+ if result is not None:
171
+ executed += 1
172
+
173
+ # Mark checkpoint completed regardless of success
174
+ await mongodb.mark_checkpoint_completed(app_id, offset_hours)
175
+
176
+ # Check if all checkpoints done → complete schedule
177
+ await self._check_schedule_completion(app_id)
178
+
179
+ if executed < max_per_cycle and item != due_items[-1]:
180
+ await asyncio.sleep(delay)
181
+
182
+ logger.info(f"Pre-cache: executed {executed}/{len(due_items)} due analyses")
183
+ return executed
184
+
185
+ @staticmethod
186
+ async def _check_schedule_completion(app_id: str) -> None:
187
+ """If all checkpoints completed, mark schedule as completed."""
188
+ schedules = await mongodb.get_active_schedules()
189
+ for schedule in schedules:
190
+ if schedule["app_id"] != str(app_id):
191
+ continue
192
+ all_done = all(
193
+ cp.get("completed", False)
194
+ for cp in schedule.get("checkpoints", [])
195
+ )
196
+ if all_done:
197
+ await mongodb.complete_schedule(app_id)
198
+ logger.info(f"Schedule completed for {app_id}")
199
+ break
backend/app/services/priority_refresh_service.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Priority Refresh Service — maintains canonical priority game state in MongoDB.
3
+
4
+ Priority sources:
5
+ - top500: top 500 games by review count (local DB)
6
+ - top_sellers / new_releases / specials: Steam store featured categories
7
+
8
+ Priority state fields on games documents:
9
+ is_priority bool
10
+ priority_sources list[str]
11
+ priority_grace_until datetime | None
12
+ priority_last_confirmed_at datetime | None
13
+ """
14
+
15
+ import asyncio
16
+ import logging
17
+ from datetime import datetime, timedelta, timezone
18
+ from typing import Any
19
+
20
+ import httpx
21
+
22
+ from app.core.config import settings
23
+ from app.db.mongodb import mongodb
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class PriorityRefreshService:
29
+ """Refreshes priority flags on the games collection each worker cycle."""
30
+
31
+ def __init__(self, client: httpx.AsyncClient | None = None) -> None:
32
+ self._client = client
33
+ self._owns_client = client is None
34
+
35
+ async def _get_client(self) -> httpx.AsyncClient:
36
+ if self._client is None:
37
+ self._client = httpx.AsyncClient(timeout=15.0)
38
+ return self._client
39
+
40
+ async def close(self) -> None:
41
+ if self._owns_client and self._client is not None:
42
+ await self._client.aclose()
43
+ self._client = None
44
+
45
+ async def refresh_priorities(self) -> dict[str, Any]:
46
+ """
47
+ Recompute is_priority for all games and write changes to MongoDB.
48
+
49
+ Returns a summary dict with counts.
50
+ """
51
+ now = datetime.now(timezone.utc)
52
+ grace_deadline = now + timedelta(days=settings.steam_priority_grace_days)
53
+
54
+ # 1. Build active sources map
55
+ top500_ids: set[str] = {
56
+ g["appid"]
57
+ for g in await mongodb.get_top_games_by_reviews(500)
58
+ if g.get("app_type") != "dlc"
59
+ }
60
+
61
+ category_ids: dict[str, set[str]] = await self._fetch_store_categories()
62
+
63
+ active_sources: dict[str, list[str]] = {}
64
+ for appid in top500_ids:
65
+ active_sources.setdefault(appid, []).append("top500")
66
+ for cat_name, ids in category_ids.items():
67
+ for appid in ids:
68
+ active_sources.setdefault(appid, []).append(cat_name)
69
+
70
+ # 1b. Bootstrap category games that are missing from the local DB.
71
+ # top500 appids are safe — they come from existing DB records.
72
+ # Category appids may reference games not yet in our DB.
73
+ all_category_appids: set[str] = set()
74
+ for ids in category_ids.values():
75
+ all_category_appids.update(ids)
76
+
77
+ bootstrap_summary: dict[str, Any] = {}
78
+ if all_category_appids:
79
+ _, bootstrap_summary = await self._bootstrap_missing_games(all_category_appids)
80
+ # After bootstrap, remove from active_sources any category appid that
81
+ # still has no DB record (failed bootstrap / delisted / per-cycle limit).
82
+ # This prevents bulk_update_priority_fields from silently no-oping.
83
+ existing_in_db = await mongodb.get_existing_appids(all_category_appids)
84
+ for appid in all_category_appids - existing_in_db:
85
+ active_sources.pop(appid, None)
86
+
87
+ # 2. Load current priority state (only games that already have is_priority field)
88
+ existing_priority_docs: list[dict[str, Any]] = []
89
+ if mongodb.db is not None:
90
+ try:
91
+ collection = mongodb.db[mongodb.COLLECTION_GAMES]
92
+ cursor = collection.find(
93
+ {"is_priority": {"$exists": True}},
94
+ {
95
+ "_id": 0,
96
+ "appid": 1,
97
+ "app_type": 1,
98
+ "is_priority": 1,
99
+ "priority_grace_until": 1,
100
+ "priority_sources": 1,
101
+ },
102
+ )
103
+ existing_priority_docs = await cursor.to_list(length=10000)
104
+ except Exception as e:
105
+ logger.warning(f"Failed to load existing priority docs: {e}")
106
+
107
+ existing_by_appid: dict[str, dict] = {
108
+ str(d["appid"]): d for d in existing_priority_docs
109
+ }
110
+
111
+ # 2b. DLC inherits effective priority from its parent game.
112
+ if settings.dlc_worker_analysis_enabled:
113
+ priority_parent_ids: set[str] = set(active_sources.keys())
114
+ for appid, doc in existing_by_appid.items():
115
+ if doc.get("app_type") == "dlc":
116
+ continue
117
+ if not doc.get("is_priority") or appid in active_sources:
118
+ continue
119
+
120
+ grace_until = doc.get("priority_grace_until")
121
+ if grace_until is None or grace_until >= now:
122
+ priority_parent_ids.add(appid)
123
+
124
+ for parent_appid in priority_parent_ids:
125
+ dlcs = await mongodb.get_dlcs_by_parent_appid(parent_appid)
126
+ for dlc in dlcs:
127
+ dlc_appid = str(dlc.get("appid", ""))
128
+ if dlc_appid:
129
+ active_sources[dlc_appid] = ["parent_priority"]
130
+
131
+ # 2c. When DLC worker analysis is disabled, remove any DLC that entered
132
+ # active_sources via other paths (e.g. Steam store categories).
133
+ if not settings.dlc_worker_analysis_enabled:
134
+ dlc_appids_to_remove = {
135
+ appid
136
+ for appid in active_sources
137
+ if existing_by_appid.get(appid, {}).get("app_type") == "dlc"
138
+ }
139
+ for appid in dlc_appids_to_remove:
140
+ del active_sources[appid]
141
+
142
+ # 3. Compute updates
143
+ updates: list[tuple[str, dict]] = []
144
+ became_priority = 0
145
+ entered_grace = 0
146
+ expired_grace = 0
147
+ reactivated = 0
148
+ removed_parent_priority = 0
149
+
150
+ # Active games — either new or confirming existing priority
151
+ for appid, sources in active_sources.items():
152
+ existing = existing_by_appid.get(appid)
153
+ fields: dict[str, Any] = {
154
+ "is_priority": True,
155
+ "priority_sources": sources,
156
+ "priority_grace_until": None,
157
+ "priority_last_confirmed_at": now,
158
+ }
159
+ if existing is None or not existing.get("is_priority"):
160
+ became_priority += 1
161
+ elif existing.get("priority_grace_until") is not None:
162
+ reactivated += 1
163
+ updates.append((appid, fields))
164
+
165
+ # Games that were priority but are no longer in any active source
166
+ for appid, doc in existing_by_appid.items():
167
+ if appid in active_sources:
168
+ continue # already handled above
169
+ if not doc.get("is_priority"):
170
+ continue # already marked non-priority, skip
171
+
172
+ if "parent_priority" in (doc.get("priority_sources") or []):
173
+ updates.append((appid, {
174
+ "is_priority": False,
175
+ "priority_sources": [],
176
+ "priority_grace_until": None,
177
+ }))
178
+ removed_parent_priority += 1
179
+ continue
180
+
181
+ grace_until = doc.get("priority_grace_until")
182
+
183
+ if grace_until is None:
184
+ # Just left all sources — start grace period
185
+ updates.append((appid, {
186
+ "priority_grace_until": grace_deadline,
187
+ "priority_sources": [],
188
+ }))
189
+ entered_grace += 1
190
+ elif grace_until < now:
191
+ # Grace expired — remove priority
192
+ updates.append((appid, {
193
+ "is_priority": False,
194
+ "priority_sources": [],
195
+ "priority_grace_until": None,
196
+ }))
197
+ expired_grace += 1
198
+ # else: still in grace and not expired — no update needed
199
+
200
+ modified = await mongodb.bulk_update_priority_fields(updates)
201
+
202
+ result = {
203
+ "total_active": len(active_sources),
204
+ "top500_count": len(top500_ids),
205
+ "category_counts": {k: len(v) for k, v in category_ids.items()},
206
+ "bootstrap": bootstrap_summary,
207
+ "became_priority": became_priority,
208
+ "reactivated": reactivated,
209
+ "entered_grace": entered_grace,
210
+ "expired_grace": expired_grace,
211
+ "removed_parent_priority": removed_parent_priority,
212
+ "db_modified": modified,
213
+ }
214
+ logger.info(f"Priority refresh complete: {result}")
215
+ return result
216
+
217
+ @staticmethod
218
+ def _parse_app_type(data: dict[str, Any]) -> dict[str, Any]:
219
+ """Parse app_type and parent_appid from an appdetails data block."""
220
+ app_type = data.get("type") or "unknown"
221
+ fullgame = data.get("fullgame")
222
+ parent_appid = None
223
+ if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None:
224
+ parent_appid = str(fullgame["appid"])
225
+ return {"app_type": str(app_type), "parent_appid": parent_appid}
226
+
227
+ async def _fetch_app_details_bilingual(self, appid: str) -> dict[str, Any] | None:
228
+ """
229
+ Fetch appdetails for a single game in both english and schinese.
230
+
231
+ Returns a minimal game dict (name, name_cn, app_type, parent_appid,
232
+ header_image, cn_name_checked) or None on failure / not found.
233
+ """
234
+ client = await self._get_client()
235
+ store_url = "https://store.steampowered.com/api/appdetails"
236
+
237
+ async def _fetch_one(lang: str) -> dict[str, Any]:
238
+ try:
239
+ resp = await client.get(
240
+ store_url,
241
+ params={"appids": appid, "l": lang, "cc": settings.steam_region},
242
+ )
243
+ if resp.status_code != 200:
244
+ return {}
245
+ entry = resp.json().get(str(appid))
246
+ if entry and entry.get("success"):
247
+ return entry.get("data") or {}
248
+ return {}
249
+ except Exception as e:
250
+ logger.warning(f"appdetails error for {appid} (lang={lang}): {e}")
251
+ return {}
252
+
253
+ data_en, data_cn = await asyncio.gather(
254
+ _fetch_one("english"),
255
+ _fetch_one("schinese"),
256
+ )
257
+
258
+ if not data_en and not data_cn:
259
+ logger.warning(f"No appdetails for {appid} — skipping bootstrap")
260
+ return None
261
+
262
+ name_en = data_en.get("name") or data_cn.get("name")
263
+ if not name_en:
264
+ logger.warning(f"No name in appdetails for {appid} — skipping bootstrap")
265
+ return None
266
+
267
+ name_cn = data_cn.get("name")
268
+ base = data_en or data_cn
269
+ type_info = self._parse_app_type(base)
270
+
271
+ return {
272
+ "appid": appid,
273
+ "name": name_en,
274
+ "name_cn": name_cn if name_cn and name_cn != name_en else None,
275
+ "cn_name_checked": True,
276
+ "app_type": type_info["app_type"],
277
+ "parent_appid": type_info["parent_appid"],
278
+ "header_image": base.get("header_image"),
279
+ }
280
+
281
+ async def _bootstrap_missing_games(
282
+ self,
283
+ category_appids: set[str],
284
+ ) -> tuple[set[str], dict[str, Any]]:
285
+ """
286
+ Fetch Steam Store data and upsert games missing from the local DB.
287
+
288
+ Returns:
289
+ (bootstrapped_appids, summary_dict)
290
+ bootstrapped_appids: set of appids that were newly upserted
291
+ """
292
+ existing = await mongodb.get_existing_appids(category_appids)
293
+ missing = category_appids - existing
294
+
295
+ if not missing:
296
+ return set(), {"bootstrapped": 0, "failed": 0, "skipped_existing": len(existing)}
297
+
298
+ limit = settings.steam_bootstrap_max_per_cycle
299
+ appids_to_fetch = list(missing)[:limit]
300
+ bootstrapped: set[str] = set()
301
+ failed = 0
302
+
303
+ for i, appid in enumerate(appids_to_fetch):
304
+ game_data = await self._fetch_app_details_bilingual(appid)
305
+ if game_data is None:
306
+ failed += 1
307
+ else:
308
+ await mongodb.upsert_game(game_data)
309
+ bootstrapped.add(appid)
310
+
311
+ if i < len(appids_to_fetch) - 1:
312
+ await asyncio.sleep(settings.steam_bootstrap_delay)
313
+
314
+ summary = {
315
+ "bootstrapped": len(bootstrapped),
316
+ "failed": failed,
317
+ "skipped_existing": len(existing),
318
+ "missing_over_limit": max(0, len(missing) - limit),
319
+ }
320
+ if bootstrapped or failed:
321
+ logger.info(f"Bootstrap missing games: {summary}")
322
+ return bootstrapped, summary
323
+
324
+ async def _fetch_region_categories(self, region: str) -> dict[str, set[str]]:
325
+ """
326
+ Fetch featured categories for a single Steam region (cc=region).
327
+
328
+ Returns dict mapping category name -> set of appid strings.
329
+ On any failure, returns {} so the caller can continue with other regions.
330
+ """
331
+ try:
332
+ client = await self._get_client()
333
+ resp = await client.get(
334
+ settings.steam_priority_categories_url,
335
+ params={"cc": region, "l": "schinese"},
336
+ )
337
+ if resp.status_code != 200:
338
+ logger.warning(
339
+ f"Steam featuredcategories [{region}] returned {resp.status_code} — skipping region"
340
+ )
341
+ return {}
342
+
343
+ data = resp.json()
344
+ except Exception as e:
345
+ logger.warning(
346
+ f"Failed to fetch Steam store categories [{region}]: {e} — skipping region"
347
+ )
348
+ return {}
349
+
350
+ result: dict[str, set[str]] = {}
351
+ for cat_name in settings.steam_priority_categories_list:
352
+ cat_data = data.get(cat_name)
353
+ if not cat_data:
354
+ continue
355
+ items = cat_data.get("items", [])
356
+ appids: set[str] = {
357
+ str(item["id"])
358
+ for item in items
359
+ if item.get("type") == 0 and item.get("id") is not None
360
+ }
361
+ result[cat_name] = appids
362
+
363
+ return result
364
+
365
+ async def _fetch_store_categories(self) -> dict[str, set[str]]:
366
+ """
367
+ Fetch game appids from Steam store featured categories across all configured regions.
368
+
369
+ Iterates over steam_priority_regions_list (default: CN, US) and merges results.
370
+ If one region fails, the other is still used. If all fail, returns {} (fallback
371
+ to top-500 only).
372
+
373
+ Returns dict mapping category name -> set of appid strings.
374
+ """
375
+ regions = settings.steam_priority_regions_list
376
+ if not regions:
377
+ logger.warning(
378
+ "steam_priority_regions is empty — skipping store categories fetch (top500 only)"
379
+ )
380
+ return {}
381
+
382
+ merged: dict[str, set[str]] = {}
383
+ for region in regions:
384
+ region_data = await self._fetch_region_categories(region)
385
+ for cat_name, appids in region_data.items():
386
+ merged.setdefault(cat_name, set()).update(appids)
387
+ return merged
backend/app/services/steam_errors.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom exceptions for Steam API errors.
3
+
4
+ Separate module to avoid circular imports between mongodb.py and steam_service.py.
5
+ """
6
+
7
+
8
+ class SteamAPIError(Exception):
9
+ """Raised when Steam API returns a non-retryable error (404, 403, other 4xx)."""
10
+
11
+ def __init__(self, status_code: int, app_id: str, message: str = "") -> None:
12
+ self.status_code = status_code
13
+ self.app_id = app_id
14
+ self.message = message or f"Steam API error {status_code} for app {app_id}"
15
+ super().__init__(self.message)
16
+
17
+
18
+ class SteamRateLimitError(SteamAPIError):
19
+ """Raised when Steam API returns 429 after all retries are exhausted."""
20
+
21
+ def __init__(self, app_id: str) -> None:
22
+ super().__init__(status_code=429, app_id=app_id, message=f"Steam API rate limited for app {app_id}")
backend/app/services/steam_service.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Serwis do komunikacji ze Steam API.
3
+
4
+ Odpowiada za pobieranie informacji o grach oraz recenzji.
5
+ Wykorzystuje publiczne API Steam (nie wymaga klucza API).
6
+ Implementuje statystyczne próbkowanie recenzji (stratified sampling).
7
+ Retry z exponential backoff dla 429/5xx/timeout.
8
+ """
9
+
10
+ import asyncio
11
+ import logging
12
+ from dataclasses import dataclass
13
+ from typing import Any, AsyncGenerator
14
+
15
+ import httpx
16
+
17
+ from app.core.config import settings
18
+ from app.core.sampling import SamplePlan, create_sample_plan
19
+ from app.db.mongodb import mongodb
20
+ from app.models.schemas import GameInfo, ReviewBatch, ReviewItem
21
+ from app.services.steam_errors import SteamAPIError, SteamRateLimitError
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Status codes that should be retried
26
+ _RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
27
+
28
+
29
+ @dataclass
30
+ class ReviewStats:
31
+ """Statystyki recenzji gry."""
32
+
33
+ total: int
34
+ positive: int
35
+ negative: int
36
+
37
+
38
+ class SteamService:
39
+ """
40
+ Serwis do pobierania danych ze Steam API.
41
+ """
42
+
43
+ STORE_API_URL = "https://store.steampowered.com/api"
44
+ REVIEW_API_URL = "https://store.steampowered.com/appreviews"
45
+ SEARCH_API_URL = "https://store.steampowered.com/api/storesearch"
46
+
47
+ def __init__(self, timeout: float = 30.0) -> None:
48
+ self.timeout = timeout
49
+ self.client = httpx.AsyncClient(timeout=self.timeout)
50
+
51
+ async def close(self) -> None:
52
+ """Close the shared HTTP client."""
53
+ await self.client.aclose()
54
+
55
+ async def _request_with_retry(
56
+ self,
57
+ client: httpx.AsyncClient,
58
+ url: str,
59
+ params: dict[str, Any],
60
+ context: str = "",
61
+ ) -> httpx.Response:
62
+ """
63
+ Wykonuje request z retry i exponential backoff.
64
+ """
65
+ max_attempts = settings.steam_retry_max_attempts
66
+ base_delay = settings.steam_retry_base_delay
67
+ max_delay = settings.steam_retry_max_delay
68
+ last_exception: Exception | None = None
69
+
70
+ for attempt in range(max_attempts):
71
+ try:
72
+ response = await client.get(url, params=params)
73
+ status = response.status_code
74
+
75
+ if status == 200:
76
+ return response
77
+
78
+ # Non-retryable client errors
79
+ if status == 404:
80
+ raise SteamAPIError(404, context, f"Not found: {url}")
81
+ if status == 403:
82
+ raise SteamAPIError(403, context, f"Forbidden: {url}")
83
+ if 400 <= status < 500 and status not in _RETRYABLE_STATUS_CODES:
84
+ raise SteamAPIError(status, context, f"Client error {status}: {url}")
85
+
86
+ # Retryable errors (429, 5xx)
87
+ if attempt < max_attempts - 1:
88
+ delay = min(base_delay * (2 ** attempt), max_delay)
89
+
90
+ # Respect Retry-After header for 429
91
+ if status == 429:
92
+ retry_after = response.headers.get("Retry-After")
93
+ if retry_after:
94
+ try:
95
+ delay = min(float(retry_after), max_delay)
96
+ except ValueError:
97
+ pass
98
+
99
+ logger.warning(
100
+ f"Steam API {status} for {context}, "
101
+ f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s"
102
+ )
103
+ await asyncio.sleep(delay)
104
+ else:
105
+ # Exhausted retries
106
+ if status == 429:
107
+ raise SteamRateLimitError(context)
108
+ raise SteamAPIError(status, context, f"Server error {status} after {max_attempts} attempts: {url}")
109
+
110
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
111
+ last_exception = e
112
+ if attempt < max_attempts - 1:
113
+ delay = min(base_delay * (2 ** attempt), max_delay)
114
+ logger.warning(
115
+ f"Steam API {type(e).__name__} for {context}, "
116
+ f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s"
117
+ )
118
+ await asyncio.sleep(delay)
119
+ else:
120
+ raise SteamAPIError(
121
+ 0, context,
122
+ f"Connection failed after {max_attempts} attempts: {e}"
123
+ ) from e
124
+
125
+ # Should not reach here, but just in case
126
+ raise SteamAPIError(0, context, "Unexpected retry exhaustion") from last_exception
127
+
128
+ async def search_game(self, query: str) -> GameInfo | None:
129
+ """Wyszukuje grę po nazwie używając publicznego API wyszukiwarki Steam."""
130
+ client = self.client
131
+ params = {
132
+ "term": query,
133
+ "l": settings.steam_review_language,
134
+ "cc": settings.steam_region,
135
+ }
136
+
137
+ try:
138
+ response = await self._request_with_retry(
139
+ client, self.SEARCH_API_URL, params, context=f"search:{query}"
140
+ )
141
+ data = response.json()
142
+ except (SteamAPIError, SteamRateLimitError) as e:
143
+ logger.error(f"Błąd wyszukiwania gry '{query}': {e}")
144
+ return None
145
+
146
+ items = data.get("items", [])
147
+ if not items:
148
+ logger.warning(f"Nie znaleziono gry: {query}")
149
+ return None
150
+
151
+ first_result = items[0]
152
+ app_id = str(first_result.get("id"))
153
+
154
+ game_info = await self.get_game_info(app_id)
155
+
156
+ if game_info:
157
+ await mongodb.upsert_game({
158
+ "appid": game_info.app_id,
159
+ "name": game_info.name,
160
+ "name_cn": game_info.name_cn,
161
+ "cn_name_checked": True,
162
+ "header_image": game_info.header_image,
163
+ "total_reviews": game_info.total_reviews
164
+ })
165
+
166
+ return game_info
167
+
168
+ async def get_game_info(self, app_id: str) -> GameInfo | None:
169
+ """Pobiera szczegółowe metadane gry (obrazek, nazwę) z appdetails."""
170
+ cached_error = await mongodb.get_steam_error(app_id)
171
+ if cached_error:
172
+ logger.info(
173
+ f"Skipping Steam API for app {app_id} — "
174
+ f"cached error {cached_error.get('status_code')}"
175
+ )
176
+ return None
177
+
178
+ client = self.client
179
+ details_url = f"{self.STORE_API_URL}/appdetails"
180
+
181
+ async def fetch_localized(lang: str):
182
+ try:
183
+ params = {"appids": app_id, "l": lang, "cc": settings.steam_region}
184
+ resp = await self._request_with_retry(
185
+ client, details_url, params, context=app_id
186
+ )
187
+ return resp.json().get(app_id, {})
188
+ except SteamAPIError as e:
189
+ if e.status_code == 404:
190
+ await mongodb.cache_steam_error(
191
+ app_id, 404, settings.steam_error_cache_ttl_404
192
+ )
193
+ return {}
194
+
195
+ data_zh, data_en = await asyncio.gather(
196
+ fetch_localized("schinese"),
197
+ fetch_localized("english")
198
+ )
199
+
200
+ if not data_en.get("success") and not data_zh.get("success"):
201
+ logger.warning(f"Nie znaleziono szczegółów gry: {app_id}")
202
+ return None
203
+
204
+ base_data = data_en.get("data") or data_zh.get("data")
205
+ name_en = data_en.get("data", {}).get("name") or base_data.get("name")
206
+ name_zh = data_zh.get("data", {}).get("name")
207
+
208
+ stats = await self.get_review_stats(app_id)
209
+
210
+ return GameInfo(
211
+ app_id=app_id,
212
+ name=name_en,
213
+ name_cn=name_zh if name_zh != name_en else None,
214
+ header_image=base_data.get("header_image"),
215
+ total_reviews=stats.total,
216
+ )
217
+
218
+ async def get_review_stats(self, app_id: str) -> ReviewStats:
219
+ """Pobiera sumaryczne statystyki recenzji potrzebne do planowania próbki."""
220
+ cached_error = await mongodb.get_steam_error(app_id)
221
+ if cached_error:
222
+ logger.info(
223
+ f"Skipping review stats for app {app_id} — "
224
+ f"cached error {cached_error.get('status_code')}"
225
+ )
226
+ return ReviewStats(total=0, positive=0, negative=0)
227
+
228
+ client = self.client
229
+ url = f"{self.REVIEW_API_URL}/{app_id}"
230
+ params = {
231
+ "json": "1",
232
+ "filter": "all",
233
+ "num_per_page": "0",
234
+ }
235
+
236
+ try:
237
+ response = await self._request_with_retry(
238
+ client, url, params, context=app_id
239
+ )
240
+ data = response.json()
241
+
242
+ summary = data.get("query_summary", {})
243
+ return ReviewStats(
244
+ total=summary.get("total_reviews", 0),
245
+ positive=summary.get("total_positive", 0),
246
+ negative=summary.get("total_negative", 0),
247
+ )
248
+ except SteamAPIError as e:
249
+ if e.status_code in (404, 429):
250
+ ttl = (
251
+ settings.steam_error_cache_ttl_429
252
+ if e.status_code == 429
253
+ else settings.steam_error_cache_ttl_404
254
+ )
255
+ await mongodb.cache_steam_error(app_id, e.status_code, ttl)
256
+ logger.error(f"Błąd pobierania statystyk recenzji: {e}")
257
+ return ReviewStats(total=0, positive=0, negative=0)
258
+
259
+ async def _fetch_reviews_batch(
260
+ self,
261
+ client: httpx.AsyncClient,
262
+ app_id: str,
263
+ review_type: str,
264
+ filter_type: str,
265
+ num_per_page: int,
266
+ cursor: str | None,
267
+ ) -> tuple[list[str], list[ReviewItem], str | None]:
268
+ """Pobiera pojedynczą paczkę recenzji (do 100 sztuk)."""
269
+ url = f"{self.REVIEW_API_URL}/{app_id}"
270
+ params: dict[str, Any] = {
271
+ "json": "1",
272
+ "filter": filter_type,
273
+ "review_type": review_type,
274
+ "language": settings.steam_review_language,
275
+ "num_per_page": str(num_per_page),
276
+ "cursor": cursor or "*",
277
+ "purchase_type": "all",
278
+ }
279
+
280
+ try:
281
+ response = await self._request_with_retry(
282
+ client, url, params, context=app_id
283
+ )
284
+ data = response.json()
285
+ except SteamRateLimitError:
286
+ await mongodb.cache_steam_error(
287
+ app_id, 429, settings.steam_error_cache_ttl_429
288
+ )
289
+ logger.error(f"Rate limited fetching reviews for {app_id}")
290
+ return [], [], None
291
+ except SteamAPIError as e:
292
+ logger.error(f"Błąd pobierania recenzji: {e}")
293
+ return [], [], None
294
+
295
+ if not data.get("success"):
296
+ return [], [], None
297
+
298
+ reviews_data = data.get("reviews", [])
299
+ review_texts: list[str] = []
300
+ review_items: list[ReviewItem] = []
301
+
302
+ for review in reviews_data:
303
+ text = review.get("review")
304
+ if not text:
305
+ continue
306
+ review_texts.append(text)
307
+ review_items.append(ReviewItem(
308
+ text=text,
309
+ recommendation_id=str(review.get("recommendationid", "")),
310
+ timestamp_created=review.get("timestamp_created", 0),
311
+ ))
312
+
313
+ new_cursor = data.get("cursor")
314
+ return review_texts, review_items, new_cursor
315
+
316
+ async def fetch_reviews_stratified(
317
+ self,
318
+ app_id: str,
319
+ sample_plan: SamplePlan,
320
+ ) -> AsyncGenerator[ReviewBatch, None]:
321
+ """
322
+ Główna logika pobierania danych. Działa w dwóch fazach.
323
+ """
324
+ batch_size = settings.review_batch_size
325
+ all_reviews: set[str] = set()
326
+ seen_cursors: set[str] = set()
327
+ client = self.client
328
+
329
+ # --- FAZA 1: TOP HELPFUL ---
330
+ cursor: str | None = "*"
331
+ fetched = 0
332
+
333
+ while fetched < sample_plan.top_helpful:
334
+ to_fetch = min(batch_size, sample_plan.top_helpful - fetched)
335
+ reviews, review_items, cursor = await self._fetch_reviews_batch(
336
+ client, app_id, "all", "all", to_fetch, cursor
337
+ )
338
+
339
+ if not reviews:
340
+ break
341
+ if cursor and cursor in seen_cursors:
342
+ logger.warning(f"Repeated cursor {cursor} for {app_id} (top_helpful). Shortfall: {sample_plan.top_helpful - fetched}")
343
+ break
344
+ if cursor:
345
+ seen_cursors.add(cursor)
346
+
347
+ all_reviews.update(reviews)
348
+ fetched += len(reviews)
349
+ yield ReviewBatch(reviews=reviews, review_items=review_items, cursor=cursor)
350
+
351
+ if not cursor or cursor == "*":
352
+ break
353
+
354
+ # --- FAZA 2a: RECENT POSITIVE ---
355
+ positive_target = sample_plan.positive_count
356
+ if positive_target > 0:
357
+ cursor = "*"
358
+ fetched = 0
359
+ seen_cursors_pos: set[str] = set()
360
+
361
+ while fetched < positive_target:
362
+ to_fetch = min(batch_size, positive_target - fetched)
363
+ # Jeśli mamy dużo duplikatów, prosimy o więcej niż pozostało do targetu (ale max batch_size)
364
+ if fetched > 0:
365
+ to_fetch = batch_size
366
+
367
+ reviews, review_items, cursor = await self._fetch_reviews_batch(
368
+ client, app_id, "positive", "recent", to_fetch, cursor or "*"
369
+ )
370
+ if not reviews:
371
+ break
372
+ if cursor and cursor in seen_cursors_pos:
373
+ logger.warning(f"Repeated cursor {cursor} for {app_id} (positive). Shortfall: {positive_target - fetched}")
374
+ break
375
+ if cursor:
376
+ seen_cursors_pos.add(cursor)
377
+
378
+ new_reviews = [r for r in reviews if r not in all_reviews]
379
+ new_texts_set = set(new_reviews)
380
+ new_items = [ri for ri in review_items if ri.text in new_texts_set]
381
+ all_reviews.update(new_reviews)
382
+ fetched += len(new_reviews)
383
+
384
+ if new_reviews:
385
+ yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor)
386
+ if not cursor or cursor == "*":
387
+ break
388
+
389
+ # --- FAZA 2b: RECENT NEGATIVE ---
390
+ negative_target = sample_plan.negative_count
391
+ if negative_target > 0:
392
+ cursor = "*"
393
+ fetched = 0
394
+ seen_cursors_neg: set[str] = set()
395
+
396
+ while fetched < negative_target:
397
+ to_fetch = min(batch_size, negative_target - fetched)
398
+ if fetched > 0:
399
+ to_fetch = batch_size
400
+
401
+ reviews, review_items, cursor = await self._fetch_reviews_batch(
402
+ client, app_id, "negative", "recent", to_fetch, cursor or "*"
403
+ )
404
+ if not reviews:
405
+ break
406
+ if cursor and cursor in seen_cursors_neg:
407
+ logger.warning(f"Repeated cursor {cursor} for {app_id} (negative). Shortfall: {negative_target - fetched}")
408
+ break
409
+ if cursor:
410
+ seen_cursors_neg.add(cursor)
411
+
412
+ new_reviews = [r for r in reviews if r not in all_reviews]
413
+ new_texts_set = set(new_reviews)
414
+ new_items = [ri for ri in review_items if ri.text in new_texts_set]
415
+ all_reviews.update(new_reviews)
416
+ fetched += len(new_reviews)
417
+
418
+ if new_reviews:
419
+ yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor)
420
+ if not cursor or cursor == "*":
421
+ break
422
+
423
+ logger.info(f"Pobrano łącznie {len(all_reviews)} unikalnych recenzji")
424
+
425
+ async def fetch_recent_reviews(
426
+ self,
427
+ app_id: str,
428
+ exclude_ids: set[str] | None = None,
429
+ ) -> list[ReviewItem]:
430
+ """
431
+ Fetch recent reviews for incremental analysis.
432
+ """
433
+ is_new_game = not exclude_ids
434
+ exclude_ids = exclude_ids or set()
435
+ batch_size = settings.review_batch_size
436
+
437
+ # Incremental Fetch limit for new games
438
+ if is_new_game:
439
+ stats = await self.get_review_stats(app_id)
440
+ max_total = min(stats.total, settings.recent_sample_limit, 500)
441
+ else:
442
+ max_total = settings.recent_sample_limit
443
+
444
+ client = self.client
445
+ cursor: str | None = "*"
446
+ seen_cursors: set[str] = set()
447
+ new_items: list[ReviewItem] = []
448
+
449
+ while len(new_items) < max_total:
450
+ to_fetch = min(batch_size, max_total - len(new_items))
451
+ _, review_items, cursor = await self._fetch_reviews_batch(
452
+ client, app_id, "all", "recent", to_fetch, cursor
453
+ )
454
+
455
+ if not review_items:
456
+ break
457
+ if cursor and cursor in seen_cursors:
458
+ logger.warning(f"Repeated cursor {cursor} for {app_id} (recent). Shortfall: {max_total - len(new_items)}")
459
+ break
460
+ if cursor:
461
+ seen_cursors.add(cursor)
462
+
463
+ # Filter out already-known reviews
464
+ batch_new = [ri for ri in review_items if ri.recommendation_id not in exclude_ids]
465
+
466
+ # Early exit: if >80% of batch is known, we've passed the boundary
467
+ known_ratio = 1 - (len(batch_new) / len(review_items)) if review_items else 0
468
+ new_items.extend(batch_new)
469
+
470
+ if not is_new_game and known_ratio > 0.8:
471
+ logger.info(
472
+ f"Early exit for {app_id}: {known_ratio:.0%} of batch already known"
473
+ )
474
+ break
475
+
476
+ if not cursor or cursor == "*":
477
+ break
478
+
479
+ logger.info(f"Incremental fetch for {app_id}: {len(new_items)} new reviews")
480
+ return new_items[:max_total]
481
+
482
+ async def fetch_reviews(
483
+ self,
484
+ app_id: str,
485
+ batch_size: int | None = None,
486
+ max_reviews: int | None = None,
487
+ ) -> AsyncGenerator[ReviewBatch, None]:
488
+ """Wrapper dla zachowania kompatybilności."""
489
+ stats = await self.get_review_stats(app_id)
490
+ if stats.total == 0:
491
+ return
492
+
493
+ sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
494
+ async for batch in self.fetch_reviews_stratified(app_id, sample_plan):
495
+ yield batch
496
+
497
+
498
+ # Globalna instancja serwisu (Singleton)
499
+ steam_service = SteamService()
backend/app/services/update_detection_service.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Update Detection Service — checks Steam News API for game updates.
3
+
4
+ Compares the latest news/patch date with the stored `last_game_update_at`
5
+ to detect games that have been recently updated.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from datetime import datetime, timezone
11
+ from typing import Any, NamedTuple, cast
12
+
13
+ import httpx
14
+
15
+ from app.core.config import settings
16
+ from app.db.mongodb import mongodb
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/"
21
+
22
+ # Matches two-segment versions: 1.2, v2.0, 0.6, 123.4
23
+ # Excludes three-segment (0.6.1) via negative lookahead, 4-digit years via \d{1,3},
24
+ # and sub-segments of longer versions (e.g. "6.1" within "0.6.1") via lookbehind.
25
+ VERSION_RE = re.compile(r'(?<!\d\.)\bv?\d{1,3}\.\d+\b(?!\.\d)')
26
+
27
+ # Phase 1 regex constants
28
+ RELEASE_PHRASE_RE = re.compile(
29
+ r'\b(out now|is out|is live|now live|now available|full release|'
30
+ r'leaving early access|out of early access)\b',
31
+ re.IGNORECASE
32
+ )
33
+ CONTENT_UPDATE_RE = re.compile(
34
+ r'\b(major update|content update|big update|biggest update)\b',
35
+ re.IGNORECASE
36
+ )
37
+ ACTION_WORD_RE = re.compile(
38
+ r'\b(update|patch|release|available|launch|live|out)\b',
39
+ re.IGNORECASE
40
+ )
41
+ HOTFIX_RE = re.compile(r'\b(hotfix|hot.?fix)\b', re.IGNORECASE)
42
+ BRANCH_RE = re.compile(
43
+ r'\b(experimental branch|experimental.{0,10}patch|experimental.{0,10}build|'
44
+ r'public.?test|pts build|beta branch|'
45
+ r'on experimental|for experimental)\b',
46
+ re.IGNORECASE
47
+ )
48
+ MAJOR_RELEASE_RE = re.compile(
49
+ r'\b(out now|is out|is live|now live|now available|full release|'
50
+ r'leaving early access|out of early access)\b',
51
+ re.IGNORECASE
52
+ )
53
+ MAJOR_CONTENT_RE = re.compile(
54
+ r'\b(major update|content update|big update|biggest update)\b',
55
+ re.IGNORECASE
56
+ )
57
+ ONE_ZERO_RE = re.compile(r'\b1\.0\b(?!\.\d)')
58
+
59
+ # Phase 2 regex constants
60
+ EVENT_FESTIVAL_RE = re.compile(
61
+ r'\b(festival|anniversary\s+event|community\s+event|'
62
+ r'in-game\s+event|roadmap|preview)\b',
63
+ re.IGNORECASE
64
+ )
65
+ UPDATE_OR_PATCH_RE = re.compile(r'\b(update|patch)\b', re.IGNORECASE)
66
+ NAMED_VERSION_RE = re.compile(r'\bV\d+\b') # case-sensitive: uppercase V only
67
+ UPDATE_WORD_RE = re.compile(r'\bupdate\b', re.IGNORECASE)
68
+ PATCH_WORD_RE = re.compile(r'\bpatch\b', re.IGNORECASE)
69
+ MAINT_LANGUAGE_RE = re.compile(
70
+ r'\b(fix(?:es|ed)?|bug\s*fix|improv(?:es?|ed|ements?)|stability|performance|tweak)\b',
71
+ re.IGNORECASE
72
+ )
73
+
74
+ _NEWS_MAX_PAGES = 5 # Max pages in incremental mode (5 * 5 = 25 items)
75
+
76
+
77
+ class NewsCheckResult(NamedTuple):
78
+ latest_update_date: datetime | None # date of most recent update-related item
79
+ is_major: bool # whether any item qualifies as major
80
+ major_date: datetime | None # date of most recent major item; None if not major
81
+ newest_seen_gid: str | None = None # GID of newest news item (for cursor persistence)
82
+ newest_seen_at: datetime | None = None # timestamp of newest news item
83
+
84
+
85
+ class UpdateDetectionService:
86
+ """Detects game updates via Steam News API."""
87
+
88
+ def __init__(self, client: httpx.AsyncClient | None = None) -> None:
89
+ self._client = client
90
+ self._owns_client = client is None
91
+
92
+ async def _get_client(self) -> httpx.AsyncClient:
93
+ if self._client is None:
94
+ self._client = httpx.AsyncClient(timeout=15.0)
95
+ return self._client
96
+
97
+ async def close(self) -> None:
98
+ if self._owns_client and self._client is not None:
99
+ await self._client.aclose()
100
+ self._client = None
101
+
102
+ @staticmethod
103
+ def _is_update_related(item: dict) -> bool:
104
+ """Return True if news item is update-related.
105
+
106
+ Conditions (any one is sufficient):
107
+ A: 'patchnotes' in tags
108
+ B: feedlabel == 'Product Update'
109
+ C: title matches release-style phrases
110
+ D: title matches large content update phrases
111
+ E: title has a version number AND an action word
112
+ """
113
+ tags = item.get("tags")
114
+ if isinstance(tags, list):
115
+ is_patch = "patchnotes" in tags
116
+ else:
117
+ is_patch = "patchnotes" in (tags or "")
118
+ feedlabel = item.get("feedlabel") or ""
119
+ if is_patch or feedlabel == "Product Update":
120
+ return True
121
+
122
+ # Conditions C/D/E: title-based signals — restricted to developer feed only.
123
+ # Third-party news sites (GamingOnLinux etc.) can write about updates using
124
+ # the same language, so we only trust these signals from the developer's own feed.
125
+ if item.get("feedname") != "steam_community_announcements":
126
+ return False
127
+
128
+ title = item.get("title", "")
129
+ if RELEASE_PHRASE_RE.search(title):
130
+ return True
131
+ if CONTENT_UPDATE_RE.search(title):
132
+ return True
133
+ if VERSION_RE.search(title) and ACTION_WORD_RE.search(title):
134
+ return True
135
+ # F: named version (V70) + "update" in title (developer feed only)
136
+ if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title):
137
+ return True
138
+
139
+ return False
140
+
141
+ @staticmethod
142
+ def _is_major_update(item: dict) -> bool:
143
+ """Return True if the news item represents a major update.
144
+
145
+ Negative signals (blockers) are checked first:
146
+ - hotfix keyword → not major
147
+ - experimental branch / public test branch → not major
148
+
149
+ Positive signals (any one is sufficient):
150
+ - version number in title (VERSION_RE)
151
+ - release language (MAJOR_RELEASE_RE)
152
+ - standalone '1.0' (ONE_ZERO_RE)
153
+ - large content phrases (MAJOR_CONTENT_RE)
154
+ """
155
+ title = item.get("title", "")
156
+
157
+ if HOTFIX_RE.search(title):
158
+ return False
159
+ if BRANCH_RE.search(title):
160
+ return False
161
+ if EVENT_FESTIVAL_RE.search(title) and not UPDATE_OR_PATCH_RE.search(title):
162
+ return False
163
+ if PATCH_WORD_RE.search(title) and MAINT_LANGUAGE_RE.search(title):
164
+ return False
165
+
166
+ if VERSION_RE.search(title):
167
+ return True
168
+ if MAJOR_RELEASE_RE.search(title):
169
+ return True
170
+ if ONE_ZERO_RE.search(title):
171
+ return True
172
+ if MAJOR_CONTENT_RE.search(title):
173
+ return True
174
+ if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title):
175
+ return True
176
+
177
+ return False
178
+
179
+ @staticmethod
180
+ def _collect_update_candidates(
181
+ news_items: list[dict],
182
+ ) -> tuple[datetime | None, datetime | None]:
183
+ """Scan all items, return (latest_update_date, major_date).
184
+
185
+ latest_update_date: max date of all update-related items (or None)
186
+ major_date: max date of major items (or None if no major found)
187
+ """
188
+ latest_update_ts: int | None = None
189
+ major_ts: int | None = None
190
+
191
+ for item in news_items:
192
+ if not UpdateDetectionService._is_update_related(item):
193
+ continue
194
+ ts = item.get("date") or 0
195
+ if not ts:
196
+ continue
197
+ if latest_update_ts is None or ts > latest_update_ts:
198
+ latest_update_ts = ts
199
+ if UpdateDetectionService._is_major_update(item):
200
+ if major_ts is None or ts > major_ts:
201
+ major_ts = ts
202
+
203
+ latest_update_date = (
204
+ datetime.fromtimestamp(latest_update_ts, tz=timezone.utc)
205
+ if latest_update_ts is not None
206
+ else None
207
+ )
208
+ major_date = (
209
+ datetime.fromtimestamp(major_ts, tz=timezone.utc)
210
+ if major_ts is not None
211
+ else None
212
+ )
213
+ return latest_update_date, major_date
214
+
215
+ @staticmethod
216
+ async def _fetch_news_page(
217
+ client: httpx.AsyncClient,
218
+ app_id: str,
219
+ count: int,
220
+ enddate: int | None = None,
221
+ ) -> list[dict]:
222
+ """Fetch a single page of news items from Steam API.
223
+
224
+ Returns [] on HTTP error or request failure.
225
+ """
226
+ params: dict[str, Any] = {
227
+ "appid": app_id,
228
+ "count": count,
229
+ "maxlength": 0,
230
+ }
231
+ if enddate is not None:
232
+ params["enddate"] = enddate
233
+
234
+ try:
235
+ resp = await client.get(STEAM_NEWS_API_URL, params=params)
236
+ if resp.status_code != 200:
237
+ return []
238
+ data = resp.json()
239
+ return data.get("appnews", {}).get("newsitems", [])
240
+ except (httpx.RequestError, ValueError, KeyError) as e:
241
+ logger.debug(f"News page fetch failed for {app_id}: {e}")
242
+ return []
243
+
244
+ @staticmethod
245
+ def _scan_batch_with_stopping(
246
+ items: list[dict],
247
+ last_seen_gid: str | None,
248
+ last_seen_at_ts: int | None,
249
+ refresh_cutoff_ts: int | None,
250
+ ) -> tuple[list[dict], bool]:
251
+ """Scan items (newest→oldest), collecting until a stop condition is met.
252
+
253
+ Stop conditions (item is NOT included):
254
+ - gid matches last_seen_gid
255
+ - item date <= last_seen_at_ts
256
+ - item date < refresh_cutoff_ts
257
+
258
+ Returns (accepted_items, hit_stop).
259
+ """
260
+ accepted: list[dict] = []
261
+ for item in items:
262
+ gid = str(item.get("gid", ""))
263
+ ts = item.get("date") or 0
264
+
265
+ if last_seen_gid and gid and gid == last_seen_gid:
266
+ return accepted, True
267
+ if last_seen_at_ts is not None and ts and ts <= last_seen_at_ts:
268
+ return accepted, True
269
+ if refresh_cutoff_ts is not None and ts and ts < refresh_cutoff_ts:
270
+ return accepted, True
271
+
272
+ accepted.append(item)
273
+
274
+ return accepted, False
275
+
276
+ async def _get_latest_news_date(
277
+ self,
278
+ app_id: str,
279
+ last_seen_gid: str | None = None,
280
+ last_seen_at: datetime | None = None,
281
+ ) -> NewsCheckResult:
282
+ """Fetch and scan Steam news for update candidates.
283
+
284
+ In initial mode (no cursor): fetches count=20, single page.
285
+ In incremental mode (cursor present): fetches count=5 with pagination,
286
+ stopping at the known cursor or the refresh window boundary.
287
+ """
288
+ client = await self._get_client()
289
+
290
+ is_incremental = last_seen_gid is not None or last_seen_at is not None
291
+ count = settings.news_incremental_count if is_incremental else settings.news_initial_count
292
+
293
+ # Compute stop thresholds for incremental mode
294
+ last_seen_at_ts: int | None = None
295
+ refresh_cutoff_ts: int | None = None
296
+ if is_incremental:
297
+ last_seen_at_ts = int(last_seen_at.timestamp()) if last_seen_at else None
298
+ now_ts = int(datetime.now(timezone.utc).timestamp())
299
+ cutoff_ts = now_ts - (settings.news_refresh_window_hours * 3600)
300
+
301
+ # If cursor is older than the refresh window (worker was down),
302
+ # disable the time cutoff and scan to the cursor instead.
303
+ # _NEWS_MAX_PAGES protects against unbounded pagination.
304
+ if last_seen_at_ts is not None and last_seen_at_ts < cutoff_ts:
305
+ refresh_cutoff_ts = None
306
+ else:
307
+ refresh_cutoff_ts = cutoff_ts
308
+
309
+ all_accepted: list[dict] = []
310
+ newest_gid: str | None = None
311
+ newest_ts: int = 0
312
+ scan_complete = False
313
+ pages_fetched = 0
314
+ enddate: int | None = None
315
+
316
+ while True:
317
+ items = await self._fetch_news_page(client, app_id, count, enddate)
318
+
319
+ if not items:
320
+ if pages_fetched == 0:
321
+ # First page empty (no news or HTTP error) — newest_gid stays None
322
+ pass
323
+ # Pagination page empty → incomplete scan → don't update cursor
324
+ break
325
+
326
+ pages_fetched += 1
327
+
328
+ # Track newest item (from first page only)
329
+ if newest_gid is None:
330
+ for item in items:
331
+ gid = str(item.get("gid", ""))
332
+ ts = item.get("date") or 0
333
+ if gid and ts:
334
+ newest_gid = gid
335
+ newest_ts = ts
336
+ break
337
+
338
+ if is_incremental:
339
+ accepted, hit_stop = self._scan_batch_with_stopping(
340
+ items, last_seen_gid, last_seen_at_ts, refresh_cutoff_ts
341
+ )
342
+ all_accepted.extend(accepted)
343
+
344
+ if hit_stop:
345
+ scan_complete = True
346
+ break
347
+ if len(items) < count:
348
+ scan_complete = True # API has no more items
349
+ break
350
+ if pages_fetched >= _NEWS_MAX_PAGES:
351
+ scan_complete = True # page limit reached
352
+ break
353
+ oldest_ts = items[-1].get("date") or 0
354
+ if not oldest_ts:
355
+ break # can't paginate → incomplete scan
356
+ enddate = oldest_ts - 1
357
+ else:
358
+ # Initial mode: single fetch, always clean
359
+ all_accepted.extend(items)
360
+ scan_complete = True
361
+ break
362
+
363
+ latest_update_date, major_date = self._collect_update_candidates(all_accepted)
364
+
365
+ cursor_gid: str | None = None
366
+ cursor_at: datetime | None = None
367
+ if scan_complete and newest_gid:
368
+ cursor_gid = newest_gid
369
+ cursor_at = datetime.fromtimestamp(newest_ts, tz=timezone.utc)
370
+
371
+ if latest_update_date is None:
372
+ return NewsCheckResult(
373
+ None, False, None,
374
+ newest_seen_gid=cursor_gid,
375
+ newest_seen_at=cursor_at,
376
+ )
377
+
378
+ return NewsCheckResult(
379
+ latest_update_date=latest_update_date,
380
+ is_major=major_date is not None,
381
+ major_date=major_date,
382
+ newest_seen_gid=cursor_gid,
383
+ newest_seen_at=cursor_at,
384
+ )
385
+
386
+ async def check_for_updates(
387
+ self, games: list[dict[str, Any]]
388
+ ) -> list[dict[str, Any]]:
389
+ """
390
+ Check Steam News API for each game. Return games with confirmed major updates.
391
+
392
+ Non-major patchnotes update last_game_update_at but do not trigger a schedule.
393
+ """
394
+ updated_games: list[dict[str, Any]] = []
395
+ dlcs_by_parent: dict[str, list[dict[str, Any]]] = {}
396
+
397
+ for game in games:
398
+ if game.get("app_type") == "dlc" and game.get("parent_appid"):
399
+ dlcs_by_parent.setdefault(str(game["parent_appid"]), []).append(game)
400
+
401
+ for game in games:
402
+ app_id = str(game.get("appid", ""))
403
+ if not app_id:
404
+ continue
405
+
406
+ if game.get("app_type") == "dlc":
407
+ continue
408
+
409
+ last_known = game.get("last_game_update_at")
410
+ # Normalize last_known to datetime if it's a timestamp
411
+ if last_known is not None and not isinstance(last_known, datetime):
412
+ try:
413
+ last_known = datetime.fromtimestamp(float(last_known), tz=timezone.utc)
414
+ except (ValueError, TypeError):
415
+ last_known = None
416
+
417
+ result = await self._get_latest_news_date(
418
+ app_id,
419
+ last_seen_gid=game.get("last_seen_news_gid"),
420
+ last_seen_at=game.get("last_seen_news_at"),
421
+ )
422
+
423
+ # Persist cursor before any early-continue — even if no updates found
424
+ if result.newest_seen_gid:
425
+ await mongodb.update_news_cursor(
426
+ app_id, result.newest_seen_gid, cast(datetime, result.newest_seen_at)
427
+ )
428
+
429
+ if result.latest_update_date is None:
430
+ continue
431
+
432
+ if last_known is None or result.latest_update_date > last_known:
433
+ await mongodb.update_game_update_date(app_id, result.latest_update_date)
434
+
435
+ if result.is_major:
436
+ current_patch_at = game.get("current_patch_at")
437
+ patch_date = cast(datetime, result.major_date) # always not None when is_major=True
438
+ if current_patch_at is None or patch_date > current_patch_at:
439
+ await mongodb.update_game_patch_date(app_id, patch_date)
440
+ updated_games.append({**game, "update_at": patch_date})
441
+
442
+ for dlc in dlcs_by_parent.get(app_id, []):
443
+ dlc_appid = str(dlc.get("appid", ""))
444
+ if not dlc_appid:
445
+ continue
446
+
447
+ await mongodb.update_game_patch_date(dlc_appid, patch_date)
448
+ updated_games.append({**dlc, "update_at": patch_date})
449
+
450
+ logger.info(
451
+ f"Update detection: {len(updated_games)}/{len(games)} games have new updates"
452
+ )
453
+ return updated_games
backend/pytest.ini ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [pytest]
2
+ testpaths = tests
3
+ python_files = test_*.py
4
+ python_functions = test_*
5
+ asyncio_mode = auto
6
+ addopts = -v --tb=short
backend/requirements.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web Framework
2
+ fastapi==0.109.0
3
+ uvicorn[standard]==0.27.0
4
+ sse-starlette==1.8.2
5
+
6
+ # Database
7
+ motor==3.3.2
8
+ pymongo==4.6.1
9
+
10
+ # Data Validation
11
+ pydantic==2.5.3
12
+ pydantic-settings==2.1.0
13
+
14
+ # HTTP Client
15
+ httpx==0.26.0
16
+
17
+ # AI/ML - Local Inference (ONNX Runtime only, no PyTorch needed at runtime)
18
+ numpy<2.0.0
19
+ transformers==4.37.2
20
+ optimum[onnxruntime]==1.16.2
21
+ huggingface-hub==0.20.3
22
+
23
+ # Rate Limiting
24
+ slowapi==0.1.9
25
+
26
+ # Utilities
27
+ python-dotenv==1.0.0
28
+ jieba==0.42.1
29
+
30
+ # Keyword Expansion (FastText)
31
+ gensim==4.3.3
32
+
33
+ # Code Quality
34
+ ruff==0.1.14
35
+ mypy==1.8.0
36
+
37
+ # Testing
38
+ pytest==7.4.4
39
+ pytest-asyncio==0.23.3
40
+ pytest-cov==4.1.0
41
+ anyio==4.12.1
42
+ zhconv==1.4.3
backend/scripts/smoke_news_cursor.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Smoke Test: Incremental Steam News Cursor Flow
3
+
4
+ Validates that UpdateDetectionService correctly uses cursor-based incremental
5
+ news fetching against the real Steam API.
6
+
7
+ Test game: Factorio (427520) — stable, always has news, uses patchnotes tags.
8
+
9
+ Usage:
10
+ cd /mnt/d/sentiment_summarizer/backend
11
+ ../venv/bin/python scripts/smoke_news_cursor.py
12
+ """
13
+
14
+ import asyncio
15
+ import sys
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+ from unittest.mock import AsyncMock, patch
19
+
20
+ import httpx
21
+
22
+ # Ensure backend/app is importable
23
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
24
+
25
+ from app.services.update_detection_service import UpdateDetectionService # noqa: E402
26
+
27
+ TEST_APP_ID = "427520" # Factorio
28
+
29
+
30
+ # ── helpers ──────────────────────────────────────────────────────────
31
+
32
+
33
+ def _ts() -> str:
34
+ return datetime.now(timezone.utc).strftime("%H:%M:%S")
35
+
36
+
37
+ def _print(status: str, msg: str) -> None:
38
+ tag = {
39
+ "OK": "\033[32mOK\033[0m",
40
+ "FAIL": "\033[31mFAIL\033[0m",
41
+ "SKIP": "\033[33mSKIP\033[0m",
42
+ "INFO": "\033[36mINFO\033[0m",
43
+ }
44
+ print(f"[{_ts()}] [{tag.get(status, status)}] {msg}")
45
+
46
+
47
+ class RecordingTransport(httpx.AsyncBaseTransport):
48
+ """Forwards real HTTP requests but records URL + query params for inspection."""
49
+
50
+ def __init__(self) -> None:
51
+ self._inner = httpx.AsyncHTTPTransport()
52
+ self.recorded: list[dict] = []
53
+
54
+ async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
55
+ params = dict(request.url.params)
56
+ self.recorded.append({"url": str(request.url), "params": params})
57
+ return await self._inner.handle_async_request(request)
58
+
59
+ async def aclose(self) -> None:
60
+ await self._inner.aclose()
61
+
62
+
63
+ # ── main ─────────────────────────────────────────────────────────────
64
+
65
+
66
+ async def run_smoke_test() -> int:
67
+ print(f"\nSteam News Cursor Smoke Test — Factorio ({TEST_APP_ID})")
68
+ print("=" * 60)
69
+ failures = 0
70
+
71
+ # ── Check 1: initial scan returns cursor fields ───────────────────
72
+ _print("INFO", f"Check 1: initial scan for {TEST_APP_ID} (Factorio)")
73
+ result_initial = None
74
+ svc1 = UpdateDetectionService()
75
+ try:
76
+ result_initial = await svc1._get_latest_news_date(TEST_APP_ID)
77
+ finally:
78
+ await svc1.close()
79
+
80
+ if result_initial.newest_seen_gid is None:
81
+ _print("SKIP", "No news items returned — Steam API may be rate-limiting or unreachable; skipping all checks")
82
+ return 0
83
+
84
+ c1_ok = True
85
+ if not isinstance(result_initial.newest_seen_gid, str) or not result_initial.newest_seen_gid:
86
+ _print("FAIL", f"newest_seen_gid is empty/non-string: {result_initial.newest_seen_gid!r}")
87
+ c1_ok = False
88
+
89
+ now = datetime.now(timezone.utc)
90
+ if result_initial.newest_seen_at is None:
91
+ _print("FAIL", "newest_seen_at is None")
92
+ c1_ok = False
93
+ elif not (
94
+ datetime(2020, 1, 1, tzinfo=timezone.utc)
95
+ <= result_initial.newest_seen_at
96
+ <= datetime(now.year + 1, 1, 1, tzinfo=timezone.utc)
97
+ ):
98
+ _print("FAIL", f"newest_seen_at out of expected range: {result_initial.newest_seen_at!r}")
99
+ c1_ok = False
100
+
101
+ if c1_ok:
102
+ _print(
103
+ "OK",
104
+ f"cursor GID={result_initial.newest_seen_gid}, "
105
+ f"at={result_initial.newest_seen_at.isoformat()}",
106
+ )
107
+ else:
108
+ failures += 1
109
+
110
+ cursor_gid = result_initial.newest_seen_gid
111
+ cursor_at = result_initial.newest_seen_at
112
+
113
+ # ── Check 2: incremental scan uses count=5 ────────────────────────
114
+ _print("INFO", "Check 2: incremental scan uses count=5")
115
+ transport = RecordingTransport()
116
+ client = httpx.AsyncClient(transport=transport, timeout=15.0)
117
+ svc2 = UpdateDetectionService(client=client)
118
+ result_inc = None
119
+ try:
120
+ result_inc = await svc2._get_latest_news_date(
121
+ TEST_APP_ID, last_seen_gid=cursor_gid, last_seen_at=cursor_at
122
+ )
123
+ finally:
124
+ await client.aclose()
125
+
126
+ if not transport.recorded:
127
+ _print("SKIP", "No requests recorded — Steam API may be unreachable")
128
+ else:
129
+ c2_ok = True
130
+ for i, req in enumerate(transport.recorded):
131
+ count_val = req["params"].get("count")
132
+ enddate_val = req["params"].get("enddate", "n/a")
133
+ if str(count_val) != "5":
134
+ _print("FAIL", f"Request {i + 1}: count={count_val!r}, expected '5'")
135
+ c2_ok = False
136
+ else:
137
+ _print("INFO", f" Request {i + 1}: count=5 ✓ enddate={enddate_val}")
138
+ if c2_ok:
139
+ _print("OK", f"All {len(transport.recorded)} request(s) used count=5")
140
+ else:
141
+ failures += 1
142
+
143
+ # ── Check 3: no items older than cursor boundary ──────────────────
144
+ _print("INFO", "Check 3: incremental result respects cursor boundary")
145
+ if result_inc is None:
146
+ _print("SKIP", "No incremental result available")
147
+ else:
148
+ c3_ok = True
149
+ if result_inc.latest_update_date is not None:
150
+ if result_inc.latest_update_date <= cursor_at:
151
+ _print(
152
+ "FAIL",
153
+ f"latest_update_date {result_inc.latest_update_date.isoformat()} "
154
+ f"is not strictly newer than cursor {cursor_at.isoformat()}",
155
+ )
156
+ c3_ok = False
157
+ else:
158
+ _print(
159
+ "INFO",
160
+ f" latest_update_date={result_inc.latest_update_date.isoformat()} "
161
+ f"> cursor (new update found between scans)",
162
+ )
163
+ else:
164
+ _print("INFO", " latest_update_date=None (no new updates since cursor) — expected")
165
+ if c3_ok:
166
+ _print("OK", "Cursor boundary respected")
167
+ else:
168
+ failures += 1
169
+
170
+ # ── Check 4: latest_update_date / major_date invariants ──────────
171
+ _print("INFO", "Check 4: structural invariants on initial scan result")
172
+ c4_ok = True
173
+ if result_initial.latest_update_date is None:
174
+ if result_initial.is_major or result_initial.major_date is not None:
175
+ _print(
176
+ "FAIL",
177
+ f"latest_update_date=None but is_major={result_initial.is_major}, "
178
+ f"major_date={result_initial.major_date!r}",
179
+ )
180
+ c4_ok = False
181
+ elif result_initial.is_major:
182
+ if result_initial.major_date is None:
183
+ _print("FAIL", "is_major=True but major_date is None")
184
+ c4_ok = False
185
+ elif result_initial.major_date > result_initial.latest_update_date:
186
+ _print(
187
+ "FAIL",
188
+ f"major_date {result_initial.major_date.isoformat()} "
189
+ f"> latest_update_date {result_initial.latest_update_date.isoformat()}",
190
+ )
191
+ c4_ok = False
192
+ else:
193
+ if result_initial.major_date is not None:
194
+ _print("FAIL", f"is_major=False but major_date={result_initial.major_date!r}")
195
+ c4_ok = False
196
+ if c4_ok:
197
+ _print(
198
+ "OK",
199
+ f"invariants hold: latest_update_date={result_initial.latest_update_date}, "
200
+ f"is_major={result_initial.is_major}, major_date={result_initial.major_date}",
201
+ )
202
+ else:
203
+ failures += 1
204
+
205
+ # ── Check 5: check_for_updates end-to-end, mocked DB ─────────────
206
+ _print("INFO", "Check 5: check_for_updates end-to-end (mocked DB)")
207
+ mock_mongodb = AsyncMock()
208
+ svc5 = UpdateDetectionService()
209
+ updated = None
210
+ try:
211
+ with patch("app.services.update_detection_service.mongodb", mock_mongodb):
212
+ updated = await svc5.check_for_updates(
213
+ [{"appid": TEST_APP_ID, "name": "Factorio"}]
214
+ )
215
+ finally:
216
+ await svc5.close()
217
+
218
+ c5_ok = True
219
+ if not isinstance(updated, list):
220
+ _print("FAIL", f"check_for_updates returned {type(updated).__name__}, expected list")
221
+ c5_ok = False
222
+
223
+ call_count = mock_mongodb.update_news_cursor.call_count
224
+ if call_count == 0:
225
+ # API may have failed between checks (swallowed internally by the service);
226
+ # treat as skip — not a hard failure per the plan.
227
+ _print("SKIP", "update_news_cursor not called — Steam API may have been unreachable for this call")
228
+ elif call_count > 1:
229
+ _print("FAIL", f"update_news_cursor called {call_count} times, expected 1")
230
+ c5_ok = False
231
+ else:
232
+ args = mock_mongodb.update_news_cursor.call_args[0]
233
+ if not (
234
+ isinstance(args[0], str)
235
+ and isinstance(args[1], str)
236
+ and isinstance(args[2], datetime)
237
+ ):
238
+ _print(
239
+ "FAIL",
240
+ f"update_news_cursor arg types wrong: "
241
+ f"{[type(a).__name__ for a in args]} — expected (str, str, datetime)",
242
+ )
243
+ c5_ok = False
244
+ else:
245
+ _print(
246
+ "OK",
247
+ f"check_for_updates returned list; "
248
+ f"update_news_cursor({args[0]!r}, {args[1]!r}, {args[2].isoformat()!r})",
249
+ )
250
+ if not c5_ok:
251
+ failures += 1
252
+
253
+ # ── Summary ───────────────────────────────────────────────────────
254
+ print("=" * 60)
255
+ if failures == 0:
256
+ _print("OK", "All checks passed")
257
+ return 0
258
+ else:
259
+ _print("FAIL", f"{failures} check(s) failed")
260
+ return 1
261
+
262
+
263
+ if __name__ == "__main__":
264
+ sys.exit(asyncio.run(run_smoke_test()))
backend/scripts/smoke_test.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Smoke Test — local verification of worker cycle and analysis pipeline.
3
+
4
+ Usage:
5
+ cd backend
6
+ python scripts/smoke_test.py analyze <appid> # run full analysis for a game
7
+ python scripts/smoke_test.py cycle # mini worker cycle (1 game)
8
+ """
9
+
10
+ import argparse
11
+ import asyncio
12
+ import logging
13
+ import sys
14
+ import time
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+
18
+ # Ensure backend/app is importable
19
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
20
+
21
+ from app.core.config import settings # noqa: E402
22
+ from app.db.mongodb import mongodb # noqa: E402
23
+ from app.services.nlp_service import NLPService # noqa: E402
24
+ from app.services.steam_service import SteamService # noqa: E402
25
+ from app.services.update_detection_service import UpdateDetectionService # noqa: E402
26
+ from app.services.precache_service import PreCacheService # noqa: E402
27
+ from app.services.analysis_runner import run_full_analysis # noqa: E402
28
+
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
32
+ )
33
+ logger = logging.getLogger("smoke_test")
34
+
35
+
36
+ def _ts() -> str:
37
+ return datetime.now(timezone.utc).strftime("%H:%M:%S")
38
+
39
+
40
+ def _print(status: str, msg: str) -> None:
41
+ tag = {"OK": "\033[32mOK\033[0m", "FAIL": "\033[31mFAIL\033[0m", "SKIP": "\033[33mSKIP\033[0m", "INFO": "\033[36mINFO\033[0m"}
42
+ print(f"[{_ts()}] [{tag.get(status, status)}] {msg}")
43
+
44
+
45
+ # ── analyze subcommand ──────────────────────────────────────────────
46
+
47
+
48
+ async def cmd_analyze(app_id: str) -> None:
49
+ _print("INFO", f"Starting analysis for app_id={app_id}")
50
+ _print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}")
51
+
52
+ await mongodb.connect()
53
+
54
+ steam_svc = SteamService()
55
+ nlp_svc = NLPService()
56
+
57
+ try:
58
+ t0 = time.monotonic()
59
+ result = await run_full_analysis(app_id, f"smoke-{app_id}", steam_svc, nlp_svc)
60
+ elapsed = time.monotonic() - t0
61
+
62
+ if result is None:
63
+ _print("FAIL", "run_full_analysis returned None")
64
+ return
65
+
66
+ game = result.get("game", {})
67
+ topics = result.get("topics", [])
68
+ analyzed = result.get("analyzed_reviews", 0)
69
+ highlights = result.get("general_highlights", [])
70
+
71
+ _print("OK", f"Analysis complete in {elapsed:.1f}s")
72
+ _print("OK", f" Game: {game.get('name', '?')} (appid {game.get('app_id', '?')})")
73
+ _print("OK", f" Reviews analyzed: {analyzed}")
74
+ _print("OK", f" Topics found: {len(topics)}")
75
+ _print("OK", f" General highlights: {len(highlights)}")
76
+
77
+ # Verify cache write
78
+ cached = await mongodb.get_cached_analysis(app_id)
79
+ if cached:
80
+ _print("OK", " Cache write verified — document found in MongoDB")
81
+ else:
82
+ _print("FAIL", " Cache write verification FAILED — no document in MongoDB")
83
+
84
+ finally:
85
+ await steam_svc.close()
86
+ await mongodb.disconnect()
87
+
88
+
89
+ # ── cycle subcommand ─────────────────────────────────────────────────
90
+
91
+
92
+ async def cmd_cycle() -> None:
93
+ _print("INFO", "Starting mini worker cycle")
94
+ _print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}")
95
+
96
+ await mongodb.connect()
97
+
98
+ steam_svc = SteamService()
99
+ nlp_svc = NLPService()
100
+ update_svc = UpdateDetectionService()
101
+
102
+ try:
103
+ # Step 1: Get top 1 game
104
+ _print("INFO", "Step 1: Fetching top game by reviews...")
105
+ top_games = await mongodb.get_top_games_by_reviews(1)
106
+ if not top_games:
107
+ _print("SKIP", "No games in DB — run game sync first or use 'analyze' subcommand")
108
+ return
109
+
110
+ game = top_games[0]
111
+ app_id = str(game.get("appid", ""))
112
+ name = game.get("name", "?")
113
+ _print("OK", f" Top game: {name} (appid {app_id})")
114
+
115
+ # Step 2: Test datetime comparison (the bug this patch fixes)
116
+ _print("INFO", "Step 2: Testing synced_at datetime comparison...")
117
+ synced_at = game.get("synced_at")
118
+ if synced_at:
119
+ try:
120
+ delta = datetime.now(timezone.utc) - synced_at
121
+ hours = delta.total_seconds() / 3600
122
+ _print("OK", f" synced_at delta: {hours:.1f}h (tz={synced_at.tzinfo})")
123
+ except TypeError as e:
124
+ _print("FAIL", f" datetime subtraction failed: {e}")
125
+ return
126
+ else:
127
+ _print("SKIP", " No synced_at field — game sync not run yet")
128
+
129
+ # Step 3: Update detection (1 game)
130
+ _print("INFO", "Step 3: Update detection...")
131
+ t0 = time.monotonic()
132
+ updated = await update_svc.check_for_updates([game])
133
+ elapsed = time.monotonic() - t0
134
+ _print("OK", f" Updates detected: {len(updated)} in {elapsed:.1f}s")
135
+
136
+ # Step 4: Bootstrap missing analyses
137
+ _print("INFO", "Step 4: Bootstrap missing analyses...")
138
+ precache_svc = PreCacheService(steam_svc, nlp_svc)
139
+ bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games)
140
+ _print("OK", f" Bootstrapped: {bootstrapped}")
141
+
142
+ # Step 5: Process due analyses (max 1)
143
+ _print("INFO", "Step 5: Processing due analyses (max 1)...")
144
+ orig = settings.precache_max_analyses_per_cycle
145
+ # Temporarily limit to 1
146
+ object.__setattr__(settings, "precache_max_analyses_per_cycle", 1)
147
+ try:
148
+ executed = await precache_svc.process_due_analyses()
149
+ _print("OK", f" Executed: {executed}")
150
+ finally:
151
+ object.__setattr__(settings, "precache_max_analyses_per_cycle", orig)
152
+
153
+ _print("OK", "Mini cycle complete")
154
+
155
+ finally:
156
+ await update_svc.close()
157
+ await steam_svc.close()
158
+ await mongodb.disconnect()
159
+
160
+
161
+ # ── main ─────────────────────────────────────────────────────────────
162
+
163
+
164
+ def main() -> None:
165
+ parser = argparse.ArgumentParser(description="SentimentStream smoke test")
166
+ sub = parser.add_subparsers(dest="command")
167
+
168
+ p_analyze = sub.add_parser("analyze", help="Run full analysis for a game")
169
+ p_analyze.add_argument("appid", help="Steam app ID (e.g. 730)")
170
+
171
+ sub.add_parser("cycle", help="Run mini worker cycle (top 1 game)")
172
+
173
+ args = parser.parse_args()
174
+
175
+ if args.command == "analyze":
176
+ asyncio.run(cmd_analyze(args.appid))
177
+ elif args.command == "cycle":
178
+ asyncio.run(cmd_cycle())
179
+ else:
180
+ parser.print_help()
181
+ sys.exit(1)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()
backend/worker_main.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Worker Main App — lightweight FastAPI for background game sync and pre-cache.
3
+
4
+ Endpoints:
5
+ GET /health — MongoDB ping, last cycle summary, cycle_running flag
6
+ POST /trigger — token-protected, starts a worker cycle as background task
7
+ GET /logs — token-protected, read structured log tail
8
+ """
9
+
10
+ import asyncio
11
+ import logging
12
+ import os
13
+ import uuid
14
+ from contextlib import asynccontextmanager
15
+ from datetime import datetime, timezone
16
+ from typing import Any
17
+
18
+ from fastapi import FastAPI, Query, Request
19
+ from fastapi.responses import JSONResponse
20
+
21
+ from app.core.config import settings
22
+ from app.core.worker_logging import (
23
+ AsyncTimingContext,
24
+ WORKER_LOG_WHITELIST,
25
+ log_structured,
26
+ read_log_tail,
27
+ resolve_log_path,
28
+ set_cycle_id,
29
+ setup_app_logging,
30
+ setup_structured_logger,
31
+ )
32
+ from app.db.mongodb import mongodb
33
+ from app.services.game_sync_service import GameSyncService
34
+ from app.services.nlp_service import NLPService
35
+ from app.services.precache_service import PreCacheService
36
+ from app.services.steam_service import SteamService
37
+ from app.services.priority_refresh_service import PriorityRefreshService
38
+ from app.services.update_detection_service import UpdateDetectionService
39
+
40
+ logging.basicConfig(
41
+ level=logging.INFO,
42
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
43
+ )
44
+ logger = logging.getLogger(__name__)
45
+
46
+ # Cycle state
47
+ _cycle_running = False
48
+ _last_cycle_summary: dict[str, Any] = {}
49
+
50
+
51
+ @asynccontextmanager
52
+ async def lifespan(app: FastAPI):
53
+ """Connect MongoDB on startup, disconnect on shutdown."""
54
+ await mongodb.connect()
55
+ setup_structured_logger("worker")
56
+ setup_app_logging()
57
+ logger.info("Worker started — MongoDB connected, structured logging initialized")
58
+ yield
59
+ await mongodb.disconnect()
60
+ logger.info("Worker shutting down")
61
+
62
+
63
+ app = FastAPI(title="SentimentStream Worker", lifespan=lifespan)
64
+
65
+
66
+ @app.get("/health")
67
+ async def health():
68
+ """Health check with cycle status."""
69
+ mongo_ok = False
70
+ try:
71
+ if mongodb.client:
72
+ await mongodb.client.admin.command("ping")
73
+ mongo_ok = True
74
+ except Exception:
75
+ pass
76
+
77
+ return {
78
+ "status": "ok" if mongo_ok else "degraded",
79
+ "mongodb": "connected" if mongo_ok else "disconnected",
80
+ "cycle_running": _cycle_running,
81
+ "last_cycle": _last_cycle_summary,
82
+ }
83
+
84
+
85
+ def _check_bearer_token(request: Request) -> bool:
86
+ """Validate Bearer token from Authorization header."""
87
+ auth = request.headers.get("Authorization", "")
88
+ expected = settings.worker_trigger_token
89
+ return bool(expected and auth.startswith("Bearer ") and auth[7:] == expected)
90
+
91
+
92
+ @app.post("/trigger")
93
+ async def trigger(request: Request):
94
+ """Token-protected trigger to start a worker cycle."""
95
+ global _cycle_running
96
+
97
+ if not _check_bearer_token(request):
98
+ return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
99
+
100
+ if _cycle_running:
101
+ return JSONResponse(status_code=503, content={"detail": "Cycle already running"})
102
+
103
+ asyncio.create_task(_run_cycle())
104
+ return {"status": "started"}
105
+
106
+
107
+ @app.get("/logs")
108
+ async def get_logs(
109
+ request: Request,
110
+ lines: int = Query(default=100, ge=1, le=1000),
111
+ level: str | None = Query(default=None),
112
+ event: str | None = Query(default=None),
113
+ file: str = Query(default="worker"),
114
+ ):
115
+ """Token-protected endpoint to read structured log tail."""
116
+ if not _check_bearer_token(request):
117
+ return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
118
+
119
+ log_path = resolve_log_path(file, WORKER_LOG_WHITELIST)
120
+ if log_path is None:
121
+ return JSONResponse(
122
+ status_code=400,
123
+ content={"detail": f"Unknown log file: '{file}'. Valid: {list(WORKER_LOG_WHITELIST.keys())}"},
124
+ )
125
+
126
+ entries = read_log_tail(log_path, lines=lines, level=level, event=event)
127
+ return {"entries": entries, "count": len(entries)}
128
+
129
+
130
+ async def _run_cycle() -> None:
131
+ """Execute a full worker cycle."""
132
+ global _cycle_running, _last_cycle_summary
133
+ _cycle_running = True
134
+ started = datetime.now(timezone.utc)
135
+ summary: dict[str, Any] = {"started_at": started.isoformat()}
136
+
137
+ cycle_id = uuid.uuid4().hex[:8]
138
+ set_cycle_id(cycle_id)
139
+ log_structured("cycle_start", cycle_id=cycle_id)
140
+
141
+ steam_svc = SteamService()
142
+ nlp_svc = NLPService()
143
+ game_sync_svc = GameSyncService()
144
+ priority_svc = PriorityRefreshService()
145
+ update_svc = UpdateDetectionService()
146
+
147
+ try:
148
+ # 1. Game sync (if enabled and not synced recently)
149
+ if settings.game_sync_enabled:
150
+ top_games = await mongodb.get_top_games_by_reviews(1)
151
+ last_synced = top_games[0].get("synced_at") if top_games else None
152
+ hours_since_sync = None
153
+ if last_synced:
154
+ delta = datetime.now(timezone.utc) - last_synced
155
+ hours_since_sync = delta.total_seconds() / 3600
156
+
157
+ if hours_since_sync is None or hours_since_sync > 20:
158
+ async with AsyncTimingContext() as t_sync:
159
+ logger.info("Starting game sync...")
160
+ upserted, modified = await game_sync_svc.sync_all_games()
161
+ summary["game_sync"] = {"upserted": upserted, "modified": modified}
162
+ log_structured("game_sync", elapsed_s=t_sync.elapsed_s,
163
+ detail=summary["game_sync"])
164
+
165
+ async with AsyncTimingContext() as t_details:
166
+ enriched = await game_sync_svc.sync_top_game_details()
167
+ summary["game_details"] = {"enriched": enriched}
168
+ log_structured("game_details", elapsed_s=t_details.elapsed_s,
169
+ detail=summary["game_details"])
170
+ else:
171
+ summary["game_sync"] = "skipped (recent)"
172
+ log_structured("game_sync", detail="skipped (recent)")
173
+
174
+ # ALWAYS enrich CN names if sync is enabled, even if main sync skipped
175
+ async with AsyncTimingContext() as t_cn:
176
+ cn_processed = await game_sync_svc.enrich_cn_names()
177
+ summary["cn_enrichment"] = {"processed": cn_processed}
178
+ log_structured("cn_enrichment", elapsed_s=t_cn.elapsed_s,
179
+ detail=summary["cn_enrichment"])
180
+
181
+ async with AsyncTimingContext() as t_app_types:
182
+ app_types_processed = await game_sync_svc.enrich_app_types()
183
+ summary["app_type_enrichment"] = {"processed": app_types_processed}
184
+ log_structured("app_type_enrichment", elapsed_s=t_app_types.elapsed_s,
185
+ detail=summary["app_type_enrichment"])
186
+
187
+ # 1b. Priority refresh
188
+ async with AsyncTimingContext() as t_priority:
189
+ priority_result = await priority_svc.refresh_priorities()
190
+ summary["priority_refresh"] = priority_result
191
+ log_structured("priority_refresh", elapsed_s=t_priority.elapsed_s, detail=priority_result)
192
+
193
+ # 2. Update detection
194
+ async with AsyncTimingContext() as t_update:
195
+ top_games = await mongodb.get_priority_games_for_analysis()
196
+ updated_games = await update_svc.check_for_updates(top_games)
197
+ summary["updates_detected"] = len(updated_games)
198
+ log_structured("update_detection", elapsed_s=t_update.elapsed_s,
199
+ detail={"updates_detected": len(updated_games)})
200
+
201
+ # 3. Create schedules for updated games
202
+ precache_svc = PreCacheService(steam_svc, nlp_svc)
203
+
204
+ async with AsyncTimingContext() as t_sched:
205
+ if updated_games:
206
+ await precache_svc.create_schedules_for_updates(updated_games)
207
+ log_structured("create_schedules", elapsed_s=t_sched.elapsed_s,
208
+ detail={"updated_games": len(updated_games) if updated_games else 0})
209
+
210
+ # 4. Bootstrap missing analyses
211
+ async with AsyncTimingContext() as t_boot:
212
+ bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games)
213
+ summary["bootstrapped"] = bootstrapped
214
+ log_structured("bootstrap_missing", elapsed_s=t_boot.elapsed_s,
215
+ detail={"bootstrapped": bootstrapped})
216
+
217
+ # 5. Process due analyses
218
+ if settings.precache_enabled:
219
+ async with AsyncTimingContext() as t_analyses:
220
+ executed = await precache_svc.process_due_analyses()
221
+ summary["analyses_executed"] = executed
222
+ log_structured("process_due_analyses", elapsed_s=t_analyses.elapsed_s,
223
+ detail={"executed": executed})
224
+ else:
225
+ summary["precache"] = "disabled"
226
+
227
+ except Exception as e:
228
+ logger.error(f"Cycle error: {e}", exc_info=True)
229
+ summary["error"] = str(e)
230
+ log_structured("cycle_error", level=logging.ERROR, error=str(e))
231
+ finally:
232
+ await game_sync_svc.close()
233
+ await priority_svc.close()
234
+ await update_svc.close()
235
+ await steam_svc.close()
236
+
237
+ elapsed = (datetime.now(timezone.utc) - started).total_seconds()
238
+ summary["elapsed_seconds"] = round(elapsed, 1)
239
+ _last_cycle_summary = summary
240
+ _cycle_running = False
241
+ log_structured("cycle_end", elapsed_s=round(elapsed, 1),
242
+ detail=summary)
243
+ set_cycle_id(None)
244
+ logger.info(f"Cycle complete in {elapsed:.1f}s: {summary}")
scripts/benchmark_major_update.py ADDED
@@ -0,0 +1,848 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark script for the major update detection heuristic.
4
+
5
+ Evaluates UpdateDetectionService._is_update_related, _collect_update_candidates,
6
+ and _is_major_update against a curated set of Steam games.
7
+
8
+ Three modes:
9
+ --discover Fetch news for all games (count=20 by default, matches
10
+ production) and display all items with classification
11
+ details. Use this to identify ground truth.
12
+ --evaluate Item-level evaluation: for each ItemCase, find the item
13
+ by gid and check if _is_update_related / _is_major_update
14
+ match expectations.
15
+ --evaluate-service Service-level evaluation: for each ServiceCase, run the
16
+ full selection pipeline and compare the outcome.
17
+
18
+ Both --evaluate and --evaluate-service run by default when no mode is specified.
19
+
20
+ Examples:
21
+ python scripts/benchmark_major_update.py --discover
22
+ python scripts/benchmark_major_update.py --discover --count 50
23
+ python scripts/benchmark_major_update.py --evaluate
24
+ python scripts/benchmark_major_update.py --evaluate-service
25
+ python scripts/benchmark_major_update.py # runs both evaluate modes
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import sys
32
+ from dataclasses import dataclass
33
+ from datetime import datetime, timezone
34
+ from pathlib import Path
35
+ from typing import Literal
36
+
37
+ import httpx
38
+
39
+ # ── import project service ────────────────────────────────────────────────────
40
+ sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
41
+ from app.services.update_detection_service import UpdateDetectionService # noqa: E402
42
+
43
+ STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/"
44
+
45
+ # ── benchmark games ───────────────────────────────────────────────────────────
46
+ GAMES: list[tuple[str, str]] = [
47
+ ("Going Medieval", "1029780"),
48
+ ("Timberborn", "1062090"),
49
+ ("Hades II", "1145350"),
50
+ ("Against the Storm", "1336490"),
51
+ ("Valheim", "892970"),
52
+ ("Manor Lords", "1363080"),
53
+ ("Project Zomboid", "108600"),
54
+ ("Dwarf Fortress", "975370"),
55
+ ("Helldivers 2", "553850"),
56
+ ("Deep Rock Galactic", "548430"),
57
+ ("Lethal Company", "1966720"),
58
+ ("Factorio", "427520"),
59
+ ("Satisfactory", "526870"),
60
+ ]
61
+
62
+ # ── ground truth structures ───────────────────────────────────────────────────
63
+
64
+ @dataclass
65
+ class ItemCase:
66
+ """Per-item ground truth: is this specific event major?"""
67
+ game_name: str
68
+ appid: str
69
+ gid: str
70
+ title: str # for display
71
+ expected: Literal["major", "not_major", "ambiguous"]
72
+ reasoning: str
73
+
74
+
75
+ @dataclass
76
+ class ServiceCase:
77
+ """Per-game ground truth: what should the production code do?"""
78
+ game_name: str
79
+ appid: str
80
+ expected_major: bool | None # True / False / None = ambiguous
81
+ reasoning: str
82
+
83
+
84
+ # ── item-level ground truth ───────────────────────────────────────────────────
85
+ # Populated from --discover run on 2026-03-19.
86
+ ITEM_CASES: list[ItemCase] = [
87
+ # ── Going Medieval ────────────────────────────────────────────────────────
88
+ ItemCase(
89
+ game_name="Going Medieval",
90
+ appid="1029780",
91
+ gid="1826992588604105",
92
+ title="Going Medieval is out now in 1.0!",
93
+ expected="major",
94
+ reasoning=(
95
+ "1.0 full release out of Early Access — unambiguously major. "
96
+ "Phase 1: RELEASE_PHRASE_RE matches 'is out now' → update-related. "
97
+ "ONE_ZERO_RE matches '1.0' → major."
98
+ ),
99
+ ),
100
+ ItemCase(
101
+ game_name="Going Medieval",
102
+ appid="1029780",
103
+ gid="1827626365751261",
104
+ title="Experimental Branch Patch (1.0.48)",
105
+ expected="not_major",
106
+ reasoning=(
107
+ "Experimental branch incremental patch. Three-segment version (1.0.48) "
108
+ "excluded by VERSION_RE. BRANCH_RE blocks major classification."
109
+ ),
110
+ ),
111
+ ItemCase(
112
+ game_name="Going Medieval",
113
+ appid="1029780",
114
+ gid="1827626365750723",
115
+ title="Patch Notes (1.0.47)",
116
+ expected="not_major",
117
+ reasoning="Incremental stable patch, three-segment version. not_major is correct.",
118
+ ),
119
+ # ── Timberborn ────────────────────────────────────────────────────────────
120
+ ItemCase(
121
+ game_name="Timberborn",
122
+ appid="1062090",
123
+ gid="1826992588592887",
124
+ title="Timberborn 1.0 is live!",
125
+ expected="major",
126
+ reasoning=(
127
+ "1.0 full release out of Early Access — unambiguously major. "
128
+ "Phase 1: RELEASE_PHRASE_RE matches 'is live' → update-related. "
129
+ "ONE_ZERO_RE matches '1.0' → major."
130
+ ),
131
+ ),
132
+ ItemCase(
133
+ game_name="Timberborn",
134
+ appid="1062090",
135
+ gid="1826992588603124",
136
+ title="Patch notes 2026-03-17 (experimental)",
137
+ expected="not_major",
138
+ reasoning="Experimental branch date-based patch notes. No version number. not_major is correct.",
139
+ ),
140
+ # ── Hades II ──────────────────────────────────────────────────────────────
141
+ ItemCase(
142
+ game_name="Hades II",
143
+ appid="1145350",
144
+ gid="1816215235360707",
145
+ title="Hades II v1.0 Hotfix 3",
146
+ expected="not_major",
147
+ reasoning=(
148
+ "A bugfix hotfix on top of the v1.0 launch — not a content update. "
149
+ "Phase 1: HOTFIX_RE blocks major classification. Correct: not_major."
150
+ ),
151
+ ),
152
+ ItemCase(
153
+ game_name="Hades II",
154
+ appid="1145350",
155
+ gid="1811772772516846",
156
+ title="Hades II v1.0 Hotfix 2",
157
+ expected="not_major",
158
+ reasoning="Same pattern: HOTFIX_RE blocks 'v1.0 Hotfix N' from being classified as major.",
159
+ ),
160
+ ItemCase(
161
+ game_name="Hades II",
162
+ appid="1145350",
163
+ gid="1811772772248738",
164
+ title="Hades II v1.0 Is Now Available!",
165
+ expected="major",
166
+ reasoning=(
167
+ "v1.0 full launch — unambiguously major. "
168
+ "Phase 1: RELEASE_PHRASE_RE matches 'Is Now Available' → update-related. "
169
+ "No hotfix/branch blocker. VERSION_RE matches 'v1.0' → major."
170
+ ),
171
+ ),
172
+ # ── Against the Storm ─────────────────────────────────────────────────────
173
+ ItemCase(
174
+ game_name="Against the Storm",
175
+ appid="1336490",
176
+ gid="1818752592135840",
177
+ title="Demo Update 1.9.6",
178
+ expected="not_major",
179
+ reasoning=(
180
+ "Demo game update, three-segment version 1.9.6. "
181
+ "Service correctly classifies as not_major."
182
+ ),
183
+ ),
184
+ ItemCase(
185
+ game_name="Against the Storm",
186
+ appid="1336490",
187
+ gid="1816849002010836",
188
+ title="Brineworks Update (1.9) available!",
189
+ expected="major",
190
+ reasoning=(
191
+ "Named major content update with version 1.9. "
192
+ "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' "
193
+ "→ update-related. VERSION_RE → major."
194
+ ),
195
+ ),
196
+ # ── Valheim ───────────────────────────────────────────────────────────────
197
+ ItemCase(
198
+ game_name="Valheim",
199
+ appid="892970",
200
+ gid="1825093633184197",
201
+ title="Patch 0.221.12",
202
+ expected="not_major",
203
+ reasoning="Three-segment maintenance patch. Correctly classified as not_major.",
204
+ ),
205
+ ItemCase(
206
+ game_name="Valheim",
207
+ appid="892970",
208
+ gid="1809869179994587",
209
+ title="Patch 0.221.4 (Public Test)",
210
+ expected="not_major",
211
+ reasoning="Public test branch three-segment patch. Correctly classified as not_major.",
212
+ ),
213
+ # ── Manor Lords ───────────────────────────────────────────────────────────
214
+ ItemCase(
215
+ game_name="Manor Lords",
216
+ appid="1363080",
217
+ gid="1827626365750540",
218
+ title="Major Update #6: Battlefield Changes, New Map, and Family Based Progression",
219
+ expected="major",
220
+ reasoning=(
221
+ "Developer-declared major content drop. "
222
+ "Phase 1: CONTENT_UPDATE_RE matches 'Major Update' → update-related and major."
223
+ ),
224
+ ),
225
+ ItemCase(
226
+ game_name="Manor Lords",
227
+ appid="1363080",
228
+ gid="1826992588603500",
229
+ title="New BETA version is available for testing (0.8.065)",
230
+ expected="not_major",
231
+ reasoning=(
232
+ "Beta/testing build announcement, not a production major update. "
233
+ "Current heuristic misses it entirely, which is acceptable for this benchmark case."
234
+ ),
235
+ ),
236
+ # ── Project Zomboid ───────────────────────────────────────────────────────
237
+ ItemCase(
238
+ game_name="Project Zomboid",
239
+ appid="108600",
240
+ gid="1826992588590120",
241
+ title="42.15.2 UNSTABLE HOTFIX Released",
242
+ expected="not_major",
243
+ reasoning=(
244
+ "Unstable-branch hotfix. patchnotes tag makes it update-related, "
245
+ "but HOTFIX_RE correctly blocks major classification."
246
+ ),
247
+ ),
248
+ ItemCase(
249
+ game_name="Project Zomboid",
250
+ appid="108600",
251
+ gid="1826362059930323",
252
+ title="Build 42.15.0 Unstable Released",
253
+ expected="not_major",
254
+ reasoning=(
255
+ "Unstable build release, not a production major update. "
256
+ "Current heuristic does not classify it as update-related because the three-segment "
257
+ "build number fails VERSION_RE."
258
+ ),
259
+ ),
260
+ # ── Dwarf Fortress ────────────────────────────────────────────────────────
261
+ ItemCase(
262
+ game_name="Dwarf Fortress",
263
+ appid="975370",
264
+ gid="1826362059918689",
265
+ title="Food fixes, AMA, community spotlight and more! Dwarf Fortress Patch 53.11",
266
+ expected="not_major",
267
+ reasoning=(
268
+ "Maintenance patch with Dwarf Fortress' two-segment numbering scheme. "
269
+ "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fixes' "
270
+ "→ maintenance blocker fires before VERSION_RE → not_major."
271
+ ),
272
+ ),
273
+ ItemCase(
274
+ game_name="Dwarf Fortress",
275
+ appid="975370",
276
+ gid="1821288646585998",
277
+ title="Aquatic portraits, Naked dwarf fix and more Dwarf Fortress Patch 53.10",
278
+ expected="not_major",
279
+ reasoning=(
280
+ "Another maintenance patch under the same numbering scheme. "
281
+ "Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fix' "
282
+ "→ maintenance blocker fires → not_major."
283
+ ),
284
+ ),
285
+ # ── Helldivers 2 ──────────────────────────────────────────────────────────
286
+ ItemCase(
287
+ game_name="Helldivers 2",
288
+ appid="553850",
289
+ gid="1826992588603352",
290
+ title="Machinery of Oppression: 6.1.0",
291
+ expected="major",
292
+ reasoning=(
293
+ "Named content drop with new missions/enemies. This should count as a major update. "
294
+ "Useful to test whether named major drops with three-segment versions are still found."
295
+ ),
296
+ ),
297
+ ItemCase(
298
+ game_name="Helldivers 2",
299
+ appid="553850",
300
+ gid="1826992588603981",
301
+ title="Revealing our Machinery of Oppression Content Roadmap!",
302
+ expected="not_major",
303
+ reasoning=(
304
+ "Roadmap/announcement post, not the update itself. Should not be treated as major."
305
+ ),
306
+ ),
307
+ # ── Deep Rock Galactic ────────────────────────────────────────────────────
308
+ ItemCase(
309
+ game_name="Deep Rock Galactic",
310
+ appid="548430",
311
+ gid="1825727806720055",
312
+ title="'Eight Years in Orbit' Anniversary Event is live now!",
313
+ expected="not_major",
314
+ reasoning=(
315
+ "Live event announcement, not a game patch. "
316
+ "Phase 2: EVENT_FESTIVAL_RE matches 'anniversary event'; no 'update'/'patch' in title "
317
+ "→ UPDATE_OR_PATCH_RE guard fails → event blocker fires → not_major."
318
+ ),
319
+ ),
320
+ ItemCase(
321
+ game_name="Deep Rock Galactic",
322
+ appid="548430",
323
+ gid="1824644522847377",
324
+ title="Lunar Festival 2026 is now live!",
325
+ expected="not_major",
326
+ reasoning=(
327
+ "Seasonal event announcement, not a major patch/update. "
328
+ "Phase 2: EVENT_FESTIVAL_RE matches 'festival'; no 'update'/'patch' → event blocker fires → not_major."
329
+ ),
330
+ ),
331
+ # ── Lethal Company ────────────────────────────────────────────────────────
332
+ ItemCase(
333
+ game_name="Lethal Company",
334
+ appid="1966720",
335
+ gid="1800991756395986",
336
+ title="V70 - The Incubating Update",
337
+ expected="major",
338
+ reasoning=(
339
+ "Named major content update. "
340
+ "Phase 2: NAMED_VERSION_RE matches 'V70'; UPDATE_WORD_RE matches 'Update' "
341
+ "→ condition F makes it update-related; named version positive signal → major."
342
+ ),
343
+ ),
344
+ ItemCase(
345
+ game_name="Lethal Company",
346
+ appid="1966720",
347
+ gid="1801617199407807",
348
+ title="V72 Bug fix patch",
349
+ expected="not_major",
350
+ reasoning=(
351
+ "Small bug-fix patch. patchnotes tag makes it update-related. "
352
+ "Phase 2: PATCH_WORD_RE matches 'patch'; MAINT_LANGUAGE_RE matches 'bug fix' "
353
+ "→ maintenance blocker fires → not_major."
354
+ ),
355
+ ),
356
+ # ── Factorio ──────────────────────────────────────────────────────────────
357
+ ItemCase(
358
+ game_name="Factorio",
359
+ appid="427520",
360
+ gid="1827626365752749",
361
+ title="Version 2.0.76 released as stable",
362
+ expected="not_major",
363
+ reasoning=(
364
+ "Stable maintenance patch under a three-segment versioning scheme. "
365
+ "Useful as a clean true negative."
366
+ ),
367
+ ),
368
+ # ── Satisfactory ──────────────────────────────────────────────────────────
369
+ ItemCase(
370
+ game_name="Satisfactory",
371
+ appid="526870",
372
+ gid="1826992588604352",
373
+ title="Update 1.2 is out now on Experimental!",
374
+ expected="not_major",
375
+ reasoning=(
376
+ "Experimental-branch release, not a production major update. "
377
+ "Phase 2: extended BRANCH_RE matches 'on Experimental' → branch blocker fires → not_major."
378
+ ),
379
+ ),
380
+ ItemCase(
381
+ game_name="Satisfactory",
382
+ appid="526870",
383
+ gid="1825093633185794",
384
+ title="Experimental Hotfix v1.1.3.1",
385
+ expected="not_major",
386
+ reasoning=(
387
+ "Experimental hotfix on a three-segment version. Correct behavior is not_major."
388
+ ),
389
+ ),
390
+ ]
391
+
392
+ # ── service-level ground truth ────────────────────────────────────────────────
393
+ # What SHOULD the production code do for this game given the current news window?
394
+ # Populated from --discover run on 2026-03-19.
395
+ # Phase 1 semantics: verdict based on is_major (major_date is not None), not on selected item title.
396
+ SERVICE_CASES: list[ServiceCase] = [
397
+ ServiceCase(
398
+ game_name="Going Medieval",
399
+ appid="1029780",
400
+ expected_major=True,
401
+ reasoning=(
402
+ "Game released 1.0 on 2026-03-17. Phase 1: 'is out now in 1.0!' matches "
403
+ "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. "
404
+ "Expected: major_date is not None (TP)."
405
+ ),
406
+ ),
407
+ ServiceCase(
408
+ game_name="Timberborn",
409
+ appid="1062090",
410
+ expected_major=True,
411
+ reasoning=(
412
+ "Game reached 1.0 on 2026-03-12. Phase 1: '1.0 is live!' matches "
413
+ "RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. "
414
+ "Expected: major_date is not None (TP)."
415
+ ),
416
+ ),
417
+ ServiceCase(
418
+ game_name="Hades II",
419
+ appid="1145350",
420
+ expected_major=True,
421
+ reasoning=(
422
+ "Game launched v1.0 on 2025-09-25. Phase 1: 'v1.0 Is Now Available!' matches "
423
+ "RELEASE_PHRASE_RE → update-related (developer feed). VERSION_RE matches 'v1.0' → major. "
424
+ "Subsequent hotfixes (v1.0 Hotfix 2, 3) are correctly blocked by HOTFIX_RE. "
425
+ "major_date = v1.0 launch date, latest_update_date = most recent hotfix date. "
426
+ "Expected: major_date is not None (TP)."
427
+ ),
428
+ ),
429
+ ServiceCase(
430
+ game_name="Against the Storm",
431
+ appid="1336490",
432
+ expected_major=True,
433
+ reasoning=(
434
+ "'Brineworks Update (1.9) available!' is a named major content update. "
435
+ "Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' "
436
+ "→ update-related (developer feed). VERSION_RE → major. "
437
+ "Expected: major_date is not None (TP)."
438
+ ),
439
+ ),
440
+ ServiceCase(
441
+ game_name="Valheim",
442
+ appid="892970",
443
+ expected_major=False,
444
+ reasoning=(
445
+ "Top items are three-segment maintenance patches. "
446
+ "Correctly classified as not_major. TN."
447
+ ),
448
+ ),
449
+ ServiceCase(
450
+ game_name="Manor Lords",
451
+ appid="1363080",
452
+ expected_major=True,
453
+ reasoning=(
454
+ "Current window contains a clearly labeled 'Major Update #6' post. "
455
+ "Expected: major_date is not None."
456
+ ),
457
+ ),
458
+ ServiceCase(
459
+ game_name="Project Zomboid",
460
+ appid="108600",
461
+ expected_major=False,
462
+ reasoning=(
463
+ "Current window is dominated by unstable builds and hotfixes. "
464
+ "These should update activity, but should not count as major releases."
465
+ ),
466
+ ),
467
+ ServiceCase(
468
+ game_name="Dwarf Fortress",
469
+ appid="975370",
470
+ expected_major=False,
471
+ reasoning=(
472
+ "Current window contains only maintenance patches (53.11/53.10/53.09 plus hotfixes). "
473
+ "Phase 2: maintenance blocker (patch + fix language) correctly blocks all of them → no major_date."
474
+ ),
475
+ ),
476
+ ServiceCase(
477
+ game_name="Helldivers 2",
478
+ appid="553850",
479
+ expected_major=True,
480
+ reasoning=(
481
+ "Current window contains 'Machinery of Oppression: 6.1.0', a named content update. "
482
+ "Expected: major_date is not None."
483
+ ),
484
+ ),
485
+ ServiceCase(
486
+ game_name="Lethal Company",
487
+ appid="1966720",
488
+ expected_major=True,
489
+ reasoning=(
490
+ "Current window contains 'V70 - The Incubating Update', a named major content drop, "
491
+ "plus newer bug-fix patches. Phase 2: NAMED_VERSION_RE + UPDATE_WORD_RE detects V70 → major_date set."
492
+ ),
493
+ ),
494
+ ServiceCase(
495
+ game_name="Factorio",
496
+ appid="427520",
497
+ expected_major=False,
498
+ reasoning=(
499
+ "Current window contains only three-segment stable maintenance releases (2.0.x). "
500
+ "Expected: not_major."
501
+ ),
502
+ ),
503
+ ServiceCase(
504
+ game_name="Satisfactory",
505
+ appid="526870",
506
+ expected_major=False,
507
+ reasoning=(
508
+ "Current window contains an experimental 1.2 rollout and experimental hotfixes. "
509
+ "Phase 2: extended BRANCH_RE ('on Experimental') blocks the 1.2 rollout → no major_date."
510
+ ),
511
+ ),
512
+ ]
513
+
514
+
515
+ # ── helpers ───────────────────────────────────────────────────────────────────
516
+
517
+ def _fmt_ts(ts: int | None) -> str:
518
+ if not ts:
519
+ return "—"
520
+ try:
521
+ return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
522
+ except (OSError, ValueError):
523
+ return "—"
524
+
525
+
526
+ def _fmt_dt(dt: datetime | None) -> str:
527
+ if dt is None:
528
+ return "—"
529
+ return dt.strftime("%Y-%m-%d")
530
+
531
+
532
+ def _trunc(s: str, n: int) -> str:
533
+ return (s[:n] + "…") if len(s) > n else s
534
+
535
+
536
+ def _fetch_news(client: httpx.Client, appid: str, count: int) -> list[dict]:
537
+ try:
538
+ resp = client.get(
539
+ STEAM_NEWS_API_URL,
540
+ params={"appid": appid, "count": count, "maxlength": 0},
541
+ )
542
+ if resp.status_code != 200:
543
+ print(f" [WARN] HTTP {resp.status_code} for appid {appid}", file=sys.stderr)
544
+ return []
545
+ data = resp.json()
546
+ return data.get("appnews", {}).get("newsitems", []) or []
547
+ except Exception as exc:
548
+ print(f" [WARN] Request failed for appid {appid}: {exc}", file=sys.stderr)
549
+ return []
550
+
551
+
552
+ # ── Mode 1: discover ──────────────────────────────────────────────────────────
553
+
554
+ def run_discover(count: int) -> None:
555
+ if count != 20:
556
+ print(f"NOTE: count={count} — beyond production window (prod uses count=20)\n")
557
+
558
+ col_idx = 4
559
+ col_gid = 20
560
+ col_date = 10
561
+ col_title = 40
562
+ col_fl = 16
563
+ col_tags = 24
564
+ col_ur = 9
565
+ col_maj = 7
566
+
567
+ header = (
568
+ f"{'#':<{col_idx}} "
569
+ f"{'gid':<{col_gid}} "
570
+ f"{'date':<{col_date}} "
571
+ f"{'title':<{col_title}} "
572
+ f"{'feedlabel':<{col_fl}} "
573
+ f"{'tags':<{col_tags}} "
574
+ f"{'upd_rel?':<{col_ur}} "
575
+ f"{'major?':<{col_maj}}"
576
+ )
577
+ sep = "-" * len(header)
578
+
579
+ with httpx.Client(timeout=30.0) as client:
580
+ for game_name, appid in GAMES:
581
+ print(f"\n{'=' * len(header)}")
582
+ print(f" {game_name} (appid={appid})")
583
+ print(f"{'=' * len(header)}")
584
+ print(header)
585
+ print(sep)
586
+
587
+ items = _fetch_news(client, appid, count)
588
+ if not items:
589
+ print(" (no items returned)")
590
+ continue
591
+
592
+ for idx, item in enumerate(items, start=1):
593
+ gid = str(item.get("gid") or "")[:col_gid]
594
+ date_str = _fmt_ts(item.get("date"))
595
+ title = _trunc(item.get("title", ""), col_title)
596
+ feedlabel = _trunc(item.get("feedlabel") or "", col_fl)
597
+ tags = _trunc(str(item.get("tags") or ""), col_tags)
598
+
599
+ is_ur = UpdateDetectionService._is_update_related(item)
600
+ is_maj = UpdateDetectionService._is_major_update(item)
601
+
602
+ ur_str = "Yes" if is_ur else "No"
603
+ maj_str = "Yes" if is_maj else "No"
604
+
605
+ print(
606
+ f"{idx:<{col_idx}} "
607
+ f"{gid:<{col_gid}} "
608
+ f"{date_str:<{col_date}} "
609
+ f"{title:<{col_title}} "
610
+ f"{feedlabel:<{col_fl}} "
611
+ f"{tags:<{col_tags}} "
612
+ f"{ur_str:<{col_ur}} "
613
+ f"{maj_str:<{col_maj}}"
614
+ )
615
+
616
+ latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items)
617
+ print(f"\n >> latest_update_date: {_fmt_dt(latest_update_date)} | major_date: {_fmt_dt(major_date)}")
618
+ verdict = "MAJOR" if major_date is not None else "not_major"
619
+ print(f" >> Service result: {verdict}")
620
+
621
+
622
+ # ── Mode 2: evaluate (item-level) ─────────────────────────────────────────────
623
+
624
+ def run_evaluate() -> None:
625
+ if not ITEM_CASES:
626
+ print("[evaluate] No item-level ground truth defined yet.")
627
+ print(" Run --discover first, then populate ITEM_CASES in this script.")
628
+ return
629
+
630
+ # Build lookup: appid → {gid → item}
631
+ gid_index: dict[str, dict[str, dict]] = {}
632
+ needed_appids = {case.appid for case in ITEM_CASES}
633
+
634
+ with httpx.Client(timeout=30.0) as client:
635
+ for appid in needed_appids:
636
+ items = _fetch_news(client, appid, count=20)
637
+ gid_index[appid] = {str(item.get("gid", "")): item for item in items}
638
+
639
+ tp = tn = fp = fn = amb = not_found = 0
640
+ rows: list[tuple] = []
641
+
642
+ for case in ITEM_CASES:
643
+ item = gid_index.get(case.appid, {}).get(case.gid)
644
+ if item is None:
645
+ not_found += 1
646
+ rows.append((case.game_name, case.title, "—", "—", "—", case.expected, "NOT FOUND"))
647
+ continue
648
+
649
+ is_ur = UpdateDetectionService._is_update_related(item)
650
+ is_maj = UpdateDetectionService._is_major_update(item)
651
+
652
+ predicted = "major" if (is_ur and is_maj) else "not_major"
653
+ expected = case.expected
654
+
655
+ if expected == "ambiguous":
656
+ verdict = "ambiguous"
657
+ amb += 1
658
+ elif predicted == expected:
659
+ verdict = "PASS"
660
+ if expected == "major":
661
+ tp += 1
662
+ else:
663
+ tn += 1
664
+ else:
665
+ if predicted == "major" and expected == "not_major":
666
+ verdict = "FAIL (FP)"
667
+ fp += 1
668
+ else:
669
+ verdict = "FAIL (FN)"
670
+ fn += 1
671
+
672
+ rows.append((
673
+ case.game_name,
674
+ _trunc(case.title, 30),
675
+ _fmt_ts(item.get("date")),
676
+ str(item.get("tags", ""))[:20],
677
+ item.get("feedlabel", "")[:16],
678
+ expected,
679
+ "Yes" if is_ur else "No",
680
+ "Yes" if is_maj else "No",
681
+ verdict,
682
+ ))
683
+
684
+ # Print report
685
+ print("\n" + "=" * 110)
686
+ print("REPORT A — Item-level classification")
687
+ print("=" * 110)
688
+ hdr = f"{'Game':<18} {'Title':<30} {'Date':<10} {'Tags':<20} {'FeedLabel':<16} {'Expected':<10} {'UpdRel?':<8} {'Major?':<7} Verdict"
689
+ print(hdr)
690
+ print("-" * 110)
691
+ for row in rows:
692
+ if len(row) == 7:
693
+ print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {'—':<20} {'—':<16} {row[5]:<10} {'—':<8} {'—':<7} {row[6]}")
694
+ else:
695
+ print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {row[3]:<20} {row[4]:<16} {row[5]:<10} {row[6]:<8} {row[7]:<7} {row[8]}")
696
+
697
+ total = tp + tn + fp + fn
698
+ print("\nSummary:")
699
+ print(f" Total cases : {len(ITEM_CASES)} | not found: {not_found} | ambiguous: {amb}")
700
+ print(f" TP={tp} TN={tn} FP={fp} FN={fn}")
701
+ if total > 0:
702
+ prec = tp / (tp + fp) if (tp + fp) else float("nan")
703
+ recall = tp / (tp + fn) if (tp + fn) else float("nan")
704
+ acc = (tp + tn) / total
705
+ print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}")
706
+
707
+ fps = [c for c in ITEM_CASES if "FAIL (FP)" in str(rows[ITEM_CASES.index(c)])]
708
+ fns = [c for c in ITEM_CASES if "FAIL (FN)" in str(rows[ITEM_CASES.index(c)])]
709
+ if fps:
710
+ print("\nFalse Positives:")
711
+ for c in fps:
712
+ print(f" [{c.game_name}] {c.title!r} — {c.reasoning}")
713
+ if fns:
714
+ print("\nFalse Negatives:")
715
+ for c in fns:
716
+ print(f" [{c.game_name}] {c.title!r} — {c.reasoning}")
717
+
718
+
719
+ # ── Mode 3: evaluate-service (end-to-end) ─────────────────────────────────────
720
+
721
+ def run_evaluate_service() -> None:
722
+ if not SERVICE_CASES:
723
+ print("[evaluate-service] No service-level ground truth defined yet.")
724
+ print(" Run --discover first, then populate SERVICE_CASES in this script.")
725
+ return
726
+
727
+ tp = tn = fp = fn = amb = 0
728
+ rows: list[tuple] = []
729
+
730
+ with httpx.Client(timeout=30.0) as client:
731
+ for case in SERVICE_CASES:
732
+ items = _fetch_news(client, case.appid, count=20)
733
+ latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items)
734
+ is_maj = major_date is not None
735
+
736
+ latest_str = _fmt_dt(latest_update_date)
737
+ major_str = _fmt_dt(major_date)
738
+ maj_label = "Yes" if is_maj else "No"
739
+
740
+ if case.expected_major is None:
741
+ verdict = "ambiguous"
742
+ amb += 1
743
+ elif is_maj == case.expected_major:
744
+ verdict = "PASS"
745
+ if case.expected_major:
746
+ tp += 1
747
+ else:
748
+ tn += 1
749
+ else:
750
+ if is_maj and not case.expected_major:
751
+ verdict = "FAIL (FP)"
752
+ fp += 1
753
+ else:
754
+ verdict = "FAIL (FN)"
755
+ fn += 1
756
+
757
+ rows.append((
758
+ case.game_name,
759
+ latest_str,
760
+ major_str,
761
+ maj_label,
762
+ "True" if case.expected_major else ("None" if case.expected_major is None else "False"),
763
+ verdict,
764
+ ))
765
+
766
+ print("\n" + "=" * 100)
767
+ print("REPORT B — Service-level (end-to-end)")
768
+ print("=" * 100)
769
+ hdr = f"{'Game':<18} {'LatestUpdate':<13} {'MajorDate':<11} {'Major?':<7} {'Expected':<9} Verdict"
770
+ print(hdr)
771
+ print("-" * 100)
772
+ for row in rows:
773
+ print(f"{row[0]:<18} {row[1]:<13} {row[2]:<11} {row[3]:<7} {row[4]:<9} {row[5]}")
774
+
775
+ total = tp + tn + fp + fn
776
+ print("\nSummary:")
777
+ print(f" Total games : {len(SERVICE_CASES)} | ambiguous: {amb}")
778
+ print(f" TP={tp} TN={tn} FP={fp} FN={fn}")
779
+ if total > 0:
780
+ prec = tp / (tp + fp) if (tp + fp) else float("nan")
781
+ recall = tp / (tp + fn) if (tp + fn) else float("nan")
782
+ acc = (tp + tn) / total
783
+ print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}")
784
+
785
+ for idx, case in enumerate(SERVICE_CASES):
786
+ verdict = rows[idx][5]
787
+ if verdict.startswith("FAIL"):
788
+ print(f"\n [{case.game_name}] {verdict} — {case.reasoning}")
789
+
790
+
791
+ # ── main ──────────────────────────────────────────────────────────────────────
792
+
793
+ def _parse_args() -> argparse.Namespace:
794
+ p = argparse.ArgumentParser(
795
+ description="Benchmark the major update detection heuristic against real Steam games."
796
+ )
797
+ p.add_argument(
798
+ "--discover",
799
+ action="store_true",
800
+ help="Fetch news for all games and display per-item classification details.",
801
+ )
802
+ p.add_argument(
803
+ "--evaluate",
804
+ action="store_true",
805
+ help="Run item-level evaluation against ITEM_CASES ground truth.",
806
+ )
807
+ p.add_argument(
808
+ "--evaluate-service",
809
+ action="store_true",
810
+ dest="evaluate_service",
811
+ help="Run service-level end-to-end evaluation against SERVICE_CASES ground truth.",
812
+ )
813
+ p.add_argument(
814
+ "--count",
815
+ type=int,
816
+ default=20,
817
+ help="Number of news items to fetch (default: 20, matches production). "
818
+ "Values > 20 are beyond the production window.",
819
+ )
820
+ return p.parse_args()
821
+
822
+
823
+ def main() -> int:
824
+ args = _parse_args()
825
+
826
+ discover = args.discover
827
+ evaluate = args.evaluate
828
+ eval_svc = args.evaluate_service
829
+
830
+ # Default: run both evaluate modes when nothing is specified
831
+ if not discover and not evaluate and not eval_svc:
832
+ evaluate = True
833
+ eval_svc = True
834
+
835
+ if discover:
836
+ run_discover(count=args.count)
837
+
838
+ if evaluate:
839
+ run_evaluate()
840
+
841
+ if eval_svc:
842
+ run_evaluate_service()
843
+
844
+ return 0
845
+
846
+
847
+ if __name__ == "__main__":
848
+ raise SystemExit(main())
scripts/check_db_stats.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ from motor.motor_asyncio import AsyncIOMotorClient
4
+ from dotenv import load_dotenv
5
+
6
+ # Załaduj .env z głównego katalogu lub katalogu backend
7
+ load_dotenv(".env")
8
+ load_dotenv("backend/.env")
9
+
10
+ async def check_stats():
11
+ # Pobranie parametrów z .env
12
+ mongo_url = os.getenv("MONGODB_URL")
13
+ db_name = os.getenv("MONGODB_DB_NAME", "sentimentSummary")
14
+
15
+ if not mongo_url:
16
+ print("ERROR: MONGODB_URL not found in .env file!")
17
+ return
18
+
19
+ print(f"Connecting to MongoDB: {mongo_url.split('@')[-1]}...") # Pokazuje tylko hosta dla bezpieczeństwa
20
+
21
+ try:
22
+ client = AsyncIOMotorClient(mongo_url)
23
+ db = client[db_name]
24
+ collection = db["games"]
25
+
26
+ total = await collection.count_documents({})
27
+ with_cn = await collection.count_documents({
28
+ "name_cn": {"$exists": True, "$ne": None, "$nin": ["", "null", "None"]}
29
+ })
30
+
31
+ print("\n" + "="*30)
32
+ print(f"DATABASE STATS")
33
+ print("="*30)
34
+ print(f"Total games: {total}")
35
+ print(f"With Chinese: {with_cn}")
36
+
37
+ if total > 0:
38
+ percentage = (with_cn / total) * 100
39
+ print(f"Coverage: {percentage:.2f}%")
40
+ print("="*30)
41
+
42
+ client.close()
43
+ except Exception as e:
44
+ print(f"ERROR: Could not connect or query DB: {e}")
45
+
46
+ if __name__ == "__main__":
47
+ asyncio.run(check_stats())
scripts/expand_keywords/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Keyword expansion toolkit using FastText.
3
+
4
+ This package provides tools to:
5
+ 1. Fetch reviews from Steam games
6
+ 2. Train FastText models on review corpus
7
+ 3. Expand existing keyword dictionary with semantically similar words
8
+ """
scripts/expand_keywords/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Allow running as: python -m scripts.expand_keywords"""
2
+
3
+ from .main import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
scripts/expand_keywords/config.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for keyword expansion: game list and settings.
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ # Base directories
8
+ BASE_DIR = Path(__file__).parent
9
+ DATA_DIR = BASE_DIR / "data"
10
+ REVIEWS_DIR = DATA_DIR / "reviews"
11
+ MODELS_DIR = DATA_DIR / "models"
12
+ OUTPUT_DIR = DATA_DIR / "output"
13
+
14
+ # Ensure directories exist
15
+ for dir_path in [REVIEWS_DIR, MODELS_DIR, OUTPUT_DIR]:
16
+ dir_path.mkdir(parents=True, exist_ok=True)
17
+
18
+ # Game list: (app_id, name, genre)
19
+ # Selected for variety across genres to get diverse vocabulary
20
+ GAMES: list[tuple[str, str, str]] = [
21
+ # Action RPG
22
+ ("1245620", "Elden Ring", "action_rpg"),
23
+ ("374320", "Dark Souls III", "action_rpg"),
24
+ # CRPG
25
+ ("1086940", "Baldur's Gate 3", "crpg"),
26
+ ("435150", "Divinity: Original Sin 2", "crpg"),
27
+ ("1184370", "Pathfinder: Wrath of the Righteous", "crpg"),
28
+ # Open World RPG
29
+ ("292030", "The Witcher 3", "open_world_rpg"),
30
+ ("489830", "Skyrim Special Edition", "open_world_rpg"),
31
+ ("1091500", "Cyberpunk 2077", "open_world_rpg"),
32
+ # FPS
33
+ ("730", "Counter-Strike 2", "fps_competitive"),
34
+ ("782330", "DOOM Eternal", "fps_single"),
35
+ ("1237970", "Titanfall 2", "fps_single"),
36
+ # Survival
37
+ ("892970", "Valheim", "survival"),
38
+ ("252490", "Rust", "survival"),
39
+ ("264710", "Subnautica", "survival"),
40
+ ("242760", "The Forest", "survival"),
41
+ # Strategy
42
+ ("289070", "Civilization VI", "strategy"),
43
+ ("1142710", "Total War: Warhammer III", "strategy"),
44
+ ("1466860", "Age of Empires IV", "strategy"),
45
+ # Roguelike
46
+ ("1145360", "Hades", "roguelike"),
47
+ ("588650", "Dead Cells", "roguelike"),
48
+ ("646570", "Slay the Spire", "roguelike"),
49
+ # Metroidvania
50
+ ("367520", "Hollow Knight", "metroidvania"),
51
+ ("1057090", "Ori and the Will of the Wisps", "metroidvania"),
52
+ # Simulation
53
+ ("255710", "Cities: Skylines", "simulation"),
54
+ ("427520", "Factorio", "simulation"),
55
+ ("526870", "Satisfactory", "simulation"),
56
+ # Horror
57
+ ("1196590", "Resident Evil Village", "horror"),
58
+ ("739630", "Phasmophobia", "horror"),
59
+ ("381210", "Dead by Daylight", "horror"),
60
+ # Live Service
61
+ ("1085660", "Destiny 2", "live_service"),
62
+ ("230410", "Warframe", "live_service"),
63
+ ("238960", "Path of Exile", "live_service"),
64
+ # Racing
65
+ ("1551360", "Forza Horizon 5", "racing"),
66
+ # Story Driven
67
+ ("1174180", "Red Dead Redemption 2", "story_driven"),
68
+ # Casual
69
+ ("413150", "Stardew Valley", "casual"),
70
+ ("105600", "Terraria", "casual"),
71
+ ]
72
+
73
+ # Fetching settings
74
+ SETTINGS = {
75
+ # Review fetching
76
+ "reviews_per_game": 2700, # ~80k total across ~30 games
77
+ "batch_size": 100, # Steam API batch size
78
+ "sleep_between_batches": 1.5, # Seconds between API calls
79
+ "sleep_between_games": 5.0, # Longer pause between games
80
+ "min_review_length": 50, # Filter short reviews (chars)
81
+ "max_retries": 3, # Retry count on failure
82
+ "retry_base_delay": 10.0, # Base delay for exponential backoff
83
+
84
+ # Preprocessing
85
+ "phrase_min_count": 10, # Min occurrences for phrase detection
86
+ "phrase_threshold": 10.0, # Phrase detection threshold
87
+
88
+ # FastText training
89
+ "fasttext_vector_size": 150,
90
+ "fasttext_window": 5,
91
+ "fasttext_min_count": 5,
92
+ "fasttext_epochs": 10,
93
+ "fasttext_workers": 4,
94
+
95
+ # Expansion
96
+ "similarity_threshold": 0.55,
97
+ "max_suggestions_per_seed": 20,
98
+ "min_frequency": 10, # Min word frequency in corpus
99
+ "auto_approve_threshold": 0.70, # Score threshold for auto-approval
100
+ }
101
+
102
+ # Steam API endpoint
103
+ STEAM_REVIEWS_API = "https://store.steampowered.com/appreviews/{app_id}"
104
+
105
+ # Steam language setting for reviews
106
+ STEAM_REVIEW_LANGUAGE = "schinese" # schinese, english, tchinese, etc.
scripts/expand_keywords/expander.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Keyword dictionary expansion with exclusive category assignment.
3
+
4
+ Key principle: Each word can only belong to ONE category.
5
+ This prevents cross-contamination where a word like "unplayable"
6
+ might be counted in both Bugs and Performance categories.
7
+
8
+ Algorithm:
9
+ 1. For each category: find candidate words similar to seed keywords
10
+ 2. Collect ALL candidates in a global pool
11
+ 3. Assign each word to the category with highest score
12
+ 4. Filter by similarity threshold and frequency
13
+ """
14
+
15
+ import json
16
+ import logging
17
+ import math
18
+ from collections import defaultdict
19
+ from dataclasses import dataclass, field
20
+ from datetime import datetime
21
+ from pathlib import Path
22
+
23
+ from gensim.models import FastText
24
+
25
+ from .config import OUTPUT_DIR, SETTINGS
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class Candidate:
32
+ """A candidate word for dictionary expansion."""
33
+
34
+ word: str
35
+ similarity: float
36
+ frequency: int
37
+ source_seeds: list[str] = field(default_factory=list)
38
+
39
+ @property
40
+ def score(self) -> float:
41
+ """
42
+ Combined score from similarity and frequency.
43
+
44
+ Formula: 0.7 * similarity + 0.3 * normalized_log_frequency
45
+ Frequency factor normalized to ~0-1 range.
46
+ """
47
+ freq_factor = math.log10(max(self.frequency, 1) + 1) / 5
48
+ return self.similarity * 0.7 + freq_factor * 0.3
49
+
50
+ def to_dict(self) -> dict:
51
+ return {
52
+ "word": self.word.replace("_", " "),
53
+ "similarity": round(self.similarity, 3),
54
+ "frequency": self.frequency,
55
+ "score": round(self.score, 3),
56
+ "source_seeds": self.source_seeds,
57
+ }
58
+
59
+
60
+ class KeywordExpander:
61
+ """
62
+ Expands keyword dictionary using trained FastText model.
63
+
64
+ Uses exclusive category assignment to prevent words
65
+ appearing in multiple categories.
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ model: FastText,
71
+ existing_keywords: dict[str, list[str]],
72
+ word_frequencies: dict[str, int],
73
+ similarity_threshold: float | None = None,
74
+ max_suggestions_per_seed: int | None = None,
75
+ min_frequency: int | None = None,
76
+ ):
77
+ """
78
+ Initialize expander.
79
+
80
+ Args:
81
+ model: Trained FastText model
82
+ existing_keywords: Current TOPIC_KEYWORDS dictionary
83
+ word_frequencies: Word frequency counts from corpus
84
+ similarity_threshold: Minimum similarity for candidates
85
+ max_suggestions_per_seed: Max similar words per seed
86
+ min_frequency: Minimum corpus frequency
87
+ """
88
+ self.model = model
89
+ self.existing = existing_keywords
90
+ self.word_freq = word_frequencies
91
+
92
+ self.similarity_threshold = similarity_threshold or SETTINGS["similarity_threshold"]
93
+ self.max_suggestions = max_suggestions_per_seed or SETTINGS["max_suggestions_per_seed"]
94
+ self.min_frequency = min_frequency or SETTINGS["min_frequency"]
95
+
96
+ # Build set of all existing words (normalized)
97
+ self.existing_words: set[str] = set()
98
+ for words in existing_keywords.values():
99
+ for w in words:
100
+ self.existing_words.add(w.lower().replace(" ", "_"))
101
+
102
+ logger.info(f"Expander initialized with {len(self.existing_words)} existing keywords")
103
+
104
+ def _find_candidates_for_category(
105
+ self,
106
+ category: str,
107
+ seeds: list[str],
108
+ ) -> dict[str, Candidate]:
109
+ """
110
+ Find candidate words for a single category.
111
+
112
+ Returns dict[word -> Candidate] with best similarity per word.
113
+ """
114
+ candidates: dict[str, Candidate] = {}
115
+
116
+ for seed in seeds:
117
+ # Normalize seed (e.g., "frame rate" -> "frame_rate")
118
+ seed_normalized = seed.lower().replace(" ", "_")
119
+
120
+ # Skip if seed not in vocabulary
121
+ if seed_normalized not in self.model.wv:
122
+ continue
123
+
124
+ # Get similar words
125
+ try:
126
+ similar = self.model.wv.most_similar(
127
+ seed_normalized,
128
+ topn=self.max_suggestions,
129
+ )
130
+ except KeyError:
131
+ continue
132
+
133
+ for word, similarity in similar:
134
+ # Skip existing words
135
+ if word in self.existing_words:
136
+ continue
137
+
138
+ # Skip below threshold
139
+ if similarity < self.similarity_threshold:
140
+ continue
141
+
142
+ # Check frequency
143
+ freq = self.word_freq.get(word, 0)
144
+ if freq < self.min_frequency:
145
+ continue
146
+
147
+ # Update or add candidate
148
+ if word in candidates:
149
+ # Keep higher similarity
150
+ if similarity > candidates[word].similarity:
151
+ candidates[word].similarity = similarity
152
+ candidates[word].source_seeds.append(seed)
153
+ else:
154
+ candidates[word] = Candidate(
155
+ word=word,
156
+ similarity=similarity,
157
+ frequency=freq,
158
+ source_seeds=[seed],
159
+ )
160
+
161
+ return candidates
162
+
163
+ def expand_all_exclusive(self) -> dict[str, list[Candidate]]:
164
+ """
165
+ Expand all categories with exclusive assignment.
166
+
167
+ Each word is assigned only to the category where it has
168
+ the highest score.
169
+
170
+ Returns:
171
+ Dict mapping category -> list of Candidates (sorted by score)
172
+ """
173
+ logger.info("Starting exclusive expansion...")
174
+
175
+ # Step 1: Collect candidates from all categories
176
+ # Format: word -> [(category, Candidate), ...]
177
+ all_candidates: dict[str, list[tuple[str, Candidate]]] = defaultdict(list)
178
+
179
+ for category, seeds in self.existing.items():
180
+ category_candidates = self._find_candidates_for_category(category, seeds)
181
+ for word, candidate in category_candidates.items():
182
+ all_candidates[word].append((category, candidate))
183
+
184
+ logger.info(f"[{category}] Found {len(category_candidates)} raw candidates")
185
+
186
+ # Step 2: Assign each word to category with highest score
187
+ final_assignments: dict[str, list[Candidate]] = defaultdict(list)
188
+
189
+ for word, category_candidates in all_candidates.items():
190
+ # Find category with highest score
191
+ best_category, best_candidate = max(
192
+ category_candidates,
193
+ key=lambda x: x[1].score,
194
+ )
195
+ final_assignments[best_category].append(best_candidate)
196
+
197
+ # Step 3: Sort candidates in each category by score
198
+ for category in final_assignments:
199
+ final_assignments[category].sort(key=lambda c: c.score, reverse=True)
200
+
201
+ # Log results
202
+ total = sum(len(cands) for cands in final_assignments.values())
203
+ logger.info(f"Exclusive assignment complete: {total} total candidates")
204
+
205
+ for category, cands in sorted(final_assignments.items()):
206
+ logger.info(f" {category}: {len(cands)} candidates")
207
+
208
+ return dict(final_assignments)
209
+
210
+ def export_candidates(
211
+ self,
212
+ path: Path | str | None = None,
213
+ include_threshold_in_name: bool = False,
214
+ ) -> Path:
215
+ """
216
+ Export candidates to JSON for manual review.
217
+
218
+ Args:
219
+ path: Output path (default: output/candidates.json)
220
+ include_threshold_in_name: Add threshold to filename for comparison
221
+
222
+ Returns:
223
+ Path to exported file
224
+ """
225
+ if path:
226
+ path = Path(path)
227
+ elif include_threshold_in_name:
228
+ path = OUTPUT_DIR / f"candidates_t{self.similarity_threshold:.2f}.json"
229
+ else:
230
+ path = OUTPUT_DIR / "candidates.json"
231
+
232
+ results = self.expand_all_exclusive()
233
+
234
+ export_data = {
235
+ "metadata": {
236
+ "generated_at": datetime.now().isoformat(),
237
+ "similarity_threshold": self.similarity_threshold,
238
+ "min_frequency": self.min_frequency,
239
+ "total_candidates": sum(len(c) for c in results.values()),
240
+ },
241
+ "categories": {},
242
+ }
243
+
244
+ for category, candidates in sorted(results.items()):
245
+ export_data["categories"][category] = [c.to_dict() for c in candidates]
246
+
247
+ with open(path, "w", encoding="utf-8") as f:
248
+ json.dump(export_data, f, indent=2, ensure_ascii=False)
249
+
250
+ logger.info(f"Exported candidates to {path}")
251
+ return path
252
+
253
+ def generate_keywords_py(
254
+ self,
255
+ output_path: Path | str | None = None,
256
+ auto_approve_threshold: float | None = None,
257
+ ) -> Path:
258
+ """
259
+ Generate new keywords.py with expanded dictionary.
260
+
261
+ Words with score >= auto_approve_threshold are added directly.
262
+ Words below threshold are added as comments for manual review.
263
+
264
+ Args:
265
+ output_path: Output path (default: output/keywords_expanded.py)
266
+ auto_approve_threshold: Score threshold for auto-approval
267
+
268
+ Returns:
269
+ Path to generated file
270
+ """
271
+ output_path = Path(output_path) if output_path else OUTPUT_DIR / "keywords_expanded.py"
272
+ auto_approve = auto_approve_threshold or SETTINGS["auto_approve_threshold"]
273
+
274
+ results = self.expand_all_exclusive()
275
+
276
+ lines = [
277
+ '"""',
278
+ "Expanded keyword dictionary for game review topic detection.",
279
+ f"Generated: {datetime.now().isoformat()}",
280
+ f"Auto-approve threshold: {auto_approve}",
281
+ '"""',
282
+ "",
283
+ "TOPIC_KEYWORDS = {",
284
+ ]
285
+
286
+ for category, seeds in self.existing.items():
287
+ lines.append(f' "{category}": [')
288
+
289
+ # Existing keywords
290
+ lines.append(" # Existing")
291
+ for seed in seeds:
292
+ lines.append(f' "{seed}",')
293
+
294
+ # New candidates
295
+ candidates = results.get(category, [])
296
+ if candidates:
297
+ # Auto-approved
298
+ auto_approved = [c for c in candidates if c.score >= auto_approve]
299
+ if auto_approved:
300
+ lines.append(f" # NEW (auto-approved, score >= {auto_approve})")
301
+ for c in auto_approved:
302
+ word_display = c.word.replace("_", " ")
303
+ lines.append(f' "{word_display}", # score={c.score:.2f}')
304
+
305
+ # Candidates requiring review
306
+ review_needed = [c for c in candidates if c.score < auto_approve]
307
+ if review_needed:
308
+ lines.append(f" # CANDIDATES (score < {auto_approve}, require review)")
309
+ for c in review_needed:
310
+ word_display = c.word.replace("_", " ")
311
+ lines.append(f' # "{word_display}", # score={c.score:.2f}')
312
+
313
+ lines.append(" ],")
314
+ lines.append("")
315
+
316
+ lines.append("}")
317
+ lines.append("")
318
+
319
+ with open(output_path, "w", encoding="utf-8") as f:
320
+ f.write("\n".join(lines))
321
+
322
+ logger.info(f"Generated keywords file at {output_path}")
323
+ return output_path
324
+
325
+ def get_expansion_stats(self) -> dict:
326
+ """Get statistics about the expansion."""
327
+ results = self.expand_all_exclusive()
328
+ auto_threshold = SETTINGS["auto_approve_threshold"]
329
+
330
+ stats = {
331
+ "total_candidates": 0,
332
+ "auto_approved": 0,
333
+ "needs_review": 0,
334
+ "by_category": {},
335
+ }
336
+
337
+ for category, candidates in results.items():
338
+ auto = sum(1 for c in candidates if c.score >= auto_threshold)
339
+ review = len(candidates) - auto
340
+
341
+ stats["by_category"][category] = {
342
+ "total": len(candidates),
343
+ "auto_approved": auto,
344
+ "needs_review": review,
345
+ }
346
+ stats["total_candidates"] += len(candidates)
347
+ stats["auto_approved"] += auto
348
+ stats["needs_review"] += review
349
+
350
+ return stats
scripts/expand_keywords/fetcher.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Review fetcher with rate limiting and progress tracking.
3
+
4
+ Downloads reviews from Steam API with:
5
+ - Cursor-based pagination
6
+ - Sleep between requests to respect rate limits
7
+ - Progress persistence (JSONL per game + progress.json)
8
+ - Resume capability
9
+ """
10
+
11
+ import asyncio
12
+ import json
13
+ import logging
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import httpx
20
+
21
+ from .config import GAMES, REVIEWS_DIR, SETTINGS, STEAM_REVIEW_LANGUAGE, STEAM_REVIEWS_API
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @dataclass
27
+ class FetchProgress:
28
+ """Progress tracking for a single game."""
29
+ app_id: str
30
+ name: str
31
+ target: int
32
+ fetched: int = 0
33
+ cursor: str = "*"
34
+ completed: bool = False
35
+ last_updated: str = ""
36
+
37
+ def to_dict(self) -> dict:
38
+ return {
39
+ "app_id": self.app_id,
40
+ "name": self.name,
41
+ "target": self.target,
42
+ "fetched": self.fetched,
43
+ "cursor": self.cursor,
44
+ "completed": self.completed,
45
+ "last_updated": self.last_updated,
46
+ }
47
+
48
+ @classmethod
49
+ def from_dict(cls, data: dict) -> "FetchProgress":
50
+ return cls(
51
+ app_id=data["app_id"],
52
+ name=data["name"],
53
+ target=data["target"],
54
+ fetched=data.get("fetched", 0),
55
+ cursor=data.get("cursor", "*"),
56
+ completed=data.get("completed", False),
57
+ last_updated=data.get("last_updated", ""),
58
+ )
59
+
60
+
61
+ @dataclass
62
+ class ReviewFetcher:
63
+ """
64
+ Fetches reviews from Steam with rate limiting.
65
+
66
+ Features:
67
+ - Async HTTP client with timeout
68
+ - Exponential backoff on rate limiting
69
+ - Progress persistence (resume capability)
70
+ - JSONL output per game
71
+ """
72
+
73
+ timeout: float = 30.0
74
+ progress_file: Path = field(default_factory=lambda: REVIEWS_DIR / "progress.json")
75
+
76
+ def __post_init__(self):
77
+ self._progress: dict[str, FetchProgress] = {}
78
+ self._load_progress()
79
+
80
+ def _load_progress(self) -> None:
81
+ """Load progress from file if exists."""
82
+ if self.progress_file.exists():
83
+ try:
84
+ with open(self.progress_file, "r", encoding="utf-8") as f:
85
+ data = json.load(f)
86
+ for app_id, progress_data in data.items():
87
+ self._progress[app_id] = FetchProgress.from_dict(progress_data)
88
+ logger.info(f"Loaded progress for {len(self._progress)} games")
89
+ except (json.JSONDecodeError, KeyError) as e:
90
+ logger.warning(f"Failed to load progress: {e}")
91
+ self._progress = {}
92
+
93
+ def _save_progress(self) -> None:
94
+ """Save progress to file."""
95
+ data = {app_id: prog.to_dict() for app_id, prog in self._progress.items()}
96
+ with open(self.progress_file, "w", encoding="utf-8") as f:
97
+ json.dump(data, f, indent=2)
98
+
99
+ def get_progress(self) -> dict[str, dict]:
100
+ """Get current progress for all games."""
101
+ return {app_id: prog.to_dict() for app_id, prog in self._progress.items()}
102
+
103
+ def _get_reviews_file(self, app_id: str) -> Path:
104
+ """Get path to reviews JSONL file for a game."""
105
+ return REVIEWS_DIR / f"{app_id}.jsonl"
106
+
107
+ def _append_reviews(self, app_id: str, reviews: list[str]) -> None:
108
+ """Append reviews to JSONL file."""
109
+ reviews_file = self._get_reviews_file(app_id)
110
+ with open(reviews_file, "a", encoding="utf-8") as f:
111
+ for review in reviews:
112
+ f.write(json.dumps({"text": review}, ensure_ascii=False) + "\n")
113
+
114
+ def load_reviews(self, app_id: str) -> list[str]:
115
+ """Load reviews from JSONL file."""
116
+ reviews_file = self._get_reviews_file(app_id)
117
+ if not reviews_file.exists():
118
+ return []
119
+
120
+ reviews = []
121
+ with open(reviews_file, "r", encoding="utf-8") as f:
122
+ for line in f:
123
+ try:
124
+ data = json.loads(line.strip())
125
+ reviews.append(data["text"])
126
+ except (json.JSONDecodeError, KeyError):
127
+ continue
128
+ return reviews
129
+
130
+ def load_all_reviews(self) -> list[str]:
131
+ """Load all reviews from all downloaded games."""
132
+ all_reviews = []
133
+ for app_id, _, _ in GAMES:
134
+ reviews = self.load_reviews(app_id)
135
+ all_reviews.extend(reviews)
136
+ logger.info(f"Loaded {len(all_reviews)} total reviews")
137
+ return all_reviews
138
+
139
+ async def _fetch_batch(
140
+ self,
141
+ client: httpx.AsyncClient,
142
+ app_id: str,
143
+ cursor: str,
144
+ batch_size: int,
145
+ ) -> tuple[list[str], str | None]:
146
+ """Fetch a single batch of reviews."""
147
+ url = STEAM_REVIEWS_API.format(app_id=app_id)
148
+ params: dict[str, Any] = {
149
+ "json": "1",
150
+ "filter": "recent", # "recent" has more reviews available than "all"
151
+ "review_type": "all",
152
+ "language": STEAM_REVIEW_LANGUAGE,
153
+ "num_per_page": str(batch_size),
154
+ "cursor": cursor,
155
+ "purchase_type": "all",
156
+ }
157
+
158
+ try:
159
+ response = await client.get(url, params=params)
160
+ response.raise_for_status()
161
+ data = response.json()
162
+ except httpx.HTTPError as e:
163
+ logger.error(f"HTTP error fetching reviews for {app_id}: {e}")
164
+ return [], None
165
+
166
+ if not data.get("success"):
167
+ logger.warning(f"API returned success=false for {app_id}")
168
+ return [], None
169
+
170
+ reviews_data = data.get("reviews", [])
171
+ min_length = SETTINGS["min_review_length"]
172
+
173
+ reviews = [
174
+ review.get("review", "").strip()
175
+ for review in reviews_data
176
+ if review.get("review") and len(review.get("review", "").strip()) >= min_length
177
+ ]
178
+
179
+ new_cursor = data.get("cursor")
180
+ return reviews, new_cursor
181
+
182
+ async def _fetch_with_backoff(
183
+ self,
184
+ client: httpx.AsyncClient,
185
+ app_id: str,
186
+ cursor: str,
187
+ batch_size: int,
188
+ ) -> tuple[list[str], str | None]:
189
+ """Fetch with exponential backoff on failure."""
190
+ max_retries = SETTINGS["max_retries"]
191
+ base_delay = SETTINGS["retry_base_delay"]
192
+
193
+ for attempt in range(max_retries):
194
+ reviews, new_cursor = await self._fetch_batch(client, app_id, cursor, batch_size)
195
+
196
+ if reviews or new_cursor is None:
197
+ return reviews, new_cursor
198
+
199
+ # Empty reviews with cursor - might be rate limited
200
+ delay = base_delay * (2 ** attempt)
201
+ logger.warning(f"Empty response, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
202
+ await asyncio.sleep(delay)
203
+
204
+ return [], None
205
+
206
+ async def fetch_game_reviews(
207
+ self,
208
+ app_id: str,
209
+ name: str,
210
+ target: int,
211
+ resume: bool = True,
212
+ ) -> int:
213
+ """
214
+ Fetch reviews for a single game.
215
+
216
+ Returns number of reviews fetched.
217
+ """
218
+ # Check if already completed
219
+ if resume and app_id in self._progress:
220
+ progress = self._progress[app_id]
221
+ if progress.completed:
222
+ logger.info(f"[{name}] Already completed ({progress.fetched} reviews)")
223
+ return progress.fetched
224
+ cursor = progress.cursor
225
+ fetched = progress.fetched
226
+ else:
227
+ # Start fresh - clear existing file
228
+ reviews_file = self._get_reviews_file(app_id)
229
+ if reviews_file.exists():
230
+ reviews_file.unlink()
231
+ cursor = "*"
232
+ fetched = 0
233
+
234
+ # Initialize progress
235
+ self._progress[app_id] = FetchProgress(
236
+ app_id=app_id,
237
+ name=name,
238
+ target=target,
239
+ fetched=fetched,
240
+ cursor=cursor,
241
+ )
242
+
243
+ batch_size = SETTINGS["batch_size"]
244
+ sleep_between = SETTINGS["sleep_between_batches"]
245
+ seen_cursors: set[str] = set()
246
+
247
+ logger.info(f"[{name}] Starting fetch: target={target}, already={fetched}")
248
+
249
+ async with httpx.AsyncClient(timeout=self.timeout) as client:
250
+ while fetched < target:
251
+ reviews, new_cursor = await self._fetch_with_backoff(
252
+ client, app_id, cursor, batch_size
253
+ )
254
+
255
+ if not reviews:
256
+ logger.warning(f"[{name}] No more reviews available")
257
+ break
258
+
259
+ if new_cursor and new_cursor in seen_cursors:
260
+ logger.warning(f"[{name}] Cursor loop detected")
261
+ break
262
+
263
+ if new_cursor:
264
+ seen_cursors.add(new_cursor)
265
+
266
+ # Save reviews
267
+ self._append_reviews(app_id, reviews)
268
+ fetched += len(reviews)
269
+
270
+ # Update progress
271
+ self._progress[app_id].fetched = fetched
272
+ self._progress[app_id].cursor = new_cursor or cursor
273
+ self._progress[app_id].last_updated = datetime.now().isoformat()
274
+ self._save_progress()
275
+
276
+ logger.info(f"[{name}] Fetched {fetched}/{target} reviews")
277
+
278
+ if not new_cursor or new_cursor == "*":
279
+ break
280
+
281
+ cursor = new_cursor
282
+ await asyncio.sleep(sleep_between)
283
+
284
+ # Mark as completed
285
+ self._progress[app_id].completed = True
286
+ self._progress[app_id].last_updated = datetime.now().isoformat()
287
+ self._save_progress()
288
+
289
+ logger.info(f"[{name}] Completed with {fetched} reviews")
290
+ return fetched
291
+
292
+ async def fetch_all(
293
+ self,
294
+ resume: bool = True,
295
+ limit_games: int | None = None,
296
+ ) -> dict[str, int]:
297
+ """
298
+ Fetch reviews for all configured games.
299
+
300
+ Args:
301
+ resume: Continue from previous progress
302
+ limit_games: Limit number of games (for testing)
303
+
304
+ Returns:
305
+ Dict mapping app_id to number of reviews fetched
306
+ """
307
+ results: dict[str, int] = {}
308
+ sleep_between_games = SETTINGS["sleep_between_games"]
309
+ reviews_per_game = SETTINGS["reviews_per_game"]
310
+
311
+ games = GAMES[:limit_games] if limit_games else GAMES
312
+
313
+ for i, (app_id, name, genre) in enumerate(games):
314
+ logger.info(f"Processing game {i + 1}/{len(games)}: {name} ({genre})")
315
+
316
+ count = await self.fetch_game_reviews(
317
+ app_id=app_id,
318
+ name=name,
319
+ target=reviews_per_game,
320
+ resume=resume,
321
+ )
322
+ results[app_id] = count
323
+
324
+ # Sleep between games (except for last one)
325
+ if i < len(games) - 1:
326
+ logger.info(f"Sleeping {sleep_between_games}s before next game...")
327
+ await asyncio.sleep(sleep_between_games)
328
+
329
+ total = sum(results.values())
330
+ logger.info(f"Total reviews fetched: {total}")
331
+ return results
332
+
333
+ def get_stats(self) -> dict:
334
+ """Get statistics about fetched reviews."""
335
+ stats = {
336
+ "games_total": len(GAMES),
337
+ "games_completed": 0,
338
+ "games_in_progress": 0,
339
+ "reviews_total": 0,
340
+ "reviews_per_game": {},
341
+ }
342
+
343
+ for app_id, name, _ in GAMES:
344
+ reviews_file = self._get_reviews_file(app_id)
345
+ if reviews_file.exists():
346
+ count = sum(1 for _ in open(reviews_file, "r", encoding="utf-8"))
347
+ stats["reviews_per_game"][name] = count
348
+ stats["reviews_total"] += count
349
+
350
+ if app_id in self._progress and self._progress[app_id].completed:
351
+ stats["games_completed"] += 1
352
+ else:
353
+ stats["games_in_progress"] += 1
354
+
355
+ return stats
scripts/expand_keywords/keywords_base.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Słowa kluczowe do wykrywania tematów w recenzjach gier.
3
+ Używane w podejściu hybrydowym (Keywords + ML Sentiment).
4
+
5
+ Kategorie zostały dobrane na podstawie najczęstszych tematów
6
+ poruszanych w recenzjach gier na platformie Steam.
7
+ """
8
+
9
+ TOPIC_KEYWORDS = {
10
+ # =========================================================================
11
+ # CORE GAMEPLAY
12
+ # =========================================================================
13
+ "Gameplay": [
14
+ # Podstawowe
15
+ "gameplay", "mechanics", "game mechanics", "core gameplay", "game loop",
16
+ "combat", "combat system", "fighting", "battle", "battles",
17
+ # Progresja
18
+ "progression", "leveling", "level up", "experience", "xp", "grind", "grinding",
19
+ "skill tree", "talent tree", "unlock", "unlocks", "unlockables",
20
+ # Misje i aktywności
21
+ "quests", "quest", "missions", "mission", "objectives", "side quests",
22
+ "main quest", "fetch quests", "puzzles", "puzzle", "exploration",
23
+ # Design
24
+ "game design", "level design", "map design", "pacing",
25
+ "balancing", "balanced", "unbalanced", "overpowered", "underpowered", "meta",
26
+ # Wrogowie
27
+ "enemies", "enemy", "bosses", "boss fights", "boss battle", "mobs",
28
+ # Ruch i umiejętności
29
+ "movement", "traversal", "parkour", "skills", "abilities", "powers",
30
+ "spells", "weapons", "weapon variety", "builds", "build variety",
31
+ ],
32
+
33
+ "Fun": [
34
+ # Pozytywne
35
+ "fun", "enjoyable", "entertaining", "addictive", "addicting", "engaging",
36
+ "exciting", "thrilling", "satisfying", "rewarding", "immersive",
37
+ "masterpiece", "gem", "hidden gem", "must play", "must buy",
38
+ # Negatywne
39
+ "boring", "tedious", "repetitive", "monotonous", "dull", "bland",
40
+ "frustrating", "annoying", "unfun", "not fun", "waste of time",
41
+ "disappointing", "letdown", "overhyped", "overrated", "underrated",
42
+ ],
43
+
44
+ "Difficulty": [
45
+ # Poziomy trudności
46
+ "difficulty", "easy", "normal", "hard", "very hard", "nightmare",
47
+ "easy mode", "hard mode", "difficulty settings", "difficulty options",
48
+ # Opisy trudności
49
+ "challenging", "too easy", "too hard", "too difficult", "punishing",
50
+ "forgiving", "casual", "hardcore", "souls-like", "soulslike",
51
+ "dark souls", "die a lot", "dying", "deaths", "unfair", "cheap deaths",
52
+ # Krzywa trudności
53
+ "learning curve", "steep learning curve", "skill ceiling", "skill floor",
54
+ "newcomer friendly", "beginner friendly", "accessible",
55
+ ],
56
+
57
+ # =========================================================================
58
+ # TECHNICAL
59
+ # =========================================================================
60
+ "Performance": [
61
+ # Wydajność
62
+ "performance", "optimize", "optimized", "optimization", "well optimized",
63
+ "poorly optimized", "unoptimized", "runs well", "runs smooth", "runs poorly",
64
+ # FPS
65
+ "fps", "framerate", "frame rate", "frames per second", "60fps", "30fps",
66
+ "fps drops", "frame drops", "drops", "dips", "stuttering", "stutter",
67
+ "hitching", "micro stutter",
68
+ # Zasoby
69
+ "cpu", "gpu", "ram", "vram", "memory", "memory leak", "memory usage",
70
+ # Ładowanie
71
+ "loading", "loading times", "load times", "loading screens", "long loading",
72
+ # Stabilność
73
+ "smooth", "stable", "unstable", "lag", "lagging", "input lag",
74
+ ],
75
+
76
+ "Bugs": [
77
+ # Ogólne
78
+ "bugs", "bug", "buggy", "glitch", "glitches", "glitchy",
79
+ "broken", "issues", "problems", "jank", "janky",
80
+ # Crashe
81
+ "crash", "crashes", "crashing", "crashed", "freeze", "freezing", "frozen",
82
+ "ctd", "crash to desktop", "black screen", "stuck",
83
+ # Konkretne bugi
84
+ "save bug", "save corruption", "corrupted save", "lost progress",
85
+ "clipping", "falling through", "invisible", "t-pose",
86
+ "softlock", "soft lock", "softlocked", "game breaking",
87
+ # Stan gry
88
+ "unplayable", "unfinished", "early access", "beta", "alpha",
89
+ "needs polish", "polished", "rough edges",
90
+ ],
91
+
92
+ # =========================================================================
93
+ # AUDIO-VISUAL
94
+ # =========================================================================
95
+ "Graphics": [
96
+ # Ogólne
97
+ "graphics", "visuals", "visual", "graphic", "graphically",
98
+ "looks", "look", "looking", "looks good", "looks bad", "looks great",
99
+ # Styl
100
+ "art style", "art direction", "artstyle", "aesthetic", "stylized",
101
+ "realistic", "photorealistic", "cartoony", "anime", "pixel art", "retro",
102
+ # Techniczne
103
+ "textures", "texture", "models", "model", "animations", "animation",
104
+ "lighting", "lights", "shadows", "shadow", "reflections", "ray tracing",
105
+ "rendering", "shaders", "particle effects", "particles",
106
+ # Rozdzielczość
107
+ "resolution", "4k", "1440p", "1080p", "720p", "upscaling", "dlss", "fsr",
108
+ # Środowisko
109
+ "environments", "environment", "scenery", "landscapes", "world design",
110
+ "level of detail", "lod", "draw distance", "pop in", "pop-in",
111
+ # Oceny
112
+ "beautiful", "gorgeous", "stunning", "breathtaking", "pretty",
113
+ "ugly", "dated", "outdated", "aged", "old looking",
114
+ ],
115
+
116
+ "Sound": [
117
+ # Muzyka
118
+ "music", "soundtrack", "ost", "score", "composer", "tracks",
119
+ "ambient", "ambient music", "battle music", "menu music",
120
+ # Głos
121
+ "voice", "voice acting", "voice actors", "voice over", "vo",
122
+ "voice lines", "dialogue", "dubbed", "dubbing", "lip sync",
123
+ # Efekty dźwiękowe
124
+ "sound", "sounds", "audio", "sfx", "sound effects", "sound design",
125
+ "footsteps", "gunshots", "explosions",
126
+ # Jakość
127
+ "atmosphere", "atmospheric", "immersive audio", "spatial audio",
128
+ "surround", "audio quality", "sound quality",
129
+ # Problemy
130
+ "audio bug", "audio glitch", "no sound", "sound cutting", "loud", "quiet",
131
+ ],
132
+
133
+ # =========================================================================
134
+ # CONTENT & VALUE
135
+ # =========================================================================
136
+ "Content": [
137
+ # Długość
138
+ "hours", "hour", "length", "long", "short", "playtime", "play time",
139
+ "how long", "game length", "campaign length",
140
+ # Ilość contentu
141
+ "content", "lots of content", "lack of content", "thin", "meaty",
142
+ "activities", "things to do", "side content", "endgame", "end game",
143
+ "post game", "new game plus", "ng+",
144
+ # Replayability
145
+ "replay", "replay value", "replayability", "replayable",
146
+ "multiple endings", "different endings", "choices matter",
147
+ "multiple playthroughs", "completionist", "100%", "100 percent",
148
+ ],
149
+
150
+ "Monetization": [
151
+ # Cena (ex-Price)
152
+ "price", "pricing", "cost", "costs", "priced",
153
+ "expensive", "overpriced", "cheap", "affordable",
154
+ "value", "worth", "worth it", "not worth", "bang for buck",
155
+ "value for money", "money well spent",
156
+ "sale", "discount", "on sale", "full price", "wait for sale",
157
+ "refund", "refunded", "steam sale",
158
+ "aaa price", "indie price", "budget", "premium",
159
+ "free to play", "f2p", "free",
160
+ # MTX (ex-Microtransactions)
161
+ "microtransactions", "microtransaction", "mtx", "monetization",
162
+ "in app purchases", "iap", "real money", "cash shop", "item shop",
163
+ "pay to win", "p2w", "pay2win", "paywall", "pay wall",
164
+ "pay to progress", "paying", "whale", "whales",
165
+ "loot box", "loot boxes", "lootbox", "gacha", "gambling",
166
+ "rng", "random", "chance",
167
+ "battle pass", "season pass", "battlepass", "seasons",
168
+ "premium currency", "gems", "coins", "points",
169
+ "cosmetics", "cosmetic", "skins", "skin", "outfits",
170
+ "dlc", "expansion", "expansions", "dlcs",
171
+ "cash grab", "money grab", "greedy", "predatory", "scam",
172
+ ],
173
+
174
+ # =========================================================================
175
+ # MULTIPLAYER & COMMUNITY
176
+ # =========================================================================
177
+ "Multiplayer": [
178
+ # Tryby
179
+ "multiplayer", "multi-player", "online", "offline",
180
+ "co-op", "coop", "co op", "cooperative",
181
+ "pvp", "pve", "pvpve", "versus",
182
+ "singleplayer", "single player", "solo", "solo play",
183
+ # Matchmaking
184
+ "matchmaking", "queue", "queue times", "waiting",
185
+ "servers", "server", "dedicated servers", "p2p", "peer to peer",
186
+ "ping", "latency", "connection", "disconnects", "desync",
187
+ # Gracze
188
+ "players", "teammates", "team", "squad", "party",
189
+ "randoms", "random teammates", "lobbies", "lobby",
190
+ # Problemy
191
+ "cheaters", "cheater", "hackers", "hacker", "hacking", "cheating",
192
+ "aimbots", "wallhacks", "anticheat", "anti cheat",
193
+ "toxic", "toxicity", "griefing", "griefers",
194
+ ],
195
+
196
+ "Community": [
197
+ # Społeczność
198
+ "community", "playerbase", "player base", "players",
199
+ "active", "dead game", "dead", "alive", "population",
200
+ # Modding
201
+ "mods", "mod", "modding", "mod support", "workshop",
202
+ "steam workshop", "nexus", "modders", "modded",
203
+ "custom content", "user generated",
204
+ # Deweloperzy (interakcja)
205
+ "devs", "developers", "dev team", "community manager",
206
+ "communication", "transparent", "listening",
207
+ # Społeczność graczy
208
+ "helpful", "friendly", "toxic community", "welcoming",
209
+ "guides", "wiki", "tutorials", "newbie friendly",
210
+ ],
211
+
212
+ # =========================================================================
213
+ # CONTROLS & UI
214
+ # =========================================================================
215
+ "Controls": [
216
+ # Sterowanie
217
+ "controls", "control", "controlling", "control scheme",
218
+ "keybinds", "keybind", "key bindings", "rebind", "remapping",
219
+ # Urządzenia
220
+ "keyboard", "mouse", "kb+m", "kbm",
221
+ "controller", "gamepad", "joystick", "controller support",
222
+ "xbox controller", "ps controller", "dualsense",
223
+ # Responsywność
224
+ "responsive", "unresponsive", "clunky", "sluggish", "tight",
225
+ "smooth controls", "floaty", "heavy", "weighty",
226
+ # Celowanie
227
+ "aiming", "aim", "aim assist", "auto aim",
228
+ "camera", "camera controls", "camera angle",
229
+ ],
230
+
231
+ "UI": [
232
+ # Interface
233
+ "ui", "user interface", "interface", "hud",
234
+ "menu", "menus", "main menu", "pause menu",
235
+ "ux", "user experience",
236
+ # Design UI
237
+ "clean ui", "cluttered", "minimalist", "intuitive",
238
+ "confusing", "overwhelming", "readable", "readable text",
239
+ # Elementy
240
+ "minimap", "map", "inventory", "crafting menu",
241
+ "skill menu", "quest log", "journal",
242
+ # Problemy
243
+ "font size", "text size", "too small", "can't read",
244
+ "navigation", "navigating",
245
+ ],
246
+
247
+ # =========================================================================
248
+ # STORY & NARRATIVE
249
+ # =========================================================================
250
+ "Story": [
251
+ # Narracja
252
+ "story", "storyline", "plot", "narrative", "storytelling",
253
+ "writing", "written", "well written", "poorly written",
254
+ # Elementy fabularne
255
+ "characters", "character", "protagonist", "main character",
256
+ "villain", "antagonist", "npcs", "npc", "companions",
257
+ "dialogue", "dialogues", "conversations", "choices",
258
+ # Świat
259
+ "lore", "world building", "worldbuilding", "universe",
260
+ "setting", "backstory", "history",
261
+ # Emocje
262
+ "emotional", "emotions", "feels", "touching", "heartwarming",
263
+ "dark", "mature", "gritty", "lighthearted",
264
+ # Zakończenie
265
+ "ending", "endings", "conclusion", "finale",
266
+ "twist", "plot twist", "predictable", "unpredictable",
267
+ # Cutscenki
268
+ "cutscenes", "cutscene", "cinematics", "cinematic",
269
+ "script", "scripted", "linear", "open ended",
270
+ ],
271
+
272
+ # =========================================================================
273
+ # DEVELOPER SUPPORT
274
+ # =========================================================================
275
+ "Support": [
276
+ # Aktualizacje
277
+ "updates", "update", "patch", "patches", "patched",
278
+ "hotfix", "hotfixes", "bug fixes", "fixed",
279
+ # Stan rozwoju
280
+ "abandoned", "dead", "no updates", "still updating",
281
+ "active development", "roadmap", "planned",
282
+ "early access", "full release", "1.0", "launch",
283
+ # Deweloperzy
284
+ "developer", "developers", "dev", "devs", "studio",
285
+ "indie dev", "indie developer", "aaa developer",
286
+ # Wsparcie
287
+ "support", "customer support", "response", "feedback",
288
+ "listening to feedback", "ignoring", "communication",
289
+ # Porty
290
+ "port", "ported", "console port", "pc port", "lazy port",
291
+ ],
292
+
293
+ # =========================================================================
294
+ # PREDICTION & INTENT (NEW!)
295
+ # =========================================================================
296
+ "Retention": [
297
+ # Pozytywne (High Retention)
298
+ "addictive", "addicted", "can't stop playing", "hooked", "drug",
299
+ "thousands of hours", "hundreds of hours", "worth it", "worth every penny",
300
+ "buy it", "must buy", "highly recommend", "masterpiece", "goty",
301
+ "game of the year", "10/10", "best game", "favorite game",
302
+ # Negatywne (Churn)
303
+ "refund", "refunded", "refunding", "uninstalled", "uninstall", "delete",
304
+ "waste of money", "waste of time", "don't buy", "do not buy",
305
+ "regret", "regretting", "boring", "bored", "sleep", "sleepy",
306
+ "wait for sale", "not worth it", "cash grab", "scam",
307
+ ],
308
+ }
309
+
310
+ # =============================================================================
311
+ # WYKLUCZENIA (Context-aware filtering)
312
+ # =============================================================================
313
+ # Słowa wykluczające - jeśli występują w pobliżu słowa kluczowego,
314
+ # ignorujemy to słowo kluczowe w danym kontekście.
315
+ # Format: "słowo_kluczowe": ["słowo_obok", "inne_słowo"]
316
+
317
+ EXCLUSIONS = {
318
+ # "fps" jako gatunek (FPS shooter) vs wydajność (60 fps)
319
+ "fps": ["genre", "shooter", "first person", "fps game", "fps genre"],
320
+ # "free" jako darmowy vs "free to play" model biznesowy
321
+ "free": ["drm free", "bug free", "free roam", "free world"],
322
+ # "controls" jako sterowanie vs "kontroluje" w narracji
323
+ "control": ["mind control", "control the world", "control freak"],
324
+ }
scripts/expand_keywords/main.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CLI for keyword expansion toolkit.
3
+
4
+ Usage:
5
+ # Fetch reviews from Steam (can be resumed)
6
+ python -m scripts.expand_keywords fetch --resume
7
+
8
+ # Train FastText model
9
+ python -m scripts.expand_keywords train
10
+
11
+ # Expand dictionary and export candidates
12
+ python -m scripts.expand_keywords expand --threshold 0.55
13
+
14
+ # Generate new keywords.py
15
+ python -m scripts.expand_keywords generate --auto-approve 0.7
16
+
17
+ # Run all steps
18
+ python -m scripts.expand_keywords run --resume
19
+
20
+ # Show statistics
21
+ python -m scripts.expand_keywords stats
22
+ """
23
+
24
+ import argparse
25
+ import asyncio
26
+ import logging
27
+ import sys
28
+ from pathlib import Path
29
+
30
+ # Add project root to path for imports
31
+ PROJECT_ROOT = Path(__file__).parent.parent.parent
32
+ sys.path.insert(0, str(PROJECT_ROOT))
33
+
34
+ from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS
35
+ from scripts.expand_keywords.expander import KeywordExpander
36
+ from scripts.expand_keywords.fetcher import ReviewFetcher
37
+ from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords
38
+ from scripts.expand_keywords.trainer import FastTextTrainer
39
+
40
+ # Configure logging
41
+ logging.basicConfig(
42
+ level=logging.INFO,
43
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
44
+ datefmt="%Y-%m-%d %H:%M:%S",
45
+ )
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ def load_existing_keywords() -> dict[str, list[str]]:
50
+ """Load existing TOPIC_KEYWORDS from keywords.py."""
51
+ keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py"
52
+
53
+ if not keywords_path.exists():
54
+ raise FileNotFoundError(f"Keywords file not found: {keywords_path}")
55
+
56
+ # Execute keywords.py to get TOPIC_KEYWORDS
57
+ namespace: dict = {}
58
+ exec(keywords_path.read_text(encoding="utf-8"), namespace)
59
+
60
+ keywords = namespace.get("TOPIC_KEYWORDS")
61
+ if not keywords:
62
+ raise ValueError("TOPIC_KEYWORDS not found in keywords.py")
63
+
64
+ return keywords
65
+
66
+
67
+ async def cmd_fetch(args: argparse.Namespace) -> None:
68
+ """Fetch reviews from Steam."""
69
+ logger.info("Starting review fetch...")
70
+
71
+ fetcher = ReviewFetcher()
72
+
73
+ # Show current progress
74
+ stats = fetcher.get_stats()
75
+ logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
76
+
77
+ await fetcher.fetch_all(
78
+ resume=args.resume,
79
+ limit_games=args.limit,
80
+ )
81
+
82
+ # Show final stats
83
+ stats = fetcher.get_stats()
84
+ logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
85
+
86
+
87
+ def cmd_train(args: argparse.Namespace) -> None:
88
+ """Train FastText model."""
89
+ logger.info("Starting model training...")
90
+
91
+ # Load existing keywords for frozen n-grams
92
+ keywords = load_existing_keywords()
93
+ existing_ngrams = extract_ngrams_from_keywords(keywords)
94
+ logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary")
95
+
96
+ # Load reviews
97
+ fetcher = ReviewFetcher()
98
+ reviews = fetcher.load_all_reviews()
99
+
100
+ if not reviews:
101
+ logger.error("No reviews found. Run 'fetch' first.")
102
+ return
103
+
104
+ logger.info(f"Loaded {len(reviews)} reviews")
105
+
106
+ # Preprocess
107
+ preprocessor = Preprocessor(existing_ngrams=existing_ngrams)
108
+ sentences = preprocessor.preprocess_corpus(reviews)
109
+ preprocessor.save()
110
+
111
+ # Train
112
+ trainer = FastTextTrainer()
113
+ trainer.train(sentences)
114
+ trainer.save()
115
+
116
+ logger.info("Training complete!")
117
+
118
+
119
+ def cmd_expand(args: argparse.Namespace) -> None:
120
+ """Expand dictionary and export candidates."""
121
+ logger.info("Starting dictionary expansion...")
122
+
123
+ # Load components
124
+ keywords = load_existing_keywords()
125
+
126
+ preprocessor = Preprocessor()
127
+ try:
128
+ preprocessor.load()
129
+ except FileNotFoundError:
130
+ logger.error("Preprocessor not found. Run 'train' first.")
131
+ return
132
+
133
+ trainer = FastTextTrainer()
134
+ try:
135
+ model = trainer.load()
136
+ except FileNotFoundError:
137
+ logger.error("Model not found. Run 'train' first.")
138
+ return
139
+
140
+ # Expand
141
+ expander = KeywordExpander(
142
+ model=model,
143
+ existing_keywords=keywords,
144
+ word_frequencies=preprocessor.get_word_frequencies(),
145
+ similarity_threshold=args.threshold,
146
+ )
147
+
148
+ # Export candidates (with threshold in filename if requested)
149
+ expander.export_candidates(include_threshold_in_name=args.compare)
150
+
151
+ # Show stats
152
+ stats = expander.get_expansion_stats()
153
+ logger.info(f"Expansion complete: {stats['total_candidates']} candidates")
154
+ logger.info(f" Auto-approved: {stats['auto_approved']}")
155
+ logger.info(f" Needs review: {stats['needs_review']}")
156
+
157
+
158
+ def cmd_compare(args: argparse.Namespace) -> None:
159
+ """Compare multiple thresholds."""
160
+ logger.info("Comparing thresholds...")
161
+
162
+ # Load components
163
+ keywords = load_existing_keywords()
164
+
165
+ preprocessor = Preprocessor()
166
+ try:
167
+ preprocessor.load()
168
+ except FileNotFoundError:
169
+ logger.error("Preprocessor not found. Run 'train' first.")
170
+ return
171
+
172
+ trainer = FastTextTrainer()
173
+ try:
174
+ model = trainer.load()
175
+ except FileNotFoundError:
176
+ logger.error("Model not found. Run 'train' first.")
177
+ return
178
+
179
+ thresholds = args.thresholds
180
+ results = []
181
+
182
+ for threshold in thresholds:
183
+ expander = KeywordExpander(
184
+ model=model,
185
+ existing_keywords=keywords,
186
+ word_frequencies=preprocessor.get_word_frequencies(),
187
+ similarity_threshold=threshold,
188
+ )
189
+
190
+ # Export with threshold in name
191
+ expander.export_candidates(include_threshold_in_name=True)
192
+
193
+ stats = expander.get_expansion_stats()
194
+ results.append((threshold, stats))
195
+
196
+ # Print comparison table
197
+ print("\n" + "=" * 60)
198
+ print("THRESHOLD COMPARISON")
199
+ print("=" * 60)
200
+ print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}")
201
+ print("-" * 60)
202
+
203
+ for threshold, stats in results:
204
+ print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}")
205
+
206
+ print("-" * 60)
207
+ print(f"\nOutput files saved to: {OUTPUT_DIR}/")
208
+ print("Compare candidates_t*.json to see differences.")
209
+
210
+
211
+ def cmd_generate(args: argparse.Namespace) -> None:
212
+ """Generate new keywords.py."""
213
+ logger.info("Generating expanded keywords.py...")
214
+
215
+ # Load components
216
+ keywords = load_existing_keywords()
217
+
218
+ preprocessor = Preprocessor()
219
+ try:
220
+ preprocessor.load()
221
+ except FileNotFoundError:
222
+ logger.error("Preprocessor not found. Run 'train' first.")
223
+ return
224
+
225
+ trainer = FastTextTrainer()
226
+ try:
227
+ model = trainer.load()
228
+ except FileNotFoundError:
229
+ logger.error("Model not found. Run 'train' first.")
230
+ return
231
+
232
+ # Generate
233
+ expander = KeywordExpander(
234
+ model=model,
235
+ existing_keywords=keywords,
236
+ word_frequencies=preprocessor.get_word_frequencies(),
237
+ )
238
+
239
+ output_path = expander.generate_keywords_py(
240
+ auto_approve_threshold=args.auto_approve,
241
+ )
242
+
243
+ logger.info(f"Generated: {output_path}")
244
+
245
+
246
+ async def cmd_run(args: argparse.Namespace) -> None:
247
+ """Run all steps: fetch, train, expand, generate."""
248
+ logger.info("Running complete pipeline...")
249
+
250
+ # Step 1: Fetch
251
+ await cmd_fetch(args)
252
+
253
+ # Step 2: Train
254
+ cmd_train(args)
255
+
256
+ # Step 3: Expand
257
+ cmd_expand(args)
258
+
259
+ # Step 4: Generate
260
+ cmd_generate(args)
261
+
262
+ logger.info("Pipeline complete!")
263
+
264
+
265
+ def cmd_stats(args: argparse.Namespace) -> None:
266
+ """Show statistics."""
267
+ # Fetcher stats
268
+ fetcher = ReviewFetcher()
269
+ fetch_stats = fetcher.get_stats()
270
+
271
+ print("\n=== Fetch Statistics ===")
272
+ print(f"Games configured: {fetch_stats['games_total']}")
273
+ print(f"Games completed: {fetch_stats['games_completed']}")
274
+ print(f"Games in progress: {fetch_stats['games_in_progress']}")
275
+ print(f"Total reviews: {fetch_stats['reviews_total']}")
276
+
277
+ if fetch_stats["reviews_per_game"]:
278
+ print("\nReviews per game:")
279
+ for name, count in sorted(fetch_stats["reviews_per_game"].items()):
280
+ print(f" {name}: {count}")
281
+
282
+ # Model stats
283
+ model_path = MODELS_DIR / "fasttext.model"
284
+ if model_path.exists():
285
+ print("\n=== Model Statistics ===")
286
+ trainer = FastTextTrainer()
287
+ model = trainer.load()
288
+ print(f"Vocabulary size: {len(model.wv)}")
289
+
290
+ # Expansion stats (if available)
291
+ candidates_path = OUTPUT_DIR / "candidates.json"
292
+ if candidates_path.exists():
293
+ import json
294
+ with open(candidates_path, "r", encoding="utf-8") as f:
295
+ data = json.load(f)
296
+ print("\n=== Expansion Statistics ===")
297
+ print(f"Total candidates: {data['metadata']['total_candidates']}")
298
+ for cat, cands in data["categories"].items():
299
+ print(f" {cat}: {len(cands)}")
300
+
301
+
302
+ def cmd_similar(args: argparse.Namespace) -> None:
303
+ """Find similar words for testing."""
304
+ trainer = FastTextTrainer()
305
+ try:
306
+ model = trainer.load()
307
+ except FileNotFoundError:
308
+ logger.error("Model not found. Run 'train' first.")
309
+ return
310
+
311
+ word = args.word
312
+ topn = args.topn
313
+
314
+ similar = trainer.get_similar(word, topn=topn)
315
+
316
+ if similar:
317
+ print(f"\nWords similar to '{word}':")
318
+ for w, sim in similar:
319
+ print(f" {w}: {sim:.3f}")
320
+ else:
321
+ print(f"Word '{word}' not found in vocabulary")
322
+
323
+
324
+ def main():
325
+ parser = argparse.ArgumentParser(
326
+ description="Keyword expansion toolkit using FastText",
327
+ formatter_class=argparse.RawDescriptionHelpFormatter,
328
+ )
329
+
330
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
331
+
332
+ # fetch command
333
+ fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam")
334
+ fetch_parser.add_argument(
335
+ "--resume", "-r",
336
+ action="store_true",
337
+ help="Resume from previous progress",
338
+ )
339
+ fetch_parser.add_argument(
340
+ "--limit", "-l",
341
+ type=int,
342
+ default=None,
343
+ help="Limit number of games (for testing)",
344
+ )
345
+
346
+ # train command
347
+ train_parser = subparsers.add_parser("train", help="Train FastText model")
348
+
349
+ # expand command
350
+ expand_parser = subparsers.add_parser("expand", help="Expand dictionary")
351
+ expand_parser.add_argument(
352
+ "--threshold", "-t",
353
+ type=float,
354
+ default=SETTINGS["similarity_threshold"],
355
+ help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
356
+ )
357
+ expand_parser.add_argument(
358
+ "--compare", "-c",
359
+ action="store_true",
360
+ help="Include threshold in output filename (for comparison)",
361
+ )
362
+
363
+ # compare command
364
+ compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds")
365
+ compare_parser.add_argument(
366
+ "--thresholds", "-t",
367
+ type=float,
368
+ nargs="+",
369
+ default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70],
370
+ help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)",
371
+ )
372
+
373
+ # generate command
374
+ generate_parser = subparsers.add_parser("generate", help="Generate keywords.py")
375
+ generate_parser.add_argument(
376
+ "--auto-approve", "-a",
377
+ type=float,
378
+ default=SETTINGS["auto_approve_threshold"],
379
+ help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
380
+ )
381
+
382
+ # run command (all steps)
383
+ run_parser = subparsers.add_parser("run", help="Run all steps")
384
+ run_parser.add_argument(
385
+ "--resume", "-r",
386
+ action="store_true",
387
+ help="Resume fetch from previous progress",
388
+ )
389
+ run_parser.add_argument(
390
+ "--limit", "-l",
391
+ type=int,
392
+ default=None,
393
+ help="Limit number of games (for testing)",
394
+ )
395
+ run_parser.add_argument(
396
+ "--threshold", "-t",
397
+ type=float,
398
+ default=SETTINGS["similarity_threshold"],
399
+ help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
400
+ )
401
+ run_parser.add_argument(
402
+ "--auto-approve", "-a",
403
+ type=float,
404
+ default=SETTINGS["auto_approve_threshold"],
405
+ help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
406
+ )
407
+
408
+ # stats command
409
+ stats_parser = subparsers.add_parser("stats", help="Show statistics")
410
+
411
+ # similar command (for testing)
412
+ similar_parser = subparsers.add_parser("similar", help="Find similar words")
413
+ similar_parser.add_argument("word", help="Word to find similar words for")
414
+ similar_parser.add_argument(
415
+ "--topn", "-n",
416
+ type=int,
417
+ default=20,
418
+ help="Number of results (default: 20)",
419
+ )
420
+
421
+ args = parser.parse_args()
422
+
423
+ if not args.command:
424
+ parser.print_help()
425
+ return
426
+
427
+ # Execute command
428
+ if args.command == "fetch":
429
+ asyncio.run(cmd_fetch(args))
430
+ elif args.command == "train":
431
+ cmd_train(args)
432
+ elif args.command == "expand":
433
+ cmd_expand(args)
434
+ elif args.command == "compare":
435
+ cmd_compare(args)
436
+ elif args.command == "generate":
437
+ cmd_generate(args)
438
+ elif args.command == "run":
439
+ asyncio.run(cmd_run(args))
440
+ elif args.command == "stats":
441
+ cmd_stats(args)
442
+ elif args.command == "similar":
443
+ cmd_similar(args)
444
+
445
+
446
+ if __name__ == "__main__":
447
+ main()
scripts/expand_keywords/preprocessor.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text preprocessing with n-gram detection using gensim.Phrases.
3
+
4
+ Pipeline:
5
+ 1. Tokenization (jieba for Chinese, regex for English/mixed)
6
+ 2. Build Phrases models (bigrams, trigrams)
7
+ 3. Apply frozen n-grams from existing dictionary
8
+ 4. Apply detected phrases
9
+
10
+ This ensures that multi-word concepts like "帧率" or "加载画面"
11
+ are treated as single tokens during FastText training.
12
+
13
+ For Chinese text:
14
+ - Uses jieba for word segmentation (Chinese has no spaces)
15
+ - Keeps English words intact (common in gaming reviews: fps, bug, dlc)
16
+ - Removes punctuation but preserves Chinese characters
17
+ """
18
+
19
+ import logging
20
+ import pickle
21
+ import re
22
+ from collections import Counter
23
+ from pathlib import Path
24
+
25
+ import jieba
26
+ from gensim.models import Phrases
27
+ from gensim.models.phrases import Phraser
28
+
29
+ from .config import MODELS_DIR, SETTINGS
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class Preprocessor:
35
+ """
36
+ Text preprocessor with n-gram detection.
37
+
38
+ Uses gensim Phrases for automatic phrase detection plus
39
+ frozen n-grams from the existing keyword dictionary.
40
+ """
41
+
42
+ def __init__(self, existing_ngrams: list[str] | None = None):
43
+ """
44
+ Initialize preprocessor.
45
+
46
+ Args:
47
+ existing_ngrams: Multi-word phrases from existing keywords.py
48
+ (e.g., "frame rate", "loading screen")
49
+ """
50
+ self.frozen_ngrams: set[tuple[str, ...]] = set()
51
+ if existing_ngrams:
52
+ self.frozen_ngrams = self._normalize_ngrams(existing_ngrams)
53
+ logger.info(f"Loaded {len(self.frozen_ngrams)} frozen n-grams")
54
+
55
+ self.bigram_model: Phraser | None = None
56
+ self.trigram_model: Phraser | None = None
57
+ self.word_frequencies: Counter = Counter()
58
+
59
+ def _normalize_ngrams(self, ngrams: list[str]) -> set[tuple[str, ...]]:
60
+ """Convert n-grams to lowercase tuple format for fast lookup."""
61
+ result = set()
62
+ for ng in ngrams:
63
+ if " " in ng:
64
+ tokens = tuple(ng.lower().split())
65
+ result.add(tokens)
66
+ return result
67
+
68
+ def tokenize(self, text: str) -> list[str]:
69
+ """
70
+ Tokenization for Chinese/mixed text using jieba.
71
+
72
+ - Uses jieba for Chinese word segmentation
73
+ - Keeps English words intact (common in gaming: fps, bug, dlc)
74
+ - Removes punctuation (both Chinese and English)
75
+ - Lowercases English text
76
+ """
77
+ # Remove URLs
78
+ text = re.sub(r'https?://\S+', ' ', text)
79
+
80
+ # Remove punctuation (Chinese and English) but keep Chinese chars and alphanumeric
81
+ # Chinese punctuation: 。!?,、;:""''()【】《》
82
+ text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbfa-zA-Z0-9\s]', ' ', text)
83
+
84
+ # Lowercase English text
85
+ text = text.lower()
86
+
87
+ # Use jieba to segment Chinese text
88
+ # jieba handles mixed Chinese/English text well
89
+ tokens = list(jieba.cut(text))
90
+
91
+ # Filter: remove empty strings and single spaces
92
+ tokens = [t.strip() for t in tokens if t.strip()]
93
+
94
+ return tokens
95
+
96
+ def build_phrase_models(
97
+ self,
98
+ corpus: list[list[str]],
99
+ min_count: int | None = None,
100
+ threshold: float | None = None,
101
+ ) -> None:
102
+ """
103
+ Build Phrases models for automatic n-gram detection.
104
+
105
+ Args:
106
+ corpus: List of tokenized documents
107
+ min_count: Minimum phrase occurrences (default from settings)
108
+ threshold: Scoring threshold (higher = fewer phrases)
109
+ """
110
+ min_count = min_count or SETTINGS["phrase_min_count"]
111
+ threshold = threshold or SETTINGS["phrase_threshold"]
112
+
113
+ logger.info(f"Building phrase models (min_count={min_count}, threshold={threshold})")
114
+
115
+ # Build bigram model: "frame rate" -> "frame_rate"
116
+ bigram_phrases = Phrases(
117
+ corpus,
118
+ min_count=min_count,
119
+ threshold=threshold,
120
+ delimiter="_",
121
+ )
122
+ self.bigram_model = Phraser(bigram_phrases)
123
+
124
+ # Apply bigramy to create input for trigram detection
125
+ bigram_corpus = [self.bigram_model[doc] for doc in corpus]
126
+
127
+ # Build trigram model: "dark_souls like" -> "dark_souls_like"
128
+ trigram_phrases = Phrases(
129
+ bigram_corpus,
130
+ min_count=min_count,
131
+ threshold=threshold,
132
+ delimiter="_",
133
+ )
134
+ self.trigram_model = Phraser(trigram_phrases)
135
+
136
+ # Log detected phrases
137
+ bigram_count = len(bigram_phrases.export_phrases())
138
+ trigram_count = len(trigram_phrases.export_phrases())
139
+ logger.info(f"Detected {bigram_count} bigrams, {trigram_count} trigrams")
140
+
141
+ def _apply_frozen_ngrams(self, tokens: list[str]) -> list[str]:
142
+ """
143
+ Apply frozen n-grams from existing dictionary.
144
+
145
+ These are always joined, even if not detected by Phrases.
146
+ """
147
+ result = []
148
+ i = 0
149
+
150
+ while i < len(tokens):
151
+ matched = False
152
+
153
+ # Try trigrams first (longer matches preferred)
154
+ if i + 2 < len(tokens):
155
+ trigram = (tokens[i], tokens[i + 1], tokens[i + 2])
156
+ if trigram in self.frozen_ngrams:
157
+ result.append("_".join(trigram))
158
+ i += 3
159
+ matched = True
160
+
161
+ # Try bigrams
162
+ if not matched and i + 1 < len(tokens):
163
+ bigram = (tokens[i], tokens[i + 1])
164
+ if bigram in self.frozen_ngrams:
165
+ result.append("_".join(bigram))
166
+ i += 2
167
+ matched = True
168
+
169
+ if not matched:
170
+ result.append(tokens[i])
171
+ i += 1
172
+
173
+ return result
174
+
175
+ def apply_phrases(self, tokens: list[str]) -> list[str]:
176
+ """
177
+ Apply phrase models and frozen n-grams to tokens.
178
+
179
+ Order:
180
+ 1. Frozen n-grams (from existing dictionary)
181
+ 2. Automatic Phrases (bigrams then trigrams)
182
+ """
183
+ # Apply frozen n-grams first
184
+ tokens = self._apply_frozen_ngrams(tokens)
185
+
186
+ # Apply automatic phrase models
187
+ if self.bigram_model:
188
+ tokens = list(self.bigram_model[tokens])
189
+ if self.trigram_model:
190
+ tokens = list(self.trigram_model[tokens])
191
+
192
+ return tokens
193
+
194
+ def preprocess_corpus(
195
+ self,
196
+ reviews: list[str],
197
+ build_phrases: bool = True,
198
+ ) -> list[list[str]]:
199
+ """
200
+ Full preprocessing pipeline.
201
+
202
+ Args:
203
+ reviews: Raw review texts
204
+ build_phrases: Whether to build phrase models (skip if loading)
205
+
206
+ Returns:
207
+ List of tokenized documents with phrases applied
208
+ """
209
+ logger.info(f"Preprocessing {len(reviews)} reviews...")
210
+
211
+ # Step 1: Tokenize all reviews
212
+ tokenized = [self.tokenize(review) for review in reviews]
213
+ logger.info("Tokenization complete")
214
+
215
+ # Step 2: Build phrase models
216
+ if build_phrases:
217
+ self.build_phrase_models(tokenized)
218
+
219
+ # Step 3: Apply phrases and count frequencies
220
+ processed = []
221
+ for tokens in tokenized:
222
+ phrased = self.apply_phrases(tokens)
223
+ processed.append(phrased)
224
+ self.word_frequencies.update(phrased)
225
+
226
+ logger.info(f"Vocabulary size: {len(self.word_frequencies)}")
227
+ return processed
228
+
229
+ def get_word_frequencies(self) -> dict[str, int]:
230
+ """Get word frequency dictionary."""
231
+ return dict(self.word_frequencies)
232
+
233
+ def save(self, path: Path | None = None) -> None:
234
+ """Save preprocessor state (phrase models, frequencies)."""
235
+ path = path or MODELS_DIR / "preprocessor.pkl"
236
+
237
+ data = {
238
+ "frozen_ngrams": self.frozen_ngrams,
239
+ "bigram_model": self.bigram_model,
240
+ "trigram_model": self.trigram_model,
241
+ "word_frequencies": self.word_frequencies,
242
+ }
243
+
244
+ with open(path, "wb") as f:
245
+ pickle.dump(data, f)
246
+
247
+ logger.info(f"Saved preprocessor to {path}")
248
+
249
+ def load(self, path: Path | None = None) -> None:
250
+ """Load preprocessor state."""
251
+ path = path or MODELS_DIR / "preprocessor.pkl"
252
+
253
+ if not path.exists():
254
+ raise FileNotFoundError(f"Preprocessor not found at {path}")
255
+
256
+ with open(path, "rb") as f:
257
+ data = pickle.load(f)
258
+
259
+ self.frozen_ngrams = data["frozen_ngrams"]
260
+ self.bigram_model = data["bigram_model"]
261
+ self.trigram_model = data["trigram_model"]
262
+ self.word_frequencies = data["word_frequencies"]
263
+
264
+ logger.info(f"Loaded preprocessor from {path}")
265
+
266
+
267
+ def extract_ngrams_from_keywords(keywords: dict[str, list[str]]) -> list[str]:
268
+ """
269
+ Extract multi-word phrases from keywords dictionary.
270
+
271
+ Args:
272
+ keywords: TOPIC_KEYWORDS dictionary from keywords.py
273
+
274
+ Returns:
275
+ List of multi-word phrases (e.g., ["frame rate", "loading screen"])
276
+ """
277
+ ngrams = []
278
+ for category_words in keywords.values():
279
+ for word in category_words:
280
+ if " " in word:
281
+ ngrams.append(word)
282
+ return ngrams
scripts/expand_keywords/trainer.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastText model training.
3
+
4
+ FastText is preferred over Word2Vec because:
5
+ - Better handling of typos and misspellings (common in reviews)
6
+ - Can generate vectors for out-of-vocabulary words
7
+ - Uses character n-grams internally
8
+ """
9
+
10
+ import logging
11
+ from pathlib import Path
12
+
13
+ from gensim.models import FastText
14
+
15
+ from .config import MODELS_DIR, SETTINGS
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class FastTextTrainer:
21
+ """
22
+ Trains FastText word embeddings on review corpus.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ vector_size: int | None = None,
28
+ window: int | None = None,
29
+ min_count: int | None = None,
30
+ epochs: int | None = None,
31
+ workers: int | None = None,
32
+ ):
33
+ """
34
+ Initialize trainer with hyperparameters.
35
+
36
+ Args:
37
+ vector_size: Dimensionality of word vectors
38
+ window: Context window size
39
+ min_count: Minimum word frequency
40
+ epochs: Number of training iterations
41
+ workers: Number of worker threads
42
+ """
43
+ self.vector_size = vector_size or SETTINGS["fasttext_vector_size"]
44
+ self.window = window or SETTINGS["fasttext_window"]
45
+ self.min_count = min_count or SETTINGS["fasttext_min_count"]
46
+ self.epochs = epochs or SETTINGS["fasttext_epochs"]
47
+ self.workers = workers or SETTINGS["fasttext_workers"]
48
+
49
+ self.model: FastText | None = None
50
+
51
+ def train(self, sentences: list[list[str]]) -> FastText:
52
+ """
53
+ Train FastText model on tokenized sentences.
54
+
55
+ Args:
56
+ sentences: List of tokenized documents (output from preprocessor)
57
+
58
+ Returns:
59
+ Trained FastText model
60
+ """
61
+ logger.info(
62
+ f"Training FastText model: "
63
+ f"vector_size={self.vector_size}, window={self.window}, "
64
+ f"min_count={self.min_count}, epochs={self.epochs}"
65
+ )
66
+ logger.info(f"Training on {len(sentences)} documents")
67
+
68
+ self.model = FastText(
69
+ sentences=sentences,
70
+ vector_size=self.vector_size,
71
+ window=self.window,
72
+ min_count=self.min_count,
73
+ epochs=self.epochs,
74
+ workers=self.workers,
75
+ sg=1, # Skip-gram (better for semantic similarity)
76
+ min_n=3, # Minimum character n-gram length
77
+ max_n=6, # Maximum character n-gram length
78
+ )
79
+
80
+ vocab_size = len(self.model.wv)
81
+ logger.info(f"Training complete. Vocabulary size: {vocab_size}")
82
+
83
+ return self.model
84
+
85
+ def save(self, path: Path | str | None = None) -> Path:
86
+ """
87
+ Save trained model.
88
+
89
+ Args:
90
+ path: Save path (default: models/fasttext.model)
91
+
92
+ Returns:
93
+ Path where model was saved
94
+ """
95
+ if self.model is None:
96
+ raise ValueError("No model to save. Train first.")
97
+
98
+ path = Path(path) if path else MODELS_DIR / "fasttext.model"
99
+ self.model.save(str(path))
100
+ logger.info(f"Saved model to {path}")
101
+ return path
102
+
103
+ def load(self, path: Path | str | None = None) -> FastText:
104
+ """
105
+ Load model from file.
106
+
107
+ Args:
108
+ path: Model path (default: models/fasttext.model)
109
+
110
+ Returns:
111
+ Loaded FastText model
112
+ """
113
+ path = Path(path) if path else MODELS_DIR / "fasttext.model"
114
+
115
+ if not path.exists():
116
+ raise FileNotFoundError(f"Model not found at {path}")
117
+
118
+ self.model = FastText.load(str(path))
119
+ vocab_size = len(self.model.wv)
120
+ logger.info(f"Loaded model from {path}. Vocabulary size: {vocab_size}")
121
+ return self.model
122
+
123
+ def get_similar(
124
+ self,
125
+ word: str,
126
+ topn: int = 10,
127
+ ) -> list[tuple[str, float]]:
128
+ """
129
+ Get most similar words to a given word.
130
+
131
+ Args:
132
+ word: Query word
133
+ topn: Number of results
134
+
135
+ Returns:
136
+ List of (word, similarity) tuples
137
+ """
138
+ if self.model is None:
139
+ raise ValueError("No model loaded. Train or load first.")
140
+
141
+ # Normalize word (space to underscore for phrases)
142
+ word_normalized = word.lower().replace(" ", "_")
143
+
144
+ try:
145
+ return self.model.wv.most_similar(word_normalized, topn=topn)
146
+ except KeyError:
147
+ logger.warning(f"Word '{word}' not in vocabulary")
148
+ return []
149
+
150
+ def get_similarity(self, word1: str, word2: str) -> float:
151
+ """
152
+ Get similarity between two words.
153
+
154
+ Args:
155
+ word1: First word
156
+ word2: Second word
157
+
158
+ Returns:
159
+ Cosine similarity (-1 to 1)
160
+ """
161
+ if self.model is None:
162
+ raise ValueError("No model loaded. Train or load first.")
163
+
164
+ w1 = word1.lower().replace(" ", "_")
165
+ w2 = word2.lower().replace(" ", "_")
166
+
167
+ try:
168
+ return float(self.model.wv.similarity(w1, w2))
169
+ except KeyError as e:
170
+ logger.warning(f"Word not in vocabulary: {e}")
171
+ return 0.0
172
+
173
+ def word_in_vocab(self, word: str) -> bool:
174
+ """Check if word is in vocabulary."""
175
+ if self.model is None:
176
+ return False
177
+
178
+ word_normalized = word.lower().replace(" ", "_")
179
+ return word_normalized in self.model.wv
180
+
181
+ def get_vocab_words(self) -> list[str]:
182
+ """Get all words in vocabulary."""
183
+ if self.model is None:
184
+ return []
185
+ return list(self.model.wv.key_to_index.keys())