Spaces:
Running
Running
GitHub Action commited on
Commit ·
8ff1b66
0
Parent(s):
deploy: worker release from GitHub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +23 -0
- Dockerfile +47 -0
- README.md +14 -0
- backend/.env.example +38 -0
- backend/app/__init__.py +8 -0
- backend/app/core/__init__.py +1 -0
- backend/app/core/config.py +174 -0
- backend/app/core/freshness.py +71 -0
- backend/app/core/jieba_userdict.txt +14 -0
- backend/app/core/keywords.py +273 -0
- backend/app/core/rate_limit.py +6 -0
- backend/app/core/sampling.py +135 -0
- backend/app/core/stopwords_zh.py +39 -0
- backend/app/core/ttl_tiers.py +19 -0
- backend/app/core/worker_logging.py +316 -0
- backend/app/db/__init__.py +1 -0
- backend/app/db/mongodb.py +1152 -0
- backend/app/main.py +159 -0
- backend/app/models/__init__.py +19 -0
- backend/app/models/schemas.py +210 -0
- backend/app/routers/__init__.py +5 -0
- backend/app/routers/analyze.py +597 -0
- backend/app/routers/games.py +68 -0
- backend/app/services/__init__.py +6 -0
- backend/app/services/analysis_runner.py +643 -0
- backend/app/services/analysis_utils.py +259 -0
- backend/app/services/game_sync_service.py +290 -0
- backend/app/services/highlights_service.py +202 -0
- backend/app/services/nlp_service.py +524 -0
- backend/app/services/precache_service.py +199 -0
- backend/app/services/priority_refresh_service.py +387 -0
- backend/app/services/steam_errors.py +22 -0
- backend/app/services/steam_service.py +499 -0
- backend/app/services/update_detection_service.py +453 -0
- backend/pytest.ini +6 -0
- backend/requirements.txt +42 -0
- backend/scripts/smoke_news_cursor.py +264 -0
- backend/scripts/smoke_test.py +185 -0
- backend/worker_main.py +244 -0
- scripts/benchmark_major_update.py +848 -0
- scripts/check_db_stats.py +47 -0
- scripts/expand_keywords/__init__.py +8 -0
- scripts/expand_keywords/__main__.py +6 -0
- scripts/expand_keywords/config.py +106 -0
- scripts/expand_keywords/expander.py +350 -0
- scripts/expand_keywords/fetcher.py +355 -0
- scripts/expand_keywords/keywords_base.py +324 -0
- scripts/expand_keywords/main.py +447 -0
- scripts/expand_keywords/preprocessor.py +282 -0
- scripts/expand_keywords/trainer.py +185 -0
.dockerignore
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore everything by default
|
| 2 |
+
*
|
| 3 |
+
|
| 4 |
+
# Allow only what is needed for Docker build
|
| 5 |
+
!backend/
|
| 6 |
+
!frontend/
|
| 7 |
+
!scripts/
|
| 8 |
+
!Dockerfile
|
| 9 |
+
!README.md
|
| 10 |
+
!requirements.txt
|
| 11 |
+
!.gitignore
|
| 12 |
+
|
| 13 |
+
# Exclude unnecessary subfolders
|
| 14 |
+
backend/tests/
|
| 15 |
+
backend/__pycache__/
|
| 16 |
+
backend/.pytest_cache/
|
| 17 |
+
frontend/node_modules/
|
| 18 |
+
frontend/dist/
|
| 19 |
+
|
| 20 |
+
# Exclude specific files
|
| 21 |
+
*.pdf
|
| 22 |
+
.env
|
| 23 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ------------------------------------------------------------------------------
|
| 2 |
+
# Stage 1: Quantize NLP model (torch needed ONLY here for PyTorch -> ONNX export)
|
| 3 |
+
# ------------------------------------------------------------------------------
|
| 4 |
+
FROM python:3.11-slim AS model-quantizer
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
RUN pip install --no-cache-dir \
|
| 9 |
+
--extra-index-url https://download.pytorch.org/whl/cpu \
|
| 10 |
+
"torch==2.2.0" \
|
| 11 |
+
"optimum[onnxruntime]==1.16.2" \
|
| 12 |
+
"transformers==4.37.2" \
|
| 13 |
+
"huggingface-hub==0.20.3" \
|
| 14 |
+
"numpy<2.0.0"
|
| 15 |
+
|
| 16 |
+
COPY scripts/quantize_model.py scripts/quantize_model.py
|
| 17 |
+
RUN python3 scripts/quantize_model.py
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ------------------------------------------------------------------------------
|
| 21 |
+
# Stage 2: Runtime (Python FastAPI Worker — no torch, no frontend)
|
| 22 |
+
# ------------------------------------------------------------------------------
|
| 23 |
+
FROM python:3.11-slim
|
| 24 |
+
|
| 25 |
+
WORKDIR /app
|
| 26 |
+
|
| 27 |
+
# Create non-root user for security
|
| 28 |
+
RUN useradd -m -u 1000 user
|
| 29 |
+
USER user
|
| 30 |
+
ENV HOME=/home/user \
|
| 31 |
+
PATH=/home/user/.local/bin:$PATH
|
| 32 |
+
|
| 33 |
+
# Install Python dependencies (no torch — ~700MB RAM saved)
|
| 34 |
+
COPY --chown=user:user backend/requirements.txt backend/requirements.txt
|
| 35 |
+
RUN pip install --no-cache-dir --upgrade -r backend/requirements.txt
|
| 36 |
+
|
| 37 |
+
# Copy Backend code
|
| 38 |
+
COPY --chown=user:user backend backend
|
| 39 |
+
|
| 40 |
+
# Copy pre-quantized ONNX model from Stage 1
|
| 41 |
+
COPY --chown=user:user --from=model-quantizer /app/backend/models/quantized backend/models/quantized
|
| 42 |
+
|
| 43 |
+
WORKDIR /app/backend
|
| 44 |
+
|
| 45 |
+
EXPOSE 7860
|
| 46 |
+
|
| 47 |
+
CMD ["uvicorn", "worker_main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SentimentStream Worker
|
| 3 |
+
emoji: ⚙️
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: agpl-3.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# SentimentStream Worker
|
| 13 |
+
|
| 14 |
+
Background worker for SentimentStream. Syncs games from SteamSpy, detects updates via Steam News API, and pre-caches sentiment analyses.
|
backend/.env.example
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MongoDB
|
| 2 |
+
MONGODB_URL=mongodb://admin:password@localhost:27017
|
| 3 |
+
MONGODB_DB_NAME=sentimentSummary
|
| 4 |
+
|
| 5 |
+
# App Settings
|
| 6 |
+
DEBUG=true
|
| 7 |
+
CORS_ORIGINS=http://localhost:5173,http://localhost:3000
|
| 8 |
+
|
| 9 |
+
# Cache Settings
|
| 10 |
+
CACHE_TTL_HOURS=24
|
| 11 |
+
|
| 12 |
+
# Steam API Settings
|
| 13 |
+
REVIEW_BATCH_SIZE=500
|
| 14 |
+
STEAM_REVIEW_LANGUAGE=schinese
|
| 15 |
+
STEAM_REGION=CN
|
| 16 |
+
|
| 17 |
+
# Steam API Retry
|
| 18 |
+
STEAM_RETRY_MAX_ATTEMPTS=3
|
| 19 |
+
STEAM_RETRY_BASE_DELAY=1.0
|
| 20 |
+
STEAM_RETRY_MAX_DELAY=10.0
|
| 21 |
+
|
| 22 |
+
# Sampling Settings - Statistical sampling parameters
|
| 23 |
+
SAMPLE_TOP_HELPFUL=50
|
| 24 |
+
SAMPLE_CONFIDENCE_LEVEL=0.95
|
| 25 |
+
SAMPLE_MARGIN_OF_ERROR=0.01
|
| 26 |
+
SAMPLE_MAX_REVIEWS=3000
|
| 27 |
+
|
| 28 |
+
# NLP Settings - Hugging Face Models
|
| 29 |
+
HF_SENTIMENT_MODEL=uer/roberta-base-finetuned-jd-binary-chinese
|
| 30 |
+
|
| 31 |
+
# NLP Settings - Analysis Parameters
|
| 32 |
+
TEXT_MAX_LENGTH=512
|
| 33 |
+
SENTIMENT_POSITIVE_THRESHOLD=0.1
|
| 34 |
+
SENTIMENT_NEGATIVE_THRESHOLD=-0.1
|
| 35 |
+
TOPIC_MIN_MENTIONS=5
|
| 36 |
+
|
| 37 |
+
# Deduplication Cache
|
| 38 |
+
DEDUP_CACHE_MAXSIZE=10000
|
backend/app/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SentimentStream Backend Application.
|
| 3 |
+
|
| 4 |
+
Narzędzie do analizy sentymentu i modelowania tematów
|
| 5 |
+
w recenzjach gier Steam w czasie rzeczywistym.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__version__ = "0.1.0"
|
backend/app/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Moduł konfiguracji aplikacji."""
|
backend/app/core/config.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Konfiguracja aplikacji.
|
| 3 |
+
|
| 4 |
+
Wykorzystuje Pydantic Settings do zarządzania zmiennymi środowiskowymi.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from functools import lru_cache
|
| 8 |
+
|
| 9 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Settings(BaseSettings):
|
| 13 |
+
"""
|
| 14 |
+
Ustawienia aplikacji ładowane ze zmiennych środowiskowych.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
model_config = SettingsConfigDict(
|
| 18 |
+
env_file=(".env", "backend/.env"),
|
| 19 |
+
env_file_encoding="utf-8",
|
| 20 |
+
case_sensitive=False,
|
| 21 |
+
extra="ignore"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# MongoDB
|
| 25 |
+
mongodb_url: str = ""
|
| 26 |
+
mongodb_db_name: str = "sentimentSummary"
|
| 27 |
+
|
| 28 |
+
# App Mode
|
| 29 |
+
app_mode: str = "full" # "full" = monolith, "api" = API-only (no frontend)
|
| 30 |
+
|
| 31 |
+
# App Settings
|
| 32 |
+
debug: bool = False
|
| 33 |
+
cors_origins: str = "http://localhost:5173,http://localhost:3000"
|
| 34 |
+
|
| 35 |
+
# Cache Settings
|
| 36 |
+
cache_ttl_hours: int = 24
|
| 37 |
+
cache_ttl_short_hours: int = 12 # frequently updated games
|
| 38 |
+
cache_ttl_long_hours: int = 168 # stable games (7 days)
|
| 39 |
+
cache_ttl_worker_managed_hours: int = 1440 # 60 days to preserve stale fallback results
|
| 40 |
+
cache_ttl_on_demand_hours: int = 1440 # 60 days to preserve stale fallback results
|
| 41 |
+
|
| 42 |
+
# Incremental Analysis
|
| 43 |
+
incremental_enabled: bool = True
|
| 44 |
+
incremental_max_stored_ids: int = 5000
|
| 45 |
+
incremental_max_gap_days: int = 90 # fall back to full analysis after this many days without reviews
|
| 46 |
+
recent_sample_limit: int = 1000
|
| 47 |
+
niche_cache_max_age_days: int = 60
|
| 48 |
+
analysis_freshness_max_age_days: int = 60
|
| 49 |
+
patch_context_max_age_days: int = 90
|
| 50 |
+
dlc_min_reviews_for_analysis: int = 50
|
| 51 |
+
dlc_visible_in_search: bool = False # Temporary policy: hide DLC from autocomplete/suggestions
|
| 52 |
+
dlc_worker_analysis_enabled: bool = False # Temporary policy: exclude DLC from worker-managed analysis
|
| 53 |
+
|
| 54 |
+
# Steam API Settings
|
| 55 |
+
review_batch_size: int = 100
|
| 56 |
+
steam_review_language: str = "schinese" # Review fetch scope; product analyzes Simplified Chinese Steam reviews.
|
| 57 |
+
steam_region: str = "CN" # CN, US, etc.
|
| 58 |
+
|
| 59 |
+
# Steam API Retry
|
| 60 |
+
steam_retry_max_attempts: int = 3
|
| 61 |
+
steam_retry_base_delay: float = 1.0 # doubles each retry
|
| 62 |
+
steam_retry_max_delay: float = 10.0 # cap
|
| 63 |
+
|
| 64 |
+
# Steam API Error Cache TTL (seconds)
|
| 65 |
+
steam_error_cache_ttl_404: int = 3600 # 1h
|
| 66 |
+
steam_error_cache_ttl_429: int = 300 # 5min
|
| 67 |
+
|
| 68 |
+
# Sampling Settings - Statistical sampling parameters
|
| 69 |
+
sample_top_helpful: int = 50
|
| 70 |
+
sample_confidence_level: float = 0.95
|
| 71 |
+
sample_margin_of_error: float = 0.02
|
| 72 |
+
sample_max_reviews: int = 3000
|
| 73 |
+
sample_minority_min: int = 100
|
| 74 |
+
|
| 75 |
+
# NLP Settings - Analysis Parameters
|
| 76 |
+
text_max_length: int = 512
|
| 77 |
+
sentiment_positive_threshold: float = 0.1
|
| 78 |
+
sentiment_negative_threshold: float = -0.1
|
| 79 |
+
topic_min_mentions: int = 5
|
| 80 |
+
|
| 81 |
+
# NLP Settings - Deduplication Cache
|
| 82 |
+
dedup_cache_maxsize: int = 10000
|
| 83 |
+
|
| 84 |
+
# NLP Settings - Performance & Logic
|
| 85 |
+
nlp_onnx_intra_threads: int = 2
|
| 86 |
+
nlp_onnx_inter_threads: int = 2
|
| 87 |
+
nlp_negation_window: int = 3
|
| 88 |
+
|
| 89 |
+
# Prediction Settings
|
| 90 |
+
prediction_retention_threshold_pos: float = 0.2
|
| 91 |
+
prediction_retention_threshold_neg: float = -0.2
|
| 92 |
+
|
| 93 |
+
# Community Highlights
|
| 94 |
+
highlights_ngram_min: int = 2
|
| 95 |
+
highlights_ngram_max: int = 5
|
| 96 |
+
highlights_min_mentions: int = 3
|
| 97 |
+
highlights_max_doc_freq_ratio: float = 0.4
|
| 98 |
+
highlights_top_n_general: int = 15
|
| 99 |
+
highlights_top_n_per_topic: int = 5
|
| 100 |
+
|
| 101 |
+
# Worker — Pre-cache
|
| 102 |
+
worker_trigger_token: str = ""
|
| 103 |
+
precache_enabled: bool = False
|
| 104 |
+
precache_top_n_games: int = 500
|
| 105 |
+
precache_batch_delay_seconds: int = 10
|
| 106 |
+
precache_checkpoints_hours: str = "6,12,24,72,168,336"
|
| 107 |
+
precache_max_analyses_per_cycle: int = 50
|
| 108 |
+
|
| 109 |
+
# Worker — Priority Games
|
| 110 |
+
steam_priority_categories: str = "top_sellers,new_releases,specials"
|
| 111 |
+
steam_priority_regions: str = "CN,US"
|
| 112 |
+
steam_priority_grace_days: int = 3
|
| 113 |
+
steam_priority_categories_url: str = "https://store.steampowered.com/api/featuredcategories"
|
| 114 |
+
steam_bootstrap_max_per_cycle: int = 20
|
| 115 |
+
steam_bootstrap_delay: float = 1.5
|
| 116 |
+
|
| 117 |
+
# Worker — News Scan
|
| 118 |
+
news_refresh_window_hours: int = 6
|
| 119 |
+
news_initial_count: int = 20
|
| 120 |
+
news_incremental_count: int = 5
|
| 121 |
+
|
| 122 |
+
# Worker — Game Sync
|
| 123 |
+
game_sync_enabled: bool = False
|
| 124 |
+
game_sync_steamspy_delay: float = 61.0
|
| 125 |
+
game_sync_details_delay: float = 1.1
|
| 126 |
+
game_sync_top_n_details: int = 500
|
| 127 |
+
game_sync_cn_enrichment_delay: float = 1.5
|
| 128 |
+
game_sync_cn_enrichment_limit: int = 200
|
| 129 |
+
game_sync_app_type_enrichment_delay: float = 1.5
|
| 130 |
+
game_sync_app_type_enrichment_limit: int = 200
|
| 131 |
+
|
| 132 |
+
# Logging (both Live API and Worker)
|
| 133 |
+
worker_log_dir: str = "/data/worker_logs"
|
| 134 |
+
worker_log_fallback_dir: str = "/tmp/worker_logs"
|
| 135 |
+
worker_log_max_bytes: int = 5_000_000 # 5 MB per file
|
| 136 |
+
worker_log_backup_count: int = 3 # 3 rotated files = 20 MB max
|
| 137 |
+
nlp_verbose_logging: bool = False # re-enable NLP debug logs to stdout
|
| 138 |
+
nlp_debug_log_max_bytes: int = 2_000_000 # 2 MB per file
|
| 139 |
+
errors_log_max_bytes: int = 2_000_000 # 2 MB per file
|
| 140 |
+
|
| 141 |
+
# Rate Limiting
|
| 142 |
+
rate_limit_analyze: str = "10/minute"
|
| 143 |
+
rate_limit_default: str = "30/minute"
|
| 144 |
+
|
| 145 |
+
# NLP Settings - Hugging Face Models
|
| 146 |
+
# Using specialized Chinese model (RoBERTa-JD) - 90% accuracy on product reviews
|
| 147 |
+
hf_sentiment_model: str = "uer/roberta-base-finetuned-jd-binary-chinese"
|
| 148 |
+
|
| 149 |
+
@property
|
| 150 |
+
def cors_origins_list(self) -> list[str]:
|
| 151 |
+
"""Zwraca listę dozwolonych originów CORS."""
|
| 152 |
+
return [origin.strip() for origin in self.cors_origins.split(",")]
|
| 153 |
+
|
| 154 |
+
@property
|
| 155 |
+
def precache_checkpoints_list(self) -> list[int]:
|
| 156 |
+
"""Parse checkpoint hours from comma-separated string."""
|
| 157 |
+
return sorted(int(h.strip()) for h in self.precache_checkpoints_hours.split(","))
|
| 158 |
+
|
| 159 |
+
@property
|
| 160 |
+
def steam_priority_categories_list(self) -> list[str]:
|
| 161 |
+
return [c.strip() for c in self.steam_priority_categories.split(",") if c.strip()]
|
| 162 |
+
|
| 163 |
+
@property
|
| 164 |
+
def steam_priority_regions_list(self) -> list[str]:
|
| 165 |
+
return [r.strip() for r in self.steam_priority_regions.split(",") if r.strip()]
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
@lru_cache
|
| 169 |
+
def get_settings() -> Settings:
|
| 170 |
+
"""Zwraca singleton instancji Settings."""
|
| 171 |
+
return Settings()
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
settings = get_settings()
|
backend/app/core/freshness.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Product-level analysis freshness rules.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from datetime import datetime, timezone
|
| 8 |
+
from enum import Enum
|
| 9 |
+
from typing import Any, cast
|
| 10 |
+
|
| 11 |
+
from app.core.config import settings
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FreshnessStatus(str, Enum):
|
| 15 |
+
"""Product freshness state for an existing analysis."""
|
| 16 |
+
|
| 17 |
+
FRESH = "fresh"
|
| 18 |
+
STALE_BY_AGE = "stale_by_age"
|
| 19 |
+
STALE_BY_PATCH = "stale_by_patch"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _as_utc_datetime(value: Any) -> datetime | None:
|
| 23 |
+
if value is None:
|
| 24 |
+
return None
|
| 25 |
+
if isinstance(value, datetime):
|
| 26 |
+
return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
|
| 27 |
+
if isinstance(value, str):
|
| 28 |
+
parsed = datetime.fromisoformat(value)
|
| 29 |
+
return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_analysis_reference_at(document: dict[str, Any]) -> datetime | None:
|
| 34 |
+
"""Return the best available execution timestamp for freshness checks."""
|
| 35 |
+
raw = document.get("results")
|
| 36 |
+
results: dict[str, Any] = cast(dict[str, Any], raw) if isinstance(raw, dict) else {}
|
| 37 |
+
return (
|
| 38 |
+
_as_utc_datetime(results.get("analysis_date"))
|
| 39 |
+
or _as_utc_datetime(document.get("analyzed_at"))
|
| 40 |
+
or _as_utc_datetime(document.get("cached_at"))
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def evaluate_freshness(
|
| 45 |
+
document: dict[str, Any],
|
| 46 |
+
current_patch_at: datetime | None,
|
| 47 |
+
) -> FreshnessStatus:
|
| 48 |
+
"""
|
| 49 |
+
Evaluate analysis freshness using product rules:
|
| 50 |
+
patch recency first, then max age.
|
| 51 |
+
"""
|
| 52 |
+
analysis_at = get_analysis_reference_at(document)
|
| 53 |
+
if analysis_at is None:
|
| 54 |
+
return FreshnessStatus.STALE_BY_AGE
|
| 55 |
+
|
| 56 |
+
if current_patch_at is not None and analysis_at < current_patch_at:
|
| 57 |
+
return FreshnessStatus.STALE_BY_PATCH
|
| 58 |
+
|
| 59 |
+
age_days = (datetime.now(timezone.utc) - analysis_at).days
|
| 60 |
+
if age_days > settings.analysis_freshness_max_age_days:
|
| 61 |
+
return FreshnessStatus.STALE_BY_AGE
|
| 62 |
+
|
| 63 |
+
return FreshnessStatus.FRESH
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def get_staleness_reason(status: FreshnessStatus) -> str | None:
|
| 67 |
+
if status == FreshnessStatus.STALE_BY_AGE:
|
| 68 |
+
return "STALE_REASON_AGE"
|
| 69 |
+
if status == FreshnessStatus.STALE_BY_PATCH:
|
| 70 |
+
return "STALE_REASON_PATCH"
|
| 71 |
+
return None
|
backend/app/core/jieba_userdict.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
boss战 5 n
|
| 2 |
+
开放世界 5 n
|
| 3 |
+
大逃杀 5 n
|
| 4 |
+
战斗通行证 5 n
|
| 5 |
+
皮肤系统 5 n
|
| 6 |
+
氪金 10 v
|
| 7 |
+
开箱 5 v
|
| 8 |
+
人机对战 5 n
|
| 9 |
+
帧数不稳 5 n
|
| 10 |
+
内存泄漏 5 n
|
| 11 |
+
手感好 5 a
|
| 12 |
+
手感差 5 a
|
| 13 |
+
上手简单 5 a
|
| 14 |
+
劝退新手 5 v
|
backend/app/core/keywords.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Chinese keywords for game review topic detection.
|
| 3 |
+
Used in hybrid approach (Keywords + ML Sentiment).
|
| 4 |
+
|
| 5 |
+
Categories based on common topics in Steam game reviews.
|
| 6 |
+
Seed keywords will be expanded using the expand_keywords pipeline.
|
| 7 |
+
|
| 8 |
+
Structure: topic -> {single_char, compound, phrase}
|
| 9 |
+
- single_char: standalone Chinese characters (1 char, prone to false positives)
|
| 10 |
+
- compound: multi-char Chinese words or short English words
|
| 11 |
+
- phrase: multi-word phrases (EN or ZH)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
TOPIC_KEYWORDS: dict[str, dict[str, list[str]]] = {
|
| 15 |
+
# =========================================================================
|
| 16 |
+
# CORE GAMEPLAY - 核心玩法
|
| 17 |
+
# =========================================================================
|
| 18 |
+
"Gameplay": {
|
| 19 |
+
"single_char": ["刷", "肝"],
|
| 20 |
+
"compound": [
|
| 21 |
+
"玩法", "游戏性", "机制", "战斗", "任务", "关卡",
|
| 22 |
+
"探索", "技能", "装备", "gameplay",
|
| 23 |
+
],
|
| 24 |
+
"phrase": ["战斗系统"],
|
| 25 |
+
},
|
| 26 |
+
|
| 27 |
+
"Fun": {
|
| 28 |
+
"single_char": ["爽", "烂"],
|
| 29 |
+
"compound": [
|
| 30 |
+
# Positive
|
| 31 |
+
"好玩", "有趣", "上瘾", "神作", "佳作", "精品",
|
| 32 |
+
"沉浸", "过瘾", "带感", "回血", "爽游",
|
| 33 |
+
"解压", "杀时间",
|
| 34 |
+
# Negative
|
| 35 |
+
"无聊", "枯燥", "乏味", "垃圾", "辣鸡", "粪作",
|
| 36 |
+
"失望", "无趣",
|
| 37 |
+
],
|
| 38 |
+
"phrase": [
|
| 39 |
+
"电子伟哥", "治好了", "精神时光屋", "时光屋",
|
| 40 |
+
"电子阳痿", "电子ed",
|
| 41 |
+
],
|
| 42 |
+
},
|
| 43 |
+
|
| 44 |
+
"Difficulty": {
|
| 45 |
+
"single_char": [],
|
| 46 |
+
"compound": [
|
| 47 |
+
"难度", "简单", "困难", "硬核",
|
| 48 |
+
"劝退", "手残", "新手", "上手",
|
| 49 |
+
"souls", "魂类",
|
| 50 |
+
],
|
| 51 |
+
"phrase": ["太难", "太简单"],
|
| 52 |
+
},
|
| 53 |
+
|
| 54 |
+
# =========================================================================
|
| 55 |
+
# TECHNICAL - 技术
|
| 56 |
+
# =========================================================================
|
| 57 |
+
"Performance": {
|
| 58 |
+
"single_char": ["卡"],
|
| 59 |
+
"compound": [
|
| 60 |
+
"优化", "卡顿", "帧率", "帧数", "流畅", "掉帧",
|
| 61 |
+
"丝滑", "显卡", "显存", "延迟",
|
| 62 |
+
"fps", "cpu", "gpu",
|
| 63 |
+
],
|
| 64 |
+
"phrase": [
|
| 65 |
+
"稳60", "锁60", "解锁帧率", "吃配置", "带不动",
|
| 66 |
+
"PPT效果", "幻灯片", "帧生成", "输入延迟", "帧数不稳",
|
| 67 |
+
],
|
| 68 |
+
},
|
| 69 |
+
|
| 70 |
+
"Bugs": {
|
| 71 |
+
"single_char": [],
|
| 72 |
+
"compound": [
|
| 73 |
+
"闪退", "崩溃", "卡死", "报错", "存档",
|
| 74 |
+
"黑屏", "进不去", "打不开", "未响应", "无响应",
|
| 75 |
+
"弹窗", "坏档", "掉线",
|
| 76 |
+
"bug", "bugs",
|
| 77 |
+
],
|
| 78 |
+
"phrase": [
|
| 79 |
+
"存档损坏", "无法保存", "卡加载",
|
| 80 |
+
"加载失败", "连不上",
|
| 81 |
+
],
|
| 82 |
+
},
|
| 83 |
+
|
| 84 |
+
# =========================================================================
|
| 85 |
+
# AUDIO-VISUAL - 视听
|
| 86 |
+
# =========================================================================
|
| 87 |
+
"Graphics": {
|
| 88 |
+
"single_char": [],
|
| 89 |
+
"compound": [
|
| 90 |
+
"画面", "画质", "特效", "建模", "贴图",
|
| 91 |
+
"美术", "风格", "场景", "光影",
|
| 92 |
+
"4k", "hdr",
|
| 93 |
+
],
|
| 94 |
+
"phrase": [],
|
| 95 |
+
},
|
| 96 |
+
|
| 97 |
+
"Sound": {
|
| 98 |
+
"single_char": [],
|
| 99 |
+
"compound": [
|
| 100 |
+
"音乐", "音效", "配音", "配乐", "声音",
|
| 101 |
+
"原声",
|
| 102 |
+
"bgm", "ost",
|
| 103 |
+
],
|
| 104 |
+
"phrase": ["中文配音"],
|
| 105 |
+
},
|
| 106 |
+
|
| 107 |
+
# =========================================================================
|
| 108 |
+
# CONTENT & VALUE - 内容与价值
|
| 109 |
+
# =========================================================================
|
| 110 |
+
"Content": {
|
| 111 |
+
"single_char": [],
|
| 112 |
+
"compound": [
|
| 113 |
+
"内容", "时长", "流程", "耐玩", "通关",
|
| 114 |
+
"主线", "支线", "收集", "小时", "体量",
|
| 115 |
+
"注水", "重复", "换皮", "多周目",
|
| 116 |
+
"dlc",
|
| 117 |
+
],
|
| 118 |
+
"phrase": [
|
| 119 |
+
"素材复用", "拖时长", "强行延长", "通关后",
|
| 120 |
+
],
|
| 121 |
+
},
|
| 122 |
+
|
| 123 |
+
"Monetization": {
|
| 124 |
+
"single_char": [],
|
| 125 |
+
"compound": [
|
| 126 |
+
# ex-Price
|
| 127 |
+
"价格", "定价", "值得", "不值", "贵", "便宜",
|
| 128 |
+
"打折", "史低", "入手", "白嫖", "性价比",
|
| 129 |
+
# ex-Microtransactions
|
| 130 |
+
"氪金", "内购", "充值", "抽卡", "648",
|
| 131 |
+
"课金", "首充", "月卡", "战令", "季票",
|
| 132 |
+
"开箱", "箱子", "钥匙", "保底", "抽奖",
|
| 133 |
+
"p2w",
|
| 134 |
+
],
|
| 135 |
+
"phrase": [
|
| 136 |
+
"通行证", "pay to win",
|
| 137 |
+
],
|
| 138 |
+
},
|
| 139 |
+
|
| 140 |
+
# =========================================================================
|
| 141 |
+
# MULTIPLAYER & COMMUNITY - 多人与社区
|
| 142 |
+
# =========================================================================
|
| 143 |
+
"Multiplayer": {
|
| 144 |
+
"single_char": [],
|
| 145 |
+
"compound": [
|
| 146 |
+
"联机", "多人", "匹��", "服务器", "延迟",
|
| 147 |
+
"掉线", "开黑", "组队", "单机", "野排", "车队",
|
| 148 |
+
"单排", "组排", "路人", "挂机",
|
| 149 |
+
"pvp", "pve", "coop",
|
| 150 |
+
],
|
| 151 |
+
"phrase": [
|
| 152 |
+
"坑比", "猪队友", "送人头",
|
| 153 |
+
],
|
| 154 |
+
},
|
| 155 |
+
|
| 156 |
+
"Community": {
|
| 157 |
+
"single_char": [],
|
| 158 |
+
"compound": [
|
| 159 |
+
"社区", "玩家", "汉化",
|
| 160 |
+
"官方", "民间",
|
| 161 |
+
"mod", "mods",
|
| 162 |
+
],
|
| 163 |
+
"phrase": ["创意工坊"],
|
| 164 |
+
},
|
| 165 |
+
|
| 166 |
+
# =========================================================================
|
| 167 |
+
# CONTROLS & UI - 操控与界面
|
| 168 |
+
# =========================================================================
|
| 169 |
+
"Controls": {
|
| 170 |
+
"single_char": [],
|
| 171 |
+
"compound": [
|
| 172 |
+
"操作", "手感", "手柄", "键鼠", "键盘",
|
| 173 |
+
"摇杆", "触发", "键位", "改键",
|
| 174 |
+
"死区", "陀螺仪", "扳机", "震动",
|
| 175 |
+
],
|
| 176 |
+
"phrase": [
|
| 177 |
+
"自定义键位", "辅助瞄准", "触觉反馈", "自适应扳机",
|
| 178 |
+
],
|
| 179 |
+
},
|
| 180 |
+
|
| 181 |
+
"UI": {
|
| 182 |
+
"single_char": [],
|
| 183 |
+
"compound": [
|
| 184 |
+
"界面", "菜单", "字幕", "字体",
|
| 185 |
+
"中文", "汉化",
|
| 186 |
+
"ui", "hud",
|
| 187 |
+
],
|
| 188 |
+
"phrase": [],
|
| 189 |
+
},
|
| 190 |
+
|
| 191 |
+
# =========================================================================
|
| 192 |
+
# STORY & NARRATIVE - 剧情
|
| 193 |
+
# =========================================================================
|
| 194 |
+
"Story": {
|
| 195 |
+
"single_char": [],
|
| 196 |
+
"compound": [
|
| 197 |
+
"剧情", "故事", "人物", "角色", "结局",
|
| 198 |
+
"剧本", "叙事", "世界观", "背景", "喂屎",
|
| 199 |
+
"烂尾", "降智", "工具人", "脸谱化",
|
| 200 |
+
"剧情杀", "都合主义",
|
| 201 |
+
"npc",
|
| 202 |
+
],
|
| 203 |
+
"phrase": ["逻辑硬伤"],
|
| 204 |
+
},
|
| 205 |
+
|
| 206 |
+
# =========================================================================
|
| 207 |
+
# DEVELOPER SUPPORT - 开发支持
|
| 208 |
+
# =========================================================================
|
| 209 |
+
"Support": {
|
| 210 |
+
"single_char": [],
|
| 211 |
+
"compound": [
|
| 212 |
+
"更新", "修复", "维护", "开发商", "官方",
|
| 213 |
+
"补丁", "版本",
|
| 214 |
+
],
|
| 215 |
+
"phrase": [],
|
| 216 |
+
},
|
| 217 |
+
|
| 218 |
+
"Localization": {
|
| 219 |
+
"single_char": [],
|
| 220 |
+
"compound": [
|
| 221 |
+
"本地化", "汉化", "翻译", "机翻", "缺字", "乱码",
|
| 222 |
+
"繁体", "简体",
|
| 223 |
+
],
|
| 224 |
+
"phrase": [
|
| 225 |
+
"语言支持", "中文支持", "无中文", "不支援中文",
|
| 226 |
+
"文本质量", "字幕翻译", "界面翻译",
|
| 227 |
+
],
|
| 228 |
+
},
|
| 229 |
+
|
| 230 |
+
# =========================================================================
|
| 231 |
+
# REFINEMENT - 打磨
|
| 232 |
+
# =========================================================================
|
| 233 |
+
"Polish": {
|
| 234 |
+
"single_char": [],
|
| 235 |
+
"compound": [
|
| 236 |
+
"打磨", "精致", "粗糙", "用心", "敷衍", "细节",
|
| 237 |
+
"诚意", "偷懒", "不用心", "精良", "精美",
|
| 238 |
+
],
|
| 239 |
+
"phrase": ["粗制滥造"],
|
| 240 |
+
},
|
| 241 |
+
|
| 242 |
+
# =========================================================================
|
| 243 |
+
# RETENTION - 留存
|
| 244 |
+
# =========================================================================
|
| 245 |
+
"Retention": {
|
| 246 |
+
"single_char": [],
|
| 247 |
+
"compound": [
|
| 248 |
+
# Positive (High Retention)
|
| 249 |
+
"推荐", "安利", "入正", "入坑", "必玩",
|
| 250 |
+
"神作", "年度", "满分",
|
| 251 |
+
# Negative (Churn)
|
| 252 |
+
"退款", "卸载", "弃坑", "劝退", "不推荐",
|
| 253 |
+
"避雷", "踩雷", "退坑",
|
| 254 |
+
"回坑", "出坑", "已弃",
|
| 255 |
+
],
|
| 256 |
+
"phrase": [
|
| 257 |
+
"坚持玩", "每天玩", "停不下来", "刷了",
|
| 258 |
+
"已退", "退款了",
|
| 259 |
+
],
|
| 260 |
+
},
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
# =============================================================================
|
| 264 |
+
# EXCLUSIONS (Context-aware filtering)
|
| 265 |
+
# =============================================================================
|
| 266 |
+
# Words to exclude when they appear in certain contexts.
|
| 267 |
+
# Format: "keyword": ["context_word1", "context_word2"]
|
| 268 |
+
|
| 269 |
+
EXCLUSIONS = {
|
| 270 |
+
# "fps" as genre (FPS shooter) vs performance (60 fps)
|
| 271 |
+
"fps": ["射击", "枪战", "第一人称"],
|
| 272 |
+
# Empty for now - will be expanded based on false positives
|
| 273 |
+
}
|
backend/app/core/rate_limit.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared rate limiter instance for the application."""
|
| 2 |
+
|
| 3 |
+
from slowapi import Limiter
|
| 4 |
+
from slowapi.util import get_remote_address
|
| 5 |
+
|
| 6 |
+
limiter = Limiter(key_func=get_remote_address)
|
backend/app/core/sampling.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Moduł do obliczania statystycznej wielkości próbki.
|
| 3 |
+
|
| 4 |
+
Implementuje wzory statystyczne dla próbkowania populacji.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import math
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
|
| 10 |
+
from app.core.config import settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Wartości Z dla poziomów ufności
|
| 14 |
+
Z_SCORES = {
|
| 15 |
+
0.90: 1.645,
|
| 16 |
+
0.95: 1.96,
|
| 17 |
+
0.99: 2.576,
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class SamplePlan:
|
| 23 |
+
"""
|
| 24 |
+
Plan próbkowania dla gry.
|
| 25 |
+
|
| 26 |
+
Attributes:
|
| 27 |
+
top_helpful: Liczba najprzydatniejszych recenzji.
|
| 28 |
+
statistical_sample: Wielkość próbki statystycznej.
|
| 29 |
+
positive_count: Ile pobrać pozytywnych (stratified).
|
| 30 |
+
negative_count: Ile pobrać negatywnych (stratified).
|
| 31 |
+
total: Łączna liczba recenzji do pobrania.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
top_helpful: int
|
| 35 |
+
statistical_sample: int
|
| 36 |
+
positive_count: int
|
| 37 |
+
negative_count: int
|
| 38 |
+
total: int
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def calculate_sample_size(
|
| 42 |
+
population: int,
|
| 43 |
+
confidence_level: float | None = None,
|
| 44 |
+
margin_of_error: float | None = None,
|
| 45 |
+
) -> int:
|
| 46 |
+
"""
|
| 47 |
+
Oblicza minimalną wielkość próbki dla danej populacji.
|
| 48 |
+
Wykorzystuje wzór Cochrana z korektą dla populacji skończonej.
|
| 49 |
+
"""
|
| 50 |
+
if confidence_level is None:
|
| 51 |
+
confidence_level = settings.sample_confidence_level
|
| 52 |
+
if margin_of_error is None:
|
| 53 |
+
margin_of_error = settings.sample_margin_of_error
|
| 54 |
+
|
| 55 |
+
# 1. Pobieramy Z-score (np. 1.96 dla 95% ufności).
|
| 56 |
+
# Mówi on, jak bardzo wynik może odbiegać od średniej w jednostkach odchylenia standardowego.
|
| 57 |
+
z = Z_SCORES.get(confidence_level, 1.96)
|
| 58 |
+
|
| 59 |
+
# 2. Zakładamy p=0.5 (maksymalna zmienność).
|
| 60 |
+
# To daje nam najbezpieczniejszą (największą) wielkość próbki.
|
| 61 |
+
p = 0.5
|
| 62 |
+
|
| 63 |
+
# 3. Wzór Cochrana dla nieskończonej populacji:
|
| 64 |
+
# n0 = (Z^2 * p * (1-p)) / e^2
|
| 65 |
+
# Wyjaśnienie: Z kwadrat razy zmienność, podzielone przez kwadrat błędu.
|
| 66 |
+
n_0 = (z ** 2 * p * (1 - p)) / (margin_of_error ** 2)
|
| 67 |
+
|
| 68 |
+
# 4. Korekta dla populacji skończonej (Steam ma policzalną liczbę recenzji):
|
| 69 |
+
# n = n0 / (1 + (n0 - 1) / N)
|
| 70 |
+
# Wyjaśnienie: Zmniejszamy próbkę, bo wiemy dokładnie, ile osób (recenzji) jest w "całym świecie" tej gry.
|
| 71 |
+
n = n_0 / (1 + (n_0 - 1) / population)
|
| 72 |
+
|
| 73 |
+
# Zaokrąglamy w górę do pełnej recenzji
|
| 74 |
+
return math.ceil(n)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def create_sample_plan(
|
| 78 |
+
total_reviews: int,
|
| 79 |
+
positive_reviews: int,
|
| 80 |
+
negative_reviews: int,
|
| 81 |
+
) -> SamplePlan:
|
| 82 |
+
"""
|
| 83 |
+
Tworzy plan próbkowania, łącząc dwa podejścia.
|
| 84 |
+
"""
|
| 85 |
+
top_helpful = settings.sample_top_helpful
|
| 86 |
+
max_reviews = settings.sample_max_reviews
|
| 87 |
+
|
| 88 |
+
# Obliczamy, ile recenzji musimy pobrać, żeby wynik był wiarygodny
|
| 89 |
+
statistical_sample = calculate_sample_size(total_reviews)
|
| 90 |
+
|
| 91 |
+
# Pilnujemy, żeby nie przekroczyć ustawionego limitu (np. 3000)
|
| 92 |
+
statistical_sample = min(statistical_sample, max_reviews - top_helpful)
|
| 93 |
+
|
| 94 |
+
# Obliczamy jaki procent stanowią pozytywy i negatywy w całości
|
| 95 |
+
if total_reviews > 0:
|
| 96 |
+
pos_ratio = positive_reviews / total_reviews
|
| 97 |
+
neg_ratio = negative_reviews / total_reviews
|
| 98 |
+
else:
|
| 99 |
+
pos_ratio = 0.5
|
| 100 |
+
neg_ratio = 0.5
|
| 101 |
+
|
| 102 |
+
# Rozdzielamy naszą próbkę proporcjonalnie do tych wyników (Stratified Sampling)
|
| 103 |
+
pos_target = math.ceil(statistical_sample * pos_ratio)
|
| 104 |
+
neg_target = math.ceil(statistical_sample * neg_ratio)
|
| 105 |
+
|
| 106 |
+
# Minority protection: boost the smaller group to minority_min if possible
|
| 107 |
+
minority_min = settings.sample_minority_min
|
| 108 |
+
|
| 109 |
+
if pos_target < minority_min and positive_reviews > pos_target:
|
| 110 |
+
pos_target = min(minority_min, positive_reviews)
|
| 111 |
+
|
| 112 |
+
if neg_target < minority_min and negative_reviews > neg_target:
|
| 113 |
+
neg_target = min(minority_min, negative_reviews)
|
| 114 |
+
|
| 115 |
+
# Final adjustment to stay within statistical_sample limit
|
| 116 |
+
if pos_target + neg_target > statistical_sample:
|
| 117 |
+
if pos_target > neg_target:
|
| 118 |
+
pos_target = max(pos_target - (pos_target + neg_target - statistical_sample), minority_min)
|
| 119 |
+
else:
|
| 120 |
+
neg_target = max(neg_target - (pos_target + neg_target - statistical_sample), minority_min)
|
| 121 |
+
|
| 122 |
+
# Final cap by actual availability
|
| 123 |
+
positive_count = min(pos_target, positive_reviews)
|
| 124 |
+
negative_count = min(neg_target, negative_reviews)
|
| 125 |
+
|
| 126 |
+
# Sumujemy wszystko (Top Helpful + Próbka Statystyczna)
|
| 127 |
+
total = top_helpful + positive_count + negative_count
|
| 128 |
+
|
| 129 |
+
return SamplePlan(
|
| 130 |
+
top_helpful=top_helpful,
|
| 131 |
+
statistical_sample=statistical_sample,
|
| 132 |
+
positive_count=positive_count,
|
| 133 |
+
negative_count=negative_count,
|
| 134 |
+
total=total,
|
| 135 |
+
)
|
backend/app/core/stopwords_zh.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Chinskie stop words dla NLP pipeline.
|
| 3 |
+
Uzywane przez Community Highlights (n-gram extraction) i potencjalnie inne moduly.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Jednooznakowe tokeny do odfiltrowania (WYJATKI ponizej)
|
| 7 |
+
SINGLE_CHAR_EXCEPTIONS = {"卡", "肝", "爽", "氪", "菜", "毒"}
|
| 8 |
+
|
| 9 |
+
# Stop words — czeste slowa bez wartosci informacyjnej
|
| 10 |
+
STOPWORDS_ZH = {
|
| 11 |
+
# Zaimki
|
| 12 |
+
"我", "你", "他", "她", "它", "我们", "你们", "他们",
|
| 13 |
+
# Czastki i spojniki
|
| 14 |
+
"的", "了", "是", "在", "不", "有", "和", "就",
|
| 15 |
+
"都", "也", "很", "要", "会", "可以", "这", "那",
|
| 16 |
+
"还", "没", "着", "被", "把", "让", "给", "从",
|
| 17 |
+
"到", "对", "但", "而", "或", "与",
|
| 18 |
+
# Czastki modalne
|
| 19 |
+
"吗", "呢", "啊", "吧", "呀", "嘛", "哦", "哈",
|
| 20 |
+
# Przysliwki
|
| 21 |
+
"比较", "非常", "真的", "确实", "其实", "可能",
|
| 22 |
+
"已经", "一直", "马上", "刚刚",
|
| 23 |
+
# Czasowniki ogolne
|
| 24 |
+
"觉得", "感觉", "知道", "看到", "说",
|
| 25 |
+
# Liczebniki i okreslniki
|
| 26 |
+
"一个", "一些", "这个", "那个", "什么", "怎么",
|
| 27 |
+
"多少", "几个",
|
| 28 |
+
# Filler w recenzjach gier
|
| 29 |
+
"这游戏", "这个游戏", "游戏", "玩家",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def is_stopword(token: str) -> bool:
|
| 34 |
+
"""Sprawdza czy token jest stop wordem lub jednooznakowym tokenem bez wartosci."""
|
| 35 |
+
if token in STOPWORDS_ZH:
|
| 36 |
+
return True
|
| 37 |
+
if len(token) == 1 and token not in SINGLE_CHAR_EXCEPTIONS:
|
| 38 |
+
return True
|
| 39 |
+
return False
|
backend/app/core/ttl_tiers.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tiered TTL configuration for game cache expiry.
|
| 3 |
+
|
| 4 |
+
Popular games (worker-managed top N) get longer cache,
|
| 5 |
+
niche games (on-demand) get shorter cache.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from app.core.config import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
async def get_ttl_hours(app_id: str) -> int:
|
| 12 |
+
"""Return TTL in hours based on whether the game is a priority game."""
|
| 13 |
+
from app.db.mongodb import mongodb
|
| 14 |
+
|
| 15 |
+
priority_ids = await mongodb.get_priority_game_ids_for_analysis()
|
| 16 |
+
|
| 17 |
+
if app_id in priority_ids:
|
| 18 |
+
return settings.cache_ttl_worker_managed_hours # 1440h (60d)
|
| 19 |
+
return settings.cache_ttl_on_demand_hours # 1440h (60d)
|
backend/app/core/worker_logging.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Structured logging infrastructure for Worker and Live API.
|
| 3 |
+
|
| 4 |
+
Provides JSON-line file logging with rotation, timing context managers,
|
| 5 |
+
and module-level accessors for use across the codebase.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import logging.handlers
|
| 11 |
+
import os
|
| 12 |
+
import time
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
from app.core.config import settings
|
| 16 |
+
|
| 17 |
+
# Module-level state
|
| 18 |
+
_structured_logger: logging.Logger | None = None
|
| 19 |
+
_cycle_id: str | None = None
|
| 20 |
+
_app_logging_initialized: bool = False
|
| 21 |
+
|
| 22 |
+
# Per-process log file whitelists (key → filename)
|
| 23 |
+
LIVE_LOG_WHITELIST: dict[str, str] = {
|
| 24 |
+
"live": "live.jsonl",
|
| 25 |
+
"errors": "errors.log",
|
| 26 |
+
"nlp_debug": "nlp_debug.log",
|
| 27 |
+
}
|
| 28 |
+
WORKER_LOG_WHITELIST: dict[str, str] = {
|
| 29 |
+
"worker": "worker.jsonl",
|
| 30 |
+
"errors": "errors.log",
|
| 31 |
+
"nlp_debug": "nlp_debug.log",
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class DebugOnlyFilter(logging.Filter):
|
| 36 |
+
"""Pass only DEBUG-level records (blocks INFO and above)."""
|
| 37 |
+
|
| 38 |
+
def filter(self, record: logging.LogRecord) -> bool:
|
| 39 |
+
return record.levelno == logging.DEBUG
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _get_writable_log_dir() -> str:
|
| 43 |
+
"""Return the first writable log directory (primary or fallback)."""
|
| 44 |
+
log_dir = settings.worker_log_dir
|
| 45 |
+
try:
|
| 46 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 47 |
+
test_path = os.path.join(log_dir, ".write_test")
|
| 48 |
+
with open(test_path, "w") as f:
|
| 49 |
+
f.write("ok")
|
| 50 |
+
os.remove(test_path)
|
| 51 |
+
except (OSError, PermissionError):
|
| 52 |
+
log_dir = settings.worker_log_fallback_dir
|
| 53 |
+
os.makedirs(log_dir, exist_ok=True)
|
| 54 |
+
return log_dir
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class JsonLineFormatter(logging.Formatter):
|
| 58 |
+
"""Formats log records as single-line JSON (JSONL)."""
|
| 59 |
+
|
| 60 |
+
def format(self, record: logging.LogRecord) -> str:
|
| 61 |
+
entry: dict[str, Any] = {
|
| 62 |
+
"ts": self.formatTime(record, self.datefmt),
|
| 63 |
+
"level": record.levelname,
|
| 64 |
+
"event": getattr(record, "event", record.getMessage()),
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
# Optional structured fields
|
| 68 |
+
for key in ("detail", "elapsed_s", "breakdown", "app_id",
|
| 69 |
+
"game_name", "source", "reviews_processed",
|
| 70 |
+
"topics_found", "analysis_type", "cycle_id", "error"):
|
| 71 |
+
val = getattr(record, key, None)
|
| 72 |
+
if val is not None:
|
| 73 |
+
entry[key] = val
|
| 74 |
+
|
| 75 |
+
# Include cycle_id from module state if not on record
|
| 76 |
+
if "cycle_id" not in entry or entry["cycle_id"] is None:
|
| 77 |
+
cid = get_cycle_id()
|
| 78 |
+
if cid:
|
| 79 |
+
entry["cycle_id"] = cid
|
| 80 |
+
|
| 81 |
+
# Remove None values
|
| 82 |
+
entry = {k: v for k, v in entry.items() if v is not None}
|
| 83 |
+
|
| 84 |
+
return json.dumps(entry, default=str, ensure_ascii=False)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def setup_structured_logger(name: str) -> logging.Logger:
|
| 88 |
+
"""
|
| 89 |
+
Create a rotating JSON-line file logger.
|
| 90 |
+
|
| 91 |
+
Tries settings.worker_log_dir first, falls back to
|
| 92 |
+
settings.worker_log_fallback_dir if the primary is not writable.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
name: Logger name and file prefix (e.g. "worker" or "live").
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Configured logger instance.
|
| 99 |
+
"""
|
| 100 |
+
logger = logging.getLogger(f"structured.{name}")
|
| 101 |
+
logger.setLevel(logging.INFO)
|
| 102 |
+
|
| 103 |
+
# Don't add duplicate handlers on re-init
|
| 104 |
+
if logger.handlers:
|
| 105 |
+
return logger
|
| 106 |
+
|
| 107 |
+
log_dir = _get_writable_log_dir()
|
| 108 |
+
log_path = os.path.join(log_dir, f"{name}.jsonl")
|
| 109 |
+
handler = logging.handlers.RotatingFileHandler(
|
| 110 |
+
log_path,
|
| 111 |
+
maxBytes=settings.worker_log_max_bytes,
|
| 112 |
+
backupCount=settings.worker_log_backup_count,
|
| 113 |
+
encoding="utf-8",
|
| 114 |
+
)
|
| 115 |
+
handler.setFormatter(JsonLineFormatter())
|
| 116 |
+
logger.addHandler(handler)
|
| 117 |
+
|
| 118 |
+
# Also store as module-level default
|
| 119 |
+
set_structured_logger(logger)
|
| 120 |
+
|
| 121 |
+
return logger
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class TimingContext:
|
| 125 |
+
"""Sync context manager that measures wall-clock time via time.monotonic()."""
|
| 126 |
+
|
| 127 |
+
def __init__(self) -> None:
|
| 128 |
+
self.elapsed_s: float = 0.0
|
| 129 |
+
self._start: float = 0.0
|
| 130 |
+
|
| 131 |
+
def __enter__(self) -> "TimingContext":
|
| 132 |
+
self._start = time.monotonic()
|
| 133 |
+
return self
|
| 134 |
+
|
| 135 |
+
def __exit__(self, *exc: Any) -> None:
|
| 136 |
+
self.elapsed_s = round(time.monotonic() - self._start, 3)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class AsyncTimingContext:
|
| 140 |
+
"""Async context manager that measures wall-clock time via time.monotonic()."""
|
| 141 |
+
|
| 142 |
+
def __init__(self) -> None:
|
| 143 |
+
self.elapsed_s: float = 0.0
|
| 144 |
+
self._start: float = 0.0
|
| 145 |
+
|
| 146 |
+
async def __aenter__(self) -> "AsyncTimingContext":
|
| 147 |
+
self._start = time.monotonic()
|
| 148 |
+
return self
|
| 149 |
+
|
| 150 |
+
async def __aexit__(self, *exc: Any) -> None:
|
| 151 |
+
self.elapsed_s = round(time.monotonic() - self._start, 3)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def read_log_tail(
|
| 155 |
+
path: str,
|
| 156 |
+
lines: int = 100,
|
| 157 |
+
level: str | None = None,
|
| 158 |
+
event: str | None = None,
|
| 159 |
+
) -> list[dict[str, Any]]:
|
| 160 |
+
"""
|
| 161 |
+
Read last N JSON lines from a log file, with optional filtering.
|
| 162 |
+
|
| 163 |
+
Args:
|
| 164 |
+
path: Path to .jsonl log file.
|
| 165 |
+
lines: Max number of lines to return.
|
| 166 |
+
level: Filter by log level (e.g. "ERROR").
|
| 167 |
+
event: Filter by event name substring.
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
List of parsed JSON dicts, newest last.
|
| 171 |
+
"""
|
| 172 |
+
if not os.path.exists(path):
|
| 173 |
+
return []
|
| 174 |
+
|
| 175 |
+
# Read all lines, take last N (simple approach for small-ish files)
|
| 176 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 177 |
+
all_lines = f.readlines()
|
| 178 |
+
|
| 179 |
+
# Parse from the end, collect up to `lines` matching entries
|
| 180 |
+
results: list[dict[str, Any]] = []
|
| 181 |
+
for raw in reversed(all_lines):
|
| 182 |
+
raw = raw.strip()
|
| 183 |
+
if not raw:
|
| 184 |
+
continue
|
| 185 |
+
try:
|
| 186 |
+
entry = json.loads(raw)
|
| 187 |
+
except json.JSONDecodeError:
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
if level and entry.get("level") != level:
|
| 191 |
+
continue
|
| 192 |
+
if event and event not in entry.get("event", ""):
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
results.append(entry)
|
| 196 |
+
if len(results) >= lines:
|
| 197 |
+
break
|
| 198 |
+
|
| 199 |
+
results.reverse() # Restore chronological order
|
| 200 |
+
return results
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def resolve_log_path(file_key: str, whitelist: dict[str, str]) -> str | None:
|
| 204 |
+
"""
|
| 205 |
+
Resolve a whitelisted log file key to its absolute path.
|
| 206 |
+
|
| 207 |
+
Returns the expected path if the key is in the whitelist, None otherwise.
|
| 208 |
+
The file may not exist yet (read_log_tail handles that gracefully).
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
file_key: Logical name for the log file (e.g. "live", "errors").
|
| 212 |
+
whitelist: Mapping of allowed keys to filenames for this process.
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
Absolute path to the log file, or None if key is not whitelisted.
|
| 216 |
+
"""
|
| 217 |
+
filename = whitelist.get(file_key)
|
| 218 |
+
if not filename:
|
| 219 |
+
return None
|
| 220 |
+
|
| 221 |
+
primary = os.path.join(settings.worker_log_dir, filename)
|
| 222 |
+
if os.path.isdir(settings.worker_log_dir):
|
| 223 |
+
return primary
|
| 224 |
+
|
| 225 |
+
return os.path.join(settings.worker_log_fallback_dir, filename)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def setup_app_logging() -> None:
|
| 229 |
+
"""
|
| 230 |
+
Set up application-wide file logging handlers. Idempotent.
|
| 231 |
+
|
| 232 |
+
Creates:
|
| 233 |
+
- errors.log: WARNING+ from all loggers (attached to root logger)
|
| 234 |
+
- nlp_debug.log: DEBUG-only NLP trace from app.services.nlp_service
|
| 235 |
+
|
| 236 |
+
Call once during app lifespan startup, after setup_structured_logger().
|
| 237 |
+
"""
|
| 238 |
+
global _app_logging_initialized
|
| 239 |
+
if _app_logging_initialized:
|
| 240 |
+
return
|
| 241 |
+
_app_logging_initialized = True
|
| 242 |
+
|
| 243 |
+
log_dir = _get_writable_log_dir()
|
| 244 |
+
|
| 245 |
+
# 1. errors.log — WARNING+ from root (catches all loggers via propagation)
|
| 246 |
+
errors_handler = logging.handlers.RotatingFileHandler(
|
| 247 |
+
os.path.join(log_dir, "errors.log"),
|
| 248 |
+
maxBytes=settings.errors_log_max_bytes,
|
| 249 |
+
backupCount=settings.worker_log_backup_count,
|
| 250 |
+
encoding="utf-8",
|
| 251 |
+
)
|
| 252 |
+
errors_handler.setLevel(logging.WARNING)
|
| 253 |
+
errors_handler.setFormatter(JsonLineFormatter())
|
| 254 |
+
logging.getLogger().addHandler(errors_handler)
|
| 255 |
+
|
| 256 |
+
# 2. nlp_debug.log — DEBUG-only NLP trace (Dedup/Cache messages)
|
| 257 |
+
nlp_handler = logging.handlers.RotatingFileHandler(
|
| 258 |
+
os.path.join(log_dir, "nlp_debug.log"),
|
| 259 |
+
maxBytes=settings.nlp_debug_log_max_bytes,
|
| 260 |
+
backupCount=settings.worker_log_backup_count,
|
| 261 |
+
encoding="utf-8",
|
| 262 |
+
)
|
| 263 |
+
nlp_handler.setLevel(logging.DEBUG)
|
| 264 |
+
nlp_handler.addFilter(DebugOnlyFilter())
|
| 265 |
+
nlp_handler.setFormatter(JsonLineFormatter())
|
| 266 |
+
|
| 267 |
+
nlp_logger = logging.getLogger("app.services.nlp_service")
|
| 268 |
+
nlp_logger.setLevel(logging.DEBUG)
|
| 269 |
+
nlp_logger.addHandler(nlp_handler)
|
| 270 |
+
|
| 271 |
+
# 3. Optional: re-enable NLP debug to stdout
|
| 272 |
+
if settings.nlp_verbose_logging:
|
| 273 |
+
verbose_handler = logging.StreamHandler()
|
| 274 |
+
verbose_handler.setLevel(logging.DEBUG)
|
| 275 |
+
verbose_handler.setFormatter(logging.Formatter(
|
| 276 |
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 277 |
+
))
|
| 278 |
+
nlp_logger.addHandler(verbose_handler)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def get_structured_logger() -> logging.Logger | None:
|
| 282 |
+
"""Get the module-level structured logger (if initialized)."""
|
| 283 |
+
return _structured_logger
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def set_structured_logger(logger: logging.Logger) -> None:
|
| 287 |
+
"""Set the module-level structured logger."""
|
| 288 |
+
global _structured_logger
|
| 289 |
+
_structured_logger = logger
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def get_cycle_id() -> str | None:
|
| 293 |
+
"""Get the current worker cycle ID."""
|
| 294 |
+
return _cycle_id
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def set_cycle_id(cycle_id: str | None) -> None:
|
| 298 |
+
"""Set the current worker cycle ID."""
|
| 299 |
+
global _cycle_id
|
| 300 |
+
_cycle_id = cycle_id
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def log_structured(
|
| 304 |
+
event: str,
|
| 305 |
+
level: int = logging.INFO,
|
| 306 |
+
**kwargs: Any,
|
| 307 |
+
) -> None:
|
| 308 |
+
"""
|
| 309 |
+
Emit a structured log entry via the module-level logger.
|
| 310 |
+
|
| 311 |
+
No-op if no structured logger has been initialized (e.g. in tests).
|
| 312 |
+
"""
|
| 313 |
+
slog = get_structured_logger()
|
| 314 |
+
if not slog:
|
| 315 |
+
return
|
| 316 |
+
slog.log(level, event, extra={"event": event, **kwargs})
|
backend/app/db/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Moduł bazy danych."""
|
backend/app/db/mongodb.py
ADDED
|
@@ -0,0 +1,1152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Moduł połączenia z bazą danych MongoDB.
|
| 3 |
+
|
| 4 |
+
Wykorzystuje Motor (async driver) do asynchronicznej komunikacji z MongoDB.
|
| 5 |
+
Implementuje cache wyników analizy z TTL 24h.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import logging
|
| 10 |
+
import re
|
| 11 |
+
from datetime import datetime, timedelta, timezone
|
| 12 |
+
from typing import Any
|
| 13 |
+
|
| 14 |
+
from bson.codec_options import CodecOptions
|
| 15 |
+
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
| 16 |
+
from pymongo import ASCENDING, DESCENDING, UpdateOne
|
| 17 |
+
from pymongo.errors import (
|
| 18 |
+
BulkWriteError,
|
| 19 |
+
ConnectionFailure,
|
| 20 |
+
OperationFailure,
|
| 21 |
+
PyMongoError,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
from app.core.config import settings
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class MongoDB:
|
| 30 |
+
"""
|
| 31 |
+
Klasa zarządzająca połączeniem z MongoDB.
|
| 32 |
+
|
| 33 |
+
Implementuje wzorzec Singleton poprzez globalną instancję.
|
| 34 |
+
Obsługuje cache wyników analizy z automatyczną walidacją TTL.
|
| 35 |
+
Przechowuje listę gier Steam do autouzupełniania.
|
| 36 |
+
|
| 37 |
+
Attributes:
|
| 38 |
+
client: Klient MongoDB (Motor).
|
| 39 |
+
db: Referencja do bazy danych.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
COLLECTION_ANALYSES = "analyses"
|
| 43 |
+
COLLECTION_GAMES = "games"
|
| 44 |
+
COLLECTION_STEAM_ERRORS = "steam_errors"
|
| 45 |
+
COLLECTION_REFRESH_SCHEDULES = "refresh_schedules"
|
| 46 |
+
|
| 47 |
+
def __init__(self) -> None:
|
| 48 |
+
"""Inicjalizuje instancję bez aktywnego połączenia."""
|
| 49 |
+
self.client: AsyncIOMotorClient | None = None # type: ignore
|
| 50 |
+
self.db: AsyncIOMotorDatabase | None = None # type: ignore
|
| 51 |
+
|
| 52 |
+
async def connect(self, max_retries: int = 3) -> None:
|
| 53 |
+
"""
|
| 54 |
+
Nawiązuje połączenie z MongoDB z exponential backoff.
|
| 55 |
+
|
| 56 |
+
Tworzy indeksy dla optymalnej wydajności zapytań.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
max_retries: Maksymalna liczba prób połączenia.
|
| 60 |
+
|
| 61 |
+
Raises:
|
| 62 |
+
ConnectionError: Gdy nie można połączyć się z bazą po wszystkich próbach.
|
| 63 |
+
"""
|
| 64 |
+
for attempt in range(1, max_retries + 1):
|
| 65 |
+
try:
|
| 66 |
+
self.client = AsyncIOMotorClient(settings.mongodb_url, tz_aware=True)
|
| 67 |
+
codec_options: CodecOptions = CodecOptions(tz_aware=True)
|
| 68 |
+
self.db = self.client.get_database(
|
| 69 |
+
settings.mongodb_db_name, codec_options=codec_options
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Weryfikacja połączenia
|
| 73 |
+
await self.client.admin.command("ping")
|
| 74 |
+
logger.info(f"Połączono z MongoDB: {settings.mongodb_db_name}")
|
| 75 |
+
|
| 76 |
+
# Utwórz indeksy
|
| 77 |
+
await self._create_indexes()
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
except (ConnectionFailure, PyMongoError) as e:
|
| 81 |
+
if attempt < max_retries:
|
| 82 |
+
delay = 2 ** (attempt - 1) # 1s, 2s, 4s
|
| 83 |
+
logger.warning(
|
| 84 |
+
f"MongoDB connection attempt {attempt}/{max_retries} failed: {e}. "
|
| 85 |
+
f"Retrying in {delay}s..."
|
| 86 |
+
)
|
| 87 |
+
await asyncio.sleep(delay)
|
| 88 |
+
else:
|
| 89 |
+
logger.error(f"MongoDB connection failed after {max_retries} attempts: {e}")
|
| 90 |
+
raise ConnectionError(
|
| 91 |
+
f"Nie można połączyć się z MongoDB po {max_retries} próbach: {e}"
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
async def _create_indexes(self) -> None:
|
| 95 |
+
"""Tworzy indeksy dla kolekcji."""
|
| 96 |
+
if self.db is None:
|
| 97 |
+
return
|
| 98 |
+
|
| 99 |
+
# Indeksy dla analiz
|
| 100 |
+
analyses = self.db[self.COLLECTION_ANALYSES]
|
| 101 |
+
await analyses.create_index("game_id", unique=True)
|
| 102 |
+
|
| 103 |
+
# Migrate from old global TTL index (cached_at) to per-document TTL (expires_at)
|
| 104 |
+
try:
|
| 105 |
+
existing_indexes = await analyses.index_information()
|
| 106 |
+
for idx_name, idx_info in existing_indexes.items():
|
| 107 |
+
if idx_info.get("expireAfterSeconds") is not None and "cached_at" in str(idx_info.get("key")):
|
| 108 |
+
await analyses.drop_index(idx_name)
|
| 109 |
+
logger.info(f"Dropped old TTL index: {idx_name}")
|
| 110 |
+
break
|
| 111 |
+
except OperationFailure:
|
| 112 |
+
pass # Old index may not exist
|
| 113 |
+
|
| 114 |
+
await analyses.create_index("expires_at", expireAfterSeconds=0)
|
| 115 |
+
|
| 116 |
+
# Indeksy dla listy gier
|
| 117 |
+
games = self.db[self.COLLECTION_GAMES]
|
| 118 |
+
await games.create_index("appid", unique=True)
|
| 119 |
+
# Indeks dla wyszukiwania regex (case-insensitive)
|
| 120 |
+
await games.create_index("name_lower")
|
| 121 |
+
await games.create_index("name_cn")
|
| 122 |
+
# Rzadki indeks dla flagi sprawdzenia (oszczędność miejsca, szybkość zapytania)
|
| 123 |
+
await games.create_index("cn_name_checked", sparse=True)
|
| 124 |
+
await games.create_index("parent_appid", sparse=True)
|
| 125 |
+
|
| 126 |
+
# Compound index for sorting games by review count (worker game sync)
|
| 127 |
+
await games.create_index(
|
| 128 |
+
[("positive", DESCENDING), ("negative", DESCENDING)],
|
| 129 |
+
sparse=True,
|
| 130 |
+
)
|
| 131 |
+
await games.create_index(
|
| 132 |
+
[
|
| 133 |
+
("name_lower", ASCENDING),
|
| 134 |
+
("app_type", ASCENDING),
|
| 135 |
+
("positive", DESCENDING),
|
| 136 |
+
("negative", DESCENDING),
|
| 137 |
+
]
|
| 138 |
+
)
|
| 139 |
+
await games.create_index("is_priority", sparse=True)
|
| 140 |
+
|
| 141 |
+
# Indeksy dla cache błędów Steam API
|
| 142 |
+
steam_errors = self.db[self.COLLECTION_STEAM_ERRORS]
|
| 143 |
+
await steam_errors.create_index("app_id", unique=True)
|
| 144 |
+
await steam_errors.create_index("expires_at", expireAfterSeconds=0)
|
| 145 |
+
|
| 146 |
+
# Indexes for refresh schedules (worker pre-cache)
|
| 147 |
+
schedules = self.db[self.COLLECTION_REFRESH_SCHEDULES]
|
| 148 |
+
await schedules.create_index("app_id", unique=True)
|
| 149 |
+
await schedules.create_index("status")
|
| 150 |
+
|
| 151 |
+
logger.debug("Utworzono indeksy MongoDB")
|
| 152 |
+
|
| 153 |
+
async def disconnect(self) -> None:
|
| 154 |
+
"""Zamyka połączenie z MongoDB."""
|
| 155 |
+
if self.client:
|
| 156 |
+
self.client.close()
|
| 157 |
+
logger.info("Rozłączono z MongoDB")
|
| 158 |
+
|
| 159 |
+
def _is_document_expired(self, document: dict[str, Any]) -> bool:
|
| 160 |
+
"""Check if a cache document is expired using expires_at or cached_at fallback.
|
| 161 |
+
|
| 162 |
+
With tz_aware=True on the Motor client, all datetimes from MongoDB are
|
| 163 |
+
already timezone-aware, so no manual .replace(tzinfo=...) is needed.
|
| 164 |
+
"""
|
| 165 |
+
now = datetime.now(timezone.utc)
|
| 166 |
+
|
| 167 |
+
# New-format: per-document expires_at
|
| 168 |
+
expires_at = document.get("expires_at")
|
| 169 |
+
if expires_at:
|
| 170 |
+
if isinstance(expires_at, str):
|
| 171 |
+
expires_at = datetime.fromisoformat(expires_at)
|
| 172 |
+
return now >= expires_at
|
| 173 |
+
|
| 174 |
+
# Old-format fallback: cached_at + default TTL
|
| 175 |
+
cached_at = document.get("cached_at")
|
| 176 |
+
if cached_at:
|
| 177 |
+
if isinstance(cached_at, str):
|
| 178 |
+
cached_at = datetime.fromisoformat(cached_at)
|
| 179 |
+
ttl_hours = document.get("ttl_hours", settings.cache_ttl_hours)
|
| 180 |
+
return now - cached_at > timedelta(hours=ttl_hours)
|
| 181 |
+
|
| 182 |
+
return True # No timestamp info = treat as expired
|
| 183 |
+
|
| 184 |
+
async def get_cached_analysis_full(self, game_id: str) -> dict[str, Any] | None:
|
| 185 |
+
"""
|
| 186 |
+
Returns full cache document (with review IDs, TTL info) or None if expired/missing.
|
| 187 |
+
"""
|
| 188 |
+
if self.db is None:
|
| 189 |
+
return None
|
| 190 |
+
|
| 191 |
+
collection = self.db[self.COLLECTION_ANALYSES]
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
document = await collection.find_one({"game_id": game_id})
|
| 195 |
+
if not document:
|
| 196 |
+
return None
|
| 197 |
+
|
| 198 |
+
if self._is_document_expired(document):
|
| 199 |
+
logger.info(f"Cache expired for game {game_id}")
|
| 200 |
+
return None
|
| 201 |
+
|
| 202 |
+
document.pop("_id", None)
|
| 203 |
+
return document
|
| 204 |
+
|
| 205 |
+
except PyMongoError as e:
|
| 206 |
+
logger.error(f"Error reading cache: {e}")
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
+
async def get_stale_analysis(self, game_id: str) -> dict[str, Any] | None:
|
| 210 |
+
"""
|
| 211 |
+
Returns cache document even if expired. Used by incremental path
|
| 212 |
+
to retrieve old review IDs. Returns None only if no document exists.
|
| 213 |
+
"""
|
| 214 |
+
return await self.get_analysis(game_id)
|
| 215 |
+
|
| 216 |
+
async def get_analysis(self, game_id: str) -> dict[str, Any] | None:
|
| 217 |
+
"""
|
| 218 |
+
Returns an analysis document regardless of TTL.
|
| 219 |
+
|
| 220 |
+
Product freshness is evaluated outside MongoDB, so this method is the
|
| 221 |
+
canonical read path for "show stale result + refresh" behavior.
|
| 222 |
+
"""
|
| 223 |
+
if self.db is None:
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
collection = self.db[self.COLLECTION_ANALYSES]
|
| 227 |
+
|
| 228 |
+
try:
|
| 229 |
+
document = await collection.find_one({"game_id": game_id})
|
| 230 |
+
if not document:
|
| 231 |
+
return None
|
| 232 |
+
|
| 233 |
+
document.pop("_id", None)
|
| 234 |
+
return document
|
| 235 |
+
|
| 236 |
+
except PyMongoError as e:
|
| 237 |
+
logger.error(f"Error reading stale cache: {e}")
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
async def get_cached_analysis(self, game_id: str) -> dict[str, Any] | None:
|
| 241 |
+
"""
|
| 242 |
+
Returns cached analysis results or None if expired/missing.
|
| 243 |
+
Backward-compatible wrapper around get_cached_analysis_full.
|
| 244 |
+
"""
|
| 245 |
+
doc = await self.get_cached_analysis_full(game_id)
|
| 246 |
+
if doc is None:
|
| 247 |
+
return None
|
| 248 |
+
results = doc.get("results")
|
| 249 |
+
if isinstance(results, dict) and results.get("cached_at") is None and doc.get("cached_at") is not None:
|
| 250 |
+
results = {**results, "cached_at": doc["cached_at"]}
|
| 251 |
+
return results
|
| 252 |
+
|
| 253 |
+
async def save_analysis(
|
| 254 |
+
self,
|
| 255 |
+
game_id: str,
|
| 256 |
+
results: dict[str, Any],
|
| 257 |
+
analyzed_review_ids: list[str] | None = None,
|
| 258 |
+
latest_review_timestamp: int = 0,
|
| 259 |
+
ttl_hours: int | None = None,
|
| 260 |
+
analyzed_at: datetime | None = None,
|
| 261 |
+
) -> None:
|
| 262 |
+
"""
|
| 263 |
+
Saves analysis results to cache with per-document TTL.
|
| 264 |
+
Purges review IDs to keep only the most recent ones (space efficiency).
|
| 265 |
+
"""
|
| 266 |
+
if self.db is None:
|
| 267 |
+
logger.warning("Brak połączenia z MongoDB - nie zapisano cache")
|
| 268 |
+
return
|
| 269 |
+
|
| 270 |
+
collection = self.db[self.COLLECTION_ANALYSES]
|
| 271 |
+
|
| 272 |
+
effective_ttl = ttl_hours or settings.cache_ttl_hours
|
| 273 |
+
now = datetime.now(timezone.utc)
|
| 274 |
+
analysis_date = analyzed_at
|
| 275 |
+
if analysis_date is None:
|
| 276 |
+
raw_value = results.get("analysis_date") or results.get("cached_at")
|
| 277 |
+
if isinstance(raw_value, str):
|
| 278 |
+
analysis_date = datetime.fromisoformat(raw_value)
|
| 279 |
+
elif isinstance(raw_value, datetime):
|
| 280 |
+
analysis_date = raw_value
|
| 281 |
+
if analysis_date is None:
|
| 282 |
+
analysis_date = now
|
| 283 |
+
|
| 284 |
+
if results.get("analysis_date") is None:
|
| 285 |
+
results = {**results, "analysis_date": analysis_date}
|
| 286 |
+
|
| 287 |
+
# Purge old IDs — keep only the most recent N
|
| 288 |
+
if analyzed_review_ids:
|
| 289 |
+
analyzed_review_ids = analyzed_review_ids[-settings.incremental_max_stored_ids:]
|
| 290 |
+
|
| 291 |
+
document: dict[str, Any] = {
|
| 292 |
+
"game_id": game_id,
|
| 293 |
+
"results": results,
|
| 294 |
+
"analyzed_review_ids": analyzed_review_ids or [],
|
| 295 |
+
"latest_review_timestamp": latest_review_timestamp,
|
| 296 |
+
"cached_at": now,
|
| 297 |
+
"analyzed_at": analysis_date,
|
| 298 |
+
"ttl_hours": effective_ttl,
|
| 299 |
+
"expires_at": now + timedelta(hours=effective_ttl),
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
try:
|
| 303 |
+
await collection.update_one(
|
| 304 |
+
{"game_id": game_id},
|
| 305 |
+
{"$set": document},
|
| 306 |
+
upsert=True,
|
| 307 |
+
)
|
| 308 |
+
logger.info(f"Saved cache for game {game_id} (TTL: {effective_ttl}h)")
|
| 309 |
+
|
| 310 |
+
except PyMongoError as e:
|
| 311 |
+
logger.error(f"Error saving cache: {e}")
|
| 312 |
+
|
| 313 |
+
async def delete_cached_analysis(self, game_id: str) -> bool:
|
| 314 |
+
"""
|
| 315 |
+
Usuwa cache dla danej gry.
|
| 316 |
+
|
| 317 |
+
Args:
|
| 318 |
+
game_id: Identyfikator gry Steam.
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
True jeśli usunięto, False w przeciwnym razie.
|
| 322 |
+
"""
|
| 323 |
+
if self.db is None:
|
| 324 |
+
return False
|
| 325 |
+
|
| 326 |
+
collection = self.db[self.COLLECTION_ANALYSES]
|
| 327 |
+
|
| 328 |
+
try:
|
| 329 |
+
result = await collection.delete_one({"game_id": game_id})
|
| 330 |
+
return result.deleted_count > 0
|
| 331 |
+
except PyMongoError as e:
|
| 332 |
+
logger.error(f"Błąd usuwania cache: {e}")
|
| 333 |
+
return False
|
| 334 |
+
|
| 335 |
+
# ========== Steam API Error Cache ==========
|
| 336 |
+
|
| 337 |
+
async def get_steam_error(self, app_id: str) -> dict[str, Any] | None:
|
| 338 |
+
"""
|
| 339 |
+
Sprawdza czy app_id ma cached error.
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Dict z polami app_id, status_code, expires_at lub None.
|
| 343 |
+
"""
|
| 344 |
+
if self.db is None:
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
collection = self.db[self.COLLECTION_STEAM_ERRORS]
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
document = await collection.find_one({"app_id": app_id})
|
| 351 |
+
if not document:
|
| 352 |
+
return None
|
| 353 |
+
|
| 354 |
+
document.pop("_id", None)
|
| 355 |
+
return document
|
| 356 |
+
|
| 357 |
+
except PyMongoError as e:
|
| 358 |
+
logger.error(f"Błąd odczytu steam error cache: {e}")
|
| 359 |
+
return None
|
| 360 |
+
|
| 361 |
+
async def cache_steam_error(
|
| 362 |
+
self, app_id: str, status_code: int, ttl_seconds: int
|
| 363 |
+
) -> None:
|
| 364 |
+
"""
|
| 365 |
+
Cachuje błąd Steam API z automatycznym TTL.
|
| 366 |
+
|
| 367 |
+
MongoDB TTL index automatycznie usunie dokument po expires_at.
|
| 368 |
+
"""
|
| 369 |
+
if self.db is None:
|
| 370 |
+
return
|
| 371 |
+
|
| 372 |
+
collection = self.db[self.COLLECTION_STEAM_ERRORS]
|
| 373 |
+
|
| 374 |
+
document = {
|
| 375 |
+
"app_id": app_id,
|
| 376 |
+
"status_code": status_code,
|
| 377 |
+
"cached_at": datetime.now(timezone.utc),
|
| 378 |
+
"expires_at": datetime.now(timezone.utc) + timedelta(seconds=ttl_seconds),
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
try:
|
| 382 |
+
await collection.update_one(
|
| 383 |
+
{"app_id": app_id},
|
| 384 |
+
{"$set": document},
|
| 385 |
+
upsert=True,
|
| 386 |
+
)
|
| 387 |
+
logger.info(
|
| 388 |
+
f"Cached Steam error {status_code} for app {app_id} (TTL: {ttl_seconds}s)"
|
| 389 |
+
)
|
| 390 |
+
except PyMongoError as e:
|
| 391 |
+
logger.error(f"Błąd zapisu steam error cache: {e}")
|
| 392 |
+
|
| 393 |
+
# ========== Metody dla listy gier (autouzupełnianie) ==========
|
| 394 |
+
|
| 395 |
+
async def get_games_count(self) -> int:
|
| 396 |
+
"""Zwraca liczbę gier w bazie."""
|
| 397 |
+
if self.db is None:
|
| 398 |
+
return 0
|
| 399 |
+
|
| 400 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 401 |
+
return await collection.count_documents({})
|
| 402 |
+
|
| 403 |
+
async def save_games_batch(self, games: list[dict[str, str]]) -> int:
|
| 404 |
+
"""
|
| 405 |
+
Zapisuje partię gier do bazy (bulk insert).
|
| 406 |
+
|
| 407 |
+
Args:
|
| 408 |
+
games: Lista słowników z kluczami 'appid', 'name', opcjonalnie 'developer', 'publisher'.
|
| 409 |
+
|
| 410 |
+
Returns:
|
| 411 |
+
Liczba zapisanych gier.
|
| 412 |
+
"""
|
| 413 |
+
if self.db is None or not games:
|
| 414 |
+
return 0
|
| 415 |
+
|
| 416 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 417 |
+
|
| 418 |
+
# Dodaj pole name_lower dla wyszukiwania case-insensitive
|
| 419 |
+
documents = []
|
| 420 |
+
for game in games:
|
| 421 |
+
if not game.get("name"):
|
| 422 |
+
continue
|
| 423 |
+
|
| 424 |
+
doc = {
|
| 425 |
+
"appid": game["appid"],
|
| 426 |
+
"name": game["name"],
|
| 427 |
+
"name_lower": game["name"].lower(),
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
# Dodaj opcjonalne pola
|
| 431 |
+
if game.get("developer"):
|
| 432 |
+
doc["developer"] = game["developer"]
|
| 433 |
+
if game.get("publisher"):
|
| 434 |
+
doc["publisher"] = game["publisher"]
|
| 435 |
+
|
| 436 |
+
documents.append(doc)
|
| 437 |
+
|
| 438 |
+
try:
|
| 439 |
+
# Użyj ordered=False żeby kontynuować mimo duplikatów
|
| 440 |
+
result = await collection.insert_many(documents, ordered=False)
|
| 441 |
+
return len(result.inserted_ids)
|
| 442 |
+
except BulkWriteError as e:
|
| 443 |
+
# Duplicates are expected with ordered=False — count successful inserts
|
| 444 |
+
inserted = e.details.get("nInserted", 0)
|
| 445 |
+
logger.debug(f"Pominięto duplikaty podczas zapisu gier ({inserted} inserted)")
|
| 446 |
+
return inserted
|
| 447 |
+
except PyMongoError as e:
|
| 448 |
+
logger.error(f"Błąd zapisu gier: {e}")
|
| 449 |
+
return 0
|
| 450 |
+
|
| 451 |
+
async def clear_games(self) -> None:
|
| 452 |
+
"""Usuwa wszystkie gry z bazy."""
|
| 453 |
+
if self.db is None:
|
| 454 |
+
return
|
| 455 |
+
|
| 456 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 457 |
+
await collection.delete_many({})
|
| 458 |
+
logger.info("Usunięto wszystkie gry z bazy")
|
| 459 |
+
|
| 460 |
+
async def upsert_game(self, game_data: dict[str, Any]) -> None:
|
| 461 |
+
"""
|
| 462 |
+
Dodaje lub aktualizuje pojedynczą grę w bazie danych.
|
| 463 |
+
Używane głównie przez mechanizm Fallback Search.
|
| 464 |
+
"""
|
| 465 |
+
if self.db is None:
|
| 466 |
+
return
|
| 467 |
+
|
| 468 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 469 |
+
appid = str(game_data["appid"])
|
| 470 |
+
|
| 471 |
+
# Przygotuj dokument
|
| 472 |
+
update_doc = {
|
| 473 |
+
"appid": appid,
|
| 474 |
+
"name": game_data["name"],
|
| 475 |
+
"name_lower": game_data["name"].lower(),
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
if game_data.get("name_cn"):
|
| 479 |
+
update_doc["name_cn"] = game_data["name_cn"]
|
| 480 |
+
update_doc["cn_name_checked"] = True
|
| 481 |
+
elif game_data.get("cn_name_checked"):
|
| 482 |
+
update_doc["cn_name_checked"] = True
|
| 483 |
+
|
| 484 |
+
if game_data.get("header_image") is not None:
|
| 485 |
+
update_doc["header_image"] = game_data["header_image"]
|
| 486 |
+
if game_data.get("total_reviews") is not None:
|
| 487 |
+
update_doc["total_reviews"] = game_data["total_reviews"]
|
| 488 |
+
|
| 489 |
+
# Worker-supplied fields
|
| 490 |
+
for field in (
|
| 491 |
+
"positive", "negative", "tags", "genre", "ccu",
|
| 492 |
+
"last_game_update_at", "synced_at", "developer", "publisher",
|
| 493 |
+
"app_type", "parent_appid", "dlc_checked_at",
|
| 494 |
+
):
|
| 495 |
+
if game_data.get(field) is not None:
|
| 496 |
+
update_doc[field] = game_data[field]
|
| 497 |
+
|
| 498 |
+
try:
|
| 499 |
+
await collection.update_one(
|
| 500 |
+
{"appid": appid},
|
| 501 |
+
{"$set": update_doc},
|
| 502 |
+
upsert=True
|
| 503 |
+
)
|
| 504 |
+
logger.debug(f"Zsynchronizowano grę {appid} w MongoDB")
|
| 505 |
+
except PyMongoError as e:
|
| 506 |
+
logger.error(f"Błąd upsert gry {appid}: {e}")
|
| 507 |
+
|
| 508 |
+
async def search_games(self, query: str, limit: int = 10) -> list[dict[str, Any]]:
|
| 509 |
+
"""
|
| 510 |
+
Wyszukuje gry po nazwie (EN lub CN).
|
| 511 |
+
|
| 512 |
+
Używa wyszukiwania case-insensitive z prefiksem.
|
| 513 |
+
|
| 514 |
+
Args:
|
| 515 |
+
query: Tekst do wyszukania.
|
| 516 |
+
limit: Maksymalna liczba wyników.
|
| 517 |
+
|
| 518 |
+
Returns:
|
| 519 |
+
Lista gier pasujących do zapytania (appid, name, name_cn, developer, publisher).
|
| 520 |
+
"""
|
| 521 |
+
normalized_query = query.strip()
|
| 522 |
+
if self.db is None or not normalized_query or len(normalized_query) < 2:
|
| 523 |
+
return []
|
| 524 |
+
|
| 525 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 526 |
+
|
| 527 |
+
try:
|
| 528 |
+
query_lower = normalized_query.lower()
|
| 529 |
+
name_pattern = re.escape(query_lower)
|
| 530 |
+
name_prefix_pattern = f"^{name_pattern}"
|
| 531 |
+
name_exact_pattern = f"^{name_pattern}$"
|
| 532 |
+
cn_pattern = re.escape(normalized_query)
|
| 533 |
+
cn_prefix_pattern = f"^{cn_pattern}"
|
| 534 |
+
cn_exact_pattern = f"^{cn_pattern}$"
|
| 535 |
+
|
| 536 |
+
match_filter: dict[str, Any] = {
|
| 537 |
+
"$or": [
|
| 538 |
+
{"name_lower": {"$regex": name_pattern}},
|
| 539 |
+
{"name_cn": {"$regex": cn_pattern, "$options": "i"}},
|
| 540 |
+
]
|
| 541 |
+
}
|
| 542 |
+
if not settings.dlc_visible_in_search:
|
| 543 |
+
match_filter["app_type"] = {"$ne": "dlc"}
|
| 544 |
+
|
| 545 |
+
pipeline = [
|
| 546 |
+
{"$match": match_filter},
|
| 547 |
+
{
|
| 548 |
+
"$addFields": {
|
| 549 |
+
"match_rank": {
|
| 550 |
+
"$switch": {
|
| 551 |
+
"branches": [
|
| 552 |
+
{
|
| 553 |
+
"case": {
|
| 554 |
+
"$or": [
|
| 555 |
+
{
|
| 556 |
+
"$regexMatch": {
|
| 557 |
+
"input": {"$ifNull": ["$name_lower", ""]},
|
| 558 |
+
"regex": name_exact_pattern,
|
| 559 |
+
}
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"$regexMatch": {
|
| 563 |
+
"input": {"$ifNull": ["$name_cn", ""]},
|
| 564 |
+
"regex": cn_exact_pattern,
|
| 565 |
+
"options": "i",
|
| 566 |
+
}
|
| 567 |
+
},
|
| 568 |
+
]
|
| 569 |
+
},
|
| 570 |
+
"then": 0,
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"case": {
|
| 574 |
+
"$or": [
|
| 575 |
+
{
|
| 576 |
+
"$regexMatch": {
|
| 577 |
+
"input": {"$ifNull": ["$name_lower", ""]},
|
| 578 |
+
"regex": name_prefix_pattern,
|
| 579 |
+
}
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"$regexMatch": {
|
| 583 |
+
"input": {"$ifNull": ["$name_cn", ""]},
|
| 584 |
+
"regex": cn_prefix_pattern,
|
| 585 |
+
"options": "i",
|
| 586 |
+
}
|
| 587 |
+
},
|
| 588 |
+
]
|
| 589 |
+
},
|
| 590 |
+
"then": 1,
|
| 591 |
+
},
|
| 592 |
+
],
|
| 593 |
+
"default": 2,
|
| 594 |
+
}
|
| 595 |
+
},
|
| 596 |
+
"type_rank": {
|
| 597 |
+
"$switch": {
|
| 598 |
+
"branches": [
|
| 599 |
+
{
|
| 600 |
+
"case": {
|
| 601 |
+
"$in": [
|
| 602 |
+
{"$ifNull": ["$app_type", "unknown"]},
|
| 603 |
+
["game", "unknown"],
|
| 604 |
+
]
|
| 605 |
+
},
|
| 606 |
+
"then": 0,
|
| 607 |
+
},
|
| 608 |
+
{"case": {"$eq": ["$app_type", "dlc"]}, "then": 1},
|
| 609 |
+
{"case": {"$eq": ["$app_type", "demo"]}, "then": 2},
|
| 610 |
+
],
|
| 611 |
+
"default": 1,
|
| 612 |
+
}
|
| 613 |
+
},
|
| 614 |
+
"review_count": {
|
| 615 |
+
"$add": [
|
| 616 |
+
{"$ifNull": ["$positive", 0]},
|
| 617 |
+
{"$ifNull": ["$negative", 0]},
|
| 618 |
+
]
|
| 619 |
+
},
|
| 620 |
+
}
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"$sort": {
|
| 624 |
+
"match_rank": 1,
|
| 625 |
+
"type_rank": 1,
|
| 626 |
+
"review_count": -1,
|
| 627 |
+
"name": 1,
|
| 628 |
+
}
|
| 629 |
+
},
|
| 630 |
+
{"$limit": limit},
|
| 631 |
+
{
|
| 632 |
+
"$project": {
|
| 633 |
+
"_id": 0,
|
| 634 |
+
"appid": 1,
|
| 635 |
+
"name": 1,
|
| 636 |
+
"name_cn": 1,
|
| 637 |
+
"developer": 1,
|
| 638 |
+
"publisher": 1,
|
| 639 |
+
"app_type": 1,
|
| 640 |
+
"parent_appid": 1,
|
| 641 |
+
}
|
| 642 |
+
},
|
| 643 |
+
]
|
| 644 |
+
|
| 645 |
+
cursor = collection.aggregate(pipeline)
|
| 646 |
+
results = await cursor.to_list(length=limit)
|
| 647 |
+
return results
|
| 648 |
+
|
| 649 |
+
except PyMongoError as e:
|
| 650 |
+
logger.error(f"Błąd wyszukiwania gier: {e}")
|
| 651 |
+
return []
|
| 652 |
+
|
| 653 |
+
|
| 654 |
+
async def get_game_update_date(self, app_id: str) -> datetime | None:
|
| 655 |
+
"""Get the last game update timestamp for a game."""
|
| 656 |
+
if self.db is None:
|
| 657 |
+
return None
|
| 658 |
+
|
| 659 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 660 |
+
try:
|
| 661 |
+
doc = await collection.find_one(
|
| 662 |
+
{"appid": str(app_id)},
|
| 663 |
+
{"_id": 0, "last_game_update_at": 1},
|
| 664 |
+
)
|
| 665 |
+
if doc and doc.get("last_game_update_at"):
|
| 666 |
+
val = doc["last_game_update_at"]
|
| 667 |
+
if isinstance(val, datetime):
|
| 668 |
+
return val
|
| 669 |
+
return None
|
| 670 |
+
return None
|
| 671 |
+
except PyMongoError as e:
|
| 672 |
+
logger.error(f"Error getting game update date for {app_id}: {e}")
|
| 673 |
+
return None
|
| 674 |
+
|
| 675 |
+
async def get_games_without_cn_name(self, limit: int = 200) -> list[dict[str, Any]]:
|
| 676 |
+
"""
|
| 677 |
+
Pobiera gry, które nie mają jeszcze nazwy chińskiej i nie były sprawdzane.
|
| 678 |
+
Sortuje po liczbie pozytywnych recenzji (jeśli dostępne, dla priorytetyzacji).
|
| 679 |
+
"""
|
| 680 |
+
if self.db is None:
|
| 681 |
+
return []
|
| 682 |
+
|
| 683 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 684 |
+
try:
|
| 685 |
+
pipeline = [
|
| 686 |
+
{"$match": {
|
| 687 |
+
"name_cn": {"$exists": False},
|
| 688 |
+
"cn_name_checked": {"$ne": True}, # Pomiń już sprawdzone
|
| 689 |
+
}},
|
| 690 |
+
# Sortowanie po positive (DESC), ale gry bez tego pola trafią na koniec (sparse index handling)
|
| 691 |
+
{"$sort": {"positive": -1}},
|
| 692 |
+
{"$limit": limit},
|
| 693 |
+
{"$project": {"_id": 0, "appid": 1, "name": 1}},
|
| 694 |
+
]
|
| 695 |
+
cursor = collection.aggregate(pipeline)
|
| 696 |
+
return await cursor.to_list(length=limit)
|
| 697 |
+
except PyMongoError as e:
|
| 698 |
+
logger.error(f"Error getting games without CN name: {e}")
|
| 699 |
+
return []
|
| 700 |
+
|
| 701 |
+
async def mark_cn_name_checked(self, app_id: str, name_cn: str | None = None) -> None:
|
| 702 |
+
"""
|
| 703 |
+
Oznacza grę jako sprawdzoną pod kątem chińskiej nazwy.
|
| 704 |
+
Opcjonalnie zapisuje znalezioną nazwę.
|
| 705 |
+
"""
|
| 706 |
+
if self.db is None:
|
| 707 |
+
return
|
| 708 |
+
|
| 709 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 710 |
+
update_doc: dict[str, Any] = {"cn_name_checked": True}
|
| 711 |
+
if name_cn:
|
| 712 |
+
update_doc["name_cn"] = name_cn
|
| 713 |
+
|
| 714 |
+
try:
|
| 715 |
+
await collection.update_one(
|
| 716 |
+
{"appid": str(app_id)},
|
| 717 |
+
{"$set": update_doc}
|
| 718 |
+
)
|
| 719 |
+
except PyMongoError as e:
|
| 720 |
+
logger.error(f"Error marking CN name checked for {app_id}: {e}")
|
| 721 |
+
|
| 722 |
+
async def get_games_missing_app_type(self, limit: int = 200) -> list[dict[str, Any]]:
|
| 723 |
+
"""
|
| 724 |
+
Return high-signal games that still need Steam Store type enrichment.
|
| 725 |
+
|
| 726 |
+
We prioritize already-priority games first, then any app with enough reviews
|
| 727 |
+
to qualify a DLC for worker-managed analysis.
|
| 728 |
+
"""
|
| 729 |
+
if self.db is None:
|
| 730 |
+
return []
|
| 731 |
+
|
| 732 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 733 |
+
try:
|
| 734 |
+
pipeline = [
|
| 735 |
+
{
|
| 736 |
+
"$addFields": {
|
| 737 |
+
"total_reviews_sum": {
|
| 738 |
+
"$add": [
|
| 739 |
+
{"$ifNull": ["$positive", 0]},
|
| 740 |
+
{"$ifNull": ["$negative", 0]},
|
| 741 |
+
]
|
| 742 |
+
}
|
| 743 |
+
}
|
| 744 |
+
},
|
| 745 |
+
{
|
| 746 |
+
"$match": {
|
| 747 |
+
"dlc_checked_at": {"$exists": False},
|
| 748 |
+
"$or": [
|
| 749 |
+
{"is_priority": True},
|
| 750 |
+
{
|
| 751 |
+
"total_reviews_sum": {
|
| 752 |
+
"$gte": settings.dlc_min_reviews_for_analysis
|
| 753 |
+
}
|
| 754 |
+
},
|
| 755 |
+
],
|
| 756 |
+
}
|
| 757 |
+
},
|
| 758 |
+
{"$sort": {"is_priority": -1, "total_reviews_sum": -1}},
|
| 759 |
+
{"$limit": limit},
|
| 760 |
+
{"$project": {"_id": 0, "appid": 1, "name": 1}},
|
| 761 |
+
]
|
| 762 |
+
cursor = collection.aggregate(pipeline)
|
| 763 |
+
return await cursor.to_list(length=limit)
|
| 764 |
+
except PyMongoError as e:
|
| 765 |
+
logger.error(f"Error getting games missing app type: {e}")
|
| 766 |
+
return []
|
| 767 |
+
|
| 768 |
+
async def mark_app_type_checked(
|
| 769 |
+
self,
|
| 770 |
+
app_id: str,
|
| 771 |
+
*,
|
| 772 |
+
app_type: str,
|
| 773 |
+
parent_appid: str | None = None,
|
| 774 |
+
) -> None:
|
| 775 |
+
"""Persist Steam Store app type metadata."""
|
| 776 |
+
if self.db is None:
|
| 777 |
+
return
|
| 778 |
+
|
| 779 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 780 |
+
update_doc: dict[str, Any] = {
|
| 781 |
+
"app_type": app_type,
|
| 782 |
+
"parent_appid": str(parent_appid) if parent_appid else None,
|
| 783 |
+
"dlc_checked_at": datetime.now(timezone.utc),
|
| 784 |
+
}
|
| 785 |
+
|
| 786 |
+
try:
|
| 787 |
+
await collection.update_one(
|
| 788 |
+
{"appid": str(app_id)},
|
| 789 |
+
{"$set": update_doc},
|
| 790 |
+
)
|
| 791 |
+
except PyMongoError as e:
|
| 792 |
+
logger.error(f"Error marking app type checked for {app_id}: {e}")
|
| 793 |
+
|
| 794 |
+
# ========== Worker Methods ==========
|
| 795 |
+
|
| 796 |
+
async def upsert_games_batch(self, games: list[dict[str, Any]]) -> tuple[int, int]:
|
| 797 |
+
"""
|
| 798 |
+
Bulk upsert games via UpdateOne operations.
|
| 799 |
+
|
| 800 |
+
Returns:
|
| 801 |
+
(upserted_count, modified_count)
|
| 802 |
+
"""
|
| 803 |
+
if self.db is None or not games:
|
| 804 |
+
return (0, 0)
|
| 805 |
+
|
| 806 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 807 |
+
operations = []
|
| 808 |
+
|
| 809 |
+
for game in games:
|
| 810 |
+
appid = str(game.get("appid", ""))
|
| 811 |
+
name = game.get("name", "")
|
| 812 |
+
if not appid or not name:
|
| 813 |
+
continue
|
| 814 |
+
|
| 815 |
+
update_doc: dict[str, Any] = {
|
| 816 |
+
"appid": appid,
|
| 817 |
+
"name": name,
|
| 818 |
+
"name_lower": name.lower(),
|
| 819 |
+
}
|
| 820 |
+
for field in (
|
| 821 |
+
"developer", "publisher", "positive", "negative",
|
| 822 |
+
"tags", "genre", "ccu", "synced_at",
|
| 823 |
+
"app_type", "parent_appid", "dlc_checked_at",
|
| 824 |
+
):
|
| 825 |
+
if game.get(field) is not None:
|
| 826 |
+
update_doc[field] = game[field]
|
| 827 |
+
|
| 828 |
+
operations.append(
|
| 829 |
+
UpdateOne({"appid": appid}, {"$set": update_doc}, upsert=True)
|
| 830 |
+
)
|
| 831 |
+
|
| 832 |
+
if not operations:
|
| 833 |
+
return (0, 0)
|
| 834 |
+
|
| 835 |
+
try:
|
| 836 |
+
result = await collection.bulk_write(operations, ordered=False)
|
| 837 |
+
return (result.upserted_count, result.modified_count)
|
| 838 |
+
except BulkWriteError as e:
|
| 839 |
+
details = e.details or {}
|
| 840 |
+
return (details.get("nUpserted", 0), details.get("nModified", 0))
|
| 841 |
+
except PyMongoError as e:
|
| 842 |
+
logger.error(f"Error in upsert_games_batch: {e}")
|
| 843 |
+
return (0, 0)
|
| 844 |
+
|
| 845 |
+
async def get_top_games_by_reviews(self, limit: int = 500) -> list[dict[str, Any]]:
|
| 846 |
+
"""Top N games sorted by total review count (positive + negative) DESC."""
|
| 847 |
+
if self.db is None:
|
| 848 |
+
return []
|
| 849 |
+
|
| 850 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 851 |
+
try:
|
| 852 |
+
pipeline = [
|
| 853 |
+
{"$match": {"positive": {"$exists": True}, "negative": {"$exists": True}}},
|
| 854 |
+
{"$addFields": {"total_reviews_sum": {"$add": ["$positive", "$negative"]}}},
|
| 855 |
+
{"$sort": {"total_reviews_sum": -1}},
|
| 856 |
+
{"$limit": limit},
|
| 857 |
+
{"$project": {"_id": 0}},
|
| 858 |
+
]
|
| 859 |
+
cursor = collection.aggregate(pipeline)
|
| 860 |
+
return await cursor.to_list(length=limit)
|
| 861 |
+
except PyMongoError as e:
|
| 862 |
+
logger.error(f"Error getting top games: {e}")
|
| 863 |
+
return []
|
| 864 |
+
|
| 865 |
+
async def update_game_update_date(self, app_id: str, update_at: datetime) -> None:
|
| 866 |
+
"""Store the latest game update timestamp."""
|
| 867 |
+
if self.db is None:
|
| 868 |
+
return
|
| 869 |
+
|
| 870 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 871 |
+
try:
|
| 872 |
+
await collection.update_one(
|
| 873 |
+
{"appid": str(app_id)},
|
| 874 |
+
{"$set": {"last_game_update_at": update_at}},
|
| 875 |
+
)
|
| 876 |
+
except PyMongoError as e:
|
| 877 |
+
logger.error(f"Error updating game update date for {app_id}: {e}")
|
| 878 |
+
|
| 879 |
+
async def update_game_patch_date(self, app_id: str, patch_date: datetime) -> None:
|
| 880 |
+
"""Store the latest confirmed major-update timestamp."""
|
| 881 |
+
if self.db is None:
|
| 882 |
+
return
|
| 883 |
+
|
| 884 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 885 |
+
try:
|
| 886 |
+
await collection.update_one(
|
| 887 |
+
{"appid": str(app_id)},
|
| 888 |
+
{"$set": {"current_patch_at": patch_date}},
|
| 889 |
+
)
|
| 890 |
+
except PyMongoError as e:
|
| 891 |
+
logger.error(f"Error updating game patch date for {app_id}: {e}")
|
| 892 |
+
|
| 893 |
+
async def update_news_cursor(self, app_id: str, gid: str, date: datetime) -> None:
|
| 894 |
+
"""Store the latest seen news GID and its date as an incremental scan cursor."""
|
| 895 |
+
if self.db is None:
|
| 896 |
+
return
|
| 897 |
+
|
| 898 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 899 |
+
try:
|
| 900 |
+
await collection.update_one(
|
| 901 |
+
{"appid": str(app_id)},
|
| 902 |
+
{"$set": {"last_seen_news_gid": gid, "last_seen_news_at": date}},
|
| 903 |
+
)
|
| 904 |
+
except PyMongoError as e:
|
| 905 |
+
logger.error(f"Error updating news cursor for {app_id}: {e}")
|
| 906 |
+
|
| 907 |
+
async def get_game_patch_date(self, app_id: str) -> datetime | None:
|
| 908 |
+
"""Get the latest confirmed major-update timestamp for a game."""
|
| 909 |
+
if self.db is None:
|
| 910 |
+
return None
|
| 911 |
+
|
| 912 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 913 |
+
try:
|
| 914 |
+
doc = await collection.find_one(
|
| 915 |
+
{"appid": str(app_id)},
|
| 916 |
+
{"_id": 0, "current_patch_at": 1},
|
| 917 |
+
)
|
| 918 |
+
if doc and doc.get("current_patch_at"):
|
| 919 |
+
val = doc["current_patch_at"]
|
| 920 |
+
if isinstance(val, datetime):
|
| 921 |
+
return val
|
| 922 |
+
return None
|
| 923 |
+
return None
|
| 924 |
+
except PyMongoError as e:
|
| 925 |
+
logger.error(f"Error getting game patch date for {app_id}: {e}")
|
| 926 |
+
return None
|
| 927 |
+
|
| 928 |
+
async def upsert_refresh_schedule(self, schedule: dict[str, Any]) -> None:
|
| 929 |
+
"""Create or replace a refresh schedule document."""
|
| 930 |
+
if self.db is None:
|
| 931 |
+
return
|
| 932 |
+
|
| 933 |
+
collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
|
| 934 |
+
try:
|
| 935 |
+
await collection.update_one(
|
| 936 |
+
{"app_id": schedule["app_id"]},
|
| 937 |
+
{"$set": schedule},
|
| 938 |
+
upsert=True,
|
| 939 |
+
)
|
| 940 |
+
except PyMongoError as e:
|
| 941 |
+
logger.error(f"Error upserting refresh schedule for {schedule.get('app_id')}: {e}")
|
| 942 |
+
|
| 943 |
+
async def get_active_schedules(self) -> list[dict[str, Any]]:
|
| 944 |
+
"""All schedules with status: 'active'."""
|
| 945 |
+
if self.db is None:
|
| 946 |
+
return []
|
| 947 |
+
|
| 948 |
+
collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
|
| 949 |
+
try:
|
| 950 |
+
cursor = collection.find({"status": "active"}, {"_id": 0})
|
| 951 |
+
return await cursor.to_list(length=10000)
|
| 952 |
+
except PyMongoError as e:
|
| 953 |
+
logger.error(f"Error getting active schedules: {e}")
|
| 954 |
+
return []
|
| 955 |
+
|
| 956 |
+
async def has_due_refresh_schedule(self, app_id: str) -> bool:
|
| 957 |
+
"""True when an active schedule has at least one due, incomplete checkpoint."""
|
| 958 |
+
if self.db is None:
|
| 959 |
+
return False
|
| 960 |
+
|
| 961 |
+
collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
|
| 962 |
+
now = datetime.now(timezone.utc)
|
| 963 |
+
try:
|
| 964 |
+
document = await collection.find_one(
|
| 965 |
+
{
|
| 966 |
+
"app_id": str(app_id),
|
| 967 |
+
"status": "active",
|
| 968 |
+
"checkpoints": {
|
| 969 |
+
"$elemMatch": {
|
| 970 |
+
"completed": False,
|
| 971 |
+
"due_at": {"$lte": now},
|
| 972 |
+
}
|
| 973 |
+
},
|
| 974 |
+
},
|
| 975 |
+
{"_id": 0, "app_id": 1},
|
| 976 |
+
)
|
| 977 |
+
return document is not None
|
| 978 |
+
except PyMongoError as e:
|
| 979 |
+
logger.error(f"Error checking due refresh schedule for {app_id}: {e}")
|
| 980 |
+
return False
|
| 981 |
+
|
| 982 |
+
async def mark_checkpoint_completed(self, app_id: str, offset_hours: int) -> None:
|
| 983 |
+
"""Mark a specific checkpoint as completed using positional $ update."""
|
| 984 |
+
if self.db is None:
|
| 985 |
+
return
|
| 986 |
+
|
| 987 |
+
collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
|
| 988 |
+
try:
|
| 989 |
+
await collection.update_one(
|
| 990 |
+
{"app_id": str(app_id), "checkpoints.offset_hours": offset_hours},
|
| 991 |
+
{"$set": {"checkpoints.$.completed": True}},
|
| 992 |
+
)
|
| 993 |
+
except PyMongoError as e:
|
| 994 |
+
logger.error(f"Error marking checkpoint for {app_id}/{offset_hours}h: {e}")
|
| 995 |
+
|
| 996 |
+
async def complete_schedule(self, app_id: str) -> None:
|
| 997 |
+
"""Set schedule status to 'completed'."""
|
| 998 |
+
if self.db is None:
|
| 999 |
+
return
|
| 1000 |
+
|
| 1001 |
+
collection = self.db[self.COLLECTION_REFRESH_SCHEDULES]
|
| 1002 |
+
try:
|
| 1003 |
+
await collection.update_one(
|
| 1004 |
+
{"app_id": str(app_id)},
|
| 1005 |
+
{"$set": {"status": "completed"}},
|
| 1006 |
+
)
|
| 1007 |
+
except PyMongoError as e:
|
| 1008 |
+
logger.error(f"Error completing schedule for {app_id}: {e}")
|
| 1009 |
+
|
| 1010 |
+
# ========== Priority Games Methods ==========
|
| 1011 |
+
|
| 1012 |
+
async def get_priority_games(self) -> list[dict[str, Any]]:
|
| 1013 |
+
"""All games with is_priority == True, all fields except _id."""
|
| 1014 |
+
if self.db is None:
|
| 1015 |
+
return []
|
| 1016 |
+
|
| 1017 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 1018 |
+
try:
|
| 1019 |
+
cursor = collection.find({"is_priority": True}, {"_id": 0})
|
| 1020 |
+
return await cursor.to_list(length=10000)
|
| 1021 |
+
except PyMongoError as e:
|
| 1022 |
+
logger.error(f"Error getting priority games: {e}")
|
| 1023 |
+
return []
|
| 1024 |
+
|
| 1025 |
+
async def get_priority_games_for_analysis(self) -> list[dict[str, Any]]:
|
| 1026 |
+
"""
|
| 1027 |
+
Priority games eligible for worker-managed analysis.
|
| 1028 |
+
|
| 1029 |
+
DLC stays linked to the priority universe via is_priority, but low-review DLC
|
| 1030 |
+
falls back to on-demand mode instead of occupying worker capacity.
|
| 1031 |
+
"""
|
| 1032 |
+
if self.db is None:
|
| 1033 |
+
return []
|
| 1034 |
+
|
| 1035 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 1036 |
+
if settings.dlc_worker_analysis_enabled:
|
| 1037 |
+
query: dict[str, Any] = {
|
| 1038 |
+
"is_priority": True,
|
| 1039 |
+
"$or": [
|
| 1040 |
+
{"app_type": {"$ne": "dlc"}},
|
| 1041 |
+
{
|
| 1042 |
+
"$expr": {
|
| 1043 |
+
"$gte": [
|
| 1044 |
+
{
|
| 1045 |
+
"$add": [
|
| 1046 |
+
{"$ifNull": ["$positive", 0]},
|
| 1047 |
+
{"$ifNull": ["$negative", 0]},
|
| 1048 |
+
]
|
| 1049 |
+
},
|
| 1050 |
+
settings.dlc_min_reviews_for_analysis,
|
| 1051 |
+
]
|
| 1052 |
+
}
|
| 1053 |
+
},
|
| 1054 |
+
],
|
| 1055 |
+
}
|
| 1056 |
+
else:
|
| 1057 |
+
query = {
|
| 1058 |
+
"is_priority": True,
|
| 1059 |
+
"app_type": {"$ne": "dlc"},
|
| 1060 |
+
}
|
| 1061 |
+
|
| 1062 |
+
try:
|
| 1063 |
+
cursor = collection.find(query, {"_id": 0})
|
| 1064 |
+
return await cursor.to_list(length=10000)
|
| 1065 |
+
except PyMongoError as e:
|
| 1066 |
+
logger.error(f"Error getting priority games for analysis: {e}")
|
| 1067 |
+
return []
|
| 1068 |
+
|
| 1069 |
+
async def get_priority_game_ids(self) -> set[str]:
|
| 1070 |
+
"""Lightweight set of appids for is_priority == True games."""
|
| 1071 |
+
if self.db is None:
|
| 1072 |
+
return set()
|
| 1073 |
+
|
| 1074 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 1075 |
+
try:
|
| 1076 |
+
cursor = collection.find({"is_priority": True}, {"_id": 0, "appid": 1})
|
| 1077 |
+
docs = await cursor.to_list(length=10000)
|
| 1078 |
+
return {str(d["appid"]) for d in docs if d.get("appid")}
|
| 1079 |
+
except PyMongoError as e:
|
| 1080 |
+
logger.error(f"Error getting priority game ids: {e}")
|
| 1081 |
+
return set()
|
| 1082 |
+
|
| 1083 |
+
async def get_priority_game_ids_for_analysis(self) -> set[str]:
|
| 1084 |
+
"""App IDs that should behave as worker-managed in runtime decisions."""
|
| 1085 |
+
docs = await self.get_priority_games_for_analysis()
|
| 1086 |
+
return {str(d["appid"]) for d in docs if d.get("appid")}
|
| 1087 |
+
|
| 1088 |
+
async def get_dlcs_by_parent_appid(self, parent_appid: str) -> list[dict[str, Any]]:
|
| 1089 |
+
"""Return DLC documents linked to a given base game."""
|
| 1090 |
+
if self.db is None:
|
| 1091 |
+
return []
|
| 1092 |
+
|
| 1093 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 1094 |
+
try:
|
| 1095 |
+
cursor = collection.find(
|
| 1096 |
+
{"app_type": "dlc", "parent_appid": str(parent_appid)},
|
| 1097 |
+
{"_id": 0},
|
| 1098 |
+
)
|
| 1099 |
+
return await cursor.to_list(length=1000)
|
| 1100 |
+
except PyMongoError as e:
|
| 1101 |
+
logger.error(f"Error getting DLCs for parent {parent_appid}: {e}")
|
| 1102 |
+
return []
|
| 1103 |
+
|
| 1104 |
+
async def get_existing_appids(self, appids: set[str]) -> set[str]:
|
| 1105 |
+
"""Return the subset of the given appids that have a document in games."""
|
| 1106 |
+
if self.db is None or not appids:
|
| 1107 |
+
return set()
|
| 1108 |
+
|
| 1109 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 1110 |
+
try:
|
| 1111 |
+
cursor = collection.find(
|
| 1112 |
+
{"appid": {"$in": list(appids)}},
|
| 1113 |
+
{"_id": 0, "appid": 1},
|
| 1114 |
+
)
|
| 1115 |
+
docs = await cursor.to_list(length=len(appids) + 1)
|
| 1116 |
+
return {str(d["appid"]) for d in docs if d.get("appid")}
|
| 1117 |
+
except PyMongoError as e:
|
| 1118 |
+
logger.error(f"Error in get_existing_appids: {e}")
|
| 1119 |
+
return set()
|
| 1120 |
+
|
| 1121 |
+
async def bulk_update_priority_fields(self, updates: list[tuple[str, dict]]) -> int:
|
| 1122 |
+
"""
|
| 1123 |
+
Batch UpdateOne operations for priority fields.
|
| 1124 |
+
|
| 1125 |
+
Args:
|
| 1126 |
+
updates: List of (appid, fields_dict) tuples.
|
| 1127 |
+
|
| 1128 |
+
Returns:
|
| 1129 |
+
modified_count
|
| 1130 |
+
"""
|
| 1131 |
+
if self.db is None or not updates:
|
| 1132 |
+
return 0
|
| 1133 |
+
|
| 1134 |
+
collection = self.db[self.COLLECTION_GAMES]
|
| 1135 |
+
operations = [
|
| 1136 |
+
UpdateOne({"appid": appid}, {"$set": fields})
|
| 1137 |
+
for appid, fields in updates
|
| 1138 |
+
]
|
| 1139 |
+
|
| 1140 |
+
try:
|
| 1141 |
+
result = await collection.bulk_write(operations, ordered=False)
|
| 1142 |
+
return result.modified_count
|
| 1143 |
+
except BulkWriteError as e:
|
| 1144 |
+
details = e.details or {}
|
| 1145 |
+
return details.get("nModified", 0)
|
| 1146 |
+
except PyMongoError as e:
|
| 1147 |
+
logger.error(f"Error in bulk_update_priority_fields: {e}")
|
| 1148 |
+
return 0
|
| 1149 |
+
|
| 1150 |
+
|
| 1151 |
+
# Globalna instancja (Singleton)
|
| 1152 |
+
mongodb = MongoDB()
|
backend/app/main.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
from contextlib import asynccontextmanager
|
| 4 |
+
from typing import AsyncGenerator
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI, Query, Request, Response
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from fastapi.staticfiles import StaticFiles
|
| 9 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 10 |
+
from slowapi import _rate_limit_exceeded_handler
|
| 11 |
+
from slowapi.errors import RateLimitExceeded
|
| 12 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
| 13 |
+
|
| 14 |
+
from app.core.config import settings
|
| 15 |
+
from app.core.rate_limit import limiter
|
| 16 |
+
from app.core.worker_logging import (
|
| 17 |
+
LIVE_LOG_WHITELIST,
|
| 18 |
+
read_log_tail,
|
| 19 |
+
resolve_log_path,
|
| 20 |
+
setup_app_logging,
|
| 21 |
+
setup_structured_logger,
|
| 22 |
+
)
|
| 23 |
+
from app.db.mongodb import mongodb
|
| 24 |
+
from app.routers import analyze, games
|
| 25 |
+
from app.services.nlp_service import get_nlp_service
|
| 26 |
+
from app.services.steam_service import steam_service
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Konfiguracja logowania
|
| 30 |
+
logging.basicConfig(
|
| 31 |
+
level=logging.INFO,
|
| 32 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
|
| 37 |
+
async def dispatch(self, request: Request, call_next):
|
| 38 |
+
response: Response = await call_next(request)
|
| 39 |
+
response.headers["X-Content-Type-Options"] = "nosniff"
|
| 40 |
+
response.headers["X-Frame-Options"] = "SAMEORIGIN"
|
| 41 |
+
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
|
| 42 |
+
return response
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@asynccontextmanager
|
| 46 |
+
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
| 47 |
+
"""
|
| 48 |
+
Zarządza cyklem życia aplikacji.
|
| 49 |
+
|
| 50 |
+
Nawiązuje połączenie z MongoDB przy starcie
|
| 51 |
+
i zamyka je przy wyłączeniu.
|
| 52 |
+
"""
|
| 53 |
+
if not settings.mongodb_url:
|
| 54 |
+
raise RuntimeError(
|
| 55 |
+
"MONGODB_URL is not set. Please configure it in .env or environment variables."
|
| 56 |
+
)
|
| 57 |
+
await mongodb.connect()
|
| 58 |
+
setup_structured_logger("live")
|
| 59 |
+
setup_app_logging()
|
| 60 |
+
yield
|
| 61 |
+
await steam_service.close()
|
| 62 |
+
await mongodb.disconnect()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
app = FastAPI(
|
| 66 |
+
title="SentimentStream API",
|
| 67 |
+
description="API do analizy sentymentu recenzji gier Steam w czasie rzeczywistym",
|
| 68 |
+
version="1.0.0",
|
| 69 |
+
lifespan=lifespan,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Rate limiter
|
| 73 |
+
app.state.limiter = limiter
|
| 74 |
+
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) # type: ignore[arg-type]
|
| 75 |
+
|
| 76 |
+
# Konfiguracja CORS
|
| 77 |
+
app.add_middleware(
|
| 78 |
+
CORSMiddleware,
|
| 79 |
+
allow_origins=settings.cors_origins_list,
|
| 80 |
+
allow_credentials=True,
|
| 81 |
+
allow_methods=["GET", "POST", "OPTIONS"],
|
| 82 |
+
allow_headers=["Content-Type", "Accept"],
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Security headers
|
| 86 |
+
app.add_middleware(SecurityHeadersMiddleware)
|
| 87 |
+
|
| 88 |
+
# Rejestracja routerów
|
| 89 |
+
app.include_router(analyze.router, prefix="/api", tags=["analyze"])
|
| 90 |
+
app.include_router(games.router, prefix="/api", tags=["games"])
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.get("/api/logs")
|
| 94 |
+
async def get_logs(
|
| 95 |
+
request: Request,
|
| 96 |
+
lines: int = Query(default=100, ge=1, le=1000),
|
| 97 |
+
level: str | None = Query(default=None),
|
| 98 |
+
event: str | None = Query(default=None),
|
| 99 |
+
file: str = Query(default="live"),
|
| 100 |
+
):
|
| 101 |
+
"""Token-protected endpoint to read structured log tail."""
|
| 102 |
+
auth = request.headers.get("Authorization", "")
|
| 103 |
+
expected = settings.worker_trigger_token
|
| 104 |
+
if expected:
|
| 105 |
+
if not auth.startswith("Bearer ") or auth[7:] != expected:
|
| 106 |
+
return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
|
| 107 |
+
|
| 108 |
+
log_path = resolve_log_path(file, LIVE_LOG_WHITELIST)
|
| 109 |
+
if log_path is None:
|
| 110 |
+
return JSONResponse(
|
| 111 |
+
status_code=400,
|
| 112 |
+
content={"detail": f"Unknown log file: '{file}'. Valid: {list(LIVE_LOG_WHITELIST.keys())}"},
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
entries = read_log_tail(log_path, lines=lines, level=level, event=event)
|
| 116 |
+
return {"entries": entries, "count": len(entries)}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@app.get("/health")
|
| 120 |
+
async def health_check() -> dict:
|
| 121 |
+
"""Endpoint sprawdzający stan aplikacji z rzeczywistą weryfikacją zależności."""
|
| 122 |
+
mongo_ok = False
|
| 123 |
+
if mongodb.client is not None:
|
| 124 |
+
try:
|
| 125 |
+
await mongodb.client.admin.command("ping")
|
| 126 |
+
mongo_ok = True
|
| 127 |
+
except Exception:
|
| 128 |
+
pass
|
| 129 |
+
|
| 130 |
+
nlp_svc = get_nlp_service()
|
| 131 |
+
model_ok = hasattr(nlp_svc, "classifier") and nlp_svc.classifier is not None
|
| 132 |
+
|
| 133 |
+
overall = "healthy" if (mongo_ok and model_ok) else "degraded"
|
| 134 |
+
return {
|
| 135 |
+
"status": overall,
|
| 136 |
+
"mongodb": "connected" if mongo_ok else "disconnected",
|
| 137 |
+
"model": "loaded" if model_ok else "not_loaded",
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# Obsługa plików statycznych (Frontend) - tylko jeśli istnieją (np. w Dockerze)
|
| 142 |
+
# Ścieżka w kontenerze Docker będzie: /app/frontend/dist
|
| 143 |
+
# Lokalnie zazwyczaj nie istnieje (bo używamy vite dev server), więc pomijamy
|
| 144 |
+
static_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "frontend", "dist")
|
| 145 |
+
|
| 146 |
+
if settings.app_mode != "api" and os.path.exists(static_dir):
|
| 147 |
+
app.mount("/assets", StaticFiles(directory=os.path.join(static_dir, "assets")), name="assets")
|
| 148 |
+
|
| 149 |
+
# Catch-all dla SPA (React Router)
|
| 150 |
+
@app.get("/{full_path:path}")
|
| 151 |
+
async def serve_spa(full_path: str):
|
| 152 |
+
if full_path.startswith("api"):
|
| 153 |
+
return {"error": "API route not found"}
|
| 154 |
+
|
| 155 |
+
file_path = os.path.join(static_dir, full_path)
|
| 156 |
+
if os.path.exists(file_path) and os.path.isfile(file_path):
|
| 157 |
+
return FileResponse(file_path)
|
| 158 |
+
|
| 159 |
+
return FileResponse(os.path.join(static_dir, "index.html"))
|
backend/app/models/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Modele danych Pydantic."""
|
| 2 |
+
|
| 3 |
+
from app.models.schemas import (
|
| 4 |
+
AnalysisProgress,
|
| 5 |
+
AnalysisResult,
|
| 6 |
+
GameInfo,
|
| 7 |
+
ReviewBatch,
|
| 8 |
+
SentimentType,
|
| 9 |
+
TopicSentiment,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"AnalysisProgress",
|
| 14 |
+
"AnalysisResult",
|
| 15 |
+
"GameInfo",
|
| 16 |
+
"ReviewBatch",
|
| 17 |
+
"SentimentType",
|
| 18 |
+
"TopicSentiment",
|
| 19 |
+
]
|
backend/app/models/schemas.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modele danych Pydantic.
|
| 3 |
+
|
| 4 |
+
Definiuje struktury danych używane w API oraz do walidacji.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from enum import Enum
|
| 9 |
+
|
| 10 |
+
from pydantic import BaseModel, Field
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SentimentType(str, Enum):
|
| 14 |
+
"""Typ sentymentu dla tematu."""
|
| 15 |
+
|
| 16 |
+
POSITIVE = "positive"
|
| 17 |
+
NEGATIVE = "negative"
|
| 18 |
+
NEUTRAL = "neutral"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class PredictionType(str, Enum):
|
| 22 |
+
"""Typ przewidywanego trendu liczby graczy."""
|
| 23 |
+
|
| 24 |
+
INCREASING = "increasing"
|
| 25 |
+
DECREASING = "decreasing"
|
| 26 |
+
STABLE = "stable"
|
| 27 |
+
UNCERTAIN = "uncertain"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class UserCountPrediction(BaseModel):
|
| 31 |
+
"""
|
| 32 |
+
Przewidywanie trendu liczby graczy.
|
| 33 |
+
|
| 34 |
+
Attributes:
|
| 35 |
+
trend: Przewidywany kierunek (wzrost/spadek).
|
| 36 |
+
confidence: Pewność predykcji (0.0 - 1.0).
|
| 37 |
+
reasoning: Krótkie uzasadnienie.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
trend: PredictionType
|
| 41 |
+
confidence: float
|
| 42 |
+
reasoning: str
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class GameInfo(BaseModel):
|
| 46 |
+
"""
|
| 47 |
+
Informacje o grze ze Steam.
|
| 48 |
+
|
| 49 |
+
Attributes:
|
| 50 |
+
app_id: Unikalny identyfikator gry na Steam.
|
| 51 |
+
name: Nazwa gry.
|
| 52 |
+
name_cn: Chińska nazwa gry (jeśli dostępna).
|
| 53 |
+
header_image: URL obrazka nagłówkowego.
|
| 54 |
+
total_reviews: Całkowita liczba recenzji.
|
| 55 |
+
target_count: Docelowa liczba recenzji do analizy (sample size).
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
app_id: str
|
| 59 |
+
name: str
|
| 60 |
+
name_cn: str | None = None
|
| 61 |
+
header_image: str | None = None
|
| 62 |
+
total_reviews: int = 0
|
| 63 |
+
target_count: int | None = None
|
| 64 |
+
last_game_update_at: int | None = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class TopicSentiment(BaseModel):
|
| 68 |
+
"""
|
| 69 |
+
Sentyment dla pojedynczego tematu.
|
| 70 |
+
|
| 71 |
+
Attributes:
|
| 72 |
+
topic: Nazwa tematu (np. "Grafika", "Gameplay").
|
| 73 |
+
sentiment: Typ sentymentu.
|
| 74 |
+
score: Wynik sentymentu (-1.0 do 1.0).
|
| 75 |
+
mention_count: Liczba wzmianek o temacie.
|
| 76 |
+
example: Przykładowe zdanie z recenzji.
|
| 77 |
+
"""
|
| 78 |
+
|
| 79 |
+
topic: str
|
| 80 |
+
sentiment: SentimentType
|
| 81 |
+
score: float = Field(ge=-1.0, le=1.0)
|
| 82 |
+
mention_count: int = 0
|
| 83 |
+
example: str | None = None
|
| 84 |
+
example_score: float | None = None # score przykładu do porównań przy agregacji
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class Highlight(BaseModel):
|
| 88 |
+
"""Czesto powtarzana fraza z recenzji."""
|
| 89 |
+
|
| 90 |
+
phrase: str
|
| 91 |
+
mention_count: int
|
| 92 |
+
sentiment: SentimentType
|
| 93 |
+
score: float
|
| 94 |
+
ngram_size: int
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TopicHighlights(BaseModel):
|
| 98 |
+
"""Highlights dla konkretnego tematu."""
|
| 99 |
+
|
| 100 |
+
topic: str
|
| 101 |
+
highlights: list[Highlight]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class AnalysisProgress(BaseModel):
|
| 105 |
+
"""
|
| 106 |
+
Postęp analizy (wysyłany przez SSE).
|
| 107 |
+
|
| 108 |
+
Attributes:
|
| 109 |
+
processed: Liczba przetworzonych recenzji.
|
| 110 |
+
total: Całkowita liczba recenzji do przetworzenia.
|
| 111 |
+
current_topics: Aktualne wyniki tematów.
|
| 112 |
+
skipped_count: Liczba zdań pominiętych (brak słów kluczowych).
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
processed: int
|
| 116 |
+
total: int
|
| 117 |
+
current_topics: list[TopicSentiment] = []
|
| 118 |
+
skipped_count: int = 0
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class AnalysisResult(BaseModel):
|
| 122 |
+
"""
|
| 123 |
+
Końcowy wynik analizy.
|
| 124 |
+
|
| 125 |
+
Attributes:
|
| 126 |
+
game: Informacje o grze.
|
| 127 |
+
general_topics: Lista tematów z sentymentem (pełny agregat).
|
| 128 |
+
prediction: Przewidywanie trendu liczby graczy.
|
| 129 |
+
analyzed_reviews: Liczba przeanalizowanych recenzji.
|
| 130 |
+
skipped_count: Łączna liczba pominiętych zdań.
|
| 131 |
+
cached_at: Data zapisania w cache.
|
| 132 |
+
"""
|
| 133 |
+
|
| 134 |
+
game: GameInfo
|
| 135 |
+
general_topics: list[TopicSentiment]
|
| 136 |
+
prediction: UserCountPrediction | None = None
|
| 137 |
+
analyzed_reviews: int
|
| 138 |
+
skipped_count: int = 0
|
| 139 |
+
general_highlights: list[Highlight] = []
|
| 140 |
+
recent_highlights: list[Highlight] | None = None
|
| 141 |
+
current_patch_highlights: list[Highlight] | None = None
|
| 142 |
+
topic_highlights: list[TopicHighlights] = []
|
| 143 |
+
cached_at: datetime | None = None
|
| 144 |
+
recent_topics: list[TopicSentiment] | None = None
|
| 145 |
+
recent_reviews_count: int = 0
|
| 146 |
+
current_patch_topics: list[TopicSentiment] | None = None
|
| 147 |
+
current_patch_reviews_count: int = 0
|
| 148 |
+
last_patch_topics: list[TopicSentiment] | None = None
|
| 149 |
+
last_patch_reviews_count: int = 0
|
| 150 |
+
current_patch_timestamp: int | None = None
|
| 151 |
+
analysis_date: datetime | None = None
|
| 152 |
+
current_patch_date: datetime | None = None
|
| 153 |
+
preferred_context: str | None = None
|
| 154 |
+
freshness_status: str | None = None
|
| 155 |
+
staleness_reason: str | None = None
|
| 156 |
+
is_refreshing: bool = False
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class ReviewItem(BaseModel):
|
| 160 |
+
"""Single review with metadata for incremental tracking."""
|
| 161 |
+
|
| 162 |
+
text: str
|
| 163 |
+
recommendation_id: str
|
| 164 |
+
timestamp_created: int
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class ReviewBatch(BaseModel):
|
| 168 |
+
"""
|
| 169 |
+
Partia recenzji do przetworzenia.
|
| 170 |
+
|
| 171 |
+
Attributes:
|
| 172 |
+
reviews: Lista tekstów recenzji.
|
| 173 |
+
review_items: Recenzje z metadanymi (do incremental analysis).
|
| 174 |
+
cursor: Kursor do paginacji Steam API.
|
| 175 |
+
"""
|
| 176 |
+
|
| 177 |
+
reviews: list[str]
|
| 178 |
+
review_items: list[ReviewItem] = []
|
| 179 |
+
cursor: str | None = None
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class SSEEvent(BaseModel):
|
| 183 |
+
"""
|
| 184 |
+
Wydarzenie Server-Sent Events.
|
| 185 |
+
|
| 186 |
+
Attributes:
|
| 187 |
+
event: Typ wydarzenia (progress/complete/error).
|
| 188 |
+
data: Dane wydarzenia.
|
| 189 |
+
"""
|
| 190 |
+
|
| 191 |
+
event: str
|
| 192 |
+
data: AnalysisProgress | AnalysisResult | dict
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
class CachedAnalysis(BaseModel):
|
| 196 |
+
"""
|
| 197 |
+
Dokument cache w MongoDB.
|
| 198 |
+
|
| 199 |
+
Przechowuje wyniki analizy z timestampem dla walidacji TTL.
|
| 200 |
+
|
| 201 |
+
Attributes:
|
| 202 |
+
game_id: Identyfikator gry Steam (klucz cache).
|
| 203 |
+
results: Wyniki analizy sentymentu.
|
| 204 |
+
cached_at: Data i czas zapisania do cache.
|
| 205 |
+
"""
|
| 206 |
+
|
| 207 |
+
game_id: str
|
| 208 |
+
results: AnalysisResult
|
| 209 |
+
cached_at: datetime
|
| 210 |
+
analyzed_at: datetime | None = None
|
backend/app/routers/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Routery API."""
|
| 2 |
+
|
| 3 |
+
from app.routers import analyze
|
| 4 |
+
|
| 5 |
+
__all__ = ["analyze"]
|
backend/app/routers/analyze.py
ADDED
|
@@ -0,0 +1,597 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Router API do analizy sentymentu.
|
| 3 |
+
|
| 4 |
+
Zawiera endpoint do streamowania wyników analizy przez SSE.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import contextlib
|
| 9 |
+
import json
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
from datetime import datetime, timezone
|
| 13 |
+
from typing import Any, AsyncGenerator
|
| 14 |
+
|
| 15 |
+
from fastapi import APIRouter, HTTPException, Depends, Path, Query, Request
|
| 16 |
+
from sse_starlette.sse import EventSourceResponse
|
| 17 |
+
|
| 18 |
+
from app.core.config import settings
|
| 19 |
+
from app.core.freshness import (
|
| 20 |
+
FreshnessStatus,
|
| 21 |
+
evaluate_freshness,
|
| 22 |
+
get_staleness_reason,
|
| 23 |
+
)
|
| 24 |
+
from app.core.sampling import SamplePlan, create_sample_plan
|
| 25 |
+
from app.core.ttl_tiers import get_ttl_hours
|
| 26 |
+
from app.core.worker_logging import get_structured_logger, log_structured
|
| 27 |
+
from app.db.mongodb import mongodb
|
| 28 |
+
from app.core.rate_limit import limiter
|
| 29 |
+
from app.models.schemas import (
|
| 30 |
+
AnalysisProgress,
|
| 31 |
+
AnalysisResult,
|
| 32 |
+
GameInfo,
|
| 33 |
+
Highlight,
|
| 34 |
+
TopicHighlights,
|
| 35 |
+
TopicSentiment,
|
| 36 |
+
)
|
| 37 |
+
from app.services.analysis_utils import (
|
| 38 |
+
aggregate_topics,
|
| 39 |
+
calculate_prediction,
|
| 40 |
+
coerce_utc_datetime,
|
| 41 |
+
compute_preferred_context,
|
| 42 |
+
datetime_from_timestamp,
|
| 43 |
+
filter_topics_by_min_mentions,
|
| 44 |
+
normalize_legacy_results,
|
| 45 |
+
serialize_datetime,
|
| 46 |
+
)
|
| 47 |
+
from app.services.highlights_service import HighlightsCollector
|
| 48 |
+
from app.services.analysis_runner import iter_incremental_analysis_events
|
| 49 |
+
from app.services.nlp_service import NLPService
|
| 50 |
+
from app.services.nlp_service import get_nlp_service as _get_nlp_service_instance
|
| 51 |
+
from app.services.steam_errors import SteamAPIError, SteamRateLimitError
|
| 52 |
+
from app.services.steam_service import SteamService, steam_service
|
| 53 |
+
|
| 54 |
+
logger = logging.getLogger(__name__)
|
| 55 |
+
|
| 56 |
+
router = APIRouter()
|
| 57 |
+
|
| 58 |
+
# Background refresh concurrency control
|
| 59 |
+
_refreshing_app_ids: set[str] = set()
|
| 60 |
+
_refresh_semaphore = asyncio.Semaphore(3) # max 3 concurrent background refreshes
|
| 61 |
+
|
| 62 |
+
# Funkcje pomocnicze dla Dependency Injection
|
| 63 |
+
def get_nlp_service() -> NLPService:
|
| 64 |
+
return _get_nlp_service_instance()
|
| 65 |
+
|
| 66 |
+
def get_steam_service() -> SteamService:
|
| 67 |
+
return steam_service
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@router.get("/health")
|
| 71 |
+
async def health_check():
|
| 72 |
+
"""
|
| 73 |
+
Endpoint do sprawdzania stanu aplikacji (Health Check).
|
| 74 |
+
"""
|
| 75 |
+
return {
|
| 76 |
+
"status": "ok",
|
| 77 |
+
"services": {
|
| 78 |
+
"mongodb": "connected",
|
| 79 |
+
"nlp": "ready",
|
| 80 |
+
"steam_api": "reachable"
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _build_analysis_payload(
|
| 87 |
+
document: dict[str, Any],
|
| 88 |
+
freshness_status: FreshnessStatus,
|
| 89 |
+
*,
|
| 90 |
+
current_patch_at: datetime | None = None,
|
| 91 |
+
is_refreshing: bool = False,
|
| 92 |
+
) -> dict[str, Any]:
|
| 93 |
+
results = normalize_legacy_results(document.get("results", {}))
|
| 94 |
+
payload = dict(results)
|
| 95 |
+
analysis_date = (
|
| 96 |
+
coerce_utc_datetime(payload.get("analysis_date"))
|
| 97 |
+
or coerce_utc_datetime(document.get("analyzed_at"))
|
| 98 |
+
or coerce_utc_datetime(payload.get("cached_at"))
|
| 99 |
+
or coerce_utc_datetime(document.get("cached_at"))
|
| 100 |
+
)
|
| 101 |
+
if current_patch_at is not None:
|
| 102 |
+
current_patch_date: datetime | None = current_patch_at
|
| 103 |
+
else:
|
| 104 |
+
# No confirmed major update in DB — nullify current_patch fields so
|
| 105 |
+
# legacy cached values don't appear as a valid Current Patch tab.
|
| 106 |
+
current_patch_date = None
|
| 107 |
+
payload["current_patch_topics"] = None
|
| 108 |
+
payload["current_patch_reviews_count"] = 0
|
| 109 |
+
payload["current_patch_highlights"] = None
|
| 110 |
+
payload["current_patch_timestamp"] = None
|
| 111 |
+
|
| 112 |
+
if payload.get("cached_at") is None and document.get("cached_at") is not None:
|
| 113 |
+
payload["cached_at"] = serialize_datetime(document["cached_at"])
|
| 114 |
+
elif payload.get("cached_at") is not None:
|
| 115 |
+
payload["cached_at"] = serialize_datetime(payload["cached_at"])
|
| 116 |
+
|
| 117 |
+
payload["analysis_date"] = serialize_datetime(analysis_date)
|
| 118 |
+
payload["current_patch_date"] = serialize_datetime(current_patch_date)
|
| 119 |
+
payload["freshness_status"] = freshness_status.value
|
| 120 |
+
payload["staleness_reason"] = get_staleness_reason(freshness_status)
|
| 121 |
+
payload["is_refreshing"] = is_refreshing
|
| 122 |
+
# Always recompute preferred_context from the current patch date so cached
|
| 123 |
+
# documents with a stale stored value get the correct tab on read.
|
| 124 |
+
patch_ts_for_context = int(current_patch_date.timestamp()) if current_patch_date else None
|
| 125 |
+
payload["preferred_context"] = compute_preferred_context(patch_ts_for_context)
|
| 126 |
+
return payload
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
async def _full_analysis(
|
| 130 |
+
game: GameInfo,
|
| 131 |
+
sample_plan: SamplePlan,
|
| 132 |
+
steam_svc: SteamService,
|
| 133 |
+
nlp_svc: NLPService,
|
| 134 |
+
patch_timestamp: int | None = None,
|
| 135 |
+
stale_doc: dict[str, Any] | None = None,
|
| 136 |
+
) -> AsyncGenerator[dict, None]:
|
| 137 |
+
"""Full analysis path — Producer-Consumer queue pattern."""
|
| 138 |
+
total_target = sample_plan.total
|
| 139 |
+
ttl_hours = await get_ttl_hours(game.app_id)
|
| 140 |
+
nlp_cumulative_s: float = 0.0
|
| 141 |
+
|
| 142 |
+
# Producer-Consumer queue (max 5 batches in flight)
|
| 143 |
+
queue: asyncio.Queue = asyncio.Queue(maxsize=5)
|
| 144 |
+
|
| 145 |
+
async def fetch_worker():
|
| 146 |
+
try:
|
| 147 |
+
async for batch in steam_svc.fetch_reviews_stratified(game.app_id, sample_plan):
|
| 148 |
+
await queue.put(batch)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
# Relay all exceptions to consumer via queue — they'll be re-raised
|
| 151 |
+
# and caught by the SSE generator's specific exception handlers.
|
| 152 |
+
await queue.put(e)
|
| 153 |
+
finally:
|
| 154 |
+
await queue.put(None)
|
| 155 |
+
|
| 156 |
+
fetch_task = asyncio.create_task(fetch_worker())
|
| 157 |
+
|
| 158 |
+
processed = 0
|
| 159 |
+
total_skipped = 0
|
| 160 |
+
aggregated_topics: list[TopicSentiment] = []
|
| 161 |
+
recent_processed = 0
|
| 162 |
+
recent_limit = settings.recent_sample_limit
|
| 163 |
+
all_review_ids: list[str] = []
|
| 164 |
+
latest_timestamp = 0
|
| 165 |
+
highlights_collector = HighlightsCollector()
|
| 166 |
+
current_patch_topics: list[TopicSentiment] = []
|
| 167 |
+
current_patch_count = 0
|
| 168 |
+
review_topic_results: list[tuple[int, list[TopicSentiment]]] = []
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
while True:
|
| 172 |
+
item = await queue.get()
|
| 173 |
+
|
| 174 |
+
if item is None:
|
| 175 |
+
break
|
| 176 |
+
if isinstance(item, Exception):
|
| 177 |
+
raise item
|
| 178 |
+
|
| 179 |
+
batch = item
|
| 180 |
+
if not batch.reviews:
|
| 181 |
+
continue
|
| 182 |
+
|
| 183 |
+
# Collect review IDs for incremental cache
|
| 184 |
+
for ri in batch.review_items:
|
| 185 |
+
all_review_ids.append(ri.recommendation_id)
|
| 186 |
+
if ri.timestamp_created > latest_timestamp:
|
| 187 |
+
latest_timestamp = ri.timestamp_created
|
| 188 |
+
|
| 189 |
+
# Split by patch timestamp when available and we have review_items
|
| 190 |
+
batch_skipped = 0
|
| 191 |
+
if patch_timestamp and batch.review_items:
|
| 192 |
+
for ri, text in zip(batch.review_items, batch.reviews):
|
| 193 |
+
is_recent = recent_processed < recent_limit
|
| 194 |
+
cat = []
|
| 195 |
+
if is_recent:
|
| 196 |
+
cat.append("recent")
|
| 197 |
+
|
| 198 |
+
if ri.timestamp_created >= patch_timestamp:
|
| 199 |
+
cat.append("current_patch")
|
| 200 |
+
nlp_start = time.monotonic()
|
| 201 |
+
res, skipped = await nlp_svc.analyze_batch(
|
| 202 |
+
[text], highlights_collector=highlights_collector, categories=cat
|
| 203 |
+
)
|
| 204 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 205 |
+
batch_skipped += skipped
|
| 206 |
+
if res:
|
| 207 |
+
aggregated_topics = aggregate_topics(aggregated_topics, res)
|
| 208 |
+
current_patch_topics = aggregate_topics(current_patch_topics, res)
|
| 209 |
+
review_topic_results.append((ri.timestamp_created, res))
|
| 210 |
+
current_patch_count += 1
|
| 211 |
+
else:
|
| 212 |
+
nlp_start = time.monotonic()
|
| 213 |
+
res, skipped = await nlp_svc.analyze_batch(
|
| 214 |
+
[text], highlights_collector=highlights_collector, categories=cat
|
| 215 |
+
)
|
| 216 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 217 |
+
batch_skipped += skipped
|
| 218 |
+
if res:
|
| 219 |
+
aggregated_topics = aggregate_topics(aggregated_topics, res)
|
| 220 |
+
review_topic_results.append((ri.timestamp_created, res))
|
| 221 |
+
|
| 222 |
+
recent_processed += 1
|
| 223 |
+
else:
|
| 224 |
+
for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews):
|
| 225 |
+
is_recent = recent_processed < recent_limit
|
| 226 |
+
cat = ["recent"] if is_recent else []
|
| 227 |
+
|
| 228 |
+
nlp_start = time.monotonic()
|
| 229 |
+
res, skipped = await nlp_svc.analyze_batch(
|
| 230 |
+
[text], highlights_collector=highlights_collector, categories=cat
|
| 231 |
+
)
|
| 232 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 233 |
+
batch_skipped += skipped
|
| 234 |
+
ts = ri.timestamp_created if batch.review_items else 0
|
| 235 |
+
if res:
|
| 236 |
+
aggregated_topics = aggregate_topics(aggregated_topics, res)
|
| 237 |
+
review_topic_results.append((ts, res))
|
| 238 |
+
recent_processed += 1
|
| 239 |
+
|
| 240 |
+
total_skipped += batch_skipped
|
| 241 |
+
processed += len(batch.reviews)
|
| 242 |
+
|
| 243 |
+
progress = AnalysisProgress(
|
| 244 |
+
processed=processed,
|
| 245 |
+
total=total_target,
|
| 246 |
+
current_topics=aggregated_topics,
|
| 247 |
+
skipped_count=total_skipped,
|
| 248 |
+
)
|
| 249 |
+
yield {"event": "progress", "data": progress.model_dump_json()}
|
| 250 |
+
|
| 251 |
+
await fetch_task
|
| 252 |
+
except BaseException:
|
| 253 |
+
fetch_task.cancel()
|
| 254 |
+
with contextlib.suppress(asyncio.CancelledError):
|
| 255 |
+
await fetch_task
|
| 256 |
+
raise
|
| 257 |
+
|
| 258 |
+
# Build recent_topics from highest-timestamp reviews
|
| 259 |
+
review_topic_results.sort(key=lambda x: x[0], reverse=True)
|
| 260 |
+
recent_entries = review_topic_results[:recent_limit]
|
| 261 |
+
recent_topics: list[TopicSentiment] = []
|
| 262 |
+
for _, topics_batch in recent_entries:
|
| 263 |
+
for ts in topics_batch:
|
| 264 |
+
recent_topics = aggregate_topics(recent_topics, [ts])
|
| 265 |
+
recent_reviews_count = len(recent_entries)
|
| 266 |
+
|
| 267 |
+
# Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
|
| 268 |
+
aggregated_topics = filter_topics_by_min_mentions(aggregated_topics)
|
| 269 |
+
recent_topics = filter_topics_by_min_mentions(recent_topics)
|
| 270 |
+
current_patch_topics = filter_topics_by_min_mentions(current_patch_topics)
|
| 271 |
+
|
| 272 |
+
prediction = calculate_prediction(aggregated_topics)
|
| 273 |
+
|
| 274 |
+
highlights_data = highlights_collector.compute_highlights()
|
| 275 |
+
general_highlights = highlights_data["general"]
|
| 276 |
+
recent_highlights = highlights_data["recent"]
|
| 277 |
+
current_patch_highlights = highlights_data["current_patch"]
|
| 278 |
+
topic_highlights_dict = highlights_data["topics"]
|
| 279 |
+
|
| 280 |
+
# Restrict topic highlights to topics that survived the min-mentions filter,
|
| 281 |
+
# so the topic_highlights set is always consistent with general_topics.
|
| 282 |
+
_surviving_topics = {t.topic for t in aggregated_topics}
|
| 283 |
+
topic_highlights_list = [
|
| 284 |
+
TopicHighlights(
|
| 285 |
+
topic=topic,
|
| 286 |
+
highlights=[Highlight(**h) for h in highlights],
|
| 287 |
+
)
|
| 288 |
+
for topic, highlights in topic_highlights_dict.items()
|
| 289 |
+
if topic in _surviving_topics
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
# Show recent tab if we have enough reviews to make the split meaningful
|
| 293 |
+
has_recent_split = processed > recent_limit
|
| 294 |
+
|
| 295 |
+
has_current_patch = patch_timestamp is not None and current_patch_count > 0
|
| 296 |
+
analysis_generated_at = datetime.now(timezone.utc)
|
| 297 |
+
current_patch_date = datetime_from_timestamp(patch_timestamp)
|
| 298 |
+
|
| 299 |
+
# Archive last_patch_topics when this full analysis replaces a doc with a different patch.
|
| 300 |
+
last_patch_topics: list[TopicSentiment] | None = None
|
| 301 |
+
last_patch_reviews_count = 0
|
| 302 |
+
if stale_doc:
|
| 303 |
+
old_r = normalize_legacy_results(stale_doc.get("results", {}))
|
| 304 |
+
old_patch_ts = old_r.get("current_patch_timestamp")
|
| 305 |
+
if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
|
| 306 |
+
raw_cp = old_r.get("current_patch_topics")
|
| 307 |
+
last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None
|
| 308 |
+
last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0)
|
| 309 |
+
else:
|
| 310 |
+
raw_lp = old_r.get("last_patch_topics")
|
| 311 |
+
last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None
|
| 312 |
+
last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0)
|
| 313 |
+
|
| 314 |
+
result = AnalysisResult(
|
| 315 |
+
game=game,
|
| 316 |
+
general_topics=aggregated_topics,
|
| 317 |
+
recent_topics=recent_topics if has_recent_split else None,
|
| 318 |
+
recent_reviews_count=recent_reviews_count if has_recent_split else 0,
|
| 319 |
+
current_patch_topics=current_patch_topics if has_current_patch else None,
|
| 320 |
+
current_patch_reviews_count=current_patch_count if has_current_patch else 0,
|
| 321 |
+
last_patch_topics=last_patch_topics,
|
| 322 |
+
last_patch_reviews_count=last_patch_reviews_count,
|
| 323 |
+
current_patch_timestamp=patch_timestamp,
|
| 324 |
+
analysis_date=analysis_generated_at,
|
| 325 |
+
current_patch_date=current_patch_date,
|
| 326 |
+
prediction=prediction,
|
| 327 |
+
analyzed_reviews=processed,
|
| 328 |
+
skipped_count=total_skipped,
|
| 329 |
+
general_highlights=[Highlight(**h) for h in general_highlights],
|
| 330 |
+
recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None,
|
| 331 |
+
current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None,
|
| 332 |
+
topic_highlights=topic_highlights_list,
|
| 333 |
+
cached_at=analysis_generated_at,
|
| 334 |
+
preferred_context=compute_preferred_context(patch_timestamp),
|
| 335 |
+
freshness_status=FreshnessStatus.FRESH.value,
|
| 336 |
+
is_refreshing=False,
|
| 337 |
+
)
|
| 338 |
+
await mongodb.save_analysis(
|
| 339 |
+
game.app_id,
|
| 340 |
+
result.model_dump(),
|
| 341 |
+
analyzed_review_ids=all_review_ids,
|
| 342 |
+
latest_review_timestamp=latest_timestamp,
|
| 343 |
+
ttl_hours=ttl_hours,
|
| 344 |
+
analyzed_at=analysis_generated_at,
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# Log structured timing for full analysis
|
| 348 |
+
if get_structured_logger():
|
| 349 |
+
log_structured(
|
| 350 |
+
"full_analysis_complete",
|
| 351 |
+
app_id=game.app_id,
|
| 352 |
+
game_name=game.name if hasattr(game, "name") else str(game.app_id),
|
| 353 |
+
source="live",
|
| 354 |
+
reviews_processed=processed,
|
| 355 |
+
topics_found=len(aggregated_topics),
|
| 356 |
+
detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)},
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
yield {"event": "complete", "data": result.model_dump_json()}
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
async def _incremental_analysis(
|
| 363 |
+
game: GameInfo,
|
| 364 |
+
stale_doc: dict[str, Any],
|
| 365 |
+
steam_svc: SteamService,
|
| 366 |
+
nlp_svc: NLPService,
|
| 367 |
+
patch_timestamp: int | None = None,
|
| 368 |
+
) -> AsyncGenerator[dict, None]:
|
| 369 |
+
"""Incremental analysis SSE wrapper over the shared service implementation."""
|
| 370 |
+
async for event in iter_incremental_analysis_events(
|
| 371 |
+
game,
|
| 372 |
+
stale_doc,
|
| 373 |
+
steam_svc,
|
| 374 |
+
nlp_svc,
|
| 375 |
+
patch_timestamp=patch_timestamp,
|
| 376 |
+
source="live",
|
| 377 |
+
):
|
| 378 |
+
yield event
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
async def _background_refresh(
|
| 382 |
+
game: GameInfo,
|
| 383 |
+
stale_doc: dict[str, Any],
|
| 384 |
+
steam_svc: SteamService,
|
| 385 |
+
nlp_svc: NLPService,
|
| 386 |
+
patch_ts: int | None,
|
| 387 |
+
) -> None:
|
| 388 |
+
"""Fire-and-forget incremental analysis for stale niche caches."""
|
| 389 |
+
async with _refresh_semaphore:
|
| 390 |
+
try:
|
| 391 |
+
async for _ in _incremental_analysis(
|
| 392 |
+
game, stale_doc, steam_svc, nlp_svc, patch_timestamp=patch_ts
|
| 393 |
+
):
|
| 394 |
+
pass
|
| 395 |
+
logger.info(f"Background refresh completed for {game.app_id}")
|
| 396 |
+
except Exception as e:
|
| 397 |
+
logger.error(f"Background refresh failed for {game.app_id}: {e}")
|
| 398 |
+
finally:
|
| 399 |
+
_refreshing_app_ids.discard(game.app_id)
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
async def analysis_event_generator(
|
| 403 |
+
game_name: str,
|
| 404 |
+
steam_service: SteamService,
|
| 405 |
+
nlp_service: NLPService,
|
| 406 |
+
*,
|
| 407 |
+
appid: str | None = None,
|
| 408 |
+
) -> AsyncGenerator[dict, None]:
|
| 409 |
+
"""
|
| 410 |
+
Main SSE event generator. Decides between full and incremental analysis paths.
|
| 411 |
+
"""
|
| 412 |
+
t_start = time.monotonic()
|
| 413 |
+
analysis_type = "unknown"
|
| 414 |
+
app_id = ""
|
| 415 |
+
resolved_game_name = game_name
|
| 416 |
+
reviews_processed = 0
|
| 417 |
+
|
| 418 |
+
try:
|
| 419 |
+
# 1. Resolve game — use appid directly if provided, otherwise search by name
|
| 420 |
+
if appid:
|
| 421 |
+
game = await steam_service.get_game_info(appid)
|
| 422 |
+
else:
|
| 423 |
+
game = await steam_service.search_game(game_name)
|
| 424 |
+
if not game:
|
| 425 |
+
yield {
|
| 426 |
+
"event": "analysis_error",
|
| 427 |
+
"data": json.dumps({"message": "ERROR_GAME_NOT_FOUND"}),
|
| 428 |
+
}
|
| 429 |
+
return
|
| 430 |
+
|
| 431 |
+
app_id = game.app_id
|
| 432 |
+
resolved_game_name = game.name if hasattr(game, "name") else game_name
|
| 433 |
+
|
| 434 |
+
# 1b. Fetch game patch date for current_patch tab / freshness evaluation
|
| 435 |
+
patch_date = await mongodb.get_game_patch_date(game.app_id)
|
| 436 |
+
patch_ts = int(patch_date.timestamp()) if patch_date else None
|
| 437 |
+
if patch_ts:
|
| 438 |
+
game = game.model_copy(update={"last_game_update_at": patch_ts})
|
| 439 |
+
|
| 440 |
+
# 2. Load any existing analysis and evaluate product freshness.
|
| 441 |
+
analysis_doc = await mongodb.get_analysis(game.app_id)
|
| 442 |
+
priority_ids = await mongodb.get_priority_game_ids_for_analysis()
|
| 443 |
+
is_priority = game.app_id in priority_ids
|
| 444 |
+
is_niche = not is_priority
|
| 445 |
+
|
| 446 |
+
if analysis_doc and analysis_doc.get("results"):
|
| 447 |
+
freshness_status = evaluate_freshness(analysis_doc, patch_date)
|
| 448 |
+
|
| 449 |
+
if freshness_status == FreshnessStatus.FRESH:
|
| 450 |
+
analysis_type = "cached"
|
| 451 |
+
payload = _build_analysis_payload(
|
| 452 |
+
analysis_doc,
|
| 453 |
+
freshness_status,
|
| 454 |
+
current_patch_at=patch_date,
|
| 455 |
+
)
|
| 456 |
+
yield {"event": "result", "data": json.dumps(payload)}
|
| 457 |
+
return
|
| 458 |
+
|
| 459 |
+
analysis_type = "stale_result"
|
| 460 |
+
is_refreshing = (
|
| 461 |
+
await mongodb.has_due_refresh_schedule(game.app_id)
|
| 462 |
+
if is_priority
|
| 463 |
+
else True
|
| 464 |
+
)
|
| 465 |
+
stale_payload = _build_analysis_payload(
|
| 466 |
+
analysis_doc,
|
| 467 |
+
freshness_status,
|
| 468 |
+
current_patch_at=patch_date,
|
| 469 |
+
is_refreshing=is_refreshing,
|
| 470 |
+
)
|
| 471 |
+
yield {"event": "result", "data": json.dumps(stale_payload)}
|
| 472 |
+
|
| 473 |
+
if is_priority:
|
| 474 |
+
return
|
| 475 |
+
|
| 476 |
+
try:
|
| 477 |
+
if settings.incremental_enabled and analysis_doc.get("analyzed_review_ids"):
|
| 478 |
+
refresh_generator = _incremental_analysis(
|
| 479 |
+
game, analysis_doc, steam_service, nlp_service, patch_timestamp=patch_ts
|
| 480 |
+
)
|
| 481 |
+
else:
|
| 482 |
+
stats = await steam_service.get_review_stats(game.app_id)
|
| 483 |
+
sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
|
| 484 |
+
game = game.model_copy(update={"target_count": sample_plan.total})
|
| 485 |
+
refresh_generator = _full_analysis(
|
| 486 |
+
game,
|
| 487 |
+
sample_plan,
|
| 488 |
+
steam_service,
|
| 489 |
+
nlp_service,
|
| 490 |
+
patch_timestamp=patch_ts,
|
| 491 |
+
stale_doc=analysis_doc,
|
| 492 |
+
)
|
| 493 |
+
async for event in refresh_generator:
|
| 494 |
+
if event.get("event") == "complete":
|
| 495 |
+
try:
|
| 496 |
+
data = json.loads(event["data"])
|
| 497 |
+
reviews_processed = data.get("analyzed_reviews", 0)
|
| 498 |
+
except (json.JSONDecodeError, KeyError):
|
| 499 |
+
pass
|
| 500 |
+
yield event
|
| 501 |
+
return
|
| 502 |
+
except Exception as e:
|
| 503 |
+
logger.error(f"Refresh failed for {game.app_id}: {e}")
|
| 504 |
+
return
|
| 505 |
+
|
| 506 |
+
# 3. No cache at all — live analysis
|
| 507 |
+
|
| 508 |
+
analysis_type = "full"
|
| 509 |
+
stats = await steam_service.get_review_stats(game.app_id)
|
| 510 |
+
sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
|
| 511 |
+
total_target = sample_plan.total
|
| 512 |
+
game = game.model_copy(update={"target_count": total_target})
|
| 513 |
+
|
| 514 |
+
yield {"event": "game_found", "data": game.model_dump_json()}
|
| 515 |
+
|
| 516 |
+
if is_niche:
|
| 517 |
+
yield {
|
| 518 |
+
"event": "state",
|
| 519 |
+
"data": json.dumps({"type": "first_live_analysis"}),
|
| 520 |
+
}
|
| 521 |
+
|
| 522 |
+
async for event in _full_analysis(game, sample_plan, steam_service, nlp_service, patch_timestamp=patch_ts):
|
| 523 |
+
if event.get("event") == "complete":
|
| 524 |
+
try:
|
| 525 |
+
data = json.loads(event["data"])
|
| 526 |
+
reviews_processed = data.get("analyzed_reviews", 0)
|
| 527 |
+
except (json.JSONDecodeError, KeyError):
|
| 528 |
+
pass
|
| 529 |
+
yield event
|
| 530 |
+
|
| 531 |
+
except SteamRateLimitError as e:
|
| 532 |
+
logger.warning(f"Steam rate limit: {e}")
|
| 533 |
+
yield {
|
| 534 |
+
"event": "analysis_error",
|
| 535 |
+
"data": json.dumps({"message": "ERROR_STEAM_RATE_LIMIT"}),
|
| 536 |
+
}
|
| 537 |
+
except SteamAPIError as e:
|
| 538 |
+
logger.error(f"Steam API error: {e}")
|
| 539 |
+
yield {
|
| 540 |
+
"event": "analysis_error",
|
| 541 |
+
"data": json.dumps({"message": "ERROR_STEAM_API"}),
|
| 542 |
+
}
|
| 543 |
+
except Exception as e:
|
| 544 |
+
# Safety net — SSE generator must always send an error event, never crash silently.
|
| 545 |
+
logger.error(f"Analysis error: {e}", exc_info=True)
|
| 546 |
+
yield {
|
| 547 |
+
"event": "analysis_error",
|
| 548 |
+
"data": json.dumps({"message": "ERROR_INTERNAL"}),
|
| 549 |
+
}
|
| 550 |
+
finally:
|
| 551 |
+
elapsed = round(time.monotonic() - t_start, 3)
|
| 552 |
+
if get_structured_logger():
|
| 553 |
+
log_structured(
|
| 554 |
+
"live_analysis",
|
| 555 |
+
app_id=app_id,
|
| 556 |
+
game_name=resolved_game_name,
|
| 557 |
+
analysis_type=analysis_type,
|
| 558 |
+
elapsed_s=elapsed,
|
| 559 |
+
reviews_processed=reviews_processed,
|
| 560 |
+
source="live",
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
@router.get("/analyze/{game_name}")
|
| 565 |
+
@limiter.limit(settings.rate_limit_analyze)
|
| 566 |
+
async def analyze_game(
|
| 567 |
+
request: Request,
|
| 568 |
+
game_name: str = Path(..., min_length=1, max_length=200),
|
| 569 |
+
appid: str | None = Query(None, min_length=1, max_length=20),
|
| 570 |
+
steam_service: SteamService = Depends(get_steam_service),
|
| 571 |
+
nlp_service: NLPService = Depends(get_nlp_service),
|
| 572 |
+
) -> EventSourceResponse:
|
| 573 |
+
"""
|
| 574 |
+
Endpoint do analizy sentymentu gry (SSE Stream).
|
| 575 |
+
"""
|
| 576 |
+
return EventSourceResponse(
|
| 577 |
+
analysis_event_generator(game_name, steam_service, nlp_service, appid=appid)
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
@router.get("/game/{game_name}")
|
| 582 |
+
@limiter.limit(settings.rate_limit_default)
|
| 583 |
+
async def get_game_info(
|
| 584 |
+
request: Request,
|
| 585 |
+
game_name: str = Path(..., min_length=1, max_length=200),
|
| 586 |
+
steam_service: SteamService = Depends(get_steam_service),
|
| 587 |
+
) -> dict:
|
| 588 |
+
"""
|
| 589 |
+
Endpoint do pobierania informacji o grze.
|
| 590 |
+
"""
|
| 591 |
+
game = await steam_service.search_game(game_name)
|
| 592 |
+
if not game:
|
| 593 |
+
raise HTTPException(
|
| 594 |
+
status_code=404, detail="ERROR_GAME_NOT_FOUND"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
return game.model_dump()
|
backend/app/routers/games.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Router API do wyszukiwania gier.
|
| 3 |
+
|
| 4 |
+
Zawiera endpoint do pobierania sugestii gier dla autouzupełniania.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, Query, Request
|
| 8 |
+
|
| 9 |
+
from app.core.config import settings
|
| 10 |
+
from app.db.mongodb import mongodb
|
| 11 |
+
from app.core.rate_limit import limiter
|
| 12 |
+
|
| 13 |
+
router = APIRouter()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@router.get("/games/suggestions")
|
| 17 |
+
@limiter.limit(settings.rate_limit_default)
|
| 18 |
+
async def get_game_suggestions(
|
| 19 |
+
request: Request,
|
| 20 |
+
q: str = Query(..., min_length=2, max_length=100, description="Tekst do wyszukania"),
|
| 21 |
+
limit: int = Query(10, ge=1, le=20, description="Maksymalna liczba wyników"),
|
| 22 |
+
) -> list[dict[str, str]]:
|
| 23 |
+
"""
|
| 24 |
+
Endpoint do pobierania sugestii gier dla autouzupełniania.
|
| 25 |
+
|
| 26 |
+
Wyszukuje gry po nazwie (case-insensitive).
|
| 27 |
+
Wymaga minimum 2 znaków.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
q: Tekst do wyszukania w nazwie gry.
|
| 31 |
+
limit: Maksymalna liczba wyników (1-20).
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
Lista gier pasujących do zapytania.
|
| 35 |
+
|
| 36 |
+
Example:
|
| 37 |
+
```
|
| 38 |
+
GET /api/games/suggestions?q=cyber&limit=5
|
| 39 |
+
|
| 40 |
+
[
|
| 41 |
+
{"appid": "1091500", "name": "Cyberpunk 2077"},
|
| 42 |
+
{"appid": "12345", "name": "Cyber Shadow"},
|
| 43 |
+
...
|
| 44 |
+
]
|
| 45 |
+
```
|
| 46 |
+
"""
|
| 47 |
+
games = await mongodb.search_games(q, limit)
|
| 48 |
+
return games
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@router.get("/games/count")
|
| 52 |
+
@limiter.limit(settings.rate_limit_default)
|
| 53 |
+
async def get_games_count(request: Request) -> dict[str, int]:
|
| 54 |
+
"""
|
| 55 |
+
Endpoint do sprawdzenia liczby gier w bazie.
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
Liczba gier w bazie danych.
|
| 59 |
+
|
| 60 |
+
Example:
|
| 61 |
+
```
|
| 62 |
+
GET /api/games/count
|
| 63 |
+
|
| 64 |
+
{"count": 85432}
|
| 65 |
+
```
|
| 66 |
+
"""
|
| 67 |
+
count = await mongodb.get_games_count()
|
| 68 |
+
return {"count": count}
|
backend/app/services/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Serwisy biznesowe aplikacji."""
|
| 2 |
+
|
| 3 |
+
from app.services.nlp_service import get_nlp_service
|
| 4 |
+
from app.services.steam_service import steam_service
|
| 5 |
+
|
| 6 |
+
__all__ = ["get_nlp_service", "steam_service"]
|
backend/app/services/analysis_runner.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Analysis Runner — programmatic (non-SSE) analysis for the Worker.
|
| 3 |
+
|
| 4 |
+
Extracts the core full-analysis logic from analyze.py without SSE wrapping.
|
| 5 |
+
Used by the pre-cache service to run analyses in the background.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import contextlib
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
import time
|
| 13 |
+
from datetime import datetime, timezone
|
| 14 |
+
from typing import Any, AsyncGenerator
|
| 15 |
+
|
| 16 |
+
from app.core.config import settings
|
| 17 |
+
from app.core.freshness import FreshnessStatus
|
| 18 |
+
from app.core.sampling import create_sample_plan
|
| 19 |
+
from app.core.ttl_tiers import get_ttl_hours
|
| 20 |
+
from app.core.worker_logging import AsyncTimingContext, get_structured_logger, log_structured
|
| 21 |
+
from app.db.mongodb import mongodb
|
| 22 |
+
from app.models.schemas import (
|
| 23 |
+
AnalysisProgress,
|
| 24 |
+
AnalysisResult,
|
| 25 |
+
GameInfo,
|
| 26 |
+
Highlight,
|
| 27 |
+
TopicHighlights,
|
| 28 |
+
TopicSentiment,
|
| 29 |
+
)
|
| 30 |
+
from app.services.highlights_service import HighlightsCollector
|
| 31 |
+
from app.services.analysis_utils import (
|
| 32 |
+
aggregate_topics,
|
| 33 |
+
calculate_prediction,
|
| 34 |
+
compute_preferred_context,
|
| 35 |
+
datetime_from_timestamp,
|
| 36 |
+
filter_topics_by_min_mentions,
|
| 37 |
+
normalize_legacy_results,
|
| 38 |
+
scale_topics,
|
| 39 |
+
serialize_datetime,
|
| 40 |
+
)
|
| 41 |
+
from app.services.nlp_service import NLPService
|
| 42 |
+
from app.services.steam_service import SteamService
|
| 43 |
+
|
| 44 |
+
logger = logging.getLogger(__name__)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
async def iter_incremental_analysis_events(
|
| 48 |
+
game: GameInfo,
|
| 49 |
+
stale_doc: dict[str, Any],
|
| 50 |
+
steam_svc: SteamService,
|
| 51 |
+
nlp_svc: NLPService,
|
| 52 |
+
patch_timestamp: int | None = None,
|
| 53 |
+
*,
|
| 54 |
+
source: str = "live",
|
| 55 |
+
) -> AsyncGenerator[dict[str, str], None]:
|
| 56 |
+
"""Yield incremental-analysis progress and final result events."""
|
| 57 |
+
ttl_hours = await get_ttl_hours(game.app_id)
|
| 58 |
+
old_results = normalize_legacy_results(stale_doc.get("results", {}))
|
| 59 |
+
old_review_ids: list[str] = stale_doc.get("analyzed_review_ids", [])
|
| 60 |
+
old_review_ids_set = set(old_review_ids)
|
| 61 |
+
nlp_cumulative_s: float = 0.0
|
| 62 |
+
|
| 63 |
+
old_general = [TopicSentiment(**topic) for topic in old_results.get("general_topics", [])]
|
| 64 |
+
old_recent = (
|
| 65 |
+
[TopicSentiment(**topic) for topic in old_results.get("recent_topics", [])]
|
| 66 |
+
if old_results.get("recent_topics")
|
| 67 |
+
else []
|
| 68 |
+
)
|
| 69 |
+
old_current_patch = (
|
| 70 |
+
[TopicSentiment(**topic) for topic in old_results.get("current_patch_topics", [])]
|
| 71 |
+
if old_results.get("current_patch_topics")
|
| 72 |
+
else []
|
| 73 |
+
)
|
| 74 |
+
old_last_patch = (
|
| 75 |
+
[TopicSentiment(**topic) for topic in old_results.get("last_patch_topics", [])]
|
| 76 |
+
if old_results.get("last_patch_topics")
|
| 77 |
+
else None
|
| 78 |
+
)
|
| 79 |
+
old_last_patch_count = old_results.get("last_patch_reviews_count", 0)
|
| 80 |
+
old_patch_ts = old_results.get("current_patch_timestamp")
|
| 81 |
+
|
| 82 |
+
new_items = await steam_svc.fetch_recent_reviews(
|
| 83 |
+
game.app_id,
|
| 84 |
+
exclude_ids=old_review_ids_set,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if not new_items:
|
| 88 |
+
refreshed_at = datetime.now(timezone.utc)
|
| 89 |
+
refreshed_results = {
|
| 90 |
+
**old_results,
|
| 91 |
+
"cached_at": refreshed_at,
|
| 92 |
+
"analysis_date": refreshed_at,
|
| 93 |
+
"current_patch_date": datetime_from_timestamp(
|
| 94 |
+
patch_timestamp if patch_timestamp is not None else old_results.get("current_patch_timestamp")
|
| 95 |
+
),
|
| 96 |
+
"freshness_status": FreshnessStatus.FRESH.value,
|
| 97 |
+
"staleness_reason": None,
|
| 98 |
+
"is_refreshing": False,
|
| 99 |
+
}
|
| 100 |
+
await mongodb.save_analysis(
|
| 101 |
+
game.app_id,
|
| 102 |
+
refreshed_results,
|
| 103 |
+
analyzed_review_ids=old_review_ids,
|
| 104 |
+
latest_review_timestamp=stale_doc.get("latest_review_timestamp", 0),
|
| 105 |
+
ttl_hours=ttl_hours,
|
| 106 |
+
analyzed_at=refreshed_at,
|
| 107 |
+
)
|
| 108 |
+
yield {
|
| 109 |
+
"event": "complete",
|
| 110 |
+
"data": json.dumps(refreshed_results, default=serialize_datetime),
|
| 111 |
+
}
|
| 112 |
+
return
|
| 113 |
+
|
| 114 |
+
new_texts = [item.text for item in new_items]
|
| 115 |
+
new_review_ids = [item.recommendation_id for item in new_items]
|
| 116 |
+
latest_timestamp = max(
|
| 117 |
+
(item.timestamp_created for item in new_items),
|
| 118 |
+
default=stale_doc.get("latest_review_timestamp", 0),
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
batch_size = settings.review_batch_size
|
| 122 |
+
delta_topics: list[TopicSentiment] = []
|
| 123 |
+
delta_current_patch_topics: list[TopicSentiment] = []
|
| 124 |
+
delta_current_patch_count = 0
|
| 125 |
+
highlights_collector = HighlightsCollector()
|
| 126 |
+
processed = 0
|
| 127 |
+
total_skipped = 0
|
| 128 |
+
|
| 129 |
+
for i in range(0, len(new_texts), batch_size):
|
| 130 |
+
chunk_texts = new_texts[i:i + batch_size]
|
| 131 |
+
chunk_items = new_items[i:i + batch_size]
|
| 132 |
+
|
| 133 |
+
batch_skipped = 0
|
| 134 |
+
if patch_timestamp:
|
| 135 |
+
for review_item, text in zip(chunk_items, chunk_texts):
|
| 136 |
+
categories = ["recent"]
|
| 137 |
+
if review_item.timestamp_created >= patch_timestamp:
|
| 138 |
+
categories.append("current_patch")
|
| 139 |
+
|
| 140 |
+
nlp_start = time.monotonic()
|
| 141 |
+
result_topics, skipped = await nlp_svc.analyze_batch(
|
| 142 |
+
[text],
|
| 143 |
+
highlights_collector=highlights_collector,
|
| 144 |
+
categories=categories,
|
| 145 |
+
)
|
| 146 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 147 |
+
batch_skipped += skipped
|
| 148 |
+
if result_topics:
|
| 149 |
+
delta_topics = aggregate_topics(delta_topics, result_topics)
|
| 150 |
+
if review_item.timestamp_created >= patch_timestamp:
|
| 151 |
+
delta_current_patch_topics = aggregate_topics(
|
| 152 |
+
delta_current_patch_topics,
|
| 153 |
+
result_topics,
|
| 154 |
+
)
|
| 155 |
+
delta_current_patch_count += 1
|
| 156 |
+
total_skipped += batch_skipped
|
| 157 |
+
else:
|
| 158 |
+
nlp_start = time.monotonic()
|
| 159 |
+
batch_results, batch_skipped = await nlp_svc.analyze_batch(
|
| 160 |
+
chunk_texts,
|
| 161 |
+
highlights_collector=highlights_collector,
|
| 162 |
+
categories=["recent"],
|
| 163 |
+
)
|
| 164 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 165 |
+
if batch_results:
|
| 166 |
+
delta_topics = aggregate_topics(delta_topics, batch_results)
|
| 167 |
+
total_skipped += batch_skipped
|
| 168 |
+
|
| 169 |
+
processed += len(chunk_texts)
|
| 170 |
+
|
| 171 |
+
progress = AnalysisProgress(
|
| 172 |
+
processed=processed,
|
| 173 |
+
total=len(new_texts),
|
| 174 |
+
current_topics=delta_topics,
|
| 175 |
+
skipped_count=total_skipped,
|
| 176 |
+
)
|
| 177 |
+
yield {"event": "progress", "data": progress.model_dump_json()}
|
| 178 |
+
|
| 179 |
+
new_general = aggregate_topics(old_general, delta_topics)
|
| 180 |
+
|
| 181 |
+
old_recent_count = old_results.get("recent_reviews_count", 0)
|
| 182 |
+
new_count = len(new_texts)
|
| 183 |
+
|
| 184 |
+
if (
|
| 185 |
+
old_recent_count + new_count > settings.recent_sample_limit
|
| 186 |
+
and old_recent
|
| 187 |
+
and old_recent_count > 0
|
| 188 |
+
):
|
| 189 |
+
overflow = old_recent_count + new_count - settings.recent_sample_limit
|
| 190 |
+
retain_ratio = max(0.2, 1.0 - overflow / old_recent_count)
|
| 191 |
+
scaled_old = scale_topics(old_recent, retain_ratio)
|
| 192 |
+
new_recent = aggregate_topics(scaled_old, delta_topics)
|
| 193 |
+
recent_count = int(old_recent_count * retain_ratio) + new_count
|
| 194 |
+
else:
|
| 195 |
+
new_recent = aggregate_topics(old_recent, delta_topics) if old_recent else delta_topics
|
| 196 |
+
recent_count = old_recent_count + new_count
|
| 197 |
+
|
| 198 |
+
last_patch_topics = old_last_patch
|
| 199 |
+
last_patch_count = old_last_patch_count
|
| 200 |
+
|
| 201 |
+
if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
|
| 202 |
+
last_patch_topics = old_current_patch if old_current_patch else None
|
| 203 |
+
last_patch_count = old_results.get("current_patch_reviews_count", 0)
|
| 204 |
+
old_current_patch = []
|
| 205 |
+
|
| 206 |
+
new_current_patch = (
|
| 207 |
+
aggregate_topics(old_current_patch, delta_current_patch_topics)
|
| 208 |
+
if old_current_patch
|
| 209 |
+
else (delta_current_patch_topics if delta_current_patch_topics else [])
|
| 210 |
+
)
|
| 211 |
+
base_current_patch_count = (
|
| 212 |
+
0
|
| 213 |
+
if (patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts)
|
| 214 |
+
else old_results.get("current_patch_reviews_count", 0)
|
| 215 |
+
)
|
| 216 |
+
new_current_patch_count = base_current_patch_count + delta_current_patch_count
|
| 217 |
+
has_current_patch = patch_timestamp is not None and (
|
| 218 |
+
new_current_patch_count > 0 or bool(old_current_patch)
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
|
| 222 |
+
new_general = filter_topics_by_min_mentions(new_general)
|
| 223 |
+
new_recent = filter_topics_by_min_mentions(new_recent)
|
| 224 |
+
new_current_patch = filter_topics_by_min_mentions(new_current_patch)
|
| 225 |
+
|
| 226 |
+
prediction = calculate_prediction(new_general)
|
| 227 |
+
|
| 228 |
+
highlights_data = highlights_collector.compute_highlights()
|
| 229 |
+
general_highlights = highlights_data["general"]
|
| 230 |
+
recent_highlights = highlights_data["recent"]
|
| 231 |
+
current_patch_highlights = highlights_data["current_patch"]
|
| 232 |
+
topic_highlights_dict = highlights_data["topics"]
|
| 233 |
+
|
| 234 |
+
# Restrict topic highlights to topics that survived the min-mentions filter,
|
| 235 |
+
# so the topic_highlights set is always consistent with general_topics.
|
| 236 |
+
_surviving_topics = {t.topic for t in new_general}
|
| 237 |
+
topic_highlights_list = [
|
| 238 |
+
TopicHighlights(
|
| 239 |
+
topic=topic,
|
| 240 |
+
highlights=[Highlight(**highlight) for highlight in highlights],
|
| 241 |
+
)
|
| 242 |
+
for topic, highlights in topic_highlights_dict.items()
|
| 243 |
+
if topic in _surviving_topics
|
| 244 |
+
]
|
| 245 |
+
|
| 246 |
+
merged_review_ids = old_review_ids + new_review_ids
|
| 247 |
+
|
| 248 |
+
analysis_generated_at = datetime.now(timezone.utc)
|
| 249 |
+
result = AnalysisResult(
|
| 250 |
+
game=game,
|
| 251 |
+
general_topics=new_general,
|
| 252 |
+
recent_topics=new_recent,
|
| 253 |
+
recent_reviews_count=recent_count,
|
| 254 |
+
current_patch_topics=new_current_patch if has_current_patch else None,
|
| 255 |
+
current_patch_reviews_count=new_current_patch_count if has_current_patch else 0,
|
| 256 |
+
last_patch_topics=last_patch_topics,
|
| 257 |
+
last_patch_reviews_count=last_patch_count,
|
| 258 |
+
current_patch_timestamp=patch_timestamp,
|
| 259 |
+
analysis_date=analysis_generated_at,
|
| 260 |
+
current_patch_date=datetime_from_timestamp(patch_timestamp),
|
| 261 |
+
prediction=prediction,
|
| 262 |
+
analyzed_reviews=old_results.get("analyzed_reviews", 0) + processed,
|
| 263 |
+
skipped_count=old_results.get("skipped_count", 0) + total_skipped,
|
| 264 |
+
general_highlights=[Highlight(**highlight) for highlight in general_highlights],
|
| 265 |
+
recent_highlights=[Highlight(**highlight) for highlight in recent_highlights] if recent_highlights else None,
|
| 266 |
+
current_patch_highlights=[Highlight(**highlight) for highlight in current_patch_highlights] if current_patch_highlights else None,
|
| 267 |
+
topic_highlights=topic_highlights_list,
|
| 268 |
+
cached_at=analysis_generated_at,
|
| 269 |
+
preferred_context=compute_preferred_context(patch_timestamp),
|
| 270 |
+
freshness_status=FreshnessStatus.FRESH.value,
|
| 271 |
+
is_refreshing=False,
|
| 272 |
+
)
|
| 273 |
+
await mongodb.save_analysis(
|
| 274 |
+
game.app_id,
|
| 275 |
+
result.model_dump(),
|
| 276 |
+
analyzed_review_ids=merged_review_ids,
|
| 277 |
+
latest_review_timestamp=latest_timestamp,
|
| 278 |
+
ttl_hours=ttl_hours,
|
| 279 |
+
analyzed_at=analysis_generated_at,
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
if get_structured_logger():
|
| 283 |
+
log_structured(
|
| 284 |
+
"incremental_analysis_complete",
|
| 285 |
+
app_id=game.app_id,
|
| 286 |
+
game_name=game.name if hasattr(game, "name") else str(game.app_id),
|
| 287 |
+
source=source,
|
| 288 |
+
reviews_processed=processed,
|
| 289 |
+
topics_found=len(new_general),
|
| 290 |
+
detail={"nlp_cumulative_s": round(nlp_cumulative_s, 3)},
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
yield {"event": "complete", "data": result.model_dump_json()}
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
async def run_incremental_analysis(
|
| 297 |
+
app_id: str,
|
| 298 |
+
game_name: str,
|
| 299 |
+
steam_svc: SteamService,
|
| 300 |
+
nlp_svc: NLPService,
|
| 301 |
+
) -> dict[str, Any] | None:
|
| 302 |
+
"""Run a non-SSE incremental analysis for worker jobs."""
|
| 303 |
+
slog = get_structured_logger()
|
| 304 |
+
|
| 305 |
+
try:
|
| 306 |
+
stale_doc = await mongodb.get_analysis(app_id)
|
| 307 |
+
if not stale_doc or not stale_doc.get("results") or not stale_doc.get("analyzed_review_ids"):
|
| 308 |
+
return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc)
|
| 309 |
+
|
| 310 |
+
# Long gap guard: if the most recent review we have is too old, Steam's cursor-based
|
| 311 |
+
# API may not reliably surface all reviews since then. Fall back to full analysis.
|
| 312 |
+
latest_ts = stale_doc.get("latest_review_timestamp", 0)
|
| 313 |
+
if latest_ts > 0:
|
| 314 |
+
gap_days = (time.time() - latest_ts) / 86400
|
| 315 |
+
if gap_days > settings.incremental_max_gap_days:
|
| 316 |
+
logger.info(
|
| 317 |
+
f"Incremental gap {gap_days:.0f}d > {settings.incremental_max_gap_days}d "
|
| 318 |
+
f"for {app_id} ({game_name}) — falling back to full analysis"
|
| 319 |
+
)
|
| 320 |
+
return await run_full_analysis(app_id, game_name, steam_svc, nlp_svc, stale_doc=stale_doc)
|
| 321 |
+
|
| 322 |
+
game = await steam_svc.get_game_info(app_id)
|
| 323 |
+
if not game:
|
| 324 |
+
cached_game = stale_doc.get("results", {}).get("game")
|
| 325 |
+
if isinstance(cached_game, dict):
|
| 326 |
+
game = GameInfo(**cached_game)
|
| 327 |
+
else:
|
| 328 |
+
game = GameInfo(app_id=app_id, name=game_name)
|
| 329 |
+
|
| 330 |
+
patch_date = await mongodb.get_game_patch_date(app_id)
|
| 331 |
+
patch_timestamp = int(patch_date.timestamp()) if patch_date else None
|
| 332 |
+
if patch_timestamp:
|
| 333 |
+
game = game.model_copy(update={"last_game_update_at": patch_timestamp})
|
| 334 |
+
|
| 335 |
+
final_payload: dict[str, Any] | None = None
|
| 336 |
+
async for event in iter_incremental_analysis_events(
|
| 337 |
+
game,
|
| 338 |
+
stale_doc,
|
| 339 |
+
steam_svc,
|
| 340 |
+
nlp_svc,
|
| 341 |
+
patch_timestamp=patch_timestamp,
|
| 342 |
+
source="worker",
|
| 343 |
+
):
|
| 344 |
+
if event.get("event") == "complete":
|
| 345 |
+
final_payload = json.loads(event["data"])
|
| 346 |
+
|
| 347 |
+
return final_payload
|
| 348 |
+
except Exception as e:
|
| 349 |
+
logger.error(f"Incremental analysis runner error for {app_id} ({game_name}): {e}", exc_info=True)
|
| 350 |
+
if slog:
|
| 351 |
+
log_structured(
|
| 352 |
+
"analysis_error",
|
| 353 |
+
level=logging.ERROR,
|
| 354 |
+
app_id=app_id,
|
| 355 |
+
game_name=game_name,
|
| 356 |
+
source="worker",
|
| 357 |
+
error=str(e),
|
| 358 |
+
)
|
| 359 |
+
return None
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
async def run_full_analysis(
|
| 363 |
+
app_id: str,
|
| 364 |
+
game_name: str,
|
| 365 |
+
steam_svc: SteamService,
|
| 366 |
+
nlp_svc: NLPService,
|
| 367 |
+
stale_doc: dict[str, Any] | None = None,
|
| 368 |
+
) -> dict[str, Any] | None:
|
| 369 |
+
"""
|
| 370 |
+
Run a full analysis for a game (no SSE, no streaming).
|
| 371 |
+
|
| 372 |
+
Returns:
|
| 373 |
+
Analysis result dict, or None on error.
|
| 374 |
+
"""
|
| 375 |
+
slog = get_structured_logger()
|
| 376 |
+
|
| 377 |
+
try:
|
| 378 |
+
# Phase 1: Setup — game info + review stats + sample plan
|
| 379 |
+
async with AsyncTimingContext() as t_setup:
|
| 380 |
+
# 1. Get game info
|
| 381 |
+
game = await steam_svc.get_game_info(app_id)
|
| 382 |
+
if not game:
|
| 383 |
+
logger.warning(f"Analysis runner: game info not found for {app_id}")
|
| 384 |
+
return None
|
| 385 |
+
|
| 386 |
+
# 2. Get review stats
|
| 387 |
+
stats = await steam_svc.get_review_stats(app_id)
|
| 388 |
+
if stats.total == 0:
|
| 389 |
+
logger.warning(f"Analysis runner: no reviews for {app_id}")
|
| 390 |
+
return None
|
| 391 |
+
|
| 392 |
+
# 3. Create sample plan
|
| 393 |
+
sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
|
| 394 |
+
ttl_hours = await get_ttl_hours(app_id)
|
| 395 |
+
|
| 396 |
+
# 3b. Fetch game patch date for current_patch splitting
|
| 397 |
+
patch_date = await mongodb.get_game_patch_date(app_id)
|
| 398 |
+
patch_timestamp = int(patch_date.timestamp()) if patch_date else None
|
| 399 |
+
if patch_timestamp and isinstance(game, GameInfo):
|
| 400 |
+
game = game.model_copy(update={"last_game_update_at": patch_timestamp})
|
| 401 |
+
|
| 402 |
+
# Phase 2: Fetch + Analyze — producer-consumer loop
|
| 403 |
+
nlp_cumulative_s: float = 0.0
|
| 404 |
+
|
| 405 |
+
async with AsyncTimingContext() as t_fetch_analyze:
|
| 406 |
+
# 4. Producer-consumer fetch + analyze
|
| 407 |
+
queue: asyncio.Queue = asyncio.Queue(maxsize=5)
|
| 408 |
+
|
| 409 |
+
async def fetch_worker():
|
| 410 |
+
try:
|
| 411 |
+
async for batch in steam_svc.fetch_reviews_stratified(app_id, sample_plan):
|
| 412 |
+
await queue.put(batch)
|
| 413 |
+
except Exception as e:
|
| 414 |
+
await queue.put(e)
|
| 415 |
+
finally:
|
| 416 |
+
await queue.put(None)
|
| 417 |
+
|
| 418 |
+
fetch_task = asyncio.create_task(fetch_worker())
|
| 419 |
+
|
| 420 |
+
processed = 0
|
| 421 |
+
total_skipped = 0
|
| 422 |
+
aggregated_topics: list[TopicSentiment] = []
|
| 423 |
+
recent_processed = 0
|
| 424 |
+
recent_limit = settings.recent_sample_limit
|
| 425 |
+
all_review_ids: list[str] = []
|
| 426 |
+
latest_timestamp = 0
|
| 427 |
+
highlights_collector = HighlightsCollector()
|
| 428 |
+
current_patch_topics: list[TopicSentiment] = []
|
| 429 |
+
current_patch_count = 0
|
| 430 |
+
review_topic_results: list[tuple[int, list[TopicSentiment]]] = []
|
| 431 |
+
|
| 432 |
+
try:
|
| 433 |
+
while True:
|
| 434 |
+
item = await queue.get()
|
| 435 |
+
|
| 436 |
+
if item is None:
|
| 437 |
+
break
|
| 438 |
+
if isinstance(item, Exception):
|
| 439 |
+
raise item
|
| 440 |
+
|
| 441 |
+
batch = item
|
| 442 |
+
if not batch.reviews:
|
| 443 |
+
continue
|
| 444 |
+
|
| 445 |
+
for ri in batch.review_items:
|
| 446 |
+
all_review_ids.append(ri.recommendation_id)
|
| 447 |
+
if ri.timestamp_created > latest_timestamp:
|
| 448 |
+
latest_timestamp = ri.timestamp_created
|
| 449 |
+
|
| 450 |
+
batch_skipped = 0
|
| 451 |
+
if patch_timestamp and batch.review_items:
|
| 452 |
+
for ri, text in zip(batch.review_items, batch.reviews):
|
| 453 |
+
is_recent = recent_processed < recent_limit
|
| 454 |
+
cat = []
|
| 455 |
+
if is_recent:
|
| 456 |
+
cat.append("recent")
|
| 457 |
+
|
| 458 |
+
if ri.timestamp_created >= patch_timestamp:
|
| 459 |
+
cat.append("current_patch")
|
| 460 |
+
nlp_start = time.monotonic()
|
| 461 |
+
res, skipped = await nlp_svc.analyze_batch(
|
| 462 |
+
[text], highlights_collector=highlights_collector, categories=cat
|
| 463 |
+
)
|
| 464 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 465 |
+
batch_skipped += skipped
|
| 466 |
+
if res:
|
| 467 |
+
aggregated_topics = aggregate_topics(aggregated_topics, res)
|
| 468 |
+
current_patch_topics = aggregate_topics(current_patch_topics, res)
|
| 469 |
+
review_topic_results.append((ri.timestamp_created, res))
|
| 470 |
+
current_patch_count += 1
|
| 471 |
+
else:
|
| 472 |
+
nlp_start = time.monotonic()
|
| 473 |
+
res, skipped = await nlp_svc.analyze_batch(
|
| 474 |
+
[text], highlights_collector=highlights_collector, categories=cat
|
| 475 |
+
)
|
| 476 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 477 |
+
batch_skipped += skipped
|
| 478 |
+
if res:
|
| 479 |
+
aggregated_topics = aggregate_topics(aggregated_topics, res)
|
| 480 |
+
review_topic_results.append((ri.timestamp_created, res))
|
| 481 |
+
recent_processed += 1
|
| 482 |
+
else:
|
| 483 |
+
for ri, text in zip(batch.review_items, batch.reviews) if batch.review_items else enumerate(batch.reviews):
|
| 484 |
+
is_recent = recent_processed < recent_limit
|
| 485 |
+
cat = ["recent"] if is_recent else []
|
| 486 |
+
|
| 487 |
+
nlp_start = time.monotonic()
|
| 488 |
+
res, skipped = await nlp_svc.analyze_batch(
|
| 489 |
+
[text], highlights_collector=highlights_collector, categories=cat
|
| 490 |
+
)
|
| 491 |
+
nlp_cumulative_s += time.monotonic() - nlp_start
|
| 492 |
+
batch_skipped += skipped
|
| 493 |
+
ts = ri.timestamp_created if batch.review_items else 0
|
| 494 |
+
if res:
|
| 495 |
+
aggregated_topics = aggregate_topics(aggregated_topics, res)
|
| 496 |
+
review_topic_results.append((ts, res))
|
| 497 |
+
recent_processed += 1
|
| 498 |
+
|
| 499 |
+
total_skipped += batch_skipped
|
| 500 |
+
processed += len(batch.reviews)
|
| 501 |
+
|
| 502 |
+
await fetch_task
|
| 503 |
+
except BaseException:
|
| 504 |
+
fetch_task.cancel()
|
| 505 |
+
with contextlib.suppress(asyncio.CancelledError):
|
| 506 |
+
await fetch_task
|
| 507 |
+
raise
|
| 508 |
+
|
| 509 |
+
# Phase 3: Save — highlights + MongoDB save
|
| 510 |
+
async with AsyncTimingContext() as t_save:
|
| 511 |
+
# 5. Compute prediction + highlights
|
| 512 |
+
|
| 513 |
+
# Build recent_topics from highest-timestamp reviews
|
| 514 |
+
review_topic_results.sort(key=lambda x: x[0], reverse=True)
|
| 515 |
+
recent_entries = review_topic_results[:recent_limit]
|
| 516 |
+
recent_topics: list[TopicSentiment] = []
|
| 517 |
+
for _, topics_batch in recent_entries:
|
| 518 |
+
for ts in topics_batch:
|
| 519 |
+
recent_topics = aggregate_topics(recent_topics, [ts])
|
| 520 |
+
recent_reviews_count = len(recent_entries)
|
| 521 |
+
|
| 522 |
+
# Apply min-mentions filter on final aggregates (not per-review — see nlp_service.py).
|
| 523 |
+
aggregated_topics = filter_topics_by_min_mentions(aggregated_topics)
|
| 524 |
+
recent_topics = filter_topics_by_min_mentions(recent_topics)
|
| 525 |
+
current_patch_topics = filter_topics_by_min_mentions(current_patch_topics)
|
| 526 |
+
|
| 527 |
+
prediction = calculate_prediction(aggregated_topics)
|
| 528 |
+
|
| 529 |
+
highlights_data = highlights_collector.compute_highlights()
|
| 530 |
+
general_highlights = highlights_data["general"]
|
| 531 |
+
recent_highlights = highlights_data["recent"]
|
| 532 |
+
current_patch_highlights = highlights_data["current_patch"]
|
| 533 |
+
topic_highlights_dict = highlights_data["topics"]
|
| 534 |
+
|
| 535 |
+
# Restrict topic highlights to topics that survived the min-mentions filter,
|
| 536 |
+
# so the topic_highlights set is always consistent with general_topics.
|
| 537 |
+
_surviving_topics = {t.topic for t in aggregated_topics}
|
| 538 |
+
topic_highlights_list = [
|
| 539 |
+
TopicHighlights(
|
| 540 |
+
topic=topic,
|
| 541 |
+
highlights=[Highlight(**h) for h in highlights],
|
| 542 |
+
)
|
| 543 |
+
for topic, highlights in topic_highlights_dict.items()
|
| 544 |
+
if topic in _surviving_topics
|
| 545 |
+
]
|
| 546 |
+
|
| 547 |
+
has_recent_split = processed > recent_limit
|
| 548 |
+
has_current_patch = patch_timestamp is not None and current_patch_count > 0
|
| 549 |
+
analysis_generated_at = datetime.now(timezone.utc)
|
| 550 |
+
current_patch_date = (
|
| 551 |
+
datetime.fromtimestamp(patch_timestamp, tz=timezone.utc)
|
| 552 |
+
if patch_timestamp is not None
|
| 553 |
+
else None
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
# Archive last_patch_topics when full analysis replaces a doc with a different patch.
|
| 557 |
+
last_patch_topics: list[TopicSentiment] | None = None
|
| 558 |
+
last_patch_reviews_count = 0
|
| 559 |
+
if stale_doc:
|
| 560 |
+
old_r = normalize_legacy_results(stale_doc.get("results", {}))
|
| 561 |
+
old_patch_ts = old_r.get("current_patch_timestamp")
|
| 562 |
+
if patch_timestamp and old_patch_ts and patch_timestamp != old_patch_ts:
|
| 563 |
+
raw_cp = old_r.get("current_patch_topics")
|
| 564 |
+
last_patch_topics = [TopicSentiment(**t) for t in raw_cp] if raw_cp else None
|
| 565 |
+
last_patch_reviews_count = old_r.get("current_patch_reviews_count", 0)
|
| 566 |
+
else:
|
| 567 |
+
raw_lp = old_r.get("last_patch_topics")
|
| 568 |
+
last_patch_topics = [TopicSentiment(**t) for t in raw_lp] if raw_lp else None
|
| 569 |
+
last_patch_reviews_count = old_r.get("last_patch_reviews_count", 0)
|
| 570 |
+
|
| 571 |
+
result = AnalysisResult(
|
| 572 |
+
game=game,
|
| 573 |
+
general_topics=aggregated_topics,
|
| 574 |
+
recent_topics=recent_topics if has_recent_split else None,
|
| 575 |
+
recent_reviews_count=recent_reviews_count if has_recent_split else 0,
|
| 576 |
+
current_patch_topics=current_patch_topics if has_current_patch else None,
|
| 577 |
+
current_patch_reviews_count=current_patch_count if has_current_patch else 0,
|
| 578 |
+
last_patch_topics=last_patch_topics,
|
| 579 |
+
last_patch_reviews_count=last_patch_reviews_count,
|
| 580 |
+
current_patch_timestamp=patch_timestamp,
|
| 581 |
+
analysis_date=analysis_generated_at,
|
| 582 |
+
current_patch_date=current_patch_date,
|
| 583 |
+
prediction=prediction,
|
| 584 |
+
analyzed_reviews=processed,
|
| 585 |
+
skipped_count=total_skipped,
|
| 586 |
+
general_highlights=[Highlight(**h) for h in general_highlights],
|
| 587 |
+
recent_highlights=[Highlight(**h) for h in recent_highlights] if recent_highlights else None,
|
| 588 |
+
current_patch_highlights=[Highlight(**h) for h in current_patch_highlights] if current_patch_highlights else None,
|
| 589 |
+
topic_highlights=topic_highlights_list,
|
| 590 |
+
cached_at=analysis_generated_at,
|
| 591 |
+
preferred_context=compute_preferred_context(patch_timestamp),
|
| 592 |
+
freshness_status=FreshnessStatus.FRESH.value,
|
| 593 |
+
is_refreshing=False,
|
| 594 |
+
)
|
| 595 |
+
|
| 596 |
+
# 6. Save to cache
|
| 597 |
+
await mongodb.save_analysis(
|
| 598 |
+
game.app_id,
|
| 599 |
+
result.model_dump(),
|
| 600 |
+
analyzed_review_ids=all_review_ids,
|
| 601 |
+
latest_review_timestamp=latest_timestamp,
|
| 602 |
+
ttl_hours=ttl_hours,
|
| 603 |
+
analyzed_at=analysis_generated_at,
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
total_elapsed = t_setup.elapsed_s + t_fetch_analyze.elapsed_s + t_save.elapsed_s
|
| 607 |
+
|
| 608 |
+
logger.info(
|
| 609 |
+
f"Analysis runner: completed {app_id} ({game_name}) — "
|
| 610 |
+
f"{processed} reviews, {len(aggregated_topics)} topics"
|
| 611 |
+
)
|
| 612 |
+
|
| 613 |
+
if slog:
|
| 614 |
+
log_structured(
|
| 615 |
+
"analysis_complete",
|
| 616 |
+
app_id=app_id,
|
| 617 |
+
game_name=game_name,
|
| 618 |
+
elapsed_s=round(total_elapsed, 3),
|
| 619 |
+
source="worker",
|
| 620 |
+
breakdown={
|
| 621 |
+
"setup_s": t_setup.elapsed_s,
|
| 622 |
+
"fetch_analyze_s": t_fetch_analyze.elapsed_s,
|
| 623 |
+
"nlp_cumulative_s": round(nlp_cumulative_s, 3),
|
| 624 |
+
"save_s": t_save.elapsed_s,
|
| 625 |
+
},
|
| 626 |
+
reviews_processed=processed,
|
| 627 |
+
topics_found=len(aggregated_topics),
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
return result.model_dump()
|
| 631 |
+
|
| 632 |
+
except Exception as e:
|
| 633 |
+
logger.error(f"Analysis runner error for {app_id} ({game_name}): {e}", exc_info=True)
|
| 634 |
+
if slog:
|
| 635 |
+
log_structured(
|
| 636 |
+
"analysis_error",
|
| 637 |
+
level=logging.ERROR,
|
| 638 |
+
app_id=app_id,
|
| 639 |
+
game_name=game_name,
|
| 640 |
+
source="worker",
|
| 641 |
+
error=str(e),
|
| 642 |
+
)
|
| 643 |
+
return None
|
backend/app/services/analysis_utils.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared analysis helpers used by both live and worker paths."""
|
| 2 |
+
|
| 3 |
+
import time
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from app.core.config import settings
|
| 8 |
+
from app.models.schemas import (
|
| 9 |
+
PredictionType,
|
| 10 |
+
SentimentType,
|
| 11 |
+
TopicSentiment,
|
| 12 |
+
UserCountPrediction,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def calculate_prediction(topics: list[TopicSentiment]) -> UserCountPrediction:
|
| 17 |
+
"""Compute the player-count trend prediction from aggregated topics."""
|
| 18 |
+
topic_map = {t.topic: t for t in topics}
|
| 19 |
+
|
| 20 |
+
retention = topic_map.get("Retention")
|
| 21 |
+
if retention and retention.mention_count > 5:
|
| 22 |
+
if retention.score > settings.prediction_retention_threshold_pos:
|
| 23 |
+
return UserCountPrediction(
|
| 24 |
+
trend=PredictionType.INCREASING,
|
| 25 |
+
confidence=min(0.95, 0.5 + (retention.mention_count / 100)),
|
| 26 |
+
reasoning="PREDICTION_REASONING_RETENTION_HIGH",
|
| 27 |
+
)
|
| 28 |
+
if retention.score < settings.prediction_retention_threshold_neg:
|
| 29 |
+
return UserCountPrediction(
|
| 30 |
+
trend=PredictionType.DECREASING,
|
| 31 |
+
confidence=min(0.95, 0.5 + (retention.mention_count / 100)),
|
| 32 |
+
reasoning="PREDICTION_REASONING_RETENTION_LOW",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
bugs = topic_map.get("Bugs")
|
| 36 |
+
performance = topic_map.get("Performance")
|
| 37 |
+
tech_score = 0.0
|
| 38 |
+
tech_count = 0
|
| 39 |
+
|
| 40 |
+
if bugs:
|
| 41 |
+
tech_score += bugs.score
|
| 42 |
+
tech_count += 1
|
| 43 |
+
if performance:
|
| 44 |
+
tech_score += performance.score
|
| 45 |
+
tech_count += 1
|
| 46 |
+
|
| 47 |
+
if tech_count > 0 and (tech_score / tech_count) < -0.3:
|
| 48 |
+
return UserCountPrediction(
|
| 49 |
+
trend=PredictionType.DECREASING,
|
| 50 |
+
confidence=0.75,
|
| 51 |
+
reasoning="PREDICTION_REASONING_TECH_ISSUES",
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
gameplay = topic_map.get("Gameplay")
|
| 55 |
+
fun = topic_map.get("Fun")
|
| 56 |
+
gameplay_score = 0.0
|
| 57 |
+
gameplay_count = 0
|
| 58 |
+
|
| 59 |
+
if gameplay:
|
| 60 |
+
gameplay_score += gameplay.score
|
| 61 |
+
gameplay_count += 1
|
| 62 |
+
if fun:
|
| 63 |
+
gameplay_score += fun.score
|
| 64 |
+
gameplay_count += 1
|
| 65 |
+
|
| 66 |
+
if gameplay_count > 0:
|
| 67 |
+
average_gameplay = gameplay_score / gameplay_count
|
| 68 |
+
if average_gameplay > 0.4:
|
| 69 |
+
return UserCountPrediction(
|
| 70 |
+
trend=PredictionType.INCREASING,
|
| 71 |
+
confidence=0.8,
|
| 72 |
+
reasoning="PREDICTION_REASONING_GAMEPLAY_HIGH",
|
| 73 |
+
)
|
| 74 |
+
if average_gameplay < -0.2:
|
| 75 |
+
return UserCountPrediction(
|
| 76 |
+
trend=PredictionType.DECREASING,
|
| 77 |
+
confidence=0.6,
|
| 78 |
+
reasoning="PREDICTION_REASONING_GAMEPLAY_LOW",
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
return UserCountPrediction(
|
| 82 |
+
trend=PredictionType.STABLE,
|
| 83 |
+
confidence=0.5,
|
| 84 |
+
reasoning="PREDICTION_REASONING_STABLE",
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def aggregate_topics(
|
| 89 |
+
existing: list[TopicSentiment],
|
| 90 |
+
new_batch: list[TopicSentiment],
|
| 91 |
+
) -> list[TopicSentiment]:
|
| 92 |
+
"""Merge topic aggregates using weighted mention counts."""
|
| 93 |
+
topic_data: dict[str, dict[str, Any]] = {}
|
| 94 |
+
|
| 95 |
+
def better_example(
|
| 96 |
+
current: tuple[str, float] | None,
|
| 97 |
+
new: tuple[str, float] | None,
|
| 98 |
+
) -> tuple[str, float] | None:
|
| 99 |
+
if new is None:
|
| 100 |
+
return current
|
| 101 |
+
if current is None:
|
| 102 |
+
return new
|
| 103 |
+
return new if abs(new[1]) > abs(current[1]) else current
|
| 104 |
+
|
| 105 |
+
for topic in existing:
|
| 106 |
+
if topic.topic not in topic_data:
|
| 107 |
+
topic_data[topic.topic] = {"scores": [], "count": 0, "example": None}
|
| 108 |
+
topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count)
|
| 109 |
+
topic_data[topic.topic]["count"] += topic.mention_count
|
| 110 |
+
new_example = (
|
| 111 |
+
(topic.example, topic.example_score)
|
| 112 |
+
if topic.example and topic.example_score is not None
|
| 113 |
+
else None
|
| 114 |
+
)
|
| 115 |
+
topic_data[topic.topic]["example"] = better_example(
|
| 116 |
+
topic_data[topic.topic]["example"],
|
| 117 |
+
new_example,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
for topic in new_batch:
|
| 121 |
+
if topic.topic not in topic_data:
|
| 122 |
+
topic_data[topic.topic] = {"scores": [], "count": 0, "example": None}
|
| 123 |
+
topic_data[topic.topic]["scores"].append(topic.score * topic.mention_count)
|
| 124 |
+
topic_data[topic.topic]["count"] += topic.mention_count
|
| 125 |
+
new_example = (
|
| 126 |
+
(topic.example, topic.example_score)
|
| 127 |
+
if topic.example and topic.example_score is not None
|
| 128 |
+
else None
|
| 129 |
+
)
|
| 130 |
+
topic_data[topic.topic]["example"] = better_example(
|
| 131 |
+
topic_data[topic.topic]["example"],
|
| 132 |
+
new_example,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
results: list[TopicSentiment] = []
|
| 136 |
+
for topic_name, data in topic_data.items():
|
| 137 |
+
count = data["count"]
|
| 138 |
+
if count == 0:
|
| 139 |
+
continue
|
| 140 |
+
|
| 141 |
+
average_score = sum(data["scores"]) / count
|
| 142 |
+
normalized_score = max(-1.0, min(1.0, average_score))
|
| 143 |
+
|
| 144 |
+
if normalized_score > settings.sentiment_positive_threshold:
|
| 145 |
+
sentiment = SentimentType.POSITIVE
|
| 146 |
+
elif normalized_score < settings.sentiment_negative_threshold:
|
| 147 |
+
sentiment = SentimentType.NEGATIVE
|
| 148 |
+
else:
|
| 149 |
+
sentiment = SentimentType.NEUTRAL
|
| 150 |
+
|
| 151 |
+
best_example = None
|
| 152 |
+
example_score = None
|
| 153 |
+
example_data = data["example"]
|
| 154 |
+
if example_data:
|
| 155 |
+
example_text, candidate_score = example_data
|
| 156 |
+
if sentiment == SentimentType.NEUTRAL or (
|
| 157 |
+
sentiment == SentimentType.POSITIVE and candidate_score > 0
|
| 158 |
+
) or (
|
| 159 |
+
sentiment == SentimentType.NEGATIVE and candidate_score < 0
|
| 160 |
+
):
|
| 161 |
+
best_example = example_text
|
| 162 |
+
example_score = candidate_score
|
| 163 |
+
|
| 164 |
+
results.append(
|
| 165 |
+
TopicSentiment(
|
| 166 |
+
topic=topic_name,
|
| 167 |
+
sentiment=sentiment,
|
| 168 |
+
score=round(normalized_score, 3),
|
| 169 |
+
mention_count=count,
|
| 170 |
+
example=best_example,
|
| 171 |
+
example_score=example_score,
|
| 172 |
+
)
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
results.sort(key=lambda item: item.mention_count, reverse=True)
|
| 176 |
+
return results
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def scale_topics(topics: list[TopicSentiment], factor: float) -> list[TopicSentiment]:
|
| 180 |
+
"""Scale mention counts for the approximate recent sliding window."""
|
| 181 |
+
return [
|
| 182 |
+
topic.model_copy(update={"mention_count": max(1, int(topic.mention_count * factor))})
|
| 183 |
+
for topic in topics
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def filter_topics_by_min_mentions(
|
| 188 |
+
topics: list[TopicSentiment],
|
| 189 |
+
min_mentions: int | None = None,
|
| 190 |
+
) -> list[TopicSentiment]:
|
| 191 |
+
"""Filter topics below the minimum mention threshold.
|
| 192 |
+
|
| 193 |
+
Preserves existing sort order. Only filters — does not modify score or sentiment.
|
| 194 |
+
Applied at the final aggregate level, never at the per-review level.
|
| 195 |
+
"""
|
| 196 |
+
threshold = min_mentions if min_mentions is not None else settings.topic_min_mentions
|
| 197 |
+
return [t for t in topics if t.mention_count >= threshold]
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def compute_preferred_context(patch_timestamp: int | None) -> str:
|
| 201 |
+
"""Choose the default user-facing context tab.
|
| 202 |
+
|
| 203 |
+
Returns 'current_patch' only when a recent major patch exists; otherwise
|
| 204 |
+
returns 'general' so the UI defaults to the full-picture view.
|
| 205 |
+
"""
|
| 206 |
+
if patch_timestamp is None:
|
| 207 |
+
return "general"
|
| 208 |
+
patch_age_days = (time.time() - patch_timestamp) / 86400
|
| 209 |
+
if patch_age_days > settings.patch_context_max_age_days:
|
| 210 |
+
return "general"
|
| 211 |
+
return "current_patch"
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
_LEGACY_FIELD_MAP = {
|
| 215 |
+
"topics": "general_topics",
|
| 216 |
+
"historical_topics": "general_topics",
|
| 217 |
+
"post_update_topics": "current_patch_topics",
|
| 218 |
+
"post_update_reviews_count": "current_patch_reviews_count",
|
| 219 |
+
"post_update_highlights": "current_patch_highlights",
|
| 220 |
+
"previous_update_topics": "last_patch_topics",
|
| 221 |
+
"previous_update_reviews_count": "last_patch_reviews_count",
|
| 222 |
+
"last_update_timestamp": "current_patch_timestamp",
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def normalize_legacy_results(results: dict[str, Any]) -> dict[str, Any]:
|
| 227 |
+
"""Map legacy persisted result fields to the current schema."""
|
| 228 |
+
normalized: dict[str, Any] = {}
|
| 229 |
+
for key, value in results.items():
|
| 230 |
+
new_key = _LEGACY_FIELD_MAP.get(key, key)
|
| 231 |
+
if key == "is_incremental":
|
| 232 |
+
continue
|
| 233 |
+
if new_key not in normalized:
|
| 234 |
+
normalized[new_key] = value
|
| 235 |
+
return normalized
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def serialize_datetime(value: Any) -> str | Any:
|
| 239 |
+
"""Serialize datetimes in SSE payloads and persisted compatibility helpers."""
|
| 240 |
+
if isinstance(value, datetime):
|
| 241 |
+
return value.isoformat()
|
| 242 |
+
return value
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def coerce_utc_datetime(value: Any) -> datetime | None:
|
| 246 |
+
"""Coerce persisted datetime values into timezone-aware UTC datetimes."""
|
| 247 |
+
if isinstance(value, datetime):
|
| 248 |
+
return value if value.tzinfo is not None else value.replace(tzinfo=timezone.utc)
|
| 249 |
+
if isinstance(value, str):
|
| 250 |
+
parsed = datetime.fromisoformat(value)
|
| 251 |
+
return parsed if parsed.tzinfo is not None else parsed.replace(tzinfo=timezone.utc)
|
| 252 |
+
return None
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def datetime_from_timestamp(timestamp: int | None) -> datetime | None:
|
| 256 |
+
"""Convert a unix timestamp into UTC datetime."""
|
| 257 |
+
if timestamp is None:
|
| 258 |
+
return None
|
| 259 |
+
return datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
backend/app/services/game_sync_service.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Game Sync Service — fetches game data from SteamSpy and upserts to MongoDB.
|
| 3 |
+
|
| 4 |
+
Replaces the manual scripts/fetch_games_to_mongodb.py with an automated,
|
| 5 |
+
rate-limited sync that runs as part of the Worker cycle.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
import httpx
|
| 14 |
+
|
| 15 |
+
from app.core.config import settings
|
| 16 |
+
from app.db.mongodb import mongodb
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
STEAMSPY_API_URL = "https://steamspy.com/api.php"
|
| 21 |
+
STEAM_STORE_API_URL = "https://store.steampowered.com/api"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class GameSyncService:
|
| 25 |
+
"""Syncs game data from SteamSpy into MongoDB."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, client: httpx.AsyncClient | None = None) -> None:
|
| 28 |
+
self._client = client
|
| 29 |
+
self._owns_client = client is None
|
| 30 |
+
|
| 31 |
+
async def _get_client(self) -> httpx.AsyncClient:
|
| 32 |
+
if self._client is None:
|
| 33 |
+
self._client = httpx.AsyncClient(timeout=30.0)
|
| 34 |
+
return self._client
|
| 35 |
+
|
| 36 |
+
async def close(self) -> None:
|
| 37 |
+
if self._owns_client and self._client is not None:
|
| 38 |
+
await self._client.aclose()
|
| 39 |
+
self._client = None
|
| 40 |
+
|
| 41 |
+
async def sync_all_games(self) -> tuple[int, int]:
|
| 42 |
+
"""
|
| 43 |
+
Fetch all games from SteamSpy (paginated, up to 90 pages).
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
(total_upserted, total_modified)
|
| 47 |
+
"""
|
| 48 |
+
client = await self._get_client()
|
| 49 |
+
total_upserted = 0
|
| 50 |
+
total_modified = 0
|
| 51 |
+
now = datetime.now(timezone.utc)
|
| 52 |
+
|
| 53 |
+
for page in range(90):
|
| 54 |
+
try:
|
| 55 |
+
resp = await client.get(
|
| 56 |
+
STEAMSPY_API_URL,
|
| 57 |
+
params={"request": "all", "page": page},
|
| 58 |
+
)
|
| 59 |
+
resp.raise_for_status()
|
| 60 |
+
data = resp.json()
|
| 61 |
+
|
| 62 |
+
if not data:
|
| 63 |
+
logger.info(f"SteamSpy page {page} empty — sync complete")
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
games = self._parse_all_response(data, now)
|
| 67 |
+
if games:
|
| 68 |
+
upserted, modified = await mongodb.upsert_games_batch(games)
|
| 69 |
+
total_upserted += upserted
|
| 70 |
+
total_modified += modified
|
| 71 |
+
|
| 72 |
+
logger.info(
|
| 73 |
+
f"SteamSpy page {page}: {len(games)} games "
|
| 74 |
+
f"(upserted={total_upserted}, modified={total_modified})"
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
except httpx.HTTPStatusError as e:
|
| 78 |
+
logger.error(f"SteamSpy HTTP error on page {page}: {e}")
|
| 79 |
+
break
|
| 80 |
+
except httpx.RequestError as e:
|
| 81 |
+
logger.error(f"SteamSpy request error on page {page}: {e}")
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
# Rate limit: SteamSpy allows ~1 request per minute
|
| 85 |
+
if page < 89:
|
| 86 |
+
await asyncio.sleep(settings.game_sync_steamspy_delay)
|
| 87 |
+
|
| 88 |
+
logger.info(
|
| 89 |
+
f"Game sync complete: upserted={total_upserted}, modified={total_modified}"
|
| 90 |
+
)
|
| 91 |
+
return (total_upserted, total_modified)
|
| 92 |
+
|
| 93 |
+
async def sync_top_game_details(self, limit: int | None = None) -> int:
|
| 94 |
+
"""
|
| 95 |
+
Enrich top N games with detailed info (tags, genre, ccu) from SteamSpy.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Number of games enriched.
|
| 99 |
+
"""
|
| 100 |
+
limit = limit or settings.game_sync_top_n_details
|
| 101 |
+
client = await self._get_client()
|
| 102 |
+
|
| 103 |
+
top_games = await mongodb.get_top_games_by_reviews(limit)
|
| 104 |
+
enriched = 0
|
| 105 |
+
|
| 106 |
+
for game in top_games:
|
| 107 |
+
appid = game.get("appid", "")
|
| 108 |
+
if not appid:
|
| 109 |
+
continue
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
resp = await client.get(
|
| 113 |
+
STEAMSPY_API_URL,
|
| 114 |
+
params={"request": "appdetails", "appid": appid},
|
| 115 |
+
)
|
| 116 |
+
resp.raise_for_status()
|
| 117 |
+
detail = resp.json()
|
| 118 |
+
|
| 119 |
+
update = self._parse_detail_response(detail)
|
| 120 |
+
if update:
|
| 121 |
+
await mongodb.upsert_game({"appid": appid, "name": game["name"], **update})
|
| 122 |
+
enriched += 1
|
| 123 |
+
|
| 124 |
+
except httpx.HTTPStatusError as e:
|
| 125 |
+
logger.warning(f"SteamSpy detail error for {appid}: {e}")
|
| 126 |
+
except httpx.RequestError as e:
|
| 127 |
+
logger.warning(f"SteamSpy detail request error for {appid}: {e}")
|
| 128 |
+
|
| 129 |
+
await asyncio.sleep(settings.game_sync_details_delay)
|
| 130 |
+
|
| 131 |
+
logger.info(f"Enriched {enriched}/{len(top_games)} games with details")
|
| 132 |
+
return enriched
|
| 133 |
+
|
| 134 |
+
async def enrich_cn_names(self, limit: int | None = None) -> int:
|
| 135 |
+
"""
|
| 136 |
+
Enrich games with Chinese names from Steam Store API.
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Number of games processed.
|
| 140 |
+
"""
|
| 141 |
+
limit = limit or settings.game_sync_cn_enrichment_limit
|
| 142 |
+
client = await self._get_client()
|
| 143 |
+
|
| 144 |
+
games_to_check = await mongodb.get_games_without_cn_name(limit)
|
| 145 |
+
processed = 0
|
| 146 |
+
|
| 147 |
+
for game in games_to_check:
|
| 148 |
+
appid = game.get("appid")
|
| 149 |
+
name_en = game.get("name")
|
| 150 |
+
if not appid:
|
| 151 |
+
continue
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
app_data = await self._fetch_store_app_data(client, appid)
|
| 155 |
+
if app_data and app_data.get("success"):
|
| 156 |
+
info = app_data.get("data", {})
|
| 157 |
+
name_cn = info.get("name")
|
| 158 |
+
|
| 159 |
+
# If names are different, we found a translation
|
| 160 |
+
if name_cn and name_cn != name_en:
|
| 161 |
+
await mongodb.mark_cn_name_checked(appid, name_cn)
|
| 162 |
+
else:
|
| 163 |
+
await mongodb.mark_cn_name_checked(appid)
|
| 164 |
+
else:
|
| 165 |
+
# Not found or error in API - still mark as checked
|
| 166 |
+
await mongodb.mark_cn_name_checked(appid)
|
| 167 |
+
|
| 168 |
+
processed += 1
|
| 169 |
+
|
| 170 |
+
except httpx.HTTPError as e:
|
| 171 |
+
logger.warning(f"Error fetching CN name for {appid}: {e}")
|
| 172 |
+
# Don't mark as checked on network error, try again next cycle
|
| 173 |
+
|
| 174 |
+
# Respect rate limits
|
| 175 |
+
await asyncio.sleep(settings.game_sync_cn_enrichment_delay)
|
| 176 |
+
|
| 177 |
+
logger.info(f"Enriched CN names for {processed}/{len(games_to_check)} games")
|
| 178 |
+
return processed
|
| 179 |
+
|
| 180 |
+
async def enrich_app_types(self, limit: int | None = None) -> int:
|
| 181 |
+
"""
|
| 182 |
+
Enrich app_type/parent_appid using Steam Store appdetails.
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
Number of games processed.
|
| 186 |
+
"""
|
| 187 |
+
limit = limit or settings.game_sync_app_type_enrichment_limit
|
| 188 |
+
client = await self._get_client()
|
| 189 |
+
|
| 190 |
+
games_to_check = await mongodb.get_games_missing_app_type(limit)
|
| 191 |
+
processed = 0
|
| 192 |
+
|
| 193 |
+
for game in games_to_check:
|
| 194 |
+
appid = game.get("appid")
|
| 195 |
+
if not appid:
|
| 196 |
+
continue
|
| 197 |
+
|
| 198 |
+
try:
|
| 199 |
+
app_data = await self._fetch_store_app_data(client, appid)
|
| 200 |
+
info = app_data.get("data", {}) if app_data and app_data.get("success") else {}
|
| 201 |
+
|
| 202 |
+
parsed = self._parse_store_type_response(info)
|
| 203 |
+
await mongodb.mark_app_type_checked(
|
| 204 |
+
appid,
|
| 205 |
+
app_type=parsed["app_type"],
|
| 206 |
+
parent_appid=parsed["parent_appid"],
|
| 207 |
+
)
|
| 208 |
+
processed += 1
|
| 209 |
+
|
| 210 |
+
except httpx.HTTPError as e:
|
| 211 |
+
logger.warning(f"Error fetching app type for {appid}: {e}")
|
| 212 |
+
|
| 213 |
+
await asyncio.sleep(settings.game_sync_app_type_enrichment_delay)
|
| 214 |
+
|
| 215 |
+
logger.info(f"Enriched app types for {processed}/{len(games_to_check)} games")
|
| 216 |
+
return processed
|
| 217 |
+
|
| 218 |
+
@staticmethod
|
| 219 |
+
def _parse_all_response(
|
| 220 |
+
data: dict[str, Any], synced_at: datetime
|
| 221 |
+
) -> list[dict[str, Any]]:
|
| 222 |
+
"""Parse SteamSpy 'all' response into list of game dicts."""
|
| 223 |
+
games: list[dict[str, Any]] = []
|
| 224 |
+
for appid_str, info in data.items():
|
| 225 |
+
name = info.get("name", "")
|
| 226 |
+
if not name:
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
games.append({
|
| 230 |
+
"appid": str(appid_str),
|
| 231 |
+
"name": name,
|
| 232 |
+
"developer": info.get("developer", ""),
|
| 233 |
+
"publisher": info.get("publisher", ""),
|
| 234 |
+
"positive": info.get("positive", 0),
|
| 235 |
+
"negative": info.get("negative", 0),
|
| 236 |
+
"synced_at": synced_at,
|
| 237 |
+
})
|
| 238 |
+
return games
|
| 239 |
+
|
| 240 |
+
@staticmethod
|
| 241 |
+
def _parse_detail_response(detail: dict[str, Any]) -> dict[str, Any]:
|
| 242 |
+
"""Parse SteamSpy 'appdetails' response into enrichment fields."""
|
| 243 |
+
update: dict[str, Any] = {}
|
| 244 |
+
|
| 245 |
+
tags = detail.get("tags")
|
| 246 |
+
if isinstance(tags, dict) and tags:
|
| 247 |
+
# Sort by vote count descending, keep top 20 tag names
|
| 248 |
+
sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]
|
| 249 |
+
update["tags"] = [tag_name for tag_name, _ in sorted_tags]
|
| 250 |
+
|
| 251 |
+
genre = detail.get("genre")
|
| 252 |
+
if genre:
|
| 253 |
+
update["genre"] = genre
|
| 254 |
+
|
| 255 |
+
ccu = detail.get("ccu")
|
| 256 |
+
if ccu is not None:
|
| 257 |
+
update["ccu"] = ccu
|
| 258 |
+
|
| 259 |
+
return update
|
| 260 |
+
|
| 261 |
+
@staticmethod
|
| 262 |
+
def _parse_store_type_response(info: dict[str, Any]) -> dict[str, Any]:
|
| 263 |
+
app_type = info.get("type") or "unknown"
|
| 264 |
+
fullgame = info.get("fullgame")
|
| 265 |
+
|
| 266 |
+
parent_appid = None
|
| 267 |
+
if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None:
|
| 268 |
+
parent_appid = str(fullgame["appid"])
|
| 269 |
+
|
| 270 |
+
return {
|
| 271 |
+
"app_type": str(app_type),
|
| 272 |
+
"parent_appid": parent_appid,
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
@staticmethod
|
| 276 |
+
async def _fetch_store_app_data(
|
| 277 |
+
client: httpx.AsyncClient, appid: str
|
| 278 |
+
) -> dict[str, Any] | None:
|
| 279 |
+
"""Fetch one appdetails payload from Steam Store."""
|
| 280 |
+
resp = await client.get(
|
| 281 |
+
f"{STEAM_STORE_API_URL}/appdetails",
|
| 282 |
+
params={
|
| 283 |
+
"appids": appid,
|
| 284 |
+
"l": "schinese",
|
| 285 |
+
"cc": "CN",
|
| 286 |
+
},
|
| 287 |
+
)
|
| 288 |
+
resp.raise_for_status()
|
| 289 |
+
data = resp.json()
|
| 290 |
+
return data.get(str(appid))
|
backend/app/services/highlights_service.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Serwis ekstrakcji Community Highlights z recenzji.
|
| 3 |
+
Uzywa n-gramow (2-5 tokenow) + TF-IDF do identyfikacji najczesciej uzywanych fraz.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import math
|
| 7 |
+
from collections import Counter, defaultdict
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import jieba
|
| 11 |
+
|
| 12 |
+
from app.core.config import settings
|
| 13 |
+
from app.core.stopwords_zh import is_stopword
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class HighlightsCollector:
|
| 17 |
+
"""
|
| 18 |
+
Stateful collector — akumuluje dane przez caly cykl analizy w sposob przyrostowy,
|
| 19 |
+
aby oszczedzac pamiec RAM. Oblicza highlights raz na koncu.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self) -> None:
|
| 23 |
+
self._topic_ngrams: dict[str, Counter] = defaultdict(Counter)
|
| 24 |
+
self._category_ngrams: dict[str, Counter] = defaultdict(Counter)
|
| 25 |
+
self._global_counts: Counter = Counter()
|
| 26 |
+
self._ngram_doc_freq: Counter = Counter()
|
| 27 |
+
self._ngram_sentiment_sum: dict[str, float] = defaultdict(float)
|
| 28 |
+
self._ngram_sentiment_count: Counter = Counter()
|
| 29 |
+
self._review_count = 0
|
| 30 |
+
self._current_review_seen_ngrams: set[str] = set()
|
| 31 |
+
|
| 32 |
+
def start_review(self) -> None:
|
| 33 |
+
"""Sygnalizuje poczatek nowej recenzji (do obliczania Document Frequency)."""
|
| 34 |
+
self._review_count += 1
|
| 35 |
+
self._current_review_seen_ngrams = set()
|
| 36 |
+
|
| 37 |
+
def add_sentence(
|
| 38 |
+
self,
|
| 39 |
+
review_idx: int, # Zachowane dla kompatybilnosci, uzywaj start_review() do separacji
|
| 40 |
+
sentence: str,
|
| 41 |
+
topics: list[str],
|
| 42 |
+
sentiment_score: float,
|
| 43 |
+
categories: list[str] | None = None,
|
| 44 |
+
) -> None:
|
| 45 |
+
"""Wywolywane per zdanie podczas analyze_batch()."""
|
| 46 |
+
# Prosta detekcja ASCII dla angielskich fraz (unikniecie blednego ciecia przez jieba)
|
| 47 |
+
is_ascii = all(ord(c) < 128 for c in sentence)
|
| 48 |
+
if is_ascii:
|
| 49 |
+
words = [w for w in sentence.split() if not is_stopword(w) and len(w.strip()) > 0]
|
| 50 |
+
else:
|
| 51 |
+
words = [w for w in jieba.lcut(sentence) if not is_stopword(w) and len(w.strip()) > 0]
|
| 52 |
+
|
| 53 |
+
if len(words) < 2:
|
| 54 |
+
return
|
| 55 |
+
|
| 56 |
+
for n in range(settings.highlights_ngram_min, settings.highlights_ngram_max + 1):
|
| 57 |
+
for i in range(len(words) - n + 1):
|
| 58 |
+
ngram = " ".join(words[i : i + n])
|
| 59 |
+
|
| 60 |
+
# 1. Globalne liczniki
|
| 61 |
+
self._global_counts[ngram] += 1
|
| 62 |
+
self._ngram_sentiment_sum[ngram] += sentiment_score
|
| 63 |
+
self._ngram_sentiment_count[ngram] += 1
|
| 64 |
+
|
| 65 |
+
# 2. Przyrostowe Document Frequency (raz per recenzja)
|
| 66 |
+
if ngram not in self._current_review_seen_ngrams:
|
| 67 |
+
self._ngram_doc_freq[ngram] += 1
|
| 68 |
+
self._current_review_seen_ngrams.add(ngram)
|
| 69 |
+
|
| 70 |
+
# 3. Liczniki tematyczne i kategoryczne
|
| 71 |
+
for topic in topics:
|
| 72 |
+
self._topic_ngrams[topic][ngram] += 1
|
| 73 |
+
if categories:
|
| 74 |
+
for category in categories:
|
| 75 |
+
self._category_ngrams[category][ngram] += 1
|
| 76 |
+
|
| 77 |
+
if self._review_count % 500 == 0:
|
| 78 |
+
self._prune_singletons()
|
| 79 |
+
|
| 80 |
+
def _prune_singletons(self) -> None:
|
| 81 |
+
"""Glebokie czyszczenie n-gramow z count=1 (oszczednosc pamieci)."""
|
| 82 |
+
singletons = [k for k, v in self._global_counts.items() if v <= 1]
|
| 83 |
+
for k in singletons:
|
| 84 |
+
del self._global_counts[k]
|
| 85 |
+
if k in self._ngram_sentiment_sum:
|
| 86 |
+
del self._ngram_sentiment_sum[k]
|
| 87 |
+
del self._ngram_sentiment_count[k]
|
| 88 |
+
del self._ngram_doc_freq[k]
|
| 89 |
+
|
| 90 |
+
# Czyszczenie w tematach
|
| 91 |
+
for topic in self._topic_ngrams:
|
| 92 |
+
if k in self._topic_ngrams[topic]:
|
| 93 |
+
del self._topic_ngrams[topic][k]
|
| 94 |
+
|
| 95 |
+
# Czyszczenie w kategoriach
|
| 96 |
+
for cat in self._category_ngrams:
|
| 97 |
+
if k in self._category_ngrams[cat]:
|
| 98 |
+
del self._category_ngrams[cat][k]
|
| 99 |
+
|
| 100 |
+
def compute_highlights(self) -> dict[str, Any]:
|
| 101 |
+
"""
|
| 102 |
+
Oblicza highlights po zakonczeniu analizy.
|
| 103 |
+
"""
|
| 104 |
+
if self._review_count == 0:
|
| 105 |
+
return {
|
| 106 |
+
"general": [],
|
| 107 |
+
"recent": [],
|
| 108 |
+
"current_patch": [],
|
| 109 |
+
"topics": {}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
results: dict[str, Any] = {
|
| 113 |
+
"general": self._compute_tfidf_highlights(
|
| 114 |
+
self._global_counts,
|
| 115 |
+
top_n=settings.highlights_top_n_general,
|
| 116 |
+
),
|
| 117 |
+
"recent": self._compute_tfidf_highlights(
|
| 118 |
+
self._category_ngrams.get("recent", Counter()),
|
| 119 |
+
top_n=settings.highlights_top_n_general,
|
| 120 |
+
),
|
| 121 |
+
"current_patch": self._compute_tfidf_highlights(
|
| 122 |
+
self._category_ngrams.get("current_patch", Counter()),
|
| 123 |
+
top_n=settings.highlights_top_n_general,
|
| 124 |
+
),
|
| 125 |
+
"topics": {}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
for topic, counter in self._topic_ngrams.items():
|
| 129 |
+
h = self._compute_tfidf_highlights(
|
| 130 |
+
counter,
|
| 131 |
+
top_n=settings.highlights_top_n_per_topic,
|
| 132 |
+
)
|
| 133 |
+
if h:
|
| 134 |
+
results["topics"][topic] = h
|
| 135 |
+
|
| 136 |
+
return results
|
| 137 |
+
|
| 138 |
+
def _compute_tfidf_highlights(self, counter: Counter, top_n: int) -> list[dict]:
|
| 139 |
+
"""TF-IDF scoring + filtering + dedup."""
|
| 140 |
+
candidates = []
|
| 141 |
+
n = self._review_count
|
| 142 |
+
total_count = sum(counter.values()) if counter.values() else 1
|
| 143 |
+
|
| 144 |
+
for ngram, count in counter.items():
|
| 145 |
+
df = self._ngram_doc_freq.get(ngram, 0)
|
| 146 |
+
|
| 147 |
+
if df < settings.highlights_min_mentions:
|
| 148 |
+
continue
|
| 149 |
+
if df / n > settings.highlights_max_doc_freq_ratio:
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
idf = math.log(n / df) if df > 0 else 0
|
| 153 |
+
tf = count / total_count
|
| 154 |
+
tfidf = tf * idf
|
| 155 |
+
rank_score = count * tfidf
|
| 156 |
+
|
| 157 |
+
# Oblicz sredni sentyment z sumy i liczby
|
| 158 |
+
s_sum = self._ngram_sentiment_sum.get(ngram, 0.0)
|
| 159 |
+
s_count = self._ngram_sentiment_count.get(ngram, 0)
|
| 160 |
+
avg_score = s_sum / s_count if s_count > 0 else 0.0
|
| 161 |
+
|
| 162 |
+
candidates.append({
|
| 163 |
+
"phrase": ngram,
|
| 164 |
+
"mention_count": df,
|
| 165 |
+
"score": round(avg_score, 3),
|
| 166 |
+
"sentiment": (
|
| 167 |
+
"positive" if avg_score > settings.sentiment_positive_threshold
|
| 168 |
+
else "negative" if avg_score < settings.sentiment_negative_threshold
|
| 169 |
+
else "neutral"
|
| 170 |
+
),
|
| 171 |
+
"ngram_size": len(ngram.split()),
|
| 172 |
+
"_rank": rank_score,
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
candidates.sort(key=lambda x: x["_rank"], reverse=True)
|
| 176 |
+
|
| 177 |
+
# Substring absorption
|
| 178 |
+
absorbed: set[int] = set()
|
| 179 |
+
for i, c in enumerate(candidates):
|
| 180 |
+
if i in absorbed:
|
| 181 |
+
continue
|
| 182 |
+
for j in range(i + 1, len(candidates)):
|
| 183 |
+
if j in absorbed:
|
| 184 |
+
continue
|
| 185 |
+
if candidates[j]["phrase"] in c["phrase"]:
|
| 186 |
+
parent_has_neg = any(neg in c["phrase"] for neg in ["不", "没", "无"])
|
| 187 |
+
child_has_neg = any(neg in candidates[j]["phrase"] for neg in ["不", "没", "无"])
|
| 188 |
+
if parent_has_neg == child_has_neg:
|
| 189 |
+
absorbed.add(j)
|
| 190 |
+
|
| 191 |
+
results = [c for i, c in enumerate(candidates) if i not in absorbed]
|
| 192 |
+
|
| 193 |
+
# Re-sort by mention_count descending for display order.
|
| 194 |
+
# TF-IDF sort above selected the top candidates; this ensures the final
|
| 195 |
+
# list the UI receives is ordered from most-mentioned to least-mentioned,
|
| 196 |
+
# with score and phrase as stable tie-breakers.
|
| 197 |
+
results.sort(key=lambda x: (-x["mention_count"], -x["score"], x["phrase"]))
|
| 198 |
+
|
| 199 |
+
for r in results[:top_n]:
|
| 200 |
+
r.pop("_rank", None)
|
| 201 |
+
|
| 202 |
+
return results[:top_n]
|
backend/app/services/nlp_service.py
ADDED
|
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Serwis NLP do analizy sentymentu i modelowania tematów.
|
| 3 |
+
|
| 4 |
+
Architektura: Local Inference (CPU).
|
| 5 |
+
Wykorzystuje model Transformer (DistilBERT) uruchamiany bezpośrednio w aplikacji,
|
| 6 |
+
co eliminuje opóźnienia sieciowe i zapewnia deterministyczny czas wykonania.
|
| 7 |
+
|
| 8 |
+
Optymalizacje:
|
| 9 |
+
1. Pre-kompilacja wzorców Regex (O(1) matching).
|
| 10 |
+
2. Wykonywanie inferencji w Executorze (nie blokuje Event Loop).
|
| 11 |
+
3. Batching zapytań do modelu (wykorzystanie instrukcji wektorowych CPU).
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import asyncio
|
| 17 |
+
import logging
|
| 18 |
+
import re
|
| 19 |
+
from collections import OrderedDict, defaultdict
|
| 20 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 21 |
+
from typing import TYPE_CHECKING
|
| 22 |
+
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import jieba
|
| 26 |
+
from transformers import AutoTokenizer, pipeline
|
| 27 |
+
from optimum.onnxruntime import ORTModelForSequenceClassification
|
| 28 |
+
from zhconv import convert
|
| 29 |
+
|
| 30 |
+
from app.core.config import settings
|
| 31 |
+
from app.core.keywords import EXCLUSIONS, TOPIC_KEYWORDS
|
| 32 |
+
from app.models.schemas import SentimentType, TopicSentiment
|
| 33 |
+
|
| 34 |
+
if TYPE_CHECKING:
|
| 35 |
+
from app.services.highlights_service import HighlightsCollector
|
| 36 |
+
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
|
| 39 |
+
CARD_LAG_PREFIXES = frozenset({"不", "很", "好", "太", "真", "挺", "老", "总"})
|
| 40 |
+
CARD_STANDALONE_PREVIOUS_TOKENS = frozenset({"有点", "一直", "偶尔"})
|
| 41 |
+
|
| 42 |
+
# Zakresy Unicode dla Emoji i symboli graficznych
|
| 43 |
+
# UWAGA: Poprzedni pattern "\U000024C2-\U0001F251" był zbyt szeroki i usuwał chińskie znaki!
|
| 44 |
+
# Teraz używamy precyzyjnych zakresów tylko dla emoji.
|
| 45 |
+
EMOJI_PATTERN = re.compile(
|
| 46 |
+
"["
|
| 47 |
+
"\U0001F600-\U0001F64F" # Emoticons
|
| 48 |
+
"\U0001F300-\U0001F5FF" # Misc Symbols and Pictographs
|
| 49 |
+
"\U0001F680-\U0001F6FF" # Transport and Map
|
| 50 |
+
"\U0001F1E0-\U0001F1FF" # Flags (iOS)
|
| 51 |
+
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
|
| 52 |
+
"\U0001FA00-\U0001FA6F" # Chess Symbols
|
| 53 |
+
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
|
| 54 |
+
"\U00002702-\U000027B0" # Dingbats
|
| 55 |
+
"\U0000FE00-\U0000FE0F" # Variation Selectors
|
| 56 |
+
"]+",
|
| 57 |
+
flags=re.UNICODE,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Inteligentny podział na zdania (wspiera angielski i chiński)
|
| 61 |
+
# Chiński: 。!?;
|
| 62 |
+
# Angielski: .!?
|
| 63 |
+
# Interpunkcja do usunięcia przy deduplikacji (EN + ZH)
|
| 64 |
+
DEDUP_PUNCTUATION = re.compile(r'[!"#$%&\'()*+,\-./:;<=>?@\[\\\]^_`{|}~。!?,、;:""''【】()《》~…·]')
|
| 65 |
+
|
| 66 |
+
SENTENCE_SPLIT_PATTERN = re.compile(r"""
|
| 67 |
+
(?<=[.!?。!?;])\s* # Koniec zdania (EN + ZH punctuation)
|
| 68 |
+
| # LUB
|
| 69 |
+
(?<=[a-z]),\s+ # Przecinek po literze + spacja...
|
| 70 |
+
(?=but\b|however\b|although\b|though\b) # ...przed spójnikiem przeciwstawnym (EN)
|
| 71 |
+
|
|
| 72 |
+
\s+(?=but\b|however\b|although\b|though\b) # Spójnik bez przecinka (EN)
|
| 73 |
+
|
|
| 74 |
+
(?<=。|!|?|;) # Po chińskiej interpunkcji (bez spacji)
|
| 75 |
+
|
|
| 76 |
+
(?=但是|然而|虽然|不过|可是) # Przed chińskim spójnikiem przeciwstawnym
|
| 77 |
+
""", re.VERBOSE | re.IGNORECASE)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class NLPService:
|
| 81 |
+
"""
|
| 82 |
+
Serwis NLP realizujący analizę hybrydową:
|
| 83 |
+
1. Słowa kluczowe (Regex) -> Wykrywanie tematów.
|
| 84 |
+
2. DistilBERT (Local Model) -> Analiza sentymentu.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
def __init__(self) -> None:
|
| 88 |
+
"""
|
| 89 |
+
Inicjalizuje pipeline ML oraz kompiluje wzorce tekstowe.
|
| 90 |
+
Model ładowany jest raz przy starcie aplikacji (Singleton pattern).
|
| 91 |
+
"""
|
| 92 |
+
logger.info("Inicjalizacja serwisu NLP (ONNX Optimized)...")
|
| 93 |
+
|
| 94 |
+
# 0. Jieba user dict — terminy gamingowe
|
| 95 |
+
userdict_path = Path(__file__).parent.parent / "core" / "jieba_userdict.txt"
|
| 96 |
+
if userdict_path.exists():
|
| 97 |
+
jieba.load_userdict(str(userdict_path))
|
| 98 |
+
logger.info(f"Załadowano jieba user dict: {userdict_path}")
|
| 99 |
+
|
| 100 |
+
# 1. Kompilacja Regexów
|
| 101 |
+
# Łączymy słowa kluczowe w jeden efektywny "automat" (Regex).
|
| 102 |
+
# UWAGA: \b nie działa z chińskimi znakami, więc używamy różnych wzorców
|
| 103 |
+
# dla słów ASCII (z \b) i chińskich (bez \b).
|
| 104 |
+
self.topic_patterns = {}
|
| 105 |
+
self.single_char_topic_keywords = {}
|
| 106 |
+
self.exclusion_patterns = {}
|
| 107 |
+
|
| 108 |
+
for topic, keyword_groups in TOPIC_KEYWORDS.items():
|
| 109 |
+
ascii_keywords: list[str] = []
|
| 110 |
+
chinese_keywords: list[str] = []
|
| 111 |
+
chinese_single_char_keywords: list[str] = []
|
| 112 |
+
|
| 113 |
+
for group_name, group in keyword_groups.items():
|
| 114 |
+
for keyword in group:
|
| 115 |
+
if keyword.isascii():
|
| 116 |
+
ascii_keywords.append(keyword)
|
| 117 |
+
elif group_name == "single_char" and len(keyword) == 1:
|
| 118 |
+
chinese_single_char_keywords.append(keyword)
|
| 119 |
+
else:
|
| 120 |
+
chinese_keywords.append(keyword)
|
| 121 |
+
|
| 122 |
+
self.single_char_topic_keywords[topic] = chinese_single_char_keywords
|
| 123 |
+
|
| 124 |
+
patterns = []
|
| 125 |
+
if ascii_keywords:
|
| 126 |
+
# Use word boundaries for ASCII keywords
|
| 127 |
+
sorted_ascii = sorted(ascii_keywords, key=len, reverse=True)
|
| 128 |
+
patterns.append(r'\b(' + '|'.join(re.escape(k) for k in sorted_ascii) + r')\b')
|
| 129 |
+
if chinese_keywords:
|
| 130 |
+
# No word boundaries for Chinese (they don't have spaces),
|
| 131 |
+
# but prefer longer keywords so compounds win over partial overlaps.
|
| 132 |
+
sorted_chinese = sorted(chinese_keywords, key=len, reverse=True)
|
| 133 |
+
patterns.append('(' + '|'.join(re.escape(k) for k in sorted_chinese) + ')')
|
| 134 |
+
|
| 135 |
+
if patterns:
|
| 136 |
+
combined_pattern = '|'.join(patterns)
|
| 137 |
+
self.topic_patterns[topic] = re.compile(combined_pattern, re.IGNORECASE)
|
| 138 |
+
|
| 139 |
+
for keyword, exclusions in EXCLUSIONS.items():
|
| 140 |
+
if exclusions:
|
| 141 |
+
pattern_str = '|'.join(re.escape(e) for e in exclusions)
|
| 142 |
+
self.exclusion_patterns[keyword] = re.compile(pattern_str, re.IGNORECASE)
|
| 143 |
+
|
| 144 |
+
# 2. Ładowanie modelu ONNX
|
| 145 |
+
logger.info(f"Ładowanie modelu ONNX {settings.hf_sentiment_model}...")
|
| 146 |
+
try:
|
| 147 |
+
from onnxruntime import GraphOptimizationLevel, SessionOptions
|
| 148 |
+
|
| 149 |
+
# OPTYMALIZACJA DLA HF SPACES (Shared CPU)
|
| 150 |
+
# Na darmowym tierze mamy 2 vCPU. Ograniczenie wątków zapobiega
|
| 151 |
+
# "context switching" i walce o zasoby.
|
| 152 |
+
session_options = SessionOptions()
|
| 153 |
+
session_options.intra_op_num_threads = settings.nlp_onnx_intra_threads
|
| 154 |
+
session_options.inter_op_num_threads = settings.nlp_onnx_inter_threads
|
| 155 |
+
session_options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 156 |
+
|
| 157 |
+
# Load pre-built quantized INT8 ONNX model (no PyTorch needed at runtime)
|
| 158 |
+
quantized_path = Path(__file__).resolve().parent.parent.parent / "models" / "quantized"
|
| 159 |
+
model_file = quantized_path / "model_quantized.onnx"
|
| 160 |
+
if not model_file.exists():
|
| 161 |
+
raise FileNotFoundError(
|
| 162 |
+
f"Quantized ONNX model not found at {model_file}. "
|
| 163 |
+
"Run 'python scripts/quantize_model.py' to generate it."
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
logger.info(f"Loading quantized INT8 model from {quantized_path}")
|
| 167 |
+
model = ORTModelForSequenceClassification.from_pretrained(
|
| 168 |
+
str(quantized_path),
|
| 169 |
+
file_name="model_quantized.onnx",
|
| 170 |
+
session_options=session_options,
|
| 171 |
+
)
|
| 172 |
+
tokenizer = AutoTokenizer.from_pretrained(str(quantized_path))
|
| 173 |
+
|
| 174 |
+
self.classifier = pipeline(
|
| 175 |
+
"sentiment-analysis",
|
| 176 |
+
model=model,
|
| 177 |
+
tokenizer=tokenizer,
|
| 178 |
+
device="cpu",
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
logger.info("Model NLP ONNX ready: INT8 quantized, graph_optimization=ALL")
|
| 182 |
+
except Exception as e:
|
| 183 |
+
# Deliberate broad catch — model loading can fail with OSError, RuntimeError,
|
| 184 |
+
# ONNX errors, HF Hub errors, etc. Always fatal, always re-raised.
|
| 185 |
+
logger.error(f"Krytyczny błąd ładowania modelu ONNX: {e}")
|
| 186 |
+
raise
|
| 187 |
+
|
| 188 |
+
# Pula wątków, żeby ciężkie obliczenia AI nie blokowały serwera (Event Loop)
|
| 189 |
+
self.executor = ThreadPoolExecutor(max_workers=1)
|
| 190 |
+
|
| 191 |
+
# Cache sentymentu: normalized_text -> (label_str, score)
|
| 192 |
+
self._sentiment_cache: OrderedDict[str, tuple[str, float]] = OrderedDict()
|
| 193 |
+
self._cache_maxsize = settings.dedup_cache_maxsize
|
| 194 |
+
|
| 195 |
+
def clean_text(self, text: str) -> str:
|
| 196 |
+
"""Usuwa szum (emoji, nadmiarowe spacje) i normalizuje tekst."""
|
| 197 |
+
text = EMOJI_PATTERN.sub("", text)
|
| 198 |
+
text = text.lower()
|
| 199 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 200 |
+
max_len = settings.text_max_length
|
| 201 |
+
return text[:max_len] if len(text) > max_len else text
|
| 202 |
+
|
| 203 |
+
def _normalize_for_dedup(self, text: str) -> str:
|
| 204 |
+
"""Normalizuje zdanie do klucza deduplikacji (zachowuje kolejność słów)."""
|
| 205 |
+
text = DEDUP_PUNCTUATION.sub("", text).lower()
|
| 206 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 207 |
+
return convert(text, 'zh-cn')
|
| 208 |
+
|
| 209 |
+
def _split_into_sentences(self, text: str) -> list[str]:
|
| 210 |
+
"""Rozbija recenzję na logiczne jednostki (zdania/klauzule)."""
|
| 211 |
+
parts = SENTENCE_SPLIT_PATTERN.split(text)
|
| 212 |
+
return [p.strip() for p in parts if p and p.strip()]
|
| 213 |
+
|
| 214 |
+
def _has_negation(self, text: str, position: int) -> bool:
|
| 215 |
+
"""
|
| 216 |
+
Wykrywa negację przed słowem kluczowym (w zasięgu zdefiniowanym w configu).
|
| 217 |
+
Przydatne przy precyzyjniejszej analizie aspektowej w języku chińskim.
|
| 218 |
+
"""
|
| 219 |
+
window = settings.nlp_negation_window
|
| 220 |
+
left_context = text[max(0, position-window):position]
|
| 221 |
+
return any(neg in left_context for neg in ["不", "没", "别", "无"])
|
| 222 |
+
|
| 223 |
+
@staticmethod
|
| 224 |
+
def _is_valid_single_char_token(keyword: str, token: str, previous_token: str | None) -> bool:
|
| 225 |
+
"""Waliduje pojedynczy chiński keyword w kontekście całego tokenu."""
|
| 226 |
+
if keyword != "卡":
|
| 227 |
+
return True
|
| 228 |
+
if token == keyword:
|
| 229 |
+
return previous_token is None or previous_token in CARD_STANDALONE_PREVIOUS_TOKENS
|
| 230 |
+
return token.endswith(keyword) and token[:-1] in CARD_LAG_PREFIXES
|
| 231 |
+
|
| 232 |
+
def _find_single_char_keyword_match(self, sentence: str, keywords: list[str]) -> tuple[int, str] | None:
|
| 233 |
+
"""Zwraca pierwszy poprawny match dla chińskiego single-char keywordu."""
|
| 234 |
+
if not keywords:
|
| 235 |
+
return None
|
| 236 |
+
|
| 237 |
+
keyword_set = set(keywords)
|
| 238 |
+
tokenized_sentence = list(jieba.tokenize(sentence))
|
| 239 |
+
for index, (token, start, _) in enumerate(tokenized_sentence):
|
| 240 |
+
previous_token = tokenized_sentence[index - 1][0] if index > 0 else None
|
| 241 |
+
for offset, char in enumerate(token):
|
| 242 |
+
if char not in keyword_set:
|
| 243 |
+
continue
|
| 244 |
+
if self._is_valid_single_char_token(char, token, previous_token):
|
| 245 |
+
return start + offset, char
|
| 246 |
+
return None
|
| 247 |
+
|
| 248 |
+
def _detect_topics_regex(self, sentence: str) -> dict[str, bool]:
|
| 249 |
+
"""
|
| 250 |
+
Szybkie wykrywanie tematów przy użyciu prekompilowanych regexów.
|
| 251 |
+
Złożoność: O(N) względem długości zdania, niezależnie od liczby słów kluczowych.
|
| 252 |
+
"""
|
| 253 |
+
detected = {}
|
| 254 |
+
|
| 255 |
+
# Konwersja TYMCZASOWA na uproszczony chiński dla potrzeb matchowania.
|
| 256 |
+
# Dzięki temu zachowujemy oryginalny tekst (tradycyjny/uproszczony) w bazie,
|
| 257 |
+
# ale słownik keywords.py może pozostać w zh-cn.
|
| 258 |
+
sentence_simp = convert(sentence, 'zh-cn')
|
| 259 |
+
|
| 260 |
+
for topic in TOPIC_KEYWORDS:
|
| 261 |
+
regex_match = None
|
| 262 |
+
if topic in self.topic_patterns:
|
| 263 |
+
regex_match = self.topic_patterns[topic].search(sentence_simp)
|
| 264 |
+
|
| 265 |
+
single_char_match = self._find_single_char_keyword_match(
|
| 266 |
+
sentence_simp,
|
| 267 |
+
self.single_char_topic_keywords.get(topic, []),
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
matched_word: str | None = None
|
| 271 |
+
match_start: int | None = None
|
| 272 |
+
|
| 273 |
+
if regex_match and single_char_match:
|
| 274 |
+
if single_char_match[0] < regex_match.start():
|
| 275 |
+
match_start, matched_word = single_char_match
|
| 276 |
+
else:
|
| 277 |
+
match_start = regex_match.start()
|
| 278 |
+
matched_word = regex_match.group(0).lower()
|
| 279 |
+
elif regex_match:
|
| 280 |
+
match_start = regex_match.start()
|
| 281 |
+
matched_word = regex_match.group(0).lower()
|
| 282 |
+
elif single_char_match:
|
| 283 |
+
match_start, matched_word = single_char_match
|
| 284 |
+
|
| 285 |
+
if matched_word is not None and match_start is not None:
|
| 286 |
+
is_excluded = False
|
| 287 |
+
|
| 288 |
+
if matched_word in self.exclusion_patterns:
|
| 289 |
+
if self.exclusion_patterns[matched_word].search(sentence_simp):
|
| 290 |
+
is_excluded = True
|
| 291 |
+
|
| 292 |
+
if not is_excluded:
|
| 293 |
+
negated = self._has_negation(sentence_simp, match_start)
|
| 294 |
+
detected[topic] = negated
|
| 295 |
+
|
| 296 |
+
return detected
|
| 297 |
+
|
| 298 |
+
def _run_inference(self, texts: list[str]) -> list[dict]:
|
| 299 |
+
"""Wrapper dla pipeline'u Hugging Face uruchamiany w wątku."""
|
| 300 |
+
# batch_size=16 optymalizuje operacje macierzowe na CPU (AVX)
|
| 301 |
+
# truncation=True, max_length=512 zapobiega przekroczeniu limitu pozycji ONNX
|
| 302 |
+
# (max_position_embeddings=512); pipeline uwzględnia tokeny specjalne automatycznie
|
| 303 |
+
return self.classifier(texts, batch_size=16, truncation=True, max_length=512)
|
| 304 |
+
|
| 305 |
+
@staticmethod
|
| 306 |
+
def _map_label(label_str: str, score: float) -> tuple[SentimentType, float]:
|
| 307 |
+
"""Mapuje surowy label modelu na (SentimentType, score)."""
|
| 308 |
+
label_lower = label_str.lower()
|
| 309 |
+
if 'positive' in label_lower or 'label_1' in label_lower:
|
| 310 |
+
return (SentimentType.POSITIVE, score)
|
| 311 |
+
elif 'negative' in label_lower or 'label_0' in label_lower:
|
| 312 |
+
return (SentimentType.NEGATIVE, -score)
|
| 313 |
+
return (SentimentType.NEUTRAL, 0.0)
|
| 314 |
+
|
| 315 |
+
def _cache_put(self, key: str, value: tuple[str, float]) -> None:
|
| 316 |
+
"""Dodaje wynik do cache LRU, usuwa najstarsze jeśli przekroczono limit."""
|
| 317 |
+
self._sentiment_cache[key] = value
|
| 318 |
+
self._sentiment_cache.move_to_end(key)
|
| 319 |
+
while len(self._sentiment_cache) > self._cache_maxsize:
|
| 320 |
+
self._sentiment_cache.popitem(last=False)
|
| 321 |
+
|
| 322 |
+
async def analyze_sentiment_batch(
|
| 323 |
+
self, texts: list[str]
|
| 324 |
+
) -> list[tuple[SentimentType, float]]:
|
| 325 |
+
"""
|
| 326 |
+
Asynchroniczny interfejs do analizy sentymentu.
|
| 327 |
+
Offloaduje obliczenia do osobnego wątku, nie blokując API.
|
| 328 |
+
Wykorzystuje cache LRU do pomijania powtórzonych zdań.
|
| 329 |
+
"""
|
| 330 |
+
cleaned_texts = [self.clean_text(t) for t in texts]
|
| 331 |
+
norm_keys = [self._normalize_for_dedup(t) for t in cleaned_texts]
|
| 332 |
+
|
| 333 |
+
# Rozdziel na cache hits i misses
|
| 334 |
+
final_sentiments: list[tuple[SentimentType, float]] = [(SentimentType.NEUTRAL, 0.0)] * len(texts)
|
| 335 |
+
miss_indices: list[int] = [] # indeksy w cleaned_texts, które trzeba wysłać do modelu
|
| 336 |
+
miss_texts: list[str] = []
|
| 337 |
+
|
| 338 |
+
for i, (cleaned, key) in enumerate(zip(cleaned_texts, norm_keys)):
|
| 339 |
+
if not cleaned:
|
| 340 |
+
continue
|
| 341 |
+
cached = self._sentiment_cache.get(key)
|
| 342 |
+
if cached is not None:
|
| 343 |
+
self._sentiment_cache.move_to_end(key)
|
| 344 |
+
final_sentiments[i] = self._map_label(cached[0], cached[1])
|
| 345 |
+
else:
|
| 346 |
+
miss_indices.append(i)
|
| 347 |
+
miss_texts.append(cleaned)
|
| 348 |
+
|
| 349 |
+
cache_hits = len(texts) - len(miss_texts)
|
| 350 |
+
logger.debug(f"Cache: {cache_hits} hits, {len(miss_texts)} misses (cache size: {len(self._sentiment_cache)})")
|
| 351 |
+
|
| 352 |
+
if not miss_texts:
|
| 353 |
+
return final_sentiments
|
| 354 |
+
|
| 355 |
+
# Uruchomienie modelu TYLKO na cache-misses
|
| 356 |
+
loop = asyncio.get_event_loop()
|
| 357 |
+
results = await loop.run_in_executor(self.executor, self._run_inference, miss_texts)
|
| 358 |
+
|
| 359 |
+
for j, res in enumerate(results):
|
| 360 |
+
original_idx = miss_indices[j]
|
| 361 |
+
label_str = res['label']
|
| 362 |
+
score = res['score']
|
| 363 |
+
|
| 364 |
+
# Zapisz surowy wynik w cache
|
| 365 |
+
self._cache_put(norm_keys[original_idx], (label_str, score))
|
| 366 |
+
|
| 367 |
+
final_sentiments[original_idx] = self._map_label(label_str, score)
|
| 368 |
+
|
| 369 |
+
return final_sentiments
|
| 370 |
+
|
| 371 |
+
async def analyze_batch(
|
| 372 |
+
self,
|
| 373 |
+
reviews: list[str],
|
| 374 |
+
highlights_collector: HighlightsCollector | None = None,
|
| 375 |
+
categories: list[str] | None = None,
|
| 376 |
+
) -> tuple[list[TopicSentiment], int]:
|
| 377 |
+
"""
|
| 378 |
+
Główna metoda przetwarzania partii recenzji.
|
| 379 |
+
Łączy segmentację, wykrywanie tematów i analizę sentymentu.
|
| 380 |
+
"""
|
| 381 |
+
if not reviews:
|
| 382 |
+
return [], 0
|
| 383 |
+
|
| 384 |
+
# Krok 1: Pre-processing i identyfikacja zdań do analizy
|
| 385 |
+
sentiment_tasks = []
|
| 386 |
+
skipped_sentences = 0
|
| 387 |
+
|
| 388 |
+
for review_idx, review in enumerate(reviews):
|
| 389 |
+
if highlights_collector:
|
| 390 |
+
highlights_collector.start_review()
|
| 391 |
+
|
| 392 |
+
cleaned = self.clean_text(review)
|
| 393 |
+
if not cleaned or len(cleaned) < 5:
|
| 394 |
+
continue
|
| 395 |
+
|
| 396 |
+
sentences = self._split_into_sentences(cleaned)
|
| 397 |
+
for sentence in sentences:
|
| 398 |
+
topics_map = self._detect_topics_regex(sentence)
|
| 399 |
+
if topics_map:
|
| 400 |
+
for topic, is_negated in topics_map.items():
|
| 401 |
+
sentiment_tasks.append((review_idx, topic, sentence, is_negated))
|
| 402 |
+
else:
|
| 403 |
+
skipped_sentences += 1
|
| 404 |
+
|
| 405 |
+
if not sentiment_tasks:
|
| 406 |
+
return [], skipped_sentences
|
| 407 |
+
|
| 408 |
+
# Krok 2: Deduplikacja + Analiza sentymentu
|
| 409 |
+
all_sentences = [task[2] for task in sentiment_tasks]
|
| 410 |
+
|
| 411 |
+
# Deduplikacja: normalizuj -> znajdź unikalne -> inference tylko na unikatach
|
| 412 |
+
norm_keys = [self._normalize_for_dedup(s) for s in all_sentences]
|
| 413 |
+
unique_map: dict[str, int] = {} # normalized_key -> index in unique_texts
|
| 414 |
+
unique_texts: list[str] = []
|
| 415 |
+
|
| 416 |
+
for i, key in enumerate(norm_keys):
|
| 417 |
+
if key not in unique_map:
|
| 418 |
+
unique_map[key] = len(unique_texts)
|
| 419 |
+
unique_texts.append(all_sentences[i])
|
| 420 |
+
|
| 421 |
+
dedup_total = len(all_sentences)
|
| 422 |
+
dedup_unique = len(unique_texts)
|
| 423 |
+
dedup_pct = round((1 - dedup_unique / dedup_total) * 100) if dedup_total else 0
|
| 424 |
+
logger.debug(f"Dedup: {dedup_total} -> {dedup_unique} sentences ({dedup_pct}% reduced)")
|
| 425 |
+
|
| 426 |
+
unique_results = await self.analyze_sentiment_batch(unique_texts)
|
| 427 |
+
|
| 428 |
+
# Mapowanie wyników z unikalnych z powrotem na wszystkie zdania
|
| 429 |
+
sentiment_results = [unique_results[unique_map[key]] for key in norm_keys]
|
| 430 |
+
|
| 431 |
+
# Krok 3: Agregacja wyników
|
| 432 |
+
# review_id -> topic -> list of scores
|
| 433 |
+
review_topic_scores: dict[int, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
|
| 434 |
+
# topic -> (sentence, score) - online selection najlepszego przykładu
|
| 435 |
+
topic_best_example: dict[str, tuple[str, float]] = {}
|
| 436 |
+
|
| 437 |
+
for i, (review_idx, topic, sentence, is_negated) in enumerate(sentiment_tasks):
|
| 438 |
+
_, score = sentiment_results[i]
|
| 439 |
+
|
| 440 |
+
# KULOODPORNY PIPELINE: Jeśli wykryto negację (np. "nie lubię gameplayu"),
|
| 441 |
+
# a model mimo to zwrócił dodatni sentyment, korygujemy go.
|
| 442 |
+
if is_negated and score > 0:
|
| 443 |
+
score = -score
|
| 444 |
+
|
| 445 |
+
review_topic_scores[review_idx][topic].append(score)
|
| 446 |
+
|
| 447 |
+
if highlights_collector:
|
| 448 |
+
highlights_collector.add_sentence(
|
| 449 |
+
review_idx=review_idx,
|
| 450 |
+
sentence=sentence,
|
| 451 |
+
topics=[topic],
|
| 452 |
+
sentiment_score=score,
|
| 453 |
+
categories=categories,
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
# Online selection - aktualizuj jeśli lepszy kandydat (wyższy |score|)
|
| 457 |
+
if len(sentence) > 20:
|
| 458 |
+
current = topic_best_example.get(topic)
|
| 459 |
+
if current is None or abs(score) > abs(current[1]):
|
| 460 |
+
topic_best_example[topic] = (sentence, score)
|
| 461 |
+
|
| 462 |
+
# Agregacja globalna: Średnia per recenzja -> Suma globalna
|
| 463 |
+
global_topic_stats: dict[str, dict[str, float]] = defaultdict(lambda: {"sum_score": 0.0, "count": 0.0})
|
| 464 |
+
|
| 465 |
+
for review_idx, topics_data in review_topic_scores.items():
|
| 466 |
+
for topic, scores in topics_data.items():
|
| 467 |
+
avg_review_score = sum(scores) / len(scores)
|
| 468 |
+
global_topic_stats[topic]["sum_score"] += avg_review_score
|
| 469 |
+
global_topic_stats[topic]["count"] += 1.0
|
| 470 |
+
|
| 471 |
+
# Krok 4: Formatowanie końcowe
|
| 472 |
+
final_results: list[TopicSentiment] = []
|
| 473 |
+
|
| 474 |
+
for topic_name, stats in global_topic_stats.items():
|
| 475 |
+
count = int(stats["count"])
|
| 476 |
+
if count == 0:
|
| 477 |
+
continue
|
| 478 |
+
|
| 479 |
+
avg_global_score = stats["sum_score"] / stats["count"]
|
| 480 |
+
normalized_score = max(-1.0, min(1.0, avg_global_score))
|
| 481 |
+
|
| 482 |
+
if normalized_score > settings.sentiment_positive_threshold:
|
| 483 |
+
sentiment = SentimentType.POSITIVE
|
| 484 |
+
elif normalized_score < settings.sentiment_negative_threshold:
|
| 485 |
+
sentiment = SentimentType.NEGATIVE
|
| 486 |
+
else:
|
| 487 |
+
sentiment = SentimentType.NEUTRAL
|
| 488 |
+
|
| 489 |
+
# Pobierz najlepszy przykład i zwaliduj zgodność kierunku
|
| 490 |
+
best_example = None
|
| 491 |
+
example_score = None
|
| 492 |
+
candidate = topic_best_example.get(topic_name)
|
| 493 |
+
if candidate:
|
| 494 |
+
ex_sentence, ex_score = candidate
|
| 495 |
+
# Walidacja: przykład musi być zgodny z kierunkiem sentymentu
|
| 496 |
+
if sentiment == SentimentType.NEUTRAL or \
|
| 497 |
+
(sentiment == SentimentType.POSITIVE and ex_score > 0) or \
|
| 498 |
+
(sentiment == SentimentType.NEGATIVE and ex_score < 0):
|
| 499 |
+
best_example = ex_sentence
|
| 500 |
+
example_score = ex_score
|
| 501 |
+
|
| 502 |
+
final_results.append(
|
| 503 |
+
TopicSentiment(
|
| 504 |
+
topic=topic_name,
|
| 505 |
+
sentiment=sentiment,
|
| 506 |
+
score=round(normalized_score, 3),
|
| 507 |
+
mention_count=count,
|
| 508 |
+
example=best_example,
|
| 509 |
+
example_score=example_score,
|
| 510 |
+
)
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
+
final_results.sort(key=lambda x: x.mention_count, reverse=True)
|
| 514 |
+
return final_results, skipped_sentences
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
_nlp_service: "NLPService | None" = None
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
def get_nlp_service() -> "NLPService":
|
| 521 |
+
global _nlp_service
|
| 522 |
+
if _nlp_service is None:
|
| 523 |
+
_nlp_service = NLPService()
|
| 524 |
+
return _nlp_service
|
backend/app/services/precache_service.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pre-cache Service — schedules and executes background analyses for top games.
|
| 3 |
+
|
| 4 |
+
Creates refresh schedules with checkpoints (e.g. 6h, 12h, 24h after update)
|
| 5 |
+
and processes due analyses each cycle, prioritized by game popularity.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime, timedelta, timezone
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
from app.core.config import settings
|
| 14 |
+
from app.db.mongodb import mongodb
|
| 15 |
+
from app.services.analysis_runner import run_full_analysis, run_incremental_analysis
|
| 16 |
+
from app.services.nlp_service import NLPService
|
| 17 |
+
from app.services.steam_service import SteamService
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class PreCacheService:
|
| 23 |
+
"""Manages refresh schedules and triggers pre-cache analyses."""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self, steam_svc: SteamService, nlp_svc: NLPService
|
| 27 |
+
) -> None:
|
| 28 |
+
self._steam_svc = steam_svc
|
| 29 |
+
self._nlp_svc = nlp_svc
|
| 30 |
+
|
| 31 |
+
def create_schedule(
|
| 32 |
+
self, app_id: str, game_name: str, update_at: datetime, *, is_release: bool = False
|
| 33 |
+
) -> dict[str, Any]:
|
| 34 |
+
"""Build a schedule document with checkpoints from config."""
|
| 35 |
+
checkpoints = []
|
| 36 |
+
for offset_hours in settings.precache_checkpoints_list:
|
| 37 |
+
checkpoints.append({
|
| 38 |
+
"offset_hours": offset_hours,
|
| 39 |
+
"due_at": update_at + timedelta(hours=offset_hours),
|
| 40 |
+
"completed": False,
|
| 41 |
+
})
|
| 42 |
+
|
| 43 |
+
return {
|
| 44 |
+
"app_id": str(app_id),
|
| 45 |
+
"game_name": game_name,
|
| 46 |
+
"update_at": update_at,
|
| 47 |
+
"checkpoints": checkpoints,
|
| 48 |
+
"is_release": is_release,
|
| 49 |
+
"status": "active",
|
| 50 |
+
"created_at": datetime.now(timezone.utc),
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
def create_bootstrap_schedule(
|
| 54 |
+
self, app_id: str, game_name: str
|
| 55 |
+
) -> dict[str, Any]:
|
| 56 |
+
"""Release schedule for a newly prioritized game, starting at 6h."""
|
| 57 |
+
now = datetime.now(timezone.utc)
|
| 58 |
+
return self.create_schedule(app_id, game_name, now, is_release=True)
|
| 59 |
+
|
| 60 |
+
async def create_schedules_for_updates(
|
| 61 |
+
self, updated_games: list[dict[str, Any]]
|
| 62 |
+
) -> int:
|
| 63 |
+
"""Bulk-create schedules for games that received updates."""
|
| 64 |
+
active_schedules = await mongodb.get_active_schedules()
|
| 65 |
+
active_by_app_id = {s["app_id"]: s for s in active_schedules}
|
| 66 |
+
|
| 67 |
+
created = 0
|
| 68 |
+
for game in updated_games:
|
| 69 |
+
app_id = str(game.get("appid", ""))
|
| 70 |
+
name = game.get("name", "")
|
| 71 |
+
update_at = game.get("update_at", datetime.now(timezone.utc))
|
| 72 |
+
|
| 73 |
+
existing = active_by_app_id.get(app_id)
|
| 74 |
+
if existing:
|
| 75 |
+
existing_update_at = existing.get("update_at")
|
| 76 |
+
if existing_update_at and update_at <= existing_update_at:
|
| 77 |
+
continue # Same or older patch — don't reset checkpoints
|
| 78 |
+
|
| 79 |
+
schedule = self.create_schedule(app_id, name, update_at)
|
| 80 |
+
await mongodb.upsert_refresh_schedule(schedule)
|
| 81 |
+
created += 1
|
| 82 |
+
|
| 83 |
+
logger.info(f"Created {created} refresh schedules for updated games")
|
| 84 |
+
return created
|
| 85 |
+
|
| 86 |
+
async def bootstrap_missing_analyses(
|
| 87 |
+
self, top_games: list[dict[str, Any]]
|
| 88 |
+
) -> int:
|
| 89 |
+
"""For top games with no cached analysis, create release schedules."""
|
| 90 |
+
# Pre-fetch active schedule app_ids for O(1) lookup
|
| 91 |
+
active_schedules = await mongodb.get_active_schedules()
|
| 92 |
+
scheduled_app_ids = {s["app_id"] for s in active_schedules}
|
| 93 |
+
|
| 94 |
+
created = 0
|
| 95 |
+
for game in top_games:
|
| 96 |
+
app_id = str(game.get("appid", ""))
|
| 97 |
+
if not app_id or app_id in scheduled_app_ids:
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
# Check if analysis already cached
|
| 101 |
+
cached = await mongodb.get_cached_analysis(app_id)
|
| 102 |
+
if cached is not None:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
schedule = self.create_bootstrap_schedule(app_id, game.get("name", ""))
|
| 106 |
+
await mongodb.upsert_refresh_schedule(schedule)
|
| 107 |
+
scheduled_app_ids.add(app_id)
|
| 108 |
+
created += 1
|
| 109 |
+
|
| 110 |
+
logger.info(f"Bootstrap: created {created} release schedules")
|
| 111 |
+
return created
|
| 112 |
+
|
| 113 |
+
async def process_due_analyses(self) -> int:
|
| 114 |
+
"""
|
| 115 |
+
Main processing loop: find due checkpoints, prioritize, execute.
|
| 116 |
+
|
| 117 |
+
Returns:
|
| 118 |
+
Number of analyses executed.
|
| 119 |
+
"""
|
| 120 |
+
now = datetime.now(timezone.utc)
|
| 121 |
+
schedules = await mongodb.get_active_schedules()
|
| 122 |
+
max_per_cycle = settings.precache_max_analyses_per_cycle
|
| 123 |
+
delay = settings.precache_batch_delay_seconds
|
| 124 |
+
|
| 125 |
+
# Find one due checkpoint per game
|
| 126 |
+
due_items: list[dict[str, Any]] = []
|
| 127 |
+
for schedule in schedules:
|
| 128 |
+
for cp in schedule.get("checkpoints", []):
|
| 129 |
+
if cp.get("completed"):
|
| 130 |
+
continue
|
| 131 |
+
if cp["due_at"] <= now:
|
| 132 |
+
due_items.append({
|
| 133 |
+
"app_id": schedule["app_id"],
|
| 134 |
+
"game_name": schedule.get("game_name", ""),
|
| 135 |
+
"offset_hours": cp["offset_hours"],
|
| 136 |
+
"due_at": cp["due_at"],
|
| 137 |
+
"positive": schedule.get("positive", 0),
|
| 138 |
+
"negative": schedule.get("negative", 0),
|
| 139 |
+
})
|
| 140 |
+
break # Only first due checkpoint per game
|
| 141 |
+
|
| 142 |
+
if not due_items:
|
| 143 |
+
logger.info("Pre-cache: no due analyses")
|
| 144 |
+
return 0
|
| 145 |
+
|
| 146 |
+
# Sort by popularity DESC, then due_at ASC
|
| 147 |
+
due_items.sort(
|
| 148 |
+
key=lambda x: (-(x.get("positive", 0) + x.get("negative", 0)), x["due_at"])
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Execute up to max_per_cycle
|
| 152 |
+
executed = 0
|
| 153 |
+
for item in due_items[:max_per_cycle]:
|
| 154 |
+
app_id = item["app_id"]
|
| 155 |
+
game_name = item["game_name"]
|
| 156 |
+
offset_hours = item["offset_hours"]
|
| 157 |
+
|
| 158 |
+
logger.info(f"Pre-cache: analyzing {app_id} ({game_name}) — checkpoint {offset_hours}h")
|
| 159 |
+
|
| 160 |
+
existing = await mongodb.get_analysis(app_id)
|
| 161 |
+
if existing and existing.get("results"):
|
| 162 |
+
result = await run_incremental_analysis(
|
| 163 |
+
app_id, game_name, self._steam_svc, self._nlp_svc
|
| 164 |
+
)
|
| 165 |
+
else:
|
| 166 |
+
result = await run_full_analysis(
|
| 167 |
+
app_id, game_name, self._steam_svc, self._nlp_svc
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
if result is not None:
|
| 171 |
+
executed += 1
|
| 172 |
+
|
| 173 |
+
# Mark checkpoint completed regardless of success
|
| 174 |
+
await mongodb.mark_checkpoint_completed(app_id, offset_hours)
|
| 175 |
+
|
| 176 |
+
# Check if all checkpoints done → complete schedule
|
| 177 |
+
await self._check_schedule_completion(app_id)
|
| 178 |
+
|
| 179 |
+
if executed < max_per_cycle and item != due_items[-1]:
|
| 180 |
+
await asyncio.sleep(delay)
|
| 181 |
+
|
| 182 |
+
logger.info(f"Pre-cache: executed {executed}/{len(due_items)} due analyses")
|
| 183 |
+
return executed
|
| 184 |
+
|
| 185 |
+
@staticmethod
|
| 186 |
+
async def _check_schedule_completion(app_id: str) -> None:
|
| 187 |
+
"""If all checkpoints completed, mark schedule as completed."""
|
| 188 |
+
schedules = await mongodb.get_active_schedules()
|
| 189 |
+
for schedule in schedules:
|
| 190 |
+
if schedule["app_id"] != str(app_id):
|
| 191 |
+
continue
|
| 192 |
+
all_done = all(
|
| 193 |
+
cp.get("completed", False)
|
| 194 |
+
for cp in schedule.get("checkpoints", [])
|
| 195 |
+
)
|
| 196 |
+
if all_done:
|
| 197 |
+
await mongodb.complete_schedule(app_id)
|
| 198 |
+
logger.info(f"Schedule completed for {app_id}")
|
| 199 |
+
break
|
backend/app/services/priority_refresh_service.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Priority Refresh Service — maintains canonical priority game state in MongoDB.
|
| 3 |
+
|
| 4 |
+
Priority sources:
|
| 5 |
+
- top500: top 500 games by review count (local DB)
|
| 6 |
+
- top_sellers / new_releases / specials: Steam store featured categories
|
| 7 |
+
|
| 8 |
+
Priority state fields on games documents:
|
| 9 |
+
is_priority bool
|
| 10 |
+
priority_sources list[str]
|
| 11 |
+
priority_grace_until datetime | None
|
| 12 |
+
priority_last_confirmed_at datetime | None
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import asyncio
|
| 16 |
+
import logging
|
| 17 |
+
from datetime import datetime, timedelta, timezone
|
| 18 |
+
from typing import Any
|
| 19 |
+
|
| 20 |
+
import httpx
|
| 21 |
+
|
| 22 |
+
from app.core.config import settings
|
| 23 |
+
from app.db.mongodb import mongodb
|
| 24 |
+
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class PriorityRefreshService:
|
| 29 |
+
"""Refreshes priority flags on the games collection each worker cycle."""
|
| 30 |
+
|
| 31 |
+
def __init__(self, client: httpx.AsyncClient | None = None) -> None:
|
| 32 |
+
self._client = client
|
| 33 |
+
self._owns_client = client is None
|
| 34 |
+
|
| 35 |
+
async def _get_client(self) -> httpx.AsyncClient:
|
| 36 |
+
if self._client is None:
|
| 37 |
+
self._client = httpx.AsyncClient(timeout=15.0)
|
| 38 |
+
return self._client
|
| 39 |
+
|
| 40 |
+
async def close(self) -> None:
|
| 41 |
+
if self._owns_client and self._client is not None:
|
| 42 |
+
await self._client.aclose()
|
| 43 |
+
self._client = None
|
| 44 |
+
|
| 45 |
+
async def refresh_priorities(self) -> dict[str, Any]:
|
| 46 |
+
"""
|
| 47 |
+
Recompute is_priority for all games and write changes to MongoDB.
|
| 48 |
+
|
| 49 |
+
Returns a summary dict with counts.
|
| 50 |
+
"""
|
| 51 |
+
now = datetime.now(timezone.utc)
|
| 52 |
+
grace_deadline = now + timedelta(days=settings.steam_priority_grace_days)
|
| 53 |
+
|
| 54 |
+
# 1. Build active sources map
|
| 55 |
+
top500_ids: set[str] = {
|
| 56 |
+
g["appid"]
|
| 57 |
+
for g in await mongodb.get_top_games_by_reviews(500)
|
| 58 |
+
if g.get("app_type") != "dlc"
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
category_ids: dict[str, set[str]] = await self._fetch_store_categories()
|
| 62 |
+
|
| 63 |
+
active_sources: dict[str, list[str]] = {}
|
| 64 |
+
for appid in top500_ids:
|
| 65 |
+
active_sources.setdefault(appid, []).append("top500")
|
| 66 |
+
for cat_name, ids in category_ids.items():
|
| 67 |
+
for appid in ids:
|
| 68 |
+
active_sources.setdefault(appid, []).append(cat_name)
|
| 69 |
+
|
| 70 |
+
# 1b. Bootstrap category games that are missing from the local DB.
|
| 71 |
+
# top500 appids are safe — they come from existing DB records.
|
| 72 |
+
# Category appids may reference games not yet in our DB.
|
| 73 |
+
all_category_appids: set[str] = set()
|
| 74 |
+
for ids in category_ids.values():
|
| 75 |
+
all_category_appids.update(ids)
|
| 76 |
+
|
| 77 |
+
bootstrap_summary: dict[str, Any] = {}
|
| 78 |
+
if all_category_appids:
|
| 79 |
+
_, bootstrap_summary = await self._bootstrap_missing_games(all_category_appids)
|
| 80 |
+
# After bootstrap, remove from active_sources any category appid that
|
| 81 |
+
# still has no DB record (failed bootstrap / delisted / per-cycle limit).
|
| 82 |
+
# This prevents bulk_update_priority_fields from silently no-oping.
|
| 83 |
+
existing_in_db = await mongodb.get_existing_appids(all_category_appids)
|
| 84 |
+
for appid in all_category_appids - existing_in_db:
|
| 85 |
+
active_sources.pop(appid, None)
|
| 86 |
+
|
| 87 |
+
# 2. Load current priority state (only games that already have is_priority field)
|
| 88 |
+
existing_priority_docs: list[dict[str, Any]] = []
|
| 89 |
+
if mongodb.db is not None:
|
| 90 |
+
try:
|
| 91 |
+
collection = mongodb.db[mongodb.COLLECTION_GAMES]
|
| 92 |
+
cursor = collection.find(
|
| 93 |
+
{"is_priority": {"$exists": True}},
|
| 94 |
+
{
|
| 95 |
+
"_id": 0,
|
| 96 |
+
"appid": 1,
|
| 97 |
+
"app_type": 1,
|
| 98 |
+
"is_priority": 1,
|
| 99 |
+
"priority_grace_until": 1,
|
| 100 |
+
"priority_sources": 1,
|
| 101 |
+
},
|
| 102 |
+
)
|
| 103 |
+
existing_priority_docs = await cursor.to_list(length=10000)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.warning(f"Failed to load existing priority docs: {e}")
|
| 106 |
+
|
| 107 |
+
existing_by_appid: dict[str, dict] = {
|
| 108 |
+
str(d["appid"]): d for d in existing_priority_docs
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# 2b. DLC inherits effective priority from its parent game.
|
| 112 |
+
if settings.dlc_worker_analysis_enabled:
|
| 113 |
+
priority_parent_ids: set[str] = set(active_sources.keys())
|
| 114 |
+
for appid, doc in existing_by_appid.items():
|
| 115 |
+
if doc.get("app_type") == "dlc":
|
| 116 |
+
continue
|
| 117 |
+
if not doc.get("is_priority") or appid in active_sources:
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
grace_until = doc.get("priority_grace_until")
|
| 121 |
+
if grace_until is None or grace_until >= now:
|
| 122 |
+
priority_parent_ids.add(appid)
|
| 123 |
+
|
| 124 |
+
for parent_appid in priority_parent_ids:
|
| 125 |
+
dlcs = await mongodb.get_dlcs_by_parent_appid(parent_appid)
|
| 126 |
+
for dlc in dlcs:
|
| 127 |
+
dlc_appid = str(dlc.get("appid", ""))
|
| 128 |
+
if dlc_appid:
|
| 129 |
+
active_sources[dlc_appid] = ["parent_priority"]
|
| 130 |
+
|
| 131 |
+
# 2c. When DLC worker analysis is disabled, remove any DLC that entered
|
| 132 |
+
# active_sources via other paths (e.g. Steam store categories).
|
| 133 |
+
if not settings.dlc_worker_analysis_enabled:
|
| 134 |
+
dlc_appids_to_remove = {
|
| 135 |
+
appid
|
| 136 |
+
for appid in active_sources
|
| 137 |
+
if existing_by_appid.get(appid, {}).get("app_type") == "dlc"
|
| 138 |
+
}
|
| 139 |
+
for appid in dlc_appids_to_remove:
|
| 140 |
+
del active_sources[appid]
|
| 141 |
+
|
| 142 |
+
# 3. Compute updates
|
| 143 |
+
updates: list[tuple[str, dict]] = []
|
| 144 |
+
became_priority = 0
|
| 145 |
+
entered_grace = 0
|
| 146 |
+
expired_grace = 0
|
| 147 |
+
reactivated = 0
|
| 148 |
+
removed_parent_priority = 0
|
| 149 |
+
|
| 150 |
+
# Active games — either new or confirming existing priority
|
| 151 |
+
for appid, sources in active_sources.items():
|
| 152 |
+
existing = existing_by_appid.get(appid)
|
| 153 |
+
fields: dict[str, Any] = {
|
| 154 |
+
"is_priority": True,
|
| 155 |
+
"priority_sources": sources,
|
| 156 |
+
"priority_grace_until": None,
|
| 157 |
+
"priority_last_confirmed_at": now,
|
| 158 |
+
}
|
| 159 |
+
if existing is None or not existing.get("is_priority"):
|
| 160 |
+
became_priority += 1
|
| 161 |
+
elif existing.get("priority_grace_until") is not None:
|
| 162 |
+
reactivated += 1
|
| 163 |
+
updates.append((appid, fields))
|
| 164 |
+
|
| 165 |
+
# Games that were priority but are no longer in any active source
|
| 166 |
+
for appid, doc in existing_by_appid.items():
|
| 167 |
+
if appid in active_sources:
|
| 168 |
+
continue # already handled above
|
| 169 |
+
if not doc.get("is_priority"):
|
| 170 |
+
continue # already marked non-priority, skip
|
| 171 |
+
|
| 172 |
+
if "parent_priority" in (doc.get("priority_sources") or []):
|
| 173 |
+
updates.append((appid, {
|
| 174 |
+
"is_priority": False,
|
| 175 |
+
"priority_sources": [],
|
| 176 |
+
"priority_grace_until": None,
|
| 177 |
+
}))
|
| 178 |
+
removed_parent_priority += 1
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
+
grace_until = doc.get("priority_grace_until")
|
| 182 |
+
|
| 183 |
+
if grace_until is None:
|
| 184 |
+
# Just left all sources — start grace period
|
| 185 |
+
updates.append((appid, {
|
| 186 |
+
"priority_grace_until": grace_deadline,
|
| 187 |
+
"priority_sources": [],
|
| 188 |
+
}))
|
| 189 |
+
entered_grace += 1
|
| 190 |
+
elif grace_until < now:
|
| 191 |
+
# Grace expired — remove priority
|
| 192 |
+
updates.append((appid, {
|
| 193 |
+
"is_priority": False,
|
| 194 |
+
"priority_sources": [],
|
| 195 |
+
"priority_grace_until": None,
|
| 196 |
+
}))
|
| 197 |
+
expired_grace += 1
|
| 198 |
+
# else: still in grace and not expired — no update needed
|
| 199 |
+
|
| 200 |
+
modified = await mongodb.bulk_update_priority_fields(updates)
|
| 201 |
+
|
| 202 |
+
result = {
|
| 203 |
+
"total_active": len(active_sources),
|
| 204 |
+
"top500_count": len(top500_ids),
|
| 205 |
+
"category_counts": {k: len(v) for k, v in category_ids.items()},
|
| 206 |
+
"bootstrap": bootstrap_summary,
|
| 207 |
+
"became_priority": became_priority,
|
| 208 |
+
"reactivated": reactivated,
|
| 209 |
+
"entered_grace": entered_grace,
|
| 210 |
+
"expired_grace": expired_grace,
|
| 211 |
+
"removed_parent_priority": removed_parent_priority,
|
| 212 |
+
"db_modified": modified,
|
| 213 |
+
}
|
| 214 |
+
logger.info(f"Priority refresh complete: {result}")
|
| 215 |
+
return result
|
| 216 |
+
|
| 217 |
+
@staticmethod
|
| 218 |
+
def _parse_app_type(data: dict[str, Any]) -> dict[str, Any]:
|
| 219 |
+
"""Parse app_type and parent_appid from an appdetails data block."""
|
| 220 |
+
app_type = data.get("type") or "unknown"
|
| 221 |
+
fullgame = data.get("fullgame")
|
| 222 |
+
parent_appid = None
|
| 223 |
+
if app_type == "dlc" and isinstance(fullgame, dict) and fullgame.get("appid") is not None:
|
| 224 |
+
parent_appid = str(fullgame["appid"])
|
| 225 |
+
return {"app_type": str(app_type), "parent_appid": parent_appid}
|
| 226 |
+
|
| 227 |
+
async def _fetch_app_details_bilingual(self, appid: str) -> dict[str, Any] | None:
|
| 228 |
+
"""
|
| 229 |
+
Fetch appdetails for a single game in both english and schinese.
|
| 230 |
+
|
| 231 |
+
Returns a minimal game dict (name, name_cn, app_type, parent_appid,
|
| 232 |
+
header_image, cn_name_checked) or None on failure / not found.
|
| 233 |
+
"""
|
| 234 |
+
client = await self._get_client()
|
| 235 |
+
store_url = "https://store.steampowered.com/api/appdetails"
|
| 236 |
+
|
| 237 |
+
async def _fetch_one(lang: str) -> dict[str, Any]:
|
| 238 |
+
try:
|
| 239 |
+
resp = await client.get(
|
| 240 |
+
store_url,
|
| 241 |
+
params={"appids": appid, "l": lang, "cc": settings.steam_region},
|
| 242 |
+
)
|
| 243 |
+
if resp.status_code != 200:
|
| 244 |
+
return {}
|
| 245 |
+
entry = resp.json().get(str(appid))
|
| 246 |
+
if entry and entry.get("success"):
|
| 247 |
+
return entry.get("data") or {}
|
| 248 |
+
return {}
|
| 249 |
+
except Exception as e:
|
| 250 |
+
logger.warning(f"appdetails error for {appid} (lang={lang}): {e}")
|
| 251 |
+
return {}
|
| 252 |
+
|
| 253 |
+
data_en, data_cn = await asyncio.gather(
|
| 254 |
+
_fetch_one("english"),
|
| 255 |
+
_fetch_one("schinese"),
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
if not data_en and not data_cn:
|
| 259 |
+
logger.warning(f"No appdetails for {appid} — skipping bootstrap")
|
| 260 |
+
return None
|
| 261 |
+
|
| 262 |
+
name_en = data_en.get("name") or data_cn.get("name")
|
| 263 |
+
if not name_en:
|
| 264 |
+
logger.warning(f"No name in appdetails for {appid} — skipping bootstrap")
|
| 265 |
+
return None
|
| 266 |
+
|
| 267 |
+
name_cn = data_cn.get("name")
|
| 268 |
+
base = data_en or data_cn
|
| 269 |
+
type_info = self._parse_app_type(base)
|
| 270 |
+
|
| 271 |
+
return {
|
| 272 |
+
"appid": appid,
|
| 273 |
+
"name": name_en,
|
| 274 |
+
"name_cn": name_cn if name_cn and name_cn != name_en else None,
|
| 275 |
+
"cn_name_checked": True,
|
| 276 |
+
"app_type": type_info["app_type"],
|
| 277 |
+
"parent_appid": type_info["parent_appid"],
|
| 278 |
+
"header_image": base.get("header_image"),
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
async def _bootstrap_missing_games(
|
| 282 |
+
self,
|
| 283 |
+
category_appids: set[str],
|
| 284 |
+
) -> tuple[set[str], dict[str, Any]]:
|
| 285 |
+
"""
|
| 286 |
+
Fetch Steam Store data and upsert games missing from the local DB.
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
(bootstrapped_appids, summary_dict)
|
| 290 |
+
bootstrapped_appids: set of appids that were newly upserted
|
| 291 |
+
"""
|
| 292 |
+
existing = await mongodb.get_existing_appids(category_appids)
|
| 293 |
+
missing = category_appids - existing
|
| 294 |
+
|
| 295 |
+
if not missing:
|
| 296 |
+
return set(), {"bootstrapped": 0, "failed": 0, "skipped_existing": len(existing)}
|
| 297 |
+
|
| 298 |
+
limit = settings.steam_bootstrap_max_per_cycle
|
| 299 |
+
appids_to_fetch = list(missing)[:limit]
|
| 300 |
+
bootstrapped: set[str] = set()
|
| 301 |
+
failed = 0
|
| 302 |
+
|
| 303 |
+
for i, appid in enumerate(appids_to_fetch):
|
| 304 |
+
game_data = await self._fetch_app_details_bilingual(appid)
|
| 305 |
+
if game_data is None:
|
| 306 |
+
failed += 1
|
| 307 |
+
else:
|
| 308 |
+
await mongodb.upsert_game(game_data)
|
| 309 |
+
bootstrapped.add(appid)
|
| 310 |
+
|
| 311 |
+
if i < len(appids_to_fetch) - 1:
|
| 312 |
+
await asyncio.sleep(settings.steam_bootstrap_delay)
|
| 313 |
+
|
| 314 |
+
summary = {
|
| 315 |
+
"bootstrapped": len(bootstrapped),
|
| 316 |
+
"failed": failed,
|
| 317 |
+
"skipped_existing": len(existing),
|
| 318 |
+
"missing_over_limit": max(0, len(missing) - limit),
|
| 319 |
+
}
|
| 320 |
+
if bootstrapped or failed:
|
| 321 |
+
logger.info(f"Bootstrap missing games: {summary}")
|
| 322 |
+
return bootstrapped, summary
|
| 323 |
+
|
| 324 |
+
async def _fetch_region_categories(self, region: str) -> dict[str, set[str]]:
|
| 325 |
+
"""
|
| 326 |
+
Fetch featured categories for a single Steam region (cc=region).
|
| 327 |
+
|
| 328 |
+
Returns dict mapping category name -> set of appid strings.
|
| 329 |
+
On any failure, returns {} so the caller can continue with other regions.
|
| 330 |
+
"""
|
| 331 |
+
try:
|
| 332 |
+
client = await self._get_client()
|
| 333 |
+
resp = await client.get(
|
| 334 |
+
settings.steam_priority_categories_url,
|
| 335 |
+
params={"cc": region, "l": "schinese"},
|
| 336 |
+
)
|
| 337 |
+
if resp.status_code != 200:
|
| 338 |
+
logger.warning(
|
| 339 |
+
f"Steam featuredcategories [{region}] returned {resp.status_code} — skipping region"
|
| 340 |
+
)
|
| 341 |
+
return {}
|
| 342 |
+
|
| 343 |
+
data = resp.json()
|
| 344 |
+
except Exception as e:
|
| 345 |
+
logger.warning(
|
| 346 |
+
f"Failed to fetch Steam store categories [{region}]: {e} — skipping region"
|
| 347 |
+
)
|
| 348 |
+
return {}
|
| 349 |
+
|
| 350 |
+
result: dict[str, set[str]] = {}
|
| 351 |
+
for cat_name in settings.steam_priority_categories_list:
|
| 352 |
+
cat_data = data.get(cat_name)
|
| 353 |
+
if not cat_data:
|
| 354 |
+
continue
|
| 355 |
+
items = cat_data.get("items", [])
|
| 356 |
+
appids: set[str] = {
|
| 357 |
+
str(item["id"])
|
| 358 |
+
for item in items
|
| 359 |
+
if item.get("type") == 0 and item.get("id") is not None
|
| 360 |
+
}
|
| 361 |
+
result[cat_name] = appids
|
| 362 |
+
|
| 363 |
+
return result
|
| 364 |
+
|
| 365 |
+
async def _fetch_store_categories(self) -> dict[str, set[str]]:
|
| 366 |
+
"""
|
| 367 |
+
Fetch game appids from Steam store featured categories across all configured regions.
|
| 368 |
+
|
| 369 |
+
Iterates over steam_priority_regions_list (default: CN, US) and merges results.
|
| 370 |
+
If one region fails, the other is still used. If all fail, returns {} (fallback
|
| 371 |
+
to top-500 only).
|
| 372 |
+
|
| 373 |
+
Returns dict mapping category name -> set of appid strings.
|
| 374 |
+
"""
|
| 375 |
+
regions = settings.steam_priority_regions_list
|
| 376 |
+
if not regions:
|
| 377 |
+
logger.warning(
|
| 378 |
+
"steam_priority_regions is empty — skipping store categories fetch (top500 only)"
|
| 379 |
+
)
|
| 380 |
+
return {}
|
| 381 |
+
|
| 382 |
+
merged: dict[str, set[str]] = {}
|
| 383 |
+
for region in regions:
|
| 384 |
+
region_data = await self._fetch_region_categories(region)
|
| 385 |
+
for cat_name, appids in region_data.items():
|
| 386 |
+
merged.setdefault(cat_name, set()).update(appids)
|
| 387 |
+
return merged
|
backend/app/services/steam_errors.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Custom exceptions for Steam API errors.
|
| 3 |
+
|
| 4 |
+
Separate module to avoid circular imports between mongodb.py and steam_service.py.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SteamAPIError(Exception):
|
| 9 |
+
"""Raised when Steam API returns a non-retryable error (404, 403, other 4xx)."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, status_code: int, app_id: str, message: str = "") -> None:
|
| 12 |
+
self.status_code = status_code
|
| 13 |
+
self.app_id = app_id
|
| 14 |
+
self.message = message or f"Steam API error {status_code} for app {app_id}"
|
| 15 |
+
super().__init__(self.message)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SteamRateLimitError(SteamAPIError):
|
| 19 |
+
"""Raised when Steam API returns 429 after all retries are exhausted."""
|
| 20 |
+
|
| 21 |
+
def __init__(self, app_id: str) -> None:
|
| 22 |
+
super().__init__(status_code=429, app_id=app_id, message=f"Steam API rate limited for app {app_id}")
|
backend/app/services/steam_service.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Serwis do komunikacji ze Steam API.
|
| 3 |
+
|
| 4 |
+
Odpowiada za pobieranie informacji o grach oraz recenzji.
|
| 5 |
+
Wykorzystuje publiczne API Steam (nie wymaga klucza API).
|
| 6 |
+
Implementuje statystyczne próbkowanie recenzji (stratified sampling).
|
| 7 |
+
Retry z exponential backoff dla 429/5xx/timeout.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import asyncio
|
| 11 |
+
import logging
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from typing import Any, AsyncGenerator
|
| 14 |
+
|
| 15 |
+
import httpx
|
| 16 |
+
|
| 17 |
+
from app.core.config import settings
|
| 18 |
+
from app.core.sampling import SamplePlan, create_sample_plan
|
| 19 |
+
from app.db.mongodb import mongodb
|
| 20 |
+
from app.models.schemas import GameInfo, ReviewBatch, ReviewItem
|
| 21 |
+
from app.services.steam_errors import SteamAPIError, SteamRateLimitError
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
# Status codes that should be retried
|
| 26 |
+
_RETRYABLE_STATUS_CODES = {429, 500, 502, 503, 504}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class ReviewStats:
|
| 31 |
+
"""Statystyki recenzji gry."""
|
| 32 |
+
|
| 33 |
+
total: int
|
| 34 |
+
positive: int
|
| 35 |
+
negative: int
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class SteamService:
|
| 39 |
+
"""
|
| 40 |
+
Serwis do pobierania danych ze Steam API.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
STORE_API_URL = "https://store.steampowered.com/api"
|
| 44 |
+
REVIEW_API_URL = "https://store.steampowered.com/appreviews"
|
| 45 |
+
SEARCH_API_URL = "https://store.steampowered.com/api/storesearch"
|
| 46 |
+
|
| 47 |
+
def __init__(self, timeout: float = 30.0) -> None:
|
| 48 |
+
self.timeout = timeout
|
| 49 |
+
self.client = httpx.AsyncClient(timeout=self.timeout)
|
| 50 |
+
|
| 51 |
+
async def close(self) -> None:
|
| 52 |
+
"""Close the shared HTTP client."""
|
| 53 |
+
await self.client.aclose()
|
| 54 |
+
|
| 55 |
+
async def _request_with_retry(
|
| 56 |
+
self,
|
| 57 |
+
client: httpx.AsyncClient,
|
| 58 |
+
url: str,
|
| 59 |
+
params: dict[str, Any],
|
| 60 |
+
context: str = "",
|
| 61 |
+
) -> httpx.Response:
|
| 62 |
+
"""
|
| 63 |
+
Wykonuje request z retry i exponential backoff.
|
| 64 |
+
"""
|
| 65 |
+
max_attempts = settings.steam_retry_max_attempts
|
| 66 |
+
base_delay = settings.steam_retry_base_delay
|
| 67 |
+
max_delay = settings.steam_retry_max_delay
|
| 68 |
+
last_exception: Exception | None = None
|
| 69 |
+
|
| 70 |
+
for attempt in range(max_attempts):
|
| 71 |
+
try:
|
| 72 |
+
response = await client.get(url, params=params)
|
| 73 |
+
status = response.status_code
|
| 74 |
+
|
| 75 |
+
if status == 200:
|
| 76 |
+
return response
|
| 77 |
+
|
| 78 |
+
# Non-retryable client errors
|
| 79 |
+
if status == 404:
|
| 80 |
+
raise SteamAPIError(404, context, f"Not found: {url}")
|
| 81 |
+
if status == 403:
|
| 82 |
+
raise SteamAPIError(403, context, f"Forbidden: {url}")
|
| 83 |
+
if 400 <= status < 500 and status not in _RETRYABLE_STATUS_CODES:
|
| 84 |
+
raise SteamAPIError(status, context, f"Client error {status}: {url}")
|
| 85 |
+
|
| 86 |
+
# Retryable errors (429, 5xx)
|
| 87 |
+
if attempt < max_attempts - 1:
|
| 88 |
+
delay = min(base_delay * (2 ** attempt), max_delay)
|
| 89 |
+
|
| 90 |
+
# Respect Retry-After header for 429
|
| 91 |
+
if status == 429:
|
| 92 |
+
retry_after = response.headers.get("Retry-After")
|
| 93 |
+
if retry_after:
|
| 94 |
+
try:
|
| 95 |
+
delay = min(float(retry_after), max_delay)
|
| 96 |
+
except ValueError:
|
| 97 |
+
pass
|
| 98 |
+
|
| 99 |
+
logger.warning(
|
| 100 |
+
f"Steam API {status} for {context}, "
|
| 101 |
+
f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s"
|
| 102 |
+
)
|
| 103 |
+
await asyncio.sleep(delay)
|
| 104 |
+
else:
|
| 105 |
+
# Exhausted retries
|
| 106 |
+
if status == 429:
|
| 107 |
+
raise SteamRateLimitError(context)
|
| 108 |
+
raise SteamAPIError(status, context, f"Server error {status} after {max_attempts} attempts: {url}")
|
| 109 |
+
|
| 110 |
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 111 |
+
last_exception = e
|
| 112 |
+
if attempt < max_attempts - 1:
|
| 113 |
+
delay = min(base_delay * (2 ** attempt), max_delay)
|
| 114 |
+
logger.warning(
|
| 115 |
+
f"Steam API {type(e).__name__} for {context}, "
|
| 116 |
+
f"retry {attempt + 1}/{max_attempts - 1} after {delay:.1f}s"
|
| 117 |
+
)
|
| 118 |
+
await asyncio.sleep(delay)
|
| 119 |
+
else:
|
| 120 |
+
raise SteamAPIError(
|
| 121 |
+
0, context,
|
| 122 |
+
f"Connection failed after {max_attempts} attempts: {e}"
|
| 123 |
+
) from e
|
| 124 |
+
|
| 125 |
+
# Should not reach here, but just in case
|
| 126 |
+
raise SteamAPIError(0, context, "Unexpected retry exhaustion") from last_exception
|
| 127 |
+
|
| 128 |
+
async def search_game(self, query: str) -> GameInfo | None:
|
| 129 |
+
"""Wyszukuje grę po nazwie używając publicznego API wyszukiwarki Steam."""
|
| 130 |
+
client = self.client
|
| 131 |
+
params = {
|
| 132 |
+
"term": query,
|
| 133 |
+
"l": settings.steam_review_language,
|
| 134 |
+
"cc": settings.steam_region,
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
try:
|
| 138 |
+
response = await self._request_with_retry(
|
| 139 |
+
client, self.SEARCH_API_URL, params, context=f"search:{query}"
|
| 140 |
+
)
|
| 141 |
+
data = response.json()
|
| 142 |
+
except (SteamAPIError, SteamRateLimitError) as e:
|
| 143 |
+
logger.error(f"Błąd wyszukiwania gry '{query}': {e}")
|
| 144 |
+
return None
|
| 145 |
+
|
| 146 |
+
items = data.get("items", [])
|
| 147 |
+
if not items:
|
| 148 |
+
logger.warning(f"Nie znaleziono gry: {query}")
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
first_result = items[0]
|
| 152 |
+
app_id = str(first_result.get("id"))
|
| 153 |
+
|
| 154 |
+
game_info = await self.get_game_info(app_id)
|
| 155 |
+
|
| 156 |
+
if game_info:
|
| 157 |
+
await mongodb.upsert_game({
|
| 158 |
+
"appid": game_info.app_id,
|
| 159 |
+
"name": game_info.name,
|
| 160 |
+
"name_cn": game_info.name_cn,
|
| 161 |
+
"cn_name_checked": True,
|
| 162 |
+
"header_image": game_info.header_image,
|
| 163 |
+
"total_reviews": game_info.total_reviews
|
| 164 |
+
})
|
| 165 |
+
|
| 166 |
+
return game_info
|
| 167 |
+
|
| 168 |
+
async def get_game_info(self, app_id: str) -> GameInfo | None:
|
| 169 |
+
"""Pobiera szczegółowe metadane gry (obrazek, nazwę) z appdetails."""
|
| 170 |
+
cached_error = await mongodb.get_steam_error(app_id)
|
| 171 |
+
if cached_error:
|
| 172 |
+
logger.info(
|
| 173 |
+
f"Skipping Steam API for app {app_id} — "
|
| 174 |
+
f"cached error {cached_error.get('status_code')}"
|
| 175 |
+
)
|
| 176 |
+
return None
|
| 177 |
+
|
| 178 |
+
client = self.client
|
| 179 |
+
details_url = f"{self.STORE_API_URL}/appdetails"
|
| 180 |
+
|
| 181 |
+
async def fetch_localized(lang: str):
|
| 182 |
+
try:
|
| 183 |
+
params = {"appids": app_id, "l": lang, "cc": settings.steam_region}
|
| 184 |
+
resp = await self._request_with_retry(
|
| 185 |
+
client, details_url, params, context=app_id
|
| 186 |
+
)
|
| 187 |
+
return resp.json().get(app_id, {})
|
| 188 |
+
except SteamAPIError as e:
|
| 189 |
+
if e.status_code == 404:
|
| 190 |
+
await mongodb.cache_steam_error(
|
| 191 |
+
app_id, 404, settings.steam_error_cache_ttl_404
|
| 192 |
+
)
|
| 193 |
+
return {}
|
| 194 |
+
|
| 195 |
+
data_zh, data_en = await asyncio.gather(
|
| 196 |
+
fetch_localized("schinese"),
|
| 197 |
+
fetch_localized("english")
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
if not data_en.get("success") and not data_zh.get("success"):
|
| 201 |
+
logger.warning(f"Nie znaleziono szczegółów gry: {app_id}")
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
base_data = data_en.get("data") or data_zh.get("data")
|
| 205 |
+
name_en = data_en.get("data", {}).get("name") or base_data.get("name")
|
| 206 |
+
name_zh = data_zh.get("data", {}).get("name")
|
| 207 |
+
|
| 208 |
+
stats = await self.get_review_stats(app_id)
|
| 209 |
+
|
| 210 |
+
return GameInfo(
|
| 211 |
+
app_id=app_id,
|
| 212 |
+
name=name_en,
|
| 213 |
+
name_cn=name_zh if name_zh != name_en else None,
|
| 214 |
+
header_image=base_data.get("header_image"),
|
| 215 |
+
total_reviews=stats.total,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
async def get_review_stats(self, app_id: str) -> ReviewStats:
|
| 219 |
+
"""Pobiera sumaryczne statystyki recenzji potrzebne do planowania próbki."""
|
| 220 |
+
cached_error = await mongodb.get_steam_error(app_id)
|
| 221 |
+
if cached_error:
|
| 222 |
+
logger.info(
|
| 223 |
+
f"Skipping review stats for app {app_id} — "
|
| 224 |
+
f"cached error {cached_error.get('status_code')}"
|
| 225 |
+
)
|
| 226 |
+
return ReviewStats(total=0, positive=0, negative=0)
|
| 227 |
+
|
| 228 |
+
client = self.client
|
| 229 |
+
url = f"{self.REVIEW_API_URL}/{app_id}"
|
| 230 |
+
params = {
|
| 231 |
+
"json": "1",
|
| 232 |
+
"filter": "all",
|
| 233 |
+
"num_per_page": "0",
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
try:
|
| 237 |
+
response = await self._request_with_retry(
|
| 238 |
+
client, url, params, context=app_id
|
| 239 |
+
)
|
| 240 |
+
data = response.json()
|
| 241 |
+
|
| 242 |
+
summary = data.get("query_summary", {})
|
| 243 |
+
return ReviewStats(
|
| 244 |
+
total=summary.get("total_reviews", 0),
|
| 245 |
+
positive=summary.get("total_positive", 0),
|
| 246 |
+
negative=summary.get("total_negative", 0),
|
| 247 |
+
)
|
| 248 |
+
except SteamAPIError as e:
|
| 249 |
+
if e.status_code in (404, 429):
|
| 250 |
+
ttl = (
|
| 251 |
+
settings.steam_error_cache_ttl_429
|
| 252 |
+
if e.status_code == 429
|
| 253 |
+
else settings.steam_error_cache_ttl_404
|
| 254 |
+
)
|
| 255 |
+
await mongodb.cache_steam_error(app_id, e.status_code, ttl)
|
| 256 |
+
logger.error(f"Błąd pobierania statystyk recenzji: {e}")
|
| 257 |
+
return ReviewStats(total=0, positive=0, negative=0)
|
| 258 |
+
|
| 259 |
+
async def _fetch_reviews_batch(
|
| 260 |
+
self,
|
| 261 |
+
client: httpx.AsyncClient,
|
| 262 |
+
app_id: str,
|
| 263 |
+
review_type: str,
|
| 264 |
+
filter_type: str,
|
| 265 |
+
num_per_page: int,
|
| 266 |
+
cursor: str | None,
|
| 267 |
+
) -> tuple[list[str], list[ReviewItem], str | None]:
|
| 268 |
+
"""Pobiera pojedynczą paczkę recenzji (do 100 sztuk)."""
|
| 269 |
+
url = f"{self.REVIEW_API_URL}/{app_id}"
|
| 270 |
+
params: dict[str, Any] = {
|
| 271 |
+
"json": "1",
|
| 272 |
+
"filter": filter_type,
|
| 273 |
+
"review_type": review_type,
|
| 274 |
+
"language": settings.steam_review_language,
|
| 275 |
+
"num_per_page": str(num_per_page),
|
| 276 |
+
"cursor": cursor or "*",
|
| 277 |
+
"purchase_type": "all",
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
response = await self._request_with_retry(
|
| 282 |
+
client, url, params, context=app_id
|
| 283 |
+
)
|
| 284 |
+
data = response.json()
|
| 285 |
+
except SteamRateLimitError:
|
| 286 |
+
await mongodb.cache_steam_error(
|
| 287 |
+
app_id, 429, settings.steam_error_cache_ttl_429
|
| 288 |
+
)
|
| 289 |
+
logger.error(f"Rate limited fetching reviews for {app_id}")
|
| 290 |
+
return [], [], None
|
| 291 |
+
except SteamAPIError as e:
|
| 292 |
+
logger.error(f"Błąd pobierania recenzji: {e}")
|
| 293 |
+
return [], [], None
|
| 294 |
+
|
| 295 |
+
if not data.get("success"):
|
| 296 |
+
return [], [], None
|
| 297 |
+
|
| 298 |
+
reviews_data = data.get("reviews", [])
|
| 299 |
+
review_texts: list[str] = []
|
| 300 |
+
review_items: list[ReviewItem] = []
|
| 301 |
+
|
| 302 |
+
for review in reviews_data:
|
| 303 |
+
text = review.get("review")
|
| 304 |
+
if not text:
|
| 305 |
+
continue
|
| 306 |
+
review_texts.append(text)
|
| 307 |
+
review_items.append(ReviewItem(
|
| 308 |
+
text=text,
|
| 309 |
+
recommendation_id=str(review.get("recommendationid", "")),
|
| 310 |
+
timestamp_created=review.get("timestamp_created", 0),
|
| 311 |
+
))
|
| 312 |
+
|
| 313 |
+
new_cursor = data.get("cursor")
|
| 314 |
+
return review_texts, review_items, new_cursor
|
| 315 |
+
|
| 316 |
+
async def fetch_reviews_stratified(
|
| 317 |
+
self,
|
| 318 |
+
app_id: str,
|
| 319 |
+
sample_plan: SamplePlan,
|
| 320 |
+
) -> AsyncGenerator[ReviewBatch, None]:
|
| 321 |
+
"""
|
| 322 |
+
Główna logika pobierania danych. Działa w dwóch fazach.
|
| 323 |
+
"""
|
| 324 |
+
batch_size = settings.review_batch_size
|
| 325 |
+
all_reviews: set[str] = set()
|
| 326 |
+
seen_cursors: set[str] = set()
|
| 327 |
+
client = self.client
|
| 328 |
+
|
| 329 |
+
# --- FAZA 1: TOP HELPFUL ---
|
| 330 |
+
cursor: str | None = "*"
|
| 331 |
+
fetched = 0
|
| 332 |
+
|
| 333 |
+
while fetched < sample_plan.top_helpful:
|
| 334 |
+
to_fetch = min(batch_size, sample_plan.top_helpful - fetched)
|
| 335 |
+
reviews, review_items, cursor = await self._fetch_reviews_batch(
|
| 336 |
+
client, app_id, "all", "all", to_fetch, cursor
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
if not reviews:
|
| 340 |
+
break
|
| 341 |
+
if cursor and cursor in seen_cursors:
|
| 342 |
+
logger.warning(f"Repeated cursor {cursor} for {app_id} (top_helpful). Shortfall: {sample_plan.top_helpful - fetched}")
|
| 343 |
+
break
|
| 344 |
+
if cursor:
|
| 345 |
+
seen_cursors.add(cursor)
|
| 346 |
+
|
| 347 |
+
all_reviews.update(reviews)
|
| 348 |
+
fetched += len(reviews)
|
| 349 |
+
yield ReviewBatch(reviews=reviews, review_items=review_items, cursor=cursor)
|
| 350 |
+
|
| 351 |
+
if not cursor or cursor == "*":
|
| 352 |
+
break
|
| 353 |
+
|
| 354 |
+
# --- FAZA 2a: RECENT POSITIVE ---
|
| 355 |
+
positive_target = sample_plan.positive_count
|
| 356 |
+
if positive_target > 0:
|
| 357 |
+
cursor = "*"
|
| 358 |
+
fetched = 0
|
| 359 |
+
seen_cursors_pos: set[str] = set()
|
| 360 |
+
|
| 361 |
+
while fetched < positive_target:
|
| 362 |
+
to_fetch = min(batch_size, positive_target - fetched)
|
| 363 |
+
# Jeśli mamy dużo duplikatów, prosimy o więcej niż pozostało do targetu (ale max batch_size)
|
| 364 |
+
if fetched > 0:
|
| 365 |
+
to_fetch = batch_size
|
| 366 |
+
|
| 367 |
+
reviews, review_items, cursor = await self._fetch_reviews_batch(
|
| 368 |
+
client, app_id, "positive", "recent", to_fetch, cursor or "*"
|
| 369 |
+
)
|
| 370 |
+
if not reviews:
|
| 371 |
+
break
|
| 372 |
+
if cursor and cursor in seen_cursors_pos:
|
| 373 |
+
logger.warning(f"Repeated cursor {cursor} for {app_id} (positive). Shortfall: {positive_target - fetched}")
|
| 374 |
+
break
|
| 375 |
+
if cursor:
|
| 376 |
+
seen_cursors_pos.add(cursor)
|
| 377 |
+
|
| 378 |
+
new_reviews = [r for r in reviews if r not in all_reviews]
|
| 379 |
+
new_texts_set = set(new_reviews)
|
| 380 |
+
new_items = [ri for ri in review_items if ri.text in new_texts_set]
|
| 381 |
+
all_reviews.update(new_reviews)
|
| 382 |
+
fetched += len(new_reviews)
|
| 383 |
+
|
| 384 |
+
if new_reviews:
|
| 385 |
+
yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor)
|
| 386 |
+
if not cursor or cursor == "*":
|
| 387 |
+
break
|
| 388 |
+
|
| 389 |
+
# --- FAZA 2b: RECENT NEGATIVE ---
|
| 390 |
+
negative_target = sample_plan.negative_count
|
| 391 |
+
if negative_target > 0:
|
| 392 |
+
cursor = "*"
|
| 393 |
+
fetched = 0
|
| 394 |
+
seen_cursors_neg: set[str] = set()
|
| 395 |
+
|
| 396 |
+
while fetched < negative_target:
|
| 397 |
+
to_fetch = min(batch_size, negative_target - fetched)
|
| 398 |
+
if fetched > 0:
|
| 399 |
+
to_fetch = batch_size
|
| 400 |
+
|
| 401 |
+
reviews, review_items, cursor = await self._fetch_reviews_batch(
|
| 402 |
+
client, app_id, "negative", "recent", to_fetch, cursor or "*"
|
| 403 |
+
)
|
| 404 |
+
if not reviews:
|
| 405 |
+
break
|
| 406 |
+
if cursor and cursor in seen_cursors_neg:
|
| 407 |
+
logger.warning(f"Repeated cursor {cursor} for {app_id} (negative). Shortfall: {negative_target - fetched}")
|
| 408 |
+
break
|
| 409 |
+
if cursor:
|
| 410 |
+
seen_cursors_neg.add(cursor)
|
| 411 |
+
|
| 412 |
+
new_reviews = [r for r in reviews if r not in all_reviews]
|
| 413 |
+
new_texts_set = set(new_reviews)
|
| 414 |
+
new_items = [ri for ri in review_items if ri.text in new_texts_set]
|
| 415 |
+
all_reviews.update(new_reviews)
|
| 416 |
+
fetched += len(new_reviews)
|
| 417 |
+
|
| 418 |
+
if new_reviews:
|
| 419 |
+
yield ReviewBatch(reviews=new_reviews, review_items=new_items, cursor=cursor)
|
| 420 |
+
if not cursor or cursor == "*":
|
| 421 |
+
break
|
| 422 |
+
|
| 423 |
+
logger.info(f"Pobrano łącznie {len(all_reviews)} unikalnych recenzji")
|
| 424 |
+
|
| 425 |
+
async def fetch_recent_reviews(
|
| 426 |
+
self,
|
| 427 |
+
app_id: str,
|
| 428 |
+
exclude_ids: set[str] | None = None,
|
| 429 |
+
) -> list[ReviewItem]:
|
| 430 |
+
"""
|
| 431 |
+
Fetch recent reviews for incremental analysis.
|
| 432 |
+
"""
|
| 433 |
+
is_new_game = not exclude_ids
|
| 434 |
+
exclude_ids = exclude_ids or set()
|
| 435 |
+
batch_size = settings.review_batch_size
|
| 436 |
+
|
| 437 |
+
# Incremental Fetch limit for new games
|
| 438 |
+
if is_new_game:
|
| 439 |
+
stats = await self.get_review_stats(app_id)
|
| 440 |
+
max_total = min(stats.total, settings.recent_sample_limit, 500)
|
| 441 |
+
else:
|
| 442 |
+
max_total = settings.recent_sample_limit
|
| 443 |
+
|
| 444 |
+
client = self.client
|
| 445 |
+
cursor: str | None = "*"
|
| 446 |
+
seen_cursors: set[str] = set()
|
| 447 |
+
new_items: list[ReviewItem] = []
|
| 448 |
+
|
| 449 |
+
while len(new_items) < max_total:
|
| 450 |
+
to_fetch = min(batch_size, max_total - len(new_items))
|
| 451 |
+
_, review_items, cursor = await self._fetch_reviews_batch(
|
| 452 |
+
client, app_id, "all", "recent", to_fetch, cursor
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
if not review_items:
|
| 456 |
+
break
|
| 457 |
+
if cursor and cursor in seen_cursors:
|
| 458 |
+
logger.warning(f"Repeated cursor {cursor} for {app_id} (recent). Shortfall: {max_total - len(new_items)}")
|
| 459 |
+
break
|
| 460 |
+
if cursor:
|
| 461 |
+
seen_cursors.add(cursor)
|
| 462 |
+
|
| 463 |
+
# Filter out already-known reviews
|
| 464 |
+
batch_new = [ri for ri in review_items if ri.recommendation_id not in exclude_ids]
|
| 465 |
+
|
| 466 |
+
# Early exit: if >80% of batch is known, we've passed the boundary
|
| 467 |
+
known_ratio = 1 - (len(batch_new) / len(review_items)) if review_items else 0
|
| 468 |
+
new_items.extend(batch_new)
|
| 469 |
+
|
| 470 |
+
if not is_new_game and known_ratio > 0.8:
|
| 471 |
+
logger.info(
|
| 472 |
+
f"Early exit for {app_id}: {known_ratio:.0%} of batch already known"
|
| 473 |
+
)
|
| 474 |
+
break
|
| 475 |
+
|
| 476 |
+
if not cursor or cursor == "*":
|
| 477 |
+
break
|
| 478 |
+
|
| 479 |
+
logger.info(f"Incremental fetch for {app_id}: {len(new_items)} new reviews")
|
| 480 |
+
return new_items[:max_total]
|
| 481 |
+
|
| 482 |
+
async def fetch_reviews(
|
| 483 |
+
self,
|
| 484 |
+
app_id: str,
|
| 485 |
+
batch_size: int | None = None,
|
| 486 |
+
max_reviews: int | None = None,
|
| 487 |
+
) -> AsyncGenerator[ReviewBatch, None]:
|
| 488 |
+
"""Wrapper dla zachowania kompatybilności."""
|
| 489 |
+
stats = await self.get_review_stats(app_id)
|
| 490 |
+
if stats.total == 0:
|
| 491 |
+
return
|
| 492 |
+
|
| 493 |
+
sample_plan = create_sample_plan(stats.total, stats.positive, stats.negative)
|
| 494 |
+
async for batch in self.fetch_reviews_stratified(app_id, sample_plan):
|
| 495 |
+
yield batch
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
# Globalna instancja serwisu (Singleton)
|
| 499 |
+
steam_service = SteamService()
|
backend/app/services/update_detection_service.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Update Detection Service — checks Steam News API for game updates.
|
| 3 |
+
|
| 4 |
+
Compares the latest news/patch date with the stored `last_game_update_at`
|
| 5 |
+
to detect games that have been recently updated.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import re
|
| 10 |
+
from datetime import datetime, timezone
|
| 11 |
+
from typing import Any, NamedTuple, cast
|
| 12 |
+
|
| 13 |
+
import httpx
|
| 14 |
+
|
| 15 |
+
from app.core.config import settings
|
| 16 |
+
from app.db.mongodb import mongodb
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/"
|
| 21 |
+
|
| 22 |
+
# Matches two-segment versions: 1.2, v2.0, 0.6, 123.4
|
| 23 |
+
# Excludes three-segment (0.6.1) via negative lookahead, 4-digit years via \d{1,3},
|
| 24 |
+
# and sub-segments of longer versions (e.g. "6.1" within "0.6.1") via lookbehind.
|
| 25 |
+
VERSION_RE = re.compile(r'(?<!\d\.)\bv?\d{1,3}\.\d+\b(?!\.\d)')
|
| 26 |
+
|
| 27 |
+
# Phase 1 regex constants
|
| 28 |
+
RELEASE_PHRASE_RE = re.compile(
|
| 29 |
+
r'\b(out now|is out|is live|now live|now available|full release|'
|
| 30 |
+
r'leaving early access|out of early access)\b',
|
| 31 |
+
re.IGNORECASE
|
| 32 |
+
)
|
| 33 |
+
CONTENT_UPDATE_RE = re.compile(
|
| 34 |
+
r'\b(major update|content update|big update|biggest update)\b',
|
| 35 |
+
re.IGNORECASE
|
| 36 |
+
)
|
| 37 |
+
ACTION_WORD_RE = re.compile(
|
| 38 |
+
r'\b(update|patch|release|available|launch|live|out)\b',
|
| 39 |
+
re.IGNORECASE
|
| 40 |
+
)
|
| 41 |
+
HOTFIX_RE = re.compile(r'\b(hotfix|hot.?fix)\b', re.IGNORECASE)
|
| 42 |
+
BRANCH_RE = re.compile(
|
| 43 |
+
r'\b(experimental branch|experimental.{0,10}patch|experimental.{0,10}build|'
|
| 44 |
+
r'public.?test|pts build|beta branch|'
|
| 45 |
+
r'on experimental|for experimental)\b',
|
| 46 |
+
re.IGNORECASE
|
| 47 |
+
)
|
| 48 |
+
MAJOR_RELEASE_RE = re.compile(
|
| 49 |
+
r'\b(out now|is out|is live|now live|now available|full release|'
|
| 50 |
+
r'leaving early access|out of early access)\b',
|
| 51 |
+
re.IGNORECASE
|
| 52 |
+
)
|
| 53 |
+
MAJOR_CONTENT_RE = re.compile(
|
| 54 |
+
r'\b(major update|content update|big update|biggest update)\b',
|
| 55 |
+
re.IGNORECASE
|
| 56 |
+
)
|
| 57 |
+
ONE_ZERO_RE = re.compile(r'\b1\.0\b(?!\.\d)')
|
| 58 |
+
|
| 59 |
+
# Phase 2 regex constants
|
| 60 |
+
EVENT_FESTIVAL_RE = re.compile(
|
| 61 |
+
r'\b(festival|anniversary\s+event|community\s+event|'
|
| 62 |
+
r'in-game\s+event|roadmap|preview)\b',
|
| 63 |
+
re.IGNORECASE
|
| 64 |
+
)
|
| 65 |
+
UPDATE_OR_PATCH_RE = re.compile(r'\b(update|patch)\b', re.IGNORECASE)
|
| 66 |
+
NAMED_VERSION_RE = re.compile(r'\bV\d+\b') # case-sensitive: uppercase V only
|
| 67 |
+
UPDATE_WORD_RE = re.compile(r'\bupdate\b', re.IGNORECASE)
|
| 68 |
+
PATCH_WORD_RE = re.compile(r'\bpatch\b', re.IGNORECASE)
|
| 69 |
+
MAINT_LANGUAGE_RE = re.compile(
|
| 70 |
+
r'\b(fix(?:es|ed)?|bug\s*fix|improv(?:es?|ed|ements?)|stability|performance|tweak)\b',
|
| 71 |
+
re.IGNORECASE
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
_NEWS_MAX_PAGES = 5 # Max pages in incremental mode (5 * 5 = 25 items)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class NewsCheckResult(NamedTuple):
|
| 78 |
+
latest_update_date: datetime | None # date of most recent update-related item
|
| 79 |
+
is_major: bool # whether any item qualifies as major
|
| 80 |
+
major_date: datetime | None # date of most recent major item; None if not major
|
| 81 |
+
newest_seen_gid: str | None = None # GID of newest news item (for cursor persistence)
|
| 82 |
+
newest_seen_at: datetime | None = None # timestamp of newest news item
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class UpdateDetectionService:
|
| 86 |
+
"""Detects game updates via Steam News API."""
|
| 87 |
+
|
| 88 |
+
def __init__(self, client: httpx.AsyncClient | None = None) -> None:
|
| 89 |
+
self._client = client
|
| 90 |
+
self._owns_client = client is None
|
| 91 |
+
|
| 92 |
+
async def _get_client(self) -> httpx.AsyncClient:
|
| 93 |
+
if self._client is None:
|
| 94 |
+
self._client = httpx.AsyncClient(timeout=15.0)
|
| 95 |
+
return self._client
|
| 96 |
+
|
| 97 |
+
async def close(self) -> None:
|
| 98 |
+
if self._owns_client and self._client is not None:
|
| 99 |
+
await self._client.aclose()
|
| 100 |
+
self._client = None
|
| 101 |
+
|
| 102 |
+
@staticmethod
|
| 103 |
+
def _is_update_related(item: dict) -> bool:
|
| 104 |
+
"""Return True if news item is update-related.
|
| 105 |
+
|
| 106 |
+
Conditions (any one is sufficient):
|
| 107 |
+
A: 'patchnotes' in tags
|
| 108 |
+
B: feedlabel == 'Product Update'
|
| 109 |
+
C: title matches release-style phrases
|
| 110 |
+
D: title matches large content update phrases
|
| 111 |
+
E: title has a version number AND an action word
|
| 112 |
+
"""
|
| 113 |
+
tags = item.get("tags")
|
| 114 |
+
if isinstance(tags, list):
|
| 115 |
+
is_patch = "patchnotes" in tags
|
| 116 |
+
else:
|
| 117 |
+
is_patch = "patchnotes" in (tags or "")
|
| 118 |
+
feedlabel = item.get("feedlabel") or ""
|
| 119 |
+
if is_patch or feedlabel == "Product Update":
|
| 120 |
+
return True
|
| 121 |
+
|
| 122 |
+
# Conditions C/D/E: title-based signals — restricted to developer feed only.
|
| 123 |
+
# Third-party news sites (GamingOnLinux etc.) can write about updates using
|
| 124 |
+
# the same language, so we only trust these signals from the developer's own feed.
|
| 125 |
+
if item.get("feedname") != "steam_community_announcements":
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
title = item.get("title", "")
|
| 129 |
+
if RELEASE_PHRASE_RE.search(title):
|
| 130 |
+
return True
|
| 131 |
+
if CONTENT_UPDATE_RE.search(title):
|
| 132 |
+
return True
|
| 133 |
+
if VERSION_RE.search(title) and ACTION_WORD_RE.search(title):
|
| 134 |
+
return True
|
| 135 |
+
# F: named version (V70) + "update" in title (developer feed only)
|
| 136 |
+
if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title):
|
| 137 |
+
return True
|
| 138 |
+
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
@staticmethod
|
| 142 |
+
def _is_major_update(item: dict) -> bool:
|
| 143 |
+
"""Return True if the news item represents a major update.
|
| 144 |
+
|
| 145 |
+
Negative signals (blockers) are checked first:
|
| 146 |
+
- hotfix keyword → not major
|
| 147 |
+
- experimental branch / public test branch → not major
|
| 148 |
+
|
| 149 |
+
Positive signals (any one is sufficient):
|
| 150 |
+
- version number in title (VERSION_RE)
|
| 151 |
+
- release language (MAJOR_RELEASE_RE)
|
| 152 |
+
- standalone '1.0' (ONE_ZERO_RE)
|
| 153 |
+
- large content phrases (MAJOR_CONTENT_RE)
|
| 154 |
+
"""
|
| 155 |
+
title = item.get("title", "")
|
| 156 |
+
|
| 157 |
+
if HOTFIX_RE.search(title):
|
| 158 |
+
return False
|
| 159 |
+
if BRANCH_RE.search(title):
|
| 160 |
+
return False
|
| 161 |
+
if EVENT_FESTIVAL_RE.search(title) and not UPDATE_OR_PATCH_RE.search(title):
|
| 162 |
+
return False
|
| 163 |
+
if PATCH_WORD_RE.search(title) and MAINT_LANGUAGE_RE.search(title):
|
| 164 |
+
return False
|
| 165 |
+
|
| 166 |
+
if VERSION_RE.search(title):
|
| 167 |
+
return True
|
| 168 |
+
if MAJOR_RELEASE_RE.search(title):
|
| 169 |
+
return True
|
| 170 |
+
if ONE_ZERO_RE.search(title):
|
| 171 |
+
return True
|
| 172 |
+
if MAJOR_CONTENT_RE.search(title):
|
| 173 |
+
return True
|
| 174 |
+
if NAMED_VERSION_RE.search(title) and UPDATE_WORD_RE.search(title):
|
| 175 |
+
return True
|
| 176 |
+
|
| 177 |
+
return False
|
| 178 |
+
|
| 179 |
+
@staticmethod
|
| 180 |
+
def _collect_update_candidates(
|
| 181 |
+
news_items: list[dict],
|
| 182 |
+
) -> tuple[datetime | None, datetime | None]:
|
| 183 |
+
"""Scan all items, return (latest_update_date, major_date).
|
| 184 |
+
|
| 185 |
+
latest_update_date: max date of all update-related items (or None)
|
| 186 |
+
major_date: max date of major items (or None if no major found)
|
| 187 |
+
"""
|
| 188 |
+
latest_update_ts: int | None = None
|
| 189 |
+
major_ts: int | None = None
|
| 190 |
+
|
| 191 |
+
for item in news_items:
|
| 192 |
+
if not UpdateDetectionService._is_update_related(item):
|
| 193 |
+
continue
|
| 194 |
+
ts = item.get("date") or 0
|
| 195 |
+
if not ts:
|
| 196 |
+
continue
|
| 197 |
+
if latest_update_ts is None or ts > latest_update_ts:
|
| 198 |
+
latest_update_ts = ts
|
| 199 |
+
if UpdateDetectionService._is_major_update(item):
|
| 200 |
+
if major_ts is None or ts > major_ts:
|
| 201 |
+
major_ts = ts
|
| 202 |
+
|
| 203 |
+
latest_update_date = (
|
| 204 |
+
datetime.fromtimestamp(latest_update_ts, tz=timezone.utc)
|
| 205 |
+
if latest_update_ts is not None
|
| 206 |
+
else None
|
| 207 |
+
)
|
| 208 |
+
major_date = (
|
| 209 |
+
datetime.fromtimestamp(major_ts, tz=timezone.utc)
|
| 210 |
+
if major_ts is not None
|
| 211 |
+
else None
|
| 212 |
+
)
|
| 213 |
+
return latest_update_date, major_date
|
| 214 |
+
|
| 215 |
+
@staticmethod
|
| 216 |
+
async def _fetch_news_page(
|
| 217 |
+
client: httpx.AsyncClient,
|
| 218 |
+
app_id: str,
|
| 219 |
+
count: int,
|
| 220 |
+
enddate: int | None = None,
|
| 221 |
+
) -> list[dict]:
|
| 222 |
+
"""Fetch a single page of news items from Steam API.
|
| 223 |
+
|
| 224 |
+
Returns [] on HTTP error or request failure.
|
| 225 |
+
"""
|
| 226 |
+
params: dict[str, Any] = {
|
| 227 |
+
"appid": app_id,
|
| 228 |
+
"count": count,
|
| 229 |
+
"maxlength": 0,
|
| 230 |
+
}
|
| 231 |
+
if enddate is not None:
|
| 232 |
+
params["enddate"] = enddate
|
| 233 |
+
|
| 234 |
+
try:
|
| 235 |
+
resp = await client.get(STEAM_NEWS_API_URL, params=params)
|
| 236 |
+
if resp.status_code != 200:
|
| 237 |
+
return []
|
| 238 |
+
data = resp.json()
|
| 239 |
+
return data.get("appnews", {}).get("newsitems", [])
|
| 240 |
+
except (httpx.RequestError, ValueError, KeyError) as e:
|
| 241 |
+
logger.debug(f"News page fetch failed for {app_id}: {e}")
|
| 242 |
+
return []
|
| 243 |
+
|
| 244 |
+
@staticmethod
|
| 245 |
+
def _scan_batch_with_stopping(
|
| 246 |
+
items: list[dict],
|
| 247 |
+
last_seen_gid: str | None,
|
| 248 |
+
last_seen_at_ts: int | None,
|
| 249 |
+
refresh_cutoff_ts: int | None,
|
| 250 |
+
) -> tuple[list[dict], bool]:
|
| 251 |
+
"""Scan items (newest→oldest), collecting until a stop condition is met.
|
| 252 |
+
|
| 253 |
+
Stop conditions (item is NOT included):
|
| 254 |
+
- gid matches last_seen_gid
|
| 255 |
+
- item date <= last_seen_at_ts
|
| 256 |
+
- item date < refresh_cutoff_ts
|
| 257 |
+
|
| 258 |
+
Returns (accepted_items, hit_stop).
|
| 259 |
+
"""
|
| 260 |
+
accepted: list[dict] = []
|
| 261 |
+
for item in items:
|
| 262 |
+
gid = str(item.get("gid", ""))
|
| 263 |
+
ts = item.get("date") or 0
|
| 264 |
+
|
| 265 |
+
if last_seen_gid and gid and gid == last_seen_gid:
|
| 266 |
+
return accepted, True
|
| 267 |
+
if last_seen_at_ts is not None and ts and ts <= last_seen_at_ts:
|
| 268 |
+
return accepted, True
|
| 269 |
+
if refresh_cutoff_ts is not None and ts and ts < refresh_cutoff_ts:
|
| 270 |
+
return accepted, True
|
| 271 |
+
|
| 272 |
+
accepted.append(item)
|
| 273 |
+
|
| 274 |
+
return accepted, False
|
| 275 |
+
|
| 276 |
+
async def _get_latest_news_date(
|
| 277 |
+
self,
|
| 278 |
+
app_id: str,
|
| 279 |
+
last_seen_gid: str | None = None,
|
| 280 |
+
last_seen_at: datetime | None = None,
|
| 281 |
+
) -> NewsCheckResult:
|
| 282 |
+
"""Fetch and scan Steam news for update candidates.
|
| 283 |
+
|
| 284 |
+
In initial mode (no cursor): fetches count=20, single page.
|
| 285 |
+
In incremental mode (cursor present): fetches count=5 with pagination,
|
| 286 |
+
stopping at the known cursor or the refresh window boundary.
|
| 287 |
+
"""
|
| 288 |
+
client = await self._get_client()
|
| 289 |
+
|
| 290 |
+
is_incremental = last_seen_gid is not None or last_seen_at is not None
|
| 291 |
+
count = settings.news_incremental_count if is_incremental else settings.news_initial_count
|
| 292 |
+
|
| 293 |
+
# Compute stop thresholds for incremental mode
|
| 294 |
+
last_seen_at_ts: int | None = None
|
| 295 |
+
refresh_cutoff_ts: int | None = None
|
| 296 |
+
if is_incremental:
|
| 297 |
+
last_seen_at_ts = int(last_seen_at.timestamp()) if last_seen_at else None
|
| 298 |
+
now_ts = int(datetime.now(timezone.utc).timestamp())
|
| 299 |
+
cutoff_ts = now_ts - (settings.news_refresh_window_hours * 3600)
|
| 300 |
+
|
| 301 |
+
# If cursor is older than the refresh window (worker was down),
|
| 302 |
+
# disable the time cutoff and scan to the cursor instead.
|
| 303 |
+
# _NEWS_MAX_PAGES protects against unbounded pagination.
|
| 304 |
+
if last_seen_at_ts is not None and last_seen_at_ts < cutoff_ts:
|
| 305 |
+
refresh_cutoff_ts = None
|
| 306 |
+
else:
|
| 307 |
+
refresh_cutoff_ts = cutoff_ts
|
| 308 |
+
|
| 309 |
+
all_accepted: list[dict] = []
|
| 310 |
+
newest_gid: str | None = None
|
| 311 |
+
newest_ts: int = 0
|
| 312 |
+
scan_complete = False
|
| 313 |
+
pages_fetched = 0
|
| 314 |
+
enddate: int | None = None
|
| 315 |
+
|
| 316 |
+
while True:
|
| 317 |
+
items = await self._fetch_news_page(client, app_id, count, enddate)
|
| 318 |
+
|
| 319 |
+
if not items:
|
| 320 |
+
if pages_fetched == 0:
|
| 321 |
+
# First page empty (no news or HTTP error) — newest_gid stays None
|
| 322 |
+
pass
|
| 323 |
+
# Pagination page empty → incomplete scan → don't update cursor
|
| 324 |
+
break
|
| 325 |
+
|
| 326 |
+
pages_fetched += 1
|
| 327 |
+
|
| 328 |
+
# Track newest item (from first page only)
|
| 329 |
+
if newest_gid is None:
|
| 330 |
+
for item in items:
|
| 331 |
+
gid = str(item.get("gid", ""))
|
| 332 |
+
ts = item.get("date") or 0
|
| 333 |
+
if gid and ts:
|
| 334 |
+
newest_gid = gid
|
| 335 |
+
newest_ts = ts
|
| 336 |
+
break
|
| 337 |
+
|
| 338 |
+
if is_incremental:
|
| 339 |
+
accepted, hit_stop = self._scan_batch_with_stopping(
|
| 340 |
+
items, last_seen_gid, last_seen_at_ts, refresh_cutoff_ts
|
| 341 |
+
)
|
| 342 |
+
all_accepted.extend(accepted)
|
| 343 |
+
|
| 344 |
+
if hit_stop:
|
| 345 |
+
scan_complete = True
|
| 346 |
+
break
|
| 347 |
+
if len(items) < count:
|
| 348 |
+
scan_complete = True # API has no more items
|
| 349 |
+
break
|
| 350 |
+
if pages_fetched >= _NEWS_MAX_PAGES:
|
| 351 |
+
scan_complete = True # page limit reached
|
| 352 |
+
break
|
| 353 |
+
oldest_ts = items[-1].get("date") or 0
|
| 354 |
+
if not oldest_ts:
|
| 355 |
+
break # can't paginate → incomplete scan
|
| 356 |
+
enddate = oldest_ts - 1
|
| 357 |
+
else:
|
| 358 |
+
# Initial mode: single fetch, always clean
|
| 359 |
+
all_accepted.extend(items)
|
| 360 |
+
scan_complete = True
|
| 361 |
+
break
|
| 362 |
+
|
| 363 |
+
latest_update_date, major_date = self._collect_update_candidates(all_accepted)
|
| 364 |
+
|
| 365 |
+
cursor_gid: str | None = None
|
| 366 |
+
cursor_at: datetime | None = None
|
| 367 |
+
if scan_complete and newest_gid:
|
| 368 |
+
cursor_gid = newest_gid
|
| 369 |
+
cursor_at = datetime.fromtimestamp(newest_ts, tz=timezone.utc)
|
| 370 |
+
|
| 371 |
+
if latest_update_date is None:
|
| 372 |
+
return NewsCheckResult(
|
| 373 |
+
None, False, None,
|
| 374 |
+
newest_seen_gid=cursor_gid,
|
| 375 |
+
newest_seen_at=cursor_at,
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
return NewsCheckResult(
|
| 379 |
+
latest_update_date=latest_update_date,
|
| 380 |
+
is_major=major_date is not None,
|
| 381 |
+
major_date=major_date,
|
| 382 |
+
newest_seen_gid=cursor_gid,
|
| 383 |
+
newest_seen_at=cursor_at,
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
async def check_for_updates(
|
| 387 |
+
self, games: list[dict[str, Any]]
|
| 388 |
+
) -> list[dict[str, Any]]:
|
| 389 |
+
"""
|
| 390 |
+
Check Steam News API for each game. Return games with confirmed major updates.
|
| 391 |
+
|
| 392 |
+
Non-major patchnotes update last_game_update_at but do not trigger a schedule.
|
| 393 |
+
"""
|
| 394 |
+
updated_games: list[dict[str, Any]] = []
|
| 395 |
+
dlcs_by_parent: dict[str, list[dict[str, Any]]] = {}
|
| 396 |
+
|
| 397 |
+
for game in games:
|
| 398 |
+
if game.get("app_type") == "dlc" and game.get("parent_appid"):
|
| 399 |
+
dlcs_by_parent.setdefault(str(game["parent_appid"]), []).append(game)
|
| 400 |
+
|
| 401 |
+
for game in games:
|
| 402 |
+
app_id = str(game.get("appid", ""))
|
| 403 |
+
if not app_id:
|
| 404 |
+
continue
|
| 405 |
+
|
| 406 |
+
if game.get("app_type") == "dlc":
|
| 407 |
+
continue
|
| 408 |
+
|
| 409 |
+
last_known = game.get("last_game_update_at")
|
| 410 |
+
# Normalize last_known to datetime if it's a timestamp
|
| 411 |
+
if last_known is not None and not isinstance(last_known, datetime):
|
| 412 |
+
try:
|
| 413 |
+
last_known = datetime.fromtimestamp(float(last_known), tz=timezone.utc)
|
| 414 |
+
except (ValueError, TypeError):
|
| 415 |
+
last_known = None
|
| 416 |
+
|
| 417 |
+
result = await self._get_latest_news_date(
|
| 418 |
+
app_id,
|
| 419 |
+
last_seen_gid=game.get("last_seen_news_gid"),
|
| 420 |
+
last_seen_at=game.get("last_seen_news_at"),
|
| 421 |
+
)
|
| 422 |
+
|
| 423 |
+
# Persist cursor before any early-continue — even if no updates found
|
| 424 |
+
if result.newest_seen_gid:
|
| 425 |
+
await mongodb.update_news_cursor(
|
| 426 |
+
app_id, result.newest_seen_gid, cast(datetime, result.newest_seen_at)
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
if result.latest_update_date is None:
|
| 430 |
+
continue
|
| 431 |
+
|
| 432 |
+
if last_known is None or result.latest_update_date > last_known:
|
| 433 |
+
await mongodb.update_game_update_date(app_id, result.latest_update_date)
|
| 434 |
+
|
| 435 |
+
if result.is_major:
|
| 436 |
+
current_patch_at = game.get("current_patch_at")
|
| 437 |
+
patch_date = cast(datetime, result.major_date) # always not None when is_major=True
|
| 438 |
+
if current_patch_at is None or patch_date > current_patch_at:
|
| 439 |
+
await mongodb.update_game_patch_date(app_id, patch_date)
|
| 440 |
+
updated_games.append({**game, "update_at": patch_date})
|
| 441 |
+
|
| 442 |
+
for dlc in dlcs_by_parent.get(app_id, []):
|
| 443 |
+
dlc_appid = str(dlc.get("appid", ""))
|
| 444 |
+
if not dlc_appid:
|
| 445 |
+
continue
|
| 446 |
+
|
| 447 |
+
await mongodb.update_game_patch_date(dlc_appid, patch_date)
|
| 448 |
+
updated_games.append({**dlc, "update_at": patch_date})
|
| 449 |
+
|
| 450 |
+
logger.info(
|
| 451 |
+
f"Update detection: {len(updated_games)}/{len(games)} games have new updates"
|
| 452 |
+
)
|
| 453 |
+
return updated_games
|
backend/pytest.ini
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
testpaths = tests
|
| 3 |
+
python_files = test_*.py
|
| 4 |
+
python_functions = test_*
|
| 5 |
+
asyncio_mode = auto
|
| 6 |
+
addopts = -v --tb=short
|
backend/requirements.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web Framework
|
| 2 |
+
fastapi==0.109.0
|
| 3 |
+
uvicorn[standard]==0.27.0
|
| 4 |
+
sse-starlette==1.8.2
|
| 5 |
+
|
| 6 |
+
# Database
|
| 7 |
+
motor==3.3.2
|
| 8 |
+
pymongo==4.6.1
|
| 9 |
+
|
| 10 |
+
# Data Validation
|
| 11 |
+
pydantic==2.5.3
|
| 12 |
+
pydantic-settings==2.1.0
|
| 13 |
+
|
| 14 |
+
# HTTP Client
|
| 15 |
+
httpx==0.26.0
|
| 16 |
+
|
| 17 |
+
# AI/ML - Local Inference (ONNX Runtime only, no PyTorch needed at runtime)
|
| 18 |
+
numpy<2.0.0
|
| 19 |
+
transformers==4.37.2
|
| 20 |
+
optimum[onnxruntime]==1.16.2
|
| 21 |
+
huggingface-hub==0.20.3
|
| 22 |
+
|
| 23 |
+
# Rate Limiting
|
| 24 |
+
slowapi==0.1.9
|
| 25 |
+
|
| 26 |
+
# Utilities
|
| 27 |
+
python-dotenv==1.0.0
|
| 28 |
+
jieba==0.42.1
|
| 29 |
+
|
| 30 |
+
# Keyword Expansion (FastText)
|
| 31 |
+
gensim==4.3.3
|
| 32 |
+
|
| 33 |
+
# Code Quality
|
| 34 |
+
ruff==0.1.14
|
| 35 |
+
mypy==1.8.0
|
| 36 |
+
|
| 37 |
+
# Testing
|
| 38 |
+
pytest==7.4.4
|
| 39 |
+
pytest-asyncio==0.23.3
|
| 40 |
+
pytest-cov==4.1.0
|
| 41 |
+
anyio==4.12.1
|
| 42 |
+
zhconv==1.4.3
|
backend/scripts/smoke_news_cursor.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Smoke Test: Incremental Steam News Cursor Flow
|
| 3 |
+
|
| 4 |
+
Validates that UpdateDetectionService correctly uses cursor-based incremental
|
| 5 |
+
news fetching against the real Steam API.
|
| 6 |
+
|
| 7 |
+
Test game: Factorio (427520) — stable, always has news, uses patchnotes tags.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
cd /mnt/d/sentiment_summarizer/backend
|
| 11 |
+
../venv/bin/python scripts/smoke_news_cursor.py
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import asyncio
|
| 15 |
+
import sys
|
| 16 |
+
from datetime import datetime, timezone
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from unittest.mock import AsyncMock, patch
|
| 19 |
+
|
| 20 |
+
import httpx
|
| 21 |
+
|
| 22 |
+
# Ensure backend/app is importable
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 24 |
+
|
| 25 |
+
from app.services.update_detection_service import UpdateDetectionService # noqa: E402
|
| 26 |
+
|
| 27 |
+
TEST_APP_ID = "427520" # Factorio
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ── helpers ──────────────────────────────────────────────────────────
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _ts() -> str:
|
| 34 |
+
return datetime.now(timezone.utc).strftime("%H:%M:%S")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _print(status: str, msg: str) -> None:
|
| 38 |
+
tag = {
|
| 39 |
+
"OK": "\033[32mOK\033[0m",
|
| 40 |
+
"FAIL": "\033[31mFAIL\033[0m",
|
| 41 |
+
"SKIP": "\033[33mSKIP\033[0m",
|
| 42 |
+
"INFO": "\033[36mINFO\033[0m",
|
| 43 |
+
}
|
| 44 |
+
print(f"[{_ts()}] [{tag.get(status, status)}] {msg}")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class RecordingTransport(httpx.AsyncBaseTransport):
|
| 48 |
+
"""Forwards real HTTP requests but records URL + query params for inspection."""
|
| 49 |
+
|
| 50 |
+
def __init__(self) -> None:
|
| 51 |
+
self._inner = httpx.AsyncHTTPTransport()
|
| 52 |
+
self.recorded: list[dict] = []
|
| 53 |
+
|
| 54 |
+
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
| 55 |
+
params = dict(request.url.params)
|
| 56 |
+
self.recorded.append({"url": str(request.url), "params": params})
|
| 57 |
+
return await self._inner.handle_async_request(request)
|
| 58 |
+
|
| 59 |
+
async def aclose(self) -> None:
|
| 60 |
+
await self._inner.aclose()
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ── main ─────────────────────────────────────────────────────────────
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
async def run_smoke_test() -> int:
|
| 67 |
+
print(f"\nSteam News Cursor Smoke Test — Factorio ({TEST_APP_ID})")
|
| 68 |
+
print("=" * 60)
|
| 69 |
+
failures = 0
|
| 70 |
+
|
| 71 |
+
# ── Check 1: initial scan returns cursor fields ───────────────────
|
| 72 |
+
_print("INFO", f"Check 1: initial scan for {TEST_APP_ID} (Factorio)")
|
| 73 |
+
result_initial = None
|
| 74 |
+
svc1 = UpdateDetectionService()
|
| 75 |
+
try:
|
| 76 |
+
result_initial = await svc1._get_latest_news_date(TEST_APP_ID)
|
| 77 |
+
finally:
|
| 78 |
+
await svc1.close()
|
| 79 |
+
|
| 80 |
+
if result_initial.newest_seen_gid is None:
|
| 81 |
+
_print("SKIP", "No news items returned — Steam API may be rate-limiting or unreachable; skipping all checks")
|
| 82 |
+
return 0
|
| 83 |
+
|
| 84 |
+
c1_ok = True
|
| 85 |
+
if not isinstance(result_initial.newest_seen_gid, str) or not result_initial.newest_seen_gid:
|
| 86 |
+
_print("FAIL", f"newest_seen_gid is empty/non-string: {result_initial.newest_seen_gid!r}")
|
| 87 |
+
c1_ok = False
|
| 88 |
+
|
| 89 |
+
now = datetime.now(timezone.utc)
|
| 90 |
+
if result_initial.newest_seen_at is None:
|
| 91 |
+
_print("FAIL", "newest_seen_at is None")
|
| 92 |
+
c1_ok = False
|
| 93 |
+
elif not (
|
| 94 |
+
datetime(2020, 1, 1, tzinfo=timezone.utc)
|
| 95 |
+
<= result_initial.newest_seen_at
|
| 96 |
+
<= datetime(now.year + 1, 1, 1, tzinfo=timezone.utc)
|
| 97 |
+
):
|
| 98 |
+
_print("FAIL", f"newest_seen_at out of expected range: {result_initial.newest_seen_at!r}")
|
| 99 |
+
c1_ok = False
|
| 100 |
+
|
| 101 |
+
if c1_ok:
|
| 102 |
+
_print(
|
| 103 |
+
"OK",
|
| 104 |
+
f"cursor GID={result_initial.newest_seen_gid}, "
|
| 105 |
+
f"at={result_initial.newest_seen_at.isoformat()}",
|
| 106 |
+
)
|
| 107 |
+
else:
|
| 108 |
+
failures += 1
|
| 109 |
+
|
| 110 |
+
cursor_gid = result_initial.newest_seen_gid
|
| 111 |
+
cursor_at = result_initial.newest_seen_at
|
| 112 |
+
|
| 113 |
+
# ── Check 2: incremental scan uses count=5 ────────────────────────
|
| 114 |
+
_print("INFO", "Check 2: incremental scan uses count=5")
|
| 115 |
+
transport = RecordingTransport()
|
| 116 |
+
client = httpx.AsyncClient(transport=transport, timeout=15.0)
|
| 117 |
+
svc2 = UpdateDetectionService(client=client)
|
| 118 |
+
result_inc = None
|
| 119 |
+
try:
|
| 120 |
+
result_inc = await svc2._get_latest_news_date(
|
| 121 |
+
TEST_APP_ID, last_seen_gid=cursor_gid, last_seen_at=cursor_at
|
| 122 |
+
)
|
| 123 |
+
finally:
|
| 124 |
+
await client.aclose()
|
| 125 |
+
|
| 126 |
+
if not transport.recorded:
|
| 127 |
+
_print("SKIP", "No requests recorded — Steam API may be unreachable")
|
| 128 |
+
else:
|
| 129 |
+
c2_ok = True
|
| 130 |
+
for i, req in enumerate(transport.recorded):
|
| 131 |
+
count_val = req["params"].get("count")
|
| 132 |
+
enddate_val = req["params"].get("enddate", "n/a")
|
| 133 |
+
if str(count_val) != "5":
|
| 134 |
+
_print("FAIL", f"Request {i + 1}: count={count_val!r}, expected '5'")
|
| 135 |
+
c2_ok = False
|
| 136 |
+
else:
|
| 137 |
+
_print("INFO", f" Request {i + 1}: count=5 ✓ enddate={enddate_val}")
|
| 138 |
+
if c2_ok:
|
| 139 |
+
_print("OK", f"All {len(transport.recorded)} request(s) used count=5")
|
| 140 |
+
else:
|
| 141 |
+
failures += 1
|
| 142 |
+
|
| 143 |
+
# ── Check 3: no items older than cursor boundary ──────────────────
|
| 144 |
+
_print("INFO", "Check 3: incremental result respects cursor boundary")
|
| 145 |
+
if result_inc is None:
|
| 146 |
+
_print("SKIP", "No incremental result available")
|
| 147 |
+
else:
|
| 148 |
+
c3_ok = True
|
| 149 |
+
if result_inc.latest_update_date is not None:
|
| 150 |
+
if result_inc.latest_update_date <= cursor_at:
|
| 151 |
+
_print(
|
| 152 |
+
"FAIL",
|
| 153 |
+
f"latest_update_date {result_inc.latest_update_date.isoformat()} "
|
| 154 |
+
f"is not strictly newer than cursor {cursor_at.isoformat()}",
|
| 155 |
+
)
|
| 156 |
+
c3_ok = False
|
| 157 |
+
else:
|
| 158 |
+
_print(
|
| 159 |
+
"INFO",
|
| 160 |
+
f" latest_update_date={result_inc.latest_update_date.isoformat()} "
|
| 161 |
+
f"> cursor (new update found between scans)",
|
| 162 |
+
)
|
| 163 |
+
else:
|
| 164 |
+
_print("INFO", " latest_update_date=None (no new updates since cursor) — expected")
|
| 165 |
+
if c3_ok:
|
| 166 |
+
_print("OK", "Cursor boundary respected")
|
| 167 |
+
else:
|
| 168 |
+
failures += 1
|
| 169 |
+
|
| 170 |
+
# ── Check 4: latest_update_date / major_date invariants ──────────
|
| 171 |
+
_print("INFO", "Check 4: structural invariants on initial scan result")
|
| 172 |
+
c4_ok = True
|
| 173 |
+
if result_initial.latest_update_date is None:
|
| 174 |
+
if result_initial.is_major or result_initial.major_date is not None:
|
| 175 |
+
_print(
|
| 176 |
+
"FAIL",
|
| 177 |
+
f"latest_update_date=None but is_major={result_initial.is_major}, "
|
| 178 |
+
f"major_date={result_initial.major_date!r}",
|
| 179 |
+
)
|
| 180 |
+
c4_ok = False
|
| 181 |
+
elif result_initial.is_major:
|
| 182 |
+
if result_initial.major_date is None:
|
| 183 |
+
_print("FAIL", "is_major=True but major_date is None")
|
| 184 |
+
c4_ok = False
|
| 185 |
+
elif result_initial.major_date > result_initial.latest_update_date:
|
| 186 |
+
_print(
|
| 187 |
+
"FAIL",
|
| 188 |
+
f"major_date {result_initial.major_date.isoformat()} "
|
| 189 |
+
f"> latest_update_date {result_initial.latest_update_date.isoformat()}",
|
| 190 |
+
)
|
| 191 |
+
c4_ok = False
|
| 192 |
+
else:
|
| 193 |
+
if result_initial.major_date is not None:
|
| 194 |
+
_print("FAIL", f"is_major=False but major_date={result_initial.major_date!r}")
|
| 195 |
+
c4_ok = False
|
| 196 |
+
if c4_ok:
|
| 197 |
+
_print(
|
| 198 |
+
"OK",
|
| 199 |
+
f"invariants hold: latest_update_date={result_initial.latest_update_date}, "
|
| 200 |
+
f"is_major={result_initial.is_major}, major_date={result_initial.major_date}",
|
| 201 |
+
)
|
| 202 |
+
else:
|
| 203 |
+
failures += 1
|
| 204 |
+
|
| 205 |
+
# ── Check 5: check_for_updates end-to-end, mocked DB ─────────────
|
| 206 |
+
_print("INFO", "Check 5: check_for_updates end-to-end (mocked DB)")
|
| 207 |
+
mock_mongodb = AsyncMock()
|
| 208 |
+
svc5 = UpdateDetectionService()
|
| 209 |
+
updated = None
|
| 210 |
+
try:
|
| 211 |
+
with patch("app.services.update_detection_service.mongodb", mock_mongodb):
|
| 212 |
+
updated = await svc5.check_for_updates(
|
| 213 |
+
[{"appid": TEST_APP_ID, "name": "Factorio"}]
|
| 214 |
+
)
|
| 215 |
+
finally:
|
| 216 |
+
await svc5.close()
|
| 217 |
+
|
| 218 |
+
c5_ok = True
|
| 219 |
+
if not isinstance(updated, list):
|
| 220 |
+
_print("FAIL", f"check_for_updates returned {type(updated).__name__}, expected list")
|
| 221 |
+
c5_ok = False
|
| 222 |
+
|
| 223 |
+
call_count = mock_mongodb.update_news_cursor.call_count
|
| 224 |
+
if call_count == 0:
|
| 225 |
+
# API may have failed between checks (swallowed internally by the service);
|
| 226 |
+
# treat as skip — not a hard failure per the plan.
|
| 227 |
+
_print("SKIP", "update_news_cursor not called — Steam API may have been unreachable for this call")
|
| 228 |
+
elif call_count > 1:
|
| 229 |
+
_print("FAIL", f"update_news_cursor called {call_count} times, expected 1")
|
| 230 |
+
c5_ok = False
|
| 231 |
+
else:
|
| 232 |
+
args = mock_mongodb.update_news_cursor.call_args[0]
|
| 233 |
+
if not (
|
| 234 |
+
isinstance(args[0], str)
|
| 235 |
+
and isinstance(args[1], str)
|
| 236 |
+
and isinstance(args[2], datetime)
|
| 237 |
+
):
|
| 238 |
+
_print(
|
| 239 |
+
"FAIL",
|
| 240 |
+
f"update_news_cursor arg types wrong: "
|
| 241 |
+
f"{[type(a).__name__ for a in args]} — expected (str, str, datetime)",
|
| 242 |
+
)
|
| 243 |
+
c5_ok = False
|
| 244 |
+
else:
|
| 245 |
+
_print(
|
| 246 |
+
"OK",
|
| 247 |
+
f"check_for_updates returned list; "
|
| 248 |
+
f"update_news_cursor({args[0]!r}, {args[1]!r}, {args[2].isoformat()!r})",
|
| 249 |
+
)
|
| 250 |
+
if not c5_ok:
|
| 251 |
+
failures += 1
|
| 252 |
+
|
| 253 |
+
# ── Summary ───────────────────────────────────────────────────────
|
| 254 |
+
print("=" * 60)
|
| 255 |
+
if failures == 0:
|
| 256 |
+
_print("OK", "All checks passed")
|
| 257 |
+
return 0
|
| 258 |
+
else:
|
| 259 |
+
_print("FAIL", f"{failures} check(s) failed")
|
| 260 |
+
return 1
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
if __name__ == "__main__":
|
| 264 |
+
sys.exit(asyncio.run(run_smoke_test()))
|
backend/scripts/smoke_test.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Smoke Test — local verification of worker cycle and analysis pipeline.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
cd backend
|
| 6 |
+
python scripts/smoke_test.py analyze <appid> # run full analysis for a game
|
| 7 |
+
python scripts/smoke_test.py cycle # mini worker cycle (1 game)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import asyncio
|
| 12 |
+
import logging
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
# Ensure backend/app is importable
|
| 19 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 20 |
+
|
| 21 |
+
from app.core.config import settings # noqa: E402
|
| 22 |
+
from app.db.mongodb import mongodb # noqa: E402
|
| 23 |
+
from app.services.nlp_service import NLPService # noqa: E402
|
| 24 |
+
from app.services.steam_service import SteamService # noqa: E402
|
| 25 |
+
from app.services.update_detection_service import UpdateDetectionService # noqa: E402
|
| 26 |
+
from app.services.precache_service import PreCacheService # noqa: E402
|
| 27 |
+
from app.services.analysis_runner import run_full_analysis # noqa: E402
|
| 28 |
+
|
| 29 |
+
logging.basicConfig(
|
| 30 |
+
level=logging.INFO,
|
| 31 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 32 |
+
)
|
| 33 |
+
logger = logging.getLogger("smoke_test")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _ts() -> str:
|
| 37 |
+
return datetime.now(timezone.utc).strftime("%H:%M:%S")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _print(status: str, msg: str) -> None:
|
| 41 |
+
tag = {"OK": "\033[32mOK\033[0m", "FAIL": "\033[31mFAIL\033[0m", "SKIP": "\033[33mSKIP\033[0m", "INFO": "\033[36mINFO\033[0m"}
|
| 42 |
+
print(f"[{_ts()}] [{tag.get(status, status)}] {msg}")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ── analyze subcommand ──────────────────────────────────────────────
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
async def cmd_analyze(app_id: str) -> None:
|
| 49 |
+
_print("INFO", f"Starting analysis for app_id={app_id}")
|
| 50 |
+
_print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}")
|
| 51 |
+
|
| 52 |
+
await mongodb.connect()
|
| 53 |
+
|
| 54 |
+
steam_svc = SteamService()
|
| 55 |
+
nlp_svc = NLPService()
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
t0 = time.monotonic()
|
| 59 |
+
result = await run_full_analysis(app_id, f"smoke-{app_id}", steam_svc, nlp_svc)
|
| 60 |
+
elapsed = time.monotonic() - t0
|
| 61 |
+
|
| 62 |
+
if result is None:
|
| 63 |
+
_print("FAIL", "run_full_analysis returned None")
|
| 64 |
+
return
|
| 65 |
+
|
| 66 |
+
game = result.get("game", {})
|
| 67 |
+
topics = result.get("topics", [])
|
| 68 |
+
analyzed = result.get("analyzed_reviews", 0)
|
| 69 |
+
highlights = result.get("general_highlights", [])
|
| 70 |
+
|
| 71 |
+
_print("OK", f"Analysis complete in {elapsed:.1f}s")
|
| 72 |
+
_print("OK", f" Game: {game.get('name', '?')} (appid {game.get('app_id', '?')})")
|
| 73 |
+
_print("OK", f" Reviews analyzed: {analyzed}")
|
| 74 |
+
_print("OK", f" Topics found: {len(topics)}")
|
| 75 |
+
_print("OK", f" General highlights: {len(highlights)}")
|
| 76 |
+
|
| 77 |
+
# Verify cache write
|
| 78 |
+
cached = await mongodb.get_cached_analysis(app_id)
|
| 79 |
+
if cached:
|
| 80 |
+
_print("OK", " Cache write verified — document found in MongoDB")
|
| 81 |
+
else:
|
| 82 |
+
_print("FAIL", " Cache write verification FAILED — no document in MongoDB")
|
| 83 |
+
|
| 84 |
+
finally:
|
| 85 |
+
await steam_svc.close()
|
| 86 |
+
await mongodb.disconnect()
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ── cycle subcommand ─────────────────────────────────────────────────
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
async def cmd_cycle() -> None:
|
| 93 |
+
_print("INFO", "Starting mini worker cycle")
|
| 94 |
+
_print("INFO", f"MongoDB: {settings.mongodb_url[:30]}... / DB: {settings.mongodb_db_name}")
|
| 95 |
+
|
| 96 |
+
await mongodb.connect()
|
| 97 |
+
|
| 98 |
+
steam_svc = SteamService()
|
| 99 |
+
nlp_svc = NLPService()
|
| 100 |
+
update_svc = UpdateDetectionService()
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# Step 1: Get top 1 game
|
| 104 |
+
_print("INFO", "Step 1: Fetching top game by reviews...")
|
| 105 |
+
top_games = await mongodb.get_top_games_by_reviews(1)
|
| 106 |
+
if not top_games:
|
| 107 |
+
_print("SKIP", "No games in DB — run game sync first or use 'analyze' subcommand")
|
| 108 |
+
return
|
| 109 |
+
|
| 110 |
+
game = top_games[0]
|
| 111 |
+
app_id = str(game.get("appid", ""))
|
| 112 |
+
name = game.get("name", "?")
|
| 113 |
+
_print("OK", f" Top game: {name} (appid {app_id})")
|
| 114 |
+
|
| 115 |
+
# Step 2: Test datetime comparison (the bug this patch fixes)
|
| 116 |
+
_print("INFO", "Step 2: Testing synced_at datetime comparison...")
|
| 117 |
+
synced_at = game.get("synced_at")
|
| 118 |
+
if synced_at:
|
| 119 |
+
try:
|
| 120 |
+
delta = datetime.now(timezone.utc) - synced_at
|
| 121 |
+
hours = delta.total_seconds() / 3600
|
| 122 |
+
_print("OK", f" synced_at delta: {hours:.1f}h (tz={synced_at.tzinfo})")
|
| 123 |
+
except TypeError as e:
|
| 124 |
+
_print("FAIL", f" datetime subtraction failed: {e}")
|
| 125 |
+
return
|
| 126 |
+
else:
|
| 127 |
+
_print("SKIP", " No synced_at field — game sync not run yet")
|
| 128 |
+
|
| 129 |
+
# Step 3: Update detection (1 game)
|
| 130 |
+
_print("INFO", "Step 3: Update detection...")
|
| 131 |
+
t0 = time.monotonic()
|
| 132 |
+
updated = await update_svc.check_for_updates([game])
|
| 133 |
+
elapsed = time.monotonic() - t0
|
| 134 |
+
_print("OK", f" Updates detected: {len(updated)} in {elapsed:.1f}s")
|
| 135 |
+
|
| 136 |
+
# Step 4: Bootstrap missing analyses
|
| 137 |
+
_print("INFO", "Step 4: Bootstrap missing analyses...")
|
| 138 |
+
precache_svc = PreCacheService(steam_svc, nlp_svc)
|
| 139 |
+
bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games)
|
| 140 |
+
_print("OK", f" Bootstrapped: {bootstrapped}")
|
| 141 |
+
|
| 142 |
+
# Step 5: Process due analyses (max 1)
|
| 143 |
+
_print("INFO", "Step 5: Processing due analyses (max 1)...")
|
| 144 |
+
orig = settings.precache_max_analyses_per_cycle
|
| 145 |
+
# Temporarily limit to 1
|
| 146 |
+
object.__setattr__(settings, "precache_max_analyses_per_cycle", 1)
|
| 147 |
+
try:
|
| 148 |
+
executed = await precache_svc.process_due_analyses()
|
| 149 |
+
_print("OK", f" Executed: {executed}")
|
| 150 |
+
finally:
|
| 151 |
+
object.__setattr__(settings, "precache_max_analyses_per_cycle", orig)
|
| 152 |
+
|
| 153 |
+
_print("OK", "Mini cycle complete")
|
| 154 |
+
|
| 155 |
+
finally:
|
| 156 |
+
await update_svc.close()
|
| 157 |
+
await steam_svc.close()
|
| 158 |
+
await mongodb.disconnect()
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ── main ─────────────────────────────────────────────────────────────
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def main() -> None:
|
| 165 |
+
parser = argparse.ArgumentParser(description="SentimentStream smoke test")
|
| 166 |
+
sub = parser.add_subparsers(dest="command")
|
| 167 |
+
|
| 168 |
+
p_analyze = sub.add_parser("analyze", help="Run full analysis for a game")
|
| 169 |
+
p_analyze.add_argument("appid", help="Steam app ID (e.g. 730)")
|
| 170 |
+
|
| 171 |
+
sub.add_parser("cycle", help="Run mini worker cycle (top 1 game)")
|
| 172 |
+
|
| 173 |
+
args = parser.parse_args()
|
| 174 |
+
|
| 175 |
+
if args.command == "analyze":
|
| 176 |
+
asyncio.run(cmd_analyze(args.appid))
|
| 177 |
+
elif args.command == "cycle":
|
| 178 |
+
asyncio.run(cmd_cycle())
|
| 179 |
+
else:
|
| 180 |
+
parser.print_help()
|
| 181 |
+
sys.exit(1)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
if __name__ == "__main__":
|
| 185 |
+
main()
|
backend/worker_main.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Worker Main App — lightweight FastAPI for background game sync and pre-cache.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
GET /health — MongoDB ping, last cycle summary, cycle_running flag
|
| 6 |
+
POST /trigger — token-protected, starts a worker cycle as background task
|
| 7 |
+
GET /logs — token-protected, read structured log tail
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import asyncio
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
import uuid
|
| 14 |
+
from contextlib import asynccontextmanager
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
from typing import Any
|
| 17 |
+
|
| 18 |
+
from fastapi import FastAPI, Query, Request
|
| 19 |
+
from fastapi.responses import JSONResponse
|
| 20 |
+
|
| 21 |
+
from app.core.config import settings
|
| 22 |
+
from app.core.worker_logging import (
|
| 23 |
+
AsyncTimingContext,
|
| 24 |
+
WORKER_LOG_WHITELIST,
|
| 25 |
+
log_structured,
|
| 26 |
+
read_log_tail,
|
| 27 |
+
resolve_log_path,
|
| 28 |
+
set_cycle_id,
|
| 29 |
+
setup_app_logging,
|
| 30 |
+
setup_structured_logger,
|
| 31 |
+
)
|
| 32 |
+
from app.db.mongodb import mongodb
|
| 33 |
+
from app.services.game_sync_service import GameSyncService
|
| 34 |
+
from app.services.nlp_service import NLPService
|
| 35 |
+
from app.services.precache_service import PreCacheService
|
| 36 |
+
from app.services.steam_service import SteamService
|
| 37 |
+
from app.services.priority_refresh_service import PriorityRefreshService
|
| 38 |
+
from app.services.update_detection_service import UpdateDetectionService
|
| 39 |
+
|
| 40 |
+
logging.basicConfig(
|
| 41 |
+
level=logging.INFO,
|
| 42 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 43 |
+
)
|
| 44 |
+
logger = logging.getLogger(__name__)
|
| 45 |
+
|
| 46 |
+
# Cycle state
|
| 47 |
+
_cycle_running = False
|
| 48 |
+
_last_cycle_summary: dict[str, Any] = {}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@asynccontextmanager
|
| 52 |
+
async def lifespan(app: FastAPI):
|
| 53 |
+
"""Connect MongoDB on startup, disconnect on shutdown."""
|
| 54 |
+
await mongodb.connect()
|
| 55 |
+
setup_structured_logger("worker")
|
| 56 |
+
setup_app_logging()
|
| 57 |
+
logger.info("Worker started — MongoDB connected, structured logging initialized")
|
| 58 |
+
yield
|
| 59 |
+
await mongodb.disconnect()
|
| 60 |
+
logger.info("Worker shutting down")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
app = FastAPI(title="SentimentStream Worker", lifespan=lifespan)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@app.get("/health")
|
| 67 |
+
async def health():
|
| 68 |
+
"""Health check with cycle status."""
|
| 69 |
+
mongo_ok = False
|
| 70 |
+
try:
|
| 71 |
+
if mongodb.client:
|
| 72 |
+
await mongodb.client.admin.command("ping")
|
| 73 |
+
mongo_ok = True
|
| 74 |
+
except Exception:
|
| 75 |
+
pass
|
| 76 |
+
|
| 77 |
+
return {
|
| 78 |
+
"status": "ok" if mongo_ok else "degraded",
|
| 79 |
+
"mongodb": "connected" if mongo_ok else "disconnected",
|
| 80 |
+
"cycle_running": _cycle_running,
|
| 81 |
+
"last_cycle": _last_cycle_summary,
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _check_bearer_token(request: Request) -> bool:
|
| 86 |
+
"""Validate Bearer token from Authorization header."""
|
| 87 |
+
auth = request.headers.get("Authorization", "")
|
| 88 |
+
expected = settings.worker_trigger_token
|
| 89 |
+
return bool(expected and auth.startswith("Bearer ") and auth[7:] == expected)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
@app.post("/trigger")
|
| 93 |
+
async def trigger(request: Request):
|
| 94 |
+
"""Token-protected trigger to start a worker cycle."""
|
| 95 |
+
global _cycle_running
|
| 96 |
+
|
| 97 |
+
if not _check_bearer_token(request):
|
| 98 |
+
return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
|
| 99 |
+
|
| 100 |
+
if _cycle_running:
|
| 101 |
+
return JSONResponse(status_code=503, content={"detail": "Cycle already running"})
|
| 102 |
+
|
| 103 |
+
asyncio.create_task(_run_cycle())
|
| 104 |
+
return {"status": "started"}
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
@app.get("/logs")
|
| 108 |
+
async def get_logs(
|
| 109 |
+
request: Request,
|
| 110 |
+
lines: int = Query(default=100, ge=1, le=1000),
|
| 111 |
+
level: str | None = Query(default=None),
|
| 112 |
+
event: str | None = Query(default=None),
|
| 113 |
+
file: str = Query(default="worker"),
|
| 114 |
+
):
|
| 115 |
+
"""Token-protected endpoint to read structured log tail."""
|
| 116 |
+
if not _check_bearer_token(request):
|
| 117 |
+
return JSONResponse(status_code=401, content={"detail": "Unauthorized"})
|
| 118 |
+
|
| 119 |
+
log_path = resolve_log_path(file, WORKER_LOG_WHITELIST)
|
| 120 |
+
if log_path is None:
|
| 121 |
+
return JSONResponse(
|
| 122 |
+
status_code=400,
|
| 123 |
+
content={"detail": f"Unknown log file: '{file}'. Valid: {list(WORKER_LOG_WHITELIST.keys())}"},
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
entries = read_log_tail(log_path, lines=lines, level=level, event=event)
|
| 127 |
+
return {"entries": entries, "count": len(entries)}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
async def _run_cycle() -> None:
|
| 131 |
+
"""Execute a full worker cycle."""
|
| 132 |
+
global _cycle_running, _last_cycle_summary
|
| 133 |
+
_cycle_running = True
|
| 134 |
+
started = datetime.now(timezone.utc)
|
| 135 |
+
summary: dict[str, Any] = {"started_at": started.isoformat()}
|
| 136 |
+
|
| 137 |
+
cycle_id = uuid.uuid4().hex[:8]
|
| 138 |
+
set_cycle_id(cycle_id)
|
| 139 |
+
log_structured("cycle_start", cycle_id=cycle_id)
|
| 140 |
+
|
| 141 |
+
steam_svc = SteamService()
|
| 142 |
+
nlp_svc = NLPService()
|
| 143 |
+
game_sync_svc = GameSyncService()
|
| 144 |
+
priority_svc = PriorityRefreshService()
|
| 145 |
+
update_svc = UpdateDetectionService()
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
# 1. Game sync (if enabled and not synced recently)
|
| 149 |
+
if settings.game_sync_enabled:
|
| 150 |
+
top_games = await mongodb.get_top_games_by_reviews(1)
|
| 151 |
+
last_synced = top_games[0].get("synced_at") if top_games else None
|
| 152 |
+
hours_since_sync = None
|
| 153 |
+
if last_synced:
|
| 154 |
+
delta = datetime.now(timezone.utc) - last_synced
|
| 155 |
+
hours_since_sync = delta.total_seconds() / 3600
|
| 156 |
+
|
| 157 |
+
if hours_since_sync is None or hours_since_sync > 20:
|
| 158 |
+
async with AsyncTimingContext() as t_sync:
|
| 159 |
+
logger.info("Starting game sync...")
|
| 160 |
+
upserted, modified = await game_sync_svc.sync_all_games()
|
| 161 |
+
summary["game_sync"] = {"upserted": upserted, "modified": modified}
|
| 162 |
+
log_structured("game_sync", elapsed_s=t_sync.elapsed_s,
|
| 163 |
+
detail=summary["game_sync"])
|
| 164 |
+
|
| 165 |
+
async with AsyncTimingContext() as t_details:
|
| 166 |
+
enriched = await game_sync_svc.sync_top_game_details()
|
| 167 |
+
summary["game_details"] = {"enriched": enriched}
|
| 168 |
+
log_structured("game_details", elapsed_s=t_details.elapsed_s,
|
| 169 |
+
detail=summary["game_details"])
|
| 170 |
+
else:
|
| 171 |
+
summary["game_sync"] = "skipped (recent)"
|
| 172 |
+
log_structured("game_sync", detail="skipped (recent)")
|
| 173 |
+
|
| 174 |
+
# ALWAYS enrich CN names if sync is enabled, even if main sync skipped
|
| 175 |
+
async with AsyncTimingContext() as t_cn:
|
| 176 |
+
cn_processed = await game_sync_svc.enrich_cn_names()
|
| 177 |
+
summary["cn_enrichment"] = {"processed": cn_processed}
|
| 178 |
+
log_structured("cn_enrichment", elapsed_s=t_cn.elapsed_s,
|
| 179 |
+
detail=summary["cn_enrichment"])
|
| 180 |
+
|
| 181 |
+
async with AsyncTimingContext() as t_app_types:
|
| 182 |
+
app_types_processed = await game_sync_svc.enrich_app_types()
|
| 183 |
+
summary["app_type_enrichment"] = {"processed": app_types_processed}
|
| 184 |
+
log_structured("app_type_enrichment", elapsed_s=t_app_types.elapsed_s,
|
| 185 |
+
detail=summary["app_type_enrichment"])
|
| 186 |
+
|
| 187 |
+
# 1b. Priority refresh
|
| 188 |
+
async with AsyncTimingContext() as t_priority:
|
| 189 |
+
priority_result = await priority_svc.refresh_priorities()
|
| 190 |
+
summary["priority_refresh"] = priority_result
|
| 191 |
+
log_structured("priority_refresh", elapsed_s=t_priority.elapsed_s, detail=priority_result)
|
| 192 |
+
|
| 193 |
+
# 2. Update detection
|
| 194 |
+
async with AsyncTimingContext() as t_update:
|
| 195 |
+
top_games = await mongodb.get_priority_games_for_analysis()
|
| 196 |
+
updated_games = await update_svc.check_for_updates(top_games)
|
| 197 |
+
summary["updates_detected"] = len(updated_games)
|
| 198 |
+
log_structured("update_detection", elapsed_s=t_update.elapsed_s,
|
| 199 |
+
detail={"updates_detected": len(updated_games)})
|
| 200 |
+
|
| 201 |
+
# 3. Create schedules for updated games
|
| 202 |
+
precache_svc = PreCacheService(steam_svc, nlp_svc)
|
| 203 |
+
|
| 204 |
+
async with AsyncTimingContext() as t_sched:
|
| 205 |
+
if updated_games:
|
| 206 |
+
await precache_svc.create_schedules_for_updates(updated_games)
|
| 207 |
+
log_structured("create_schedules", elapsed_s=t_sched.elapsed_s,
|
| 208 |
+
detail={"updated_games": len(updated_games) if updated_games else 0})
|
| 209 |
+
|
| 210 |
+
# 4. Bootstrap missing analyses
|
| 211 |
+
async with AsyncTimingContext() as t_boot:
|
| 212 |
+
bootstrapped = await precache_svc.bootstrap_missing_analyses(top_games)
|
| 213 |
+
summary["bootstrapped"] = bootstrapped
|
| 214 |
+
log_structured("bootstrap_missing", elapsed_s=t_boot.elapsed_s,
|
| 215 |
+
detail={"bootstrapped": bootstrapped})
|
| 216 |
+
|
| 217 |
+
# 5. Process due analyses
|
| 218 |
+
if settings.precache_enabled:
|
| 219 |
+
async with AsyncTimingContext() as t_analyses:
|
| 220 |
+
executed = await precache_svc.process_due_analyses()
|
| 221 |
+
summary["analyses_executed"] = executed
|
| 222 |
+
log_structured("process_due_analyses", elapsed_s=t_analyses.elapsed_s,
|
| 223 |
+
detail={"executed": executed})
|
| 224 |
+
else:
|
| 225 |
+
summary["precache"] = "disabled"
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
logger.error(f"Cycle error: {e}", exc_info=True)
|
| 229 |
+
summary["error"] = str(e)
|
| 230 |
+
log_structured("cycle_error", level=logging.ERROR, error=str(e))
|
| 231 |
+
finally:
|
| 232 |
+
await game_sync_svc.close()
|
| 233 |
+
await priority_svc.close()
|
| 234 |
+
await update_svc.close()
|
| 235 |
+
await steam_svc.close()
|
| 236 |
+
|
| 237 |
+
elapsed = (datetime.now(timezone.utc) - started).total_seconds()
|
| 238 |
+
summary["elapsed_seconds"] = round(elapsed, 1)
|
| 239 |
+
_last_cycle_summary = summary
|
| 240 |
+
_cycle_running = False
|
| 241 |
+
log_structured("cycle_end", elapsed_s=round(elapsed, 1),
|
| 242 |
+
detail=summary)
|
| 243 |
+
set_cycle_id(None)
|
| 244 |
+
logger.info(f"Cycle complete in {elapsed:.1f}s: {summary}")
|
scripts/benchmark_major_update.py
ADDED
|
@@ -0,0 +1,848 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Benchmark script for the major update detection heuristic.
|
| 4 |
+
|
| 5 |
+
Evaluates UpdateDetectionService._is_update_related, _collect_update_candidates,
|
| 6 |
+
and _is_major_update against a curated set of Steam games.
|
| 7 |
+
|
| 8 |
+
Three modes:
|
| 9 |
+
--discover Fetch news for all games (count=20 by default, matches
|
| 10 |
+
production) and display all items with classification
|
| 11 |
+
details. Use this to identify ground truth.
|
| 12 |
+
--evaluate Item-level evaluation: for each ItemCase, find the item
|
| 13 |
+
by gid and check if _is_update_related / _is_major_update
|
| 14 |
+
match expectations.
|
| 15 |
+
--evaluate-service Service-level evaluation: for each ServiceCase, run the
|
| 16 |
+
full selection pipeline and compare the outcome.
|
| 17 |
+
|
| 18 |
+
Both --evaluate and --evaluate-service run by default when no mode is specified.
|
| 19 |
+
|
| 20 |
+
Examples:
|
| 21 |
+
python scripts/benchmark_major_update.py --discover
|
| 22 |
+
python scripts/benchmark_major_update.py --discover --count 50
|
| 23 |
+
python scripts/benchmark_major_update.py --evaluate
|
| 24 |
+
python scripts/benchmark_major_update.py --evaluate-service
|
| 25 |
+
python scripts/benchmark_major_update.py # runs both evaluate modes
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
import argparse
|
| 31 |
+
import sys
|
| 32 |
+
from dataclasses import dataclass
|
| 33 |
+
from datetime import datetime, timezone
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
from typing import Literal
|
| 36 |
+
|
| 37 |
+
import httpx
|
| 38 |
+
|
| 39 |
+
# ── import project service ────────────────────────────────────────────────────
|
| 40 |
+
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
|
| 41 |
+
from app.services.update_detection_service import UpdateDetectionService # noqa: E402
|
| 42 |
+
|
| 43 |
+
STEAM_NEWS_API_URL = "https://api.steampowered.com/ISteamNews/GetNewsForApp/v2/"
|
| 44 |
+
|
| 45 |
+
# ── benchmark games ───────────────────────────────────────────────────────────
|
| 46 |
+
GAMES: list[tuple[str, str]] = [
|
| 47 |
+
("Going Medieval", "1029780"),
|
| 48 |
+
("Timberborn", "1062090"),
|
| 49 |
+
("Hades II", "1145350"),
|
| 50 |
+
("Against the Storm", "1336490"),
|
| 51 |
+
("Valheim", "892970"),
|
| 52 |
+
("Manor Lords", "1363080"),
|
| 53 |
+
("Project Zomboid", "108600"),
|
| 54 |
+
("Dwarf Fortress", "975370"),
|
| 55 |
+
("Helldivers 2", "553850"),
|
| 56 |
+
("Deep Rock Galactic", "548430"),
|
| 57 |
+
("Lethal Company", "1966720"),
|
| 58 |
+
("Factorio", "427520"),
|
| 59 |
+
("Satisfactory", "526870"),
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
# ── ground truth structures ───────────────────────────────────────────────────
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class ItemCase:
|
| 66 |
+
"""Per-item ground truth: is this specific event major?"""
|
| 67 |
+
game_name: str
|
| 68 |
+
appid: str
|
| 69 |
+
gid: str
|
| 70 |
+
title: str # for display
|
| 71 |
+
expected: Literal["major", "not_major", "ambiguous"]
|
| 72 |
+
reasoning: str
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
@dataclass
|
| 76 |
+
class ServiceCase:
|
| 77 |
+
"""Per-game ground truth: what should the production code do?"""
|
| 78 |
+
game_name: str
|
| 79 |
+
appid: str
|
| 80 |
+
expected_major: bool | None # True / False / None = ambiguous
|
| 81 |
+
reasoning: str
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ── item-level ground truth ───────────────────────────────────────────────────
|
| 85 |
+
# Populated from --discover run on 2026-03-19.
|
| 86 |
+
ITEM_CASES: list[ItemCase] = [
|
| 87 |
+
# ── Going Medieval ────────────────────────────────────────────────────────
|
| 88 |
+
ItemCase(
|
| 89 |
+
game_name="Going Medieval",
|
| 90 |
+
appid="1029780",
|
| 91 |
+
gid="1826992588604105",
|
| 92 |
+
title="Going Medieval is out now in 1.0!",
|
| 93 |
+
expected="major",
|
| 94 |
+
reasoning=(
|
| 95 |
+
"1.0 full release out of Early Access — unambiguously major. "
|
| 96 |
+
"Phase 1: RELEASE_PHRASE_RE matches 'is out now' → update-related. "
|
| 97 |
+
"ONE_ZERO_RE matches '1.0' → major."
|
| 98 |
+
),
|
| 99 |
+
),
|
| 100 |
+
ItemCase(
|
| 101 |
+
game_name="Going Medieval",
|
| 102 |
+
appid="1029780",
|
| 103 |
+
gid="1827626365751261",
|
| 104 |
+
title="Experimental Branch Patch (1.0.48)",
|
| 105 |
+
expected="not_major",
|
| 106 |
+
reasoning=(
|
| 107 |
+
"Experimental branch incremental patch. Three-segment version (1.0.48) "
|
| 108 |
+
"excluded by VERSION_RE. BRANCH_RE blocks major classification."
|
| 109 |
+
),
|
| 110 |
+
),
|
| 111 |
+
ItemCase(
|
| 112 |
+
game_name="Going Medieval",
|
| 113 |
+
appid="1029780",
|
| 114 |
+
gid="1827626365750723",
|
| 115 |
+
title="Patch Notes (1.0.47)",
|
| 116 |
+
expected="not_major",
|
| 117 |
+
reasoning="Incremental stable patch, three-segment version. not_major is correct.",
|
| 118 |
+
),
|
| 119 |
+
# ── Timberborn ────────────────────────────────────────────────────────────
|
| 120 |
+
ItemCase(
|
| 121 |
+
game_name="Timberborn",
|
| 122 |
+
appid="1062090",
|
| 123 |
+
gid="1826992588592887",
|
| 124 |
+
title="Timberborn 1.0 is live!",
|
| 125 |
+
expected="major",
|
| 126 |
+
reasoning=(
|
| 127 |
+
"1.0 full release out of Early Access — unambiguously major. "
|
| 128 |
+
"Phase 1: RELEASE_PHRASE_RE matches 'is live' → update-related. "
|
| 129 |
+
"ONE_ZERO_RE matches '1.0' → major."
|
| 130 |
+
),
|
| 131 |
+
),
|
| 132 |
+
ItemCase(
|
| 133 |
+
game_name="Timberborn",
|
| 134 |
+
appid="1062090",
|
| 135 |
+
gid="1826992588603124",
|
| 136 |
+
title="Patch notes 2026-03-17 (experimental)",
|
| 137 |
+
expected="not_major",
|
| 138 |
+
reasoning="Experimental branch date-based patch notes. No version number. not_major is correct.",
|
| 139 |
+
),
|
| 140 |
+
# ── Hades II ──────────────────────────────────────────────────────────────
|
| 141 |
+
ItemCase(
|
| 142 |
+
game_name="Hades II",
|
| 143 |
+
appid="1145350",
|
| 144 |
+
gid="1816215235360707",
|
| 145 |
+
title="Hades II v1.0 Hotfix 3",
|
| 146 |
+
expected="not_major",
|
| 147 |
+
reasoning=(
|
| 148 |
+
"A bugfix hotfix on top of the v1.0 launch — not a content update. "
|
| 149 |
+
"Phase 1: HOTFIX_RE blocks major classification. Correct: not_major."
|
| 150 |
+
),
|
| 151 |
+
),
|
| 152 |
+
ItemCase(
|
| 153 |
+
game_name="Hades II",
|
| 154 |
+
appid="1145350",
|
| 155 |
+
gid="1811772772516846",
|
| 156 |
+
title="Hades II v1.0 Hotfix 2",
|
| 157 |
+
expected="not_major",
|
| 158 |
+
reasoning="Same pattern: HOTFIX_RE blocks 'v1.0 Hotfix N' from being classified as major.",
|
| 159 |
+
),
|
| 160 |
+
ItemCase(
|
| 161 |
+
game_name="Hades II",
|
| 162 |
+
appid="1145350",
|
| 163 |
+
gid="1811772772248738",
|
| 164 |
+
title="Hades II v1.0 Is Now Available!",
|
| 165 |
+
expected="major",
|
| 166 |
+
reasoning=(
|
| 167 |
+
"v1.0 full launch — unambiguously major. "
|
| 168 |
+
"Phase 1: RELEASE_PHRASE_RE matches 'Is Now Available' → update-related. "
|
| 169 |
+
"No hotfix/branch blocker. VERSION_RE matches 'v1.0' → major."
|
| 170 |
+
),
|
| 171 |
+
),
|
| 172 |
+
# ── Against the Storm ─────────────────────────────────────────────────────
|
| 173 |
+
ItemCase(
|
| 174 |
+
game_name="Against the Storm",
|
| 175 |
+
appid="1336490",
|
| 176 |
+
gid="1818752592135840",
|
| 177 |
+
title="Demo Update 1.9.6",
|
| 178 |
+
expected="not_major",
|
| 179 |
+
reasoning=(
|
| 180 |
+
"Demo game update, three-segment version 1.9.6. "
|
| 181 |
+
"Service correctly classifies as not_major."
|
| 182 |
+
),
|
| 183 |
+
),
|
| 184 |
+
ItemCase(
|
| 185 |
+
game_name="Against the Storm",
|
| 186 |
+
appid="1336490",
|
| 187 |
+
gid="1816849002010836",
|
| 188 |
+
title="Brineworks Update (1.9) available!",
|
| 189 |
+
expected="major",
|
| 190 |
+
reasoning=(
|
| 191 |
+
"Named major content update with version 1.9. "
|
| 192 |
+
"Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' "
|
| 193 |
+
"→ update-related. VERSION_RE → major."
|
| 194 |
+
),
|
| 195 |
+
),
|
| 196 |
+
# ── Valheim ───────────────────────────────────────────────────────────────
|
| 197 |
+
ItemCase(
|
| 198 |
+
game_name="Valheim",
|
| 199 |
+
appid="892970",
|
| 200 |
+
gid="1825093633184197",
|
| 201 |
+
title="Patch 0.221.12",
|
| 202 |
+
expected="not_major",
|
| 203 |
+
reasoning="Three-segment maintenance patch. Correctly classified as not_major.",
|
| 204 |
+
),
|
| 205 |
+
ItemCase(
|
| 206 |
+
game_name="Valheim",
|
| 207 |
+
appid="892970",
|
| 208 |
+
gid="1809869179994587",
|
| 209 |
+
title="Patch 0.221.4 (Public Test)",
|
| 210 |
+
expected="not_major",
|
| 211 |
+
reasoning="Public test branch three-segment patch. Correctly classified as not_major.",
|
| 212 |
+
),
|
| 213 |
+
# ── Manor Lords ───────────────────────────────────────────────────────────
|
| 214 |
+
ItemCase(
|
| 215 |
+
game_name="Manor Lords",
|
| 216 |
+
appid="1363080",
|
| 217 |
+
gid="1827626365750540",
|
| 218 |
+
title="Major Update #6: Battlefield Changes, New Map, and Family Based Progression",
|
| 219 |
+
expected="major",
|
| 220 |
+
reasoning=(
|
| 221 |
+
"Developer-declared major content drop. "
|
| 222 |
+
"Phase 1: CONTENT_UPDATE_RE matches 'Major Update' → update-related and major."
|
| 223 |
+
),
|
| 224 |
+
),
|
| 225 |
+
ItemCase(
|
| 226 |
+
game_name="Manor Lords",
|
| 227 |
+
appid="1363080",
|
| 228 |
+
gid="1826992588603500",
|
| 229 |
+
title="New BETA version is available for testing (0.8.065)",
|
| 230 |
+
expected="not_major",
|
| 231 |
+
reasoning=(
|
| 232 |
+
"Beta/testing build announcement, not a production major update. "
|
| 233 |
+
"Current heuristic misses it entirely, which is acceptable for this benchmark case."
|
| 234 |
+
),
|
| 235 |
+
),
|
| 236 |
+
# ── Project Zomboid ───────────────────────────────────────────────────────
|
| 237 |
+
ItemCase(
|
| 238 |
+
game_name="Project Zomboid",
|
| 239 |
+
appid="108600",
|
| 240 |
+
gid="1826992588590120",
|
| 241 |
+
title="42.15.2 UNSTABLE HOTFIX Released",
|
| 242 |
+
expected="not_major",
|
| 243 |
+
reasoning=(
|
| 244 |
+
"Unstable-branch hotfix. patchnotes tag makes it update-related, "
|
| 245 |
+
"but HOTFIX_RE correctly blocks major classification."
|
| 246 |
+
),
|
| 247 |
+
),
|
| 248 |
+
ItemCase(
|
| 249 |
+
game_name="Project Zomboid",
|
| 250 |
+
appid="108600",
|
| 251 |
+
gid="1826362059930323",
|
| 252 |
+
title="Build 42.15.0 Unstable Released",
|
| 253 |
+
expected="not_major",
|
| 254 |
+
reasoning=(
|
| 255 |
+
"Unstable build release, not a production major update. "
|
| 256 |
+
"Current heuristic does not classify it as update-related because the three-segment "
|
| 257 |
+
"build number fails VERSION_RE."
|
| 258 |
+
),
|
| 259 |
+
),
|
| 260 |
+
# ── Dwarf Fortress ────────────────────────────────────────────────────────
|
| 261 |
+
ItemCase(
|
| 262 |
+
game_name="Dwarf Fortress",
|
| 263 |
+
appid="975370",
|
| 264 |
+
gid="1826362059918689",
|
| 265 |
+
title="Food fixes, AMA, community spotlight and more! Dwarf Fortress Patch 53.11",
|
| 266 |
+
expected="not_major",
|
| 267 |
+
reasoning=(
|
| 268 |
+
"Maintenance patch with Dwarf Fortress' two-segment numbering scheme. "
|
| 269 |
+
"Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fixes' "
|
| 270 |
+
"→ maintenance blocker fires before VERSION_RE → not_major."
|
| 271 |
+
),
|
| 272 |
+
),
|
| 273 |
+
ItemCase(
|
| 274 |
+
game_name="Dwarf Fortress",
|
| 275 |
+
appid="975370",
|
| 276 |
+
gid="1821288646585998",
|
| 277 |
+
title="Aquatic portraits, Naked dwarf fix and more Dwarf Fortress Patch 53.10",
|
| 278 |
+
expected="not_major",
|
| 279 |
+
reasoning=(
|
| 280 |
+
"Another maintenance patch under the same numbering scheme. "
|
| 281 |
+
"Phase 2: PATCH_WORD_RE matches 'Patch'; MAINT_LANGUAGE_RE matches 'fix' "
|
| 282 |
+
"→ maintenance blocker fires → not_major."
|
| 283 |
+
),
|
| 284 |
+
),
|
| 285 |
+
# ── Helldivers 2 ──────────────────────────────────────────────────────────
|
| 286 |
+
ItemCase(
|
| 287 |
+
game_name="Helldivers 2",
|
| 288 |
+
appid="553850",
|
| 289 |
+
gid="1826992588603352",
|
| 290 |
+
title="Machinery of Oppression: 6.1.0",
|
| 291 |
+
expected="major",
|
| 292 |
+
reasoning=(
|
| 293 |
+
"Named content drop with new missions/enemies. This should count as a major update. "
|
| 294 |
+
"Useful to test whether named major drops with three-segment versions are still found."
|
| 295 |
+
),
|
| 296 |
+
),
|
| 297 |
+
ItemCase(
|
| 298 |
+
game_name="Helldivers 2",
|
| 299 |
+
appid="553850",
|
| 300 |
+
gid="1826992588603981",
|
| 301 |
+
title="Revealing our Machinery of Oppression Content Roadmap!",
|
| 302 |
+
expected="not_major",
|
| 303 |
+
reasoning=(
|
| 304 |
+
"Roadmap/announcement post, not the update itself. Should not be treated as major."
|
| 305 |
+
),
|
| 306 |
+
),
|
| 307 |
+
# ── Deep Rock Galactic ────────────────────────────────────────────────────
|
| 308 |
+
ItemCase(
|
| 309 |
+
game_name="Deep Rock Galactic",
|
| 310 |
+
appid="548430",
|
| 311 |
+
gid="1825727806720055",
|
| 312 |
+
title="'Eight Years in Orbit' Anniversary Event is live now!",
|
| 313 |
+
expected="not_major",
|
| 314 |
+
reasoning=(
|
| 315 |
+
"Live event announcement, not a game patch. "
|
| 316 |
+
"Phase 2: EVENT_FESTIVAL_RE matches 'anniversary event'; no 'update'/'patch' in title "
|
| 317 |
+
"→ UPDATE_OR_PATCH_RE guard fails → event blocker fires → not_major."
|
| 318 |
+
),
|
| 319 |
+
),
|
| 320 |
+
ItemCase(
|
| 321 |
+
game_name="Deep Rock Galactic",
|
| 322 |
+
appid="548430",
|
| 323 |
+
gid="1824644522847377",
|
| 324 |
+
title="Lunar Festival 2026 is now live!",
|
| 325 |
+
expected="not_major",
|
| 326 |
+
reasoning=(
|
| 327 |
+
"Seasonal event announcement, not a major patch/update. "
|
| 328 |
+
"Phase 2: EVENT_FESTIVAL_RE matches 'festival'; no 'update'/'patch' → event blocker fires → not_major."
|
| 329 |
+
),
|
| 330 |
+
),
|
| 331 |
+
# ── Lethal Company ────────────────────────────────────────────────────────
|
| 332 |
+
ItemCase(
|
| 333 |
+
game_name="Lethal Company",
|
| 334 |
+
appid="1966720",
|
| 335 |
+
gid="1800991756395986",
|
| 336 |
+
title="V70 - The Incubating Update",
|
| 337 |
+
expected="major",
|
| 338 |
+
reasoning=(
|
| 339 |
+
"Named major content update. "
|
| 340 |
+
"Phase 2: NAMED_VERSION_RE matches 'V70'; UPDATE_WORD_RE matches 'Update' "
|
| 341 |
+
"→ condition F makes it update-related; named version positive signal → major."
|
| 342 |
+
),
|
| 343 |
+
),
|
| 344 |
+
ItemCase(
|
| 345 |
+
game_name="Lethal Company",
|
| 346 |
+
appid="1966720",
|
| 347 |
+
gid="1801617199407807",
|
| 348 |
+
title="V72 Bug fix patch",
|
| 349 |
+
expected="not_major",
|
| 350 |
+
reasoning=(
|
| 351 |
+
"Small bug-fix patch. patchnotes tag makes it update-related. "
|
| 352 |
+
"Phase 2: PATCH_WORD_RE matches 'patch'; MAINT_LANGUAGE_RE matches 'bug fix' "
|
| 353 |
+
"→ maintenance blocker fires → not_major."
|
| 354 |
+
),
|
| 355 |
+
),
|
| 356 |
+
# ── Factorio ──────────────────────────────────────────────────────────────
|
| 357 |
+
ItemCase(
|
| 358 |
+
game_name="Factorio",
|
| 359 |
+
appid="427520",
|
| 360 |
+
gid="1827626365752749",
|
| 361 |
+
title="Version 2.0.76 released as stable",
|
| 362 |
+
expected="not_major",
|
| 363 |
+
reasoning=(
|
| 364 |
+
"Stable maintenance patch under a three-segment versioning scheme. "
|
| 365 |
+
"Useful as a clean true negative."
|
| 366 |
+
),
|
| 367 |
+
),
|
| 368 |
+
# ── Satisfactory ──────────────────────────────────────────────────────────
|
| 369 |
+
ItemCase(
|
| 370 |
+
game_name="Satisfactory",
|
| 371 |
+
appid="526870",
|
| 372 |
+
gid="1826992588604352",
|
| 373 |
+
title="Update 1.2 is out now on Experimental!",
|
| 374 |
+
expected="not_major",
|
| 375 |
+
reasoning=(
|
| 376 |
+
"Experimental-branch release, not a production major update. "
|
| 377 |
+
"Phase 2: extended BRANCH_RE matches 'on Experimental' → branch blocker fires → not_major."
|
| 378 |
+
),
|
| 379 |
+
),
|
| 380 |
+
ItemCase(
|
| 381 |
+
game_name="Satisfactory",
|
| 382 |
+
appid="526870",
|
| 383 |
+
gid="1825093633185794",
|
| 384 |
+
title="Experimental Hotfix v1.1.3.1",
|
| 385 |
+
expected="not_major",
|
| 386 |
+
reasoning=(
|
| 387 |
+
"Experimental hotfix on a three-segment version. Correct behavior is not_major."
|
| 388 |
+
),
|
| 389 |
+
),
|
| 390 |
+
]
|
| 391 |
+
|
| 392 |
+
# ── service-level ground truth ────────────────────────────────────────────────
|
| 393 |
+
# What SHOULD the production code do for this game given the current news window?
|
| 394 |
+
# Populated from --discover run on 2026-03-19.
|
| 395 |
+
# Phase 1 semantics: verdict based on is_major (major_date is not None), not on selected item title.
|
| 396 |
+
SERVICE_CASES: list[ServiceCase] = [
|
| 397 |
+
ServiceCase(
|
| 398 |
+
game_name="Going Medieval",
|
| 399 |
+
appid="1029780",
|
| 400 |
+
expected_major=True,
|
| 401 |
+
reasoning=(
|
| 402 |
+
"Game released 1.0 on 2026-03-17. Phase 1: 'is out now in 1.0!' matches "
|
| 403 |
+
"RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. "
|
| 404 |
+
"Expected: major_date is not None (TP)."
|
| 405 |
+
),
|
| 406 |
+
),
|
| 407 |
+
ServiceCase(
|
| 408 |
+
game_name="Timberborn",
|
| 409 |
+
appid="1062090",
|
| 410 |
+
expected_major=True,
|
| 411 |
+
reasoning=(
|
| 412 |
+
"Game reached 1.0 on 2026-03-12. Phase 1: '1.0 is live!' matches "
|
| 413 |
+
"RELEASE_PHRASE_RE → update-related. ONE_ZERO_RE → major. "
|
| 414 |
+
"Expected: major_date is not None (TP)."
|
| 415 |
+
),
|
| 416 |
+
),
|
| 417 |
+
ServiceCase(
|
| 418 |
+
game_name="Hades II",
|
| 419 |
+
appid="1145350",
|
| 420 |
+
expected_major=True,
|
| 421 |
+
reasoning=(
|
| 422 |
+
"Game launched v1.0 on 2025-09-25. Phase 1: 'v1.0 Is Now Available!' matches "
|
| 423 |
+
"RELEASE_PHRASE_RE → update-related (developer feed). VERSION_RE matches 'v1.0' → major. "
|
| 424 |
+
"Subsequent hotfixes (v1.0 Hotfix 2, 3) are correctly blocked by HOTFIX_RE. "
|
| 425 |
+
"major_date = v1.0 launch date, latest_update_date = most recent hotfix date. "
|
| 426 |
+
"Expected: major_date is not None (TP)."
|
| 427 |
+
),
|
| 428 |
+
),
|
| 429 |
+
ServiceCase(
|
| 430 |
+
game_name="Against the Storm",
|
| 431 |
+
appid="1336490",
|
| 432 |
+
expected_major=True,
|
| 433 |
+
reasoning=(
|
| 434 |
+
"'Brineworks Update (1.9) available!' is a named major content update. "
|
| 435 |
+
"Phase 1: VERSION_RE matches '1.9' + ACTION_WORD_RE matches 'Update'/'available' "
|
| 436 |
+
"→ update-related (developer feed). VERSION_RE → major. "
|
| 437 |
+
"Expected: major_date is not None (TP)."
|
| 438 |
+
),
|
| 439 |
+
),
|
| 440 |
+
ServiceCase(
|
| 441 |
+
game_name="Valheim",
|
| 442 |
+
appid="892970",
|
| 443 |
+
expected_major=False,
|
| 444 |
+
reasoning=(
|
| 445 |
+
"Top items are three-segment maintenance patches. "
|
| 446 |
+
"Correctly classified as not_major. TN."
|
| 447 |
+
),
|
| 448 |
+
),
|
| 449 |
+
ServiceCase(
|
| 450 |
+
game_name="Manor Lords",
|
| 451 |
+
appid="1363080",
|
| 452 |
+
expected_major=True,
|
| 453 |
+
reasoning=(
|
| 454 |
+
"Current window contains a clearly labeled 'Major Update #6' post. "
|
| 455 |
+
"Expected: major_date is not None."
|
| 456 |
+
),
|
| 457 |
+
),
|
| 458 |
+
ServiceCase(
|
| 459 |
+
game_name="Project Zomboid",
|
| 460 |
+
appid="108600",
|
| 461 |
+
expected_major=False,
|
| 462 |
+
reasoning=(
|
| 463 |
+
"Current window is dominated by unstable builds and hotfixes. "
|
| 464 |
+
"These should update activity, but should not count as major releases."
|
| 465 |
+
),
|
| 466 |
+
),
|
| 467 |
+
ServiceCase(
|
| 468 |
+
game_name="Dwarf Fortress",
|
| 469 |
+
appid="975370",
|
| 470 |
+
expected_major=False,
|
| 471 |
+
reasoning=(
|
| 472 |
+
"Current window contains only maintenance patches (53.11/53.10/53.09 plus hotfixes). "
|
| 473 |
+
"Phase 2: maintenance blocker (patch + fix language) correctly blocks all of them → no major_date."
|
| 474 |
+
),
|
| 475 |
+
),
|
| 476 |
+
ServiceCase(
|
| 477 |
+
game_name="Helldivers 2",
|
| 478 |
+
appid="553850",
|
| 479 |
+
expected_major=True,
|
| 480 |
+
reasoning=(
|
| 481 |
+
"Current window contains 'Machinery of Oppression: 6.1.0', a named content update. "
|
| 482 |
+
"Expected: major_date is not None."
|
| 483 |
+
),
|
| 484 |
+
),
|
| 485 |
+
ServiceCase(
|
| 486 |
+
game_name="Lethal Company",
|
| 487 |
+
appid="1966720",
|
| 488 |
+
expected_major=True,
|
| 489 |
+
reasoning=(
|
| 490 |
+
"Current window contains 'V70 - The Incubating Update', a named major content drop, "
|
| 491 |
+
"plus newer bug-fix patches. Phase 2: NAMED_VERSION_RE + UPDATE_WORD_RE detects V70 → major_date set."
|
| 492 |
+
),
|
| 493 |
+
),
|
| 494 |
+
ServiceCase(
|
| 495 |
+
game_name="Factorio",
|
| 496 |
+
appid="427520",
|
| 497 |
+
expected_major=False,
|
| 498 |
+
reasoning=(
|
| 499 |
+
"Current window contains only three-segment stable maintenance releases (2.0.x). "
|
| 500 |
+
"Expected: not_major."
|
| 501 |
+
),
|
| 502 |
+
),
|
| 503 |
+
ServiceCase(
|
| 504 |
+
game_name="Satisfactory",
|
| 505 |
+
appid="526870",
|
| 506 |
+
expected_major=False,
|
| 507 |
+
reasoning=(
|
| 508 |
+
"Current window contains an experimental 1.2 rollout and experimental hotfixes. "
|
| 509 |
+
"Phase 2: extended BRANCH_RE ('on Experimental') blocks the 1.2 rollout → no major_date."
|
| 510 |
+
),
|
| 511 |
+
),
|
| 512 |
+
]
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
# ── helpers ───────────────────────────────────────────────────────────────────
|
| 516 |
+
|
| 517 |
+
def _fmt_ts(ts: int | None) -> str:
|
| 518 |
+
if not ts:
|
| 519 |
+
return "—"
|
| 520 |
+
try:
|
| 521 |
+
return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
|
| 522 |
+
except (OSError, ValueError):
|
| 523 |
+
return "—"
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
def _fmt_dt(dt: datetime | None) -> str:
|
| 527 |
+
if dt is None:
|
| 528 |
+
return "—"
|
| 529 |
+
return dt.strftime("%Y-%m-%d")
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
def _trunc(s: str, n: int) -> str:
|
| 533 |
+
return (s[:n] + "…") if len(s) > n else s
|
| 534 |
+
|
| 535 |
+
|
| 536 |
+
def _fetch_news(client: httpx.Client, appid: str, count: int) -> list[dict]:
|
| 537 |
+
try:
|
| 538 |
+
resp = client.get(
|
| 539 |
+
STEAM_NEWS_API_URL,
|
| 540 |
+
params={"appid": appid, "count": count, "maxlength": 0},
|
| 541 |
+
)
|
| 542 |
+
if resp.status_code != 200:
|
| 543 |
+
print(f" [WARN] HTTP {resp.status_code} for appid {appid}", file=sys.stderr)
|
| 544 |
+
return []
|
| 545 |
+
data = resp.json()
|
| 546 |
+
return data.get("appnews", {}).get("newsitems", []) or []
|
| 547 |
+
except Exception as exc:
|
| 548 |
+
print(f" [WARN] Request failed for appid {appid}: {exc}", file=sys.stderr)
|
| 549 |
+
return []
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
# ── Mode 1: discover ──────────────────────────────────────────────────────────
|
| 553 |
+
|
| 554 |
+
def run_discover(count: int) -> None:
|
| 555 |
+
if count != 20:
|
| 556 |
+
print(f"NOTE: count={count} — beyond production window (prod uses count=20)\n")
|
| 557 |
+
|
| 558 |
+
col_idx = 4
|
| 559 |
+
col_gid = 20
|
| 560 |
+
col_date = 10
|
| 561 |
+
col_title = 40
|
| 562 |
+
col_fl = 16
|
| 563 |
+
col_tags = 24
|
| 564 |
+
col_ur = 9
|
| 565 |
+
col_maj = 7
|
| 566 |
+
|
| 567 |
+
header = (
|
| 568 |
+
f"{'#':<{col_idx}} "
|
| 569 |
+
f"{'gid':<{col_gid}} "
|
| 570 |
+
f"{'date':<{col_date}} "
|
| 571 |
+
f"{'title':<{col_title}} "
|
| 572 |
+
f"{'feedlabel':<{col_fl}} "
|
| 573 |
+
f"{'tags':<{col_tags}} "
|
| 574 |
+
f"{'upd_rel?':<{col_ur}} "
|
| 575 |
+
f"{'major?':<{col_maj}}"
|
| 576 |
+
)
|
| 577 |
+
sep = "-" * len(header)
|
| 578 |
+
|
| 579 |
+
with httpx.Client(timeout=30.0) as client:
|
| 580 |
+
for game_name, appid in GAMES:
|
| 581 |
+
print(f"\n{'=' * len(header)}")
|
| 582 |
+
print(f" {game_name} (appid={appid})")
|
| 583 |
+
print(f"{'=' * len(header)}")
|
| 584 |
+
print(header)
|
| 585 |
+
print(sep)
|
| 586 |
+
|
| 587 |
+
items = _fetch_news(client, appid, count)
|
| 588 |
+
if not items:
|
| 589 |
+
print(" (no items returned)")
|
| 590 |
+
continue
|
| 591 |
+
|
| 592 |
+
for idx, item in enumerate(items, start=1):
|
| 593 |
+
gid = str(item.get("gid") or "")[:col_gid]
|
| 594 |
+
date_str = _fmt_ts(item.get("date"))
|
| 595 |
+
title = _trunc(item.get("title", ""), col_title)
|
| 596 |
+
feedlabel = _trunc(item.get("feedlabel") or "", col_fl)
|
| 597 |
+
tags = _trunc(str(item.get("tags") or ""), col_tags)
|
| 598 |
+
|
| 599 |
+
is_ur = UpdateDetectionService._is_update_related(item)
|
| 600 |
+
is_maj = UpdateDetectionService._is_major_update(item)
|
| 601 |
+
|
| 602 |
+
ur_str = "Yes" if is_ur else "No"
|
| 603 |
+
maj_str = "Yes" if is_maj else "No"
|
| 604 |
+
|
| 605 |
+
print(
|
| 606 |
+
f"{idx:<{col_idx}} "
|
| 607 |
+
f"{gid:<{col_gid}} "
|
| 608 |
+
f"{date_str:<{col_date}} "
|
| 609 |
+
f"{title:<{col_title}} "
|
| 610 |
+
f"{feedlabel:<{col_fl}} "
|
| 611 |
+
f"{tags:<{col_tags}} "
|
| 612 |
+
f"{ur_str:<{col_ur}} "
|
| 613 |
+
f"{maj_str:<{col_maj}}"
|
| 614 |
+
)
|
| 615 |
+
|
| 616 |
+
latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items)
|
| 617 |
+
print(f"\n >> latest_update_date: {_fmt_dt(latest_update_date)} | major_date: {_fmt_dt(major_date)}")
|
| 618 |
+
verdict = "MAJOR" if major_date is not None else "not_major"
|
| 619 |
+
print(f" >> Service result: {verdict}")
|
| 620 |
+
|
| 621 |
+
|
| 622 |
+
# ── Mode 2: evaluate (item-level) ─────────────────────────────────────────────
|
| 623 |
+
|
| 624 |
+
def run_evaluate() -> None:
|
| 625 |
+
if not ITEM_CASES:
|
| 626 |
+
print("[evaluate] No item-level ground truth defined yet.")
|
| 627 |
+
print(" Run --discover first, then populate ITEM_CASES in this script.")
|
| 628 |
+
return
|
| 629 |
+
|
| 630 |
+
# Build lookup: appid → {gid → item}
|
| 631 |
+
gid_index: dict[str, dict[str, dict]] = {}
|
| 632 |
+
needed_appids = {case.appid for case in ITEM_CASES}
|
| 633 |
+
|
| 634 |
+
with httpx.Client(timeout=30.0) as client:
|
| 635 |
+
for appid in needed_appids:
|
| 636 |
+
items = _fetch_news(client, appid, count=20)
|
| 637 |
+
gid_index[appid] = {str(item.get("gid", "")): item for item in items}
|
| 638 |
+
|
| 639 |
+
tp = tn = fp = fn = amb = not_found = 0
|
| 640 |
+
rows: list[tuple] = []
|
| 641 |
+
|
| 642 |
+
for case in ITEM_CASES:
|
| 643 |
+
item = gid_index.get(case.appid, {}).get(case.gid)
|
| 644 |
+
if item is None:
|
| 645 |
+
not_found += 1
|
| 646 |
+
rows.append((case.game_name, case.title, "—", "—", "—", case.expected, "NOT FOUND"))
|
| 647 |
+
continue
|
| 648 |
+
|
| 649 |
+
is_ur = UpdateDetectionService._is_update_related(item)
|
| 650 |
+
is_maj = UpdateDetectionService._is_major_update(item)
|
| 651 |
+
|
| 652 |
+
predicted = "major" if (is_ur and is_maj) else "not_major"
|
| 653 |
+
expected = case.expected
|
| 654 |
+
|
| 655 |
+
if expected == "ambiguous":
|
| 656 |
+
verdict = "ambiguous"
|
| 657 |
+
amb += 1
|
| 658 |
+
elif predicted == expected:
|
| 659 |
+
verdict = "PASS"
|
| 660 |
+
if expected == "major":
|
| 661 |
+
tp += 1
|
| 662 |
+
else:
|
| 663 |
+
tn += 1
|
| 664 |
+
else:
|
| 665 |
+
if predicted == "major" and expected == "not_major":
|
| 666 |
+
verdict = "FAIL (FP)"
|
| 667 |
+
fp += 1
|
| 668 |
+
else:
|
| 669 |
+
verdict = "FAIL (FN)"
|
| 670 |
+
fn += 1
|
| 671 |
+
|
| 672 |
+
rows.append((
|
| 673 |
+
case.game_name,
|
| 674 |
+
_trunc(case.title, 30),
|
| 675 |
+
_fmt_ts(item.get("date")),
|
| 676 |
+
str(item.get("tags", ""))[:20],
|
| 677 |
+
item.get("feedlabel", "")[:16],
|
| 678 |
+
expected,
|
| 679 |
+
"Yes" if is_ur else "No",
|
| 680 |
+
"Yes" if is_maj else "No",
|
| 681 |
+
verdict,
|
| 682 |
+
))
|
| 683 |
+
|
| 684 |
+
# Print report
|
| 685 |
+
print("\n" + "=" * 110)
|
| 686 |
+
print("REPORT A — Item-level classification")
|
| 687 |
+
print("=" * 110)
|
| 688 |
+
hdr = f"{'Game':<18} {'Title':<30} {'Date':<10} {'Tags':<20} {'FeedLabel':<16} {'Expected':<10} {'UpdRel?':<8} {'Major?':<7} Verdict"
|
| 689 |
+
print(hdr)
|
| 690 |
+
print("-" * 110)
|
| 691 |
+
for row in rows:
|
| 692 |
+
if len(row) == 7:
|
| 693 |
+
print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {'—':<20} {'—':<16} {row[5]:<10} {'—':<8} {'—':<7} {row[6]}")
|
| 694 |
+
else:
|
| 695 |
+
print(f"{row[0]:<18} {row[1]:<30} {row[2]:<10} {row[3]:<20} {row[4]:<16} {row[5]:<10} {row[6]:<8} {row[7]:<7} {row[8]}")
|
| 696 |
+
|
| 697 |
+
total = tp + tn + fp + fn
|
| 698 |
+
print("\nSummary:")
|
| 699 |
+
print(f" Total cases : {len(ITEM_CASES)} | not found: {not_found} | ambiguous: {amb}")
|
| 700 |
+
print(f" TP={tp} TN={tn} FP={fp} FN={fn}")
|
| 701 |
+
if total > 0:
|
| 702 |
+
prec = tp / (tp + fp) if (tp + fp) else float("nan")
|
| 703 |
+
recall = tp / (tp + fn) if (tp + fn) else float("nan")
|
| 704 |
+
acc = (tp + tn) / total
|
| 705 |
+
print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}")
|
| 706 |
+
|
| 707 |
+
fps = [c for c in ITEM_CASES if "FAIL (FP)" in str(rows[ITEM_CASES.index(c)])]
|
| 708 |
+
fns = [c for c in ITEM_CASES if "FAIL (FN)" in str(rows[ITEM_CASES.index(c)])]
|
| 709 |
+
if fps:
|
| 710 |
+
print("\nFalse Positives:")
|
| 711 |
+
for c in fps:
|
| 712 |
+
print(f" [{c.game_name}] {c.title!r} — {c.reasoning}")
|
| 713 |
+
if fns:
|
| 714 |
+
print("\nFalse Negatives:")
|
| 715 |
+
for c in fns:
|
| 716 |
+
print(f" [{c.game_name}] {c.title!r} — {c.reasoning}")
|
| 717 |
+
|
| 718 |
+
|
| 719 |
+
# ── Mode 3: evaluate-service (end-to-end) ─────────────────────────────────────
|
| 720 |
+
|
| 721 |
+
def run_evaluate_service() -> None:
|
| 722 |
+
if not SERVICE_CASES:
|
| 723 |
+
print("[evaluate-service] No service-level ground truth defined yet.")
|
| 724 |
+
print(" Run --discover first, then populate SERVICE_CASES in this script.")
|
| 725 |
+
return
|
| 726 |
+
|
| 727 |
+
tp = tn = fp = fn = amb = 0
|
| 728 |
+
rows: list[tuple] = []
|
| 729 |
+
|
| 730 |
+
with httpx.Client(timeout=30.0) as client:
|
| 731 |
+
for case in SERVICE_CASES:
|
| 732 |
+
items = _fetch_news(client, case.appid, count=20)
|
| 733 |
+
latest_update_date, major_date = UpdateDetectionService._collect_update_candidates(items)
|
| 734 |
+
is_maj = major_date is not None
|
| 735 |
+
|
| 736 |
+
latest_str = _fmt_dt(latest_update_date)
|
| 737 |
+
major_str = _fmt_dt(major_date)
|
| 738 |
+
maj_label = "Yes" if is_maj else "No"
|
| 739 |
+
|
| 740 |
+
if case.expected_major is None:
|
| 741 |
+
verdict = "ambiguous"
|
| 742 |
+
amb += 1
|
| 743 |
+
elif is_maj == case.expected_major:
|
| 744 |
+
verdict = "PASS"
|
| 745 |
+
if case.expected_major:
|
| 746 |
+
tp += 1
|
| 747 |
+
else:
|
| 748 |
+
tn += 1
|
| 749 |
+
else:
|
| 750 |
+
if is_maj and not case.expected_major:
|
| 751 |
+
verdict = "FAIL (FP)"
|
| 752 |
+
fp += 1
|
| 753 |
+
else:
|
| 754 |
+
verdict = "FAIL (FN)"
|
| 755 |
+
fn += 1
|
| 756 |
+
|
| 757 |
+
rows.append((
|
| 758 |
+
case.game_name,
|
| 759 |
+
latest_str,
|
| 760 |
+
major_str,
|
| 761 |
+
maj_label,
|
| 762 |
+
"True" if case.expected_major else ("None" if case.expected_major is None else "False"),
|
| 763 |
+
verdict,
|
| 764 |
+
))
|
| 765 |
+
|
| 766 |
+
print("\n" + "=" * 100)
|
| 767 |
+
print("REPORT B — Service-level (end-to-end)")
|
| 768 |
+
print("=" * 100)
|
| 769 |
+
hdr = f"{'Game':<18} {'LatestUpdate':<13} {'MajorDate':<11} {'Major?':<7} {'Expected':<9} Verdict"
|
| 770 |
+
print(hdr)
|
| 771 |
+
print("-" * 100)
|
| 772 |
+
for row in rows:
|
| 773 |
+
print(f"{row[0]:<18} {row[1]:<13} {row[2]:<11} {row[3]:<7} {row[4]:<9} {row[5]}")
|
| 774 |
+
|
| 775 |
+
total = tp + tn + fp + fn
|
| 776 |
+
print("\nSummary:")
|
| 777 |
+
print(f" Total games : {len(SERVICE_CASES)} | ambiguous: {amb}")
|
| 778 |
+
print(f" TP={tp} TN={tn} FP={fp} FN={fn}")
|
| 779 |
+
if total > 0:
|
| 780 |
+
prec = tp / (tp + fp) if (tp + fp) else float("nan")
|
| 781 |
+
recall = tp / (tp + fn) if (tp + fn) else float("nan")
|
| 782 |
+
acc = (tp + tn) / total
|
| 783 |
+
print(f" Precision={prec:.2f} Recall={recall:.2f} Accuracy={acc:.2f}")
|
| 784 |
+
|
| 785 |
+
for idx, case in enumerate(SERVICE_CASES):
|
| 786 |
+
verdict = rows[idx][5]
|
| 787 |
+
if verdict.startswith("FAIL"):
|
| 788 |
+
print(f"\n [{case.game_name}] {verdict} — {case.reasoning}")
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
# ── main ──────────────────────────────────────────────────────────────────────
|
| 792 |
+
|
| 793 |
+
def _parse_args() -> argparse.Namespace:
|
| 794 |
+
p = argparse.ArgumentParser(
|
| 795 |
+
description="Benchmark the major update detection heuristic against real Steam games."
|
| 796 |
+
)
|
| 797 |
+
p.add_argument(
|
| 798 |
+
"--discover",
|
| 799 |
+
action="store_true",
|
| 800 |
+
help="Fetch news for all games and display per-item classification details.",
|
| 801 |
+
)
|
| 802 |
+
p.add_argument(
|
| 803 |
+
"--evaluate",
|
| 804 |
+
action="store_true",
|
| 805 |
+
help="Run item-level evaluation against ITEM_CASES ground truth.",
|
| 806 |
+
)
|
| 807 |
+
p.add_argument(
|
| 808 |
+
"--evaluate-service",
|
| 809 |
+
action="store_true",
|
| 810 |
+
dest="evaluate_service",
|
| 811 |
+
help="Run service-level end-to-end evaluation against SERVICE_CASES ground truth.",
|
| 812 |
+
)
|
| 813 |
+
p.add_argument(
|
| 814 |
+
"--count",
|
| 815 |
+
type=int,
|
| 816 |
+
default=20,
|
| 817 |
+
help="Number of news items to fetch (default: 20, matches production). "
|
| 818 |
+
"Values > 20 are beyond the production window.",
|
| 819 |
+
)
|
| 820 |
+
return p.parse_args()
|
| 821 |
+
|
| 822 |
+
|
| 823 |
+
def main() -> int:
|
| 824 |
+
args = _parse_args()
|
| 825 |
+
|
| 826 |
+
discover = args.discover
|
| 827 |
+
evaluate = args.evaluate
|
| 828 |
+
eval_svc = args.evaluate_service
|
| 829 |
+
|
| 830 |
+
# Default: run both evaluate modes when nothing is specified
|
| 831 |
+
if not discover and not evaluate and not eval_svc:
|
| 832 |
+
evaluate = True
|
| 833 |
+
eval_svc = True
|
| 834 |
+
|
| 835 |
+
if discover:
|
| 836 |
+
run_discover(count=args.count)
|
| 837 |
+
|
| 838 |
+
if evaluate:
|
| 839 |
+
run_evaluate()
|
| 840 |
+
|
| 841 |
+
if eval_svc:
|
| 842 |
+
run_evaluate_service()
|
| 843 |
+
|
| 844 |
+
return 0
|
| 845 |
+
|
| 846 |
+
|
| 847 |
+
if __name__ == "__main__":
|
| 848 |
+
raise SystemExit(main())
|
scripts/check_db_stats.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import os
|
| 3 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Załaduj .env z głównego katalogu lub katalogu backend
|
| 7 |
+
load_dotenv(".env")
|
| 8 |
+
load_dotenv("backend/.env")
|
| 9 |
+
|
| 10 |
+
async def check_stats():
|
| 11 |
+
# Pobranie parametrów z .env
|
| 12 |
+
mongo_url = os.getenv("MONGODB_URL")
|
| 13 |
+
db_name = os.getenv("MONGODB_DB_NAME", "sentimentSummary")
|
| 14 |
+
|
| 15 |
+
if not mongo_url:
|
| 16 |
+
print("ERROR: MONGODB_URL not found in .env file!")
|
| 17 |
+
return
|
| 18 |
+
|
| 19 |
+
print(f"Connecting to MongoDB: {mongo_url.split('@')[-1]}...") # Pokazuje tylko hosta dla bezpieczeństwa
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
client = AsyncIOMotorClient(mongo_url)
|
| 23 |
+
db = client[db_name]
|
| 24 |
+
collection = db["games"]
|
| 25 |
+
|
| 26 |
+
total = await collection.count_documents({})
|
| 27 |
+
with_cn = await collection.count_documents({
|
| 28 |
+
"name_cn": {"$exists": True, "$ne": None, "$nin": ["", "null", "None"]}
|
| 29 |
+
})
|
| 30 |
+
|
| 31 |
+
print("\n" + "="*30)
|
| 32 |
+
print(f"DATABASE STATS")
|
| 33 |
+
print("="*30)
|
| 34 |
+
print(f"Total games: {total}")
|
| 35 |
+
print(f"With Chinese: {with_cn}")
|
| 36 |
+
|
| 37 |
+
if total > 0:
|
| 38 |
+
percentage = (with_cn / total) * 100
|
| 39 |
+
print(f"Coverage: {percentage:.2f}%")
|
| 40 |
+
print("="*30)
|
| 41 |
+
|
| 42 |
+
client.close()
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"ERROR: Could not connect or query DB: {e}")
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
asyncio.run(check_stats())
|
scripts/expand_keywords/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Keyword expansion toolkit using FastText.
|
| 3 |
+
|
| 4 |
+
This package provides tools to:
|
| 5 |
+
1. Fetch reviews from Steam games
|
| 6 |
+
2. Train FastText models on review corpus
|
| 7 |
+
3. Expand existing keyword dictionary with semantically similar words
|
| 8 |
+
"""
|
scripts/expand_keywords/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Allow running as: python -m scripts.expand_keywords"""
|
| 2 |
+
|
| 3 |
+
from .main import main
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
main()
|
scripts/expand_keywords/config.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration for keyword expansion: game list and settings.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# Base directories
|
| 8 |
+
BASE_DIR = Path(__file__).parent
|
| 9 |
+
DATA_DIR = BASE_DIR / "data"
|
| 10 |
+
REVIEWS_DIR = DATA_DIR / "reviews"
|
| 11 |
+
MODELS_DIR = DATA_DIR / "models"
|
| 12 |
+
OUTPUT_DIR = DATA_DIR / "output"
|
| 13 |
+
|
| 14 |
+
# Ensure directories exist
|
| 15 |
+
for dir_path in [REVIEWS_DIR, MODELS_DIR, OUTPUT_DIR]:
|
| 16 |
+
dir_path.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
# Game list: (app_id, name, genre)
|
| 19 |
+
# Selected for variety across genres to get diverse vocabulary
|
| 20 |
+
GAMES: list[tuple[str, str, str]] = [
|
| 21 |
+
# Action RPG
|
| 22 |
+
("1245620", "Elden Ring", "action_rpg"),
|
| 23 |
+
("374320", "Dark Souls III", "action_rpg"),
|
| 24 |
+
# CRPG
|
| 25 |
+
("1086940", "Baldur's Gate 3", "crpg"),
|
| 26 |
+
("435150", "Divinity: Original Sin 2", "crpg"),
|
| 27 |
+
("1184370", "Pathfinder: Wrath of the Righteous", "crpg"),
|
| 28 |
+
# Open World RPG
|
| 29 |
+
("292030", "The Witcher 3", "open_world_rpg"),
|
| 30 |
+
("489830", "Skyrim Special Edition", "open_world_rpg"),
|
| 31 |
+
("1091500", "Cyberpunk 2077", "open_world_rpg"),
|
| 32 |
+
# FPS
|
| 33 |
+
("730", "Counter-Strike 2", "fps_competitive"),
|
| 34 |
+
("782330", "DOOM Eternal", "fps_single"),
|
| 35 |
+
("1237970", "Titanfall 2", "fps_single"),
|
| 36 |
+
# Survival
|
| 37 |
+
("892970", "Valheim", "survival"),
|
| 38 |
+
("252490", "Rust", "survival"),
|
| 39 |
+
("264710", "Subnautica", "survival"),
|
| 40 |
+
("242760", "The Forest", "survival"),
|
| 41 |
+
# Strategy
|
| 42 |
+
("289070", "Civilization VI", "strategy"),
|
| 43 |
+
("1142710", "Total War: Warhammer III", "strategy"),
|
| 44 |
+
("1466860", "Age of Empires IV", "strategy"),
|
| 45 |
+
# Roguelike
|
| 46 |
+
("1145360", "Hades", "roguelike"),
|
| 47 |
+
("588650", "Dead Cells", "roguelike"),
|
| 48 |
+
("646570", "Slay the Spire", "roguelike"),
|
| 49 |
+
# Metroidvania
|
| 50 |
+
("367520", "Hollow Knight", "metroidvania"),
|
| 51 |
+
("1057090", "Ori and the Will of the Wisps", "metroidvania"),
|
| 52 |
+
# Simulation
|
| 53 |
+
("255710", "Cities: Skylines", "simulation"),
|
| 54 |
+
("427520", "Factorio", "simulation"),
|
| 55 |
+
("526870", "Satisfactory", "simulation"),
|
| 56 |
+
# Horror
|
| 57 |
+
("1196590", "Resident Evil Village", "horror"),
|
| 58 |
+
("739630", "Phasmophobia", "horror"),
|
| 59 |
+
("381210", "Dead by Daylight", "horror"),
|
| 60 |
+
# Live Service
|
| 61 |
+
("1085660", "Destiny 2", "live_service"),
|
| 62 |
+
("230410", "Warframe", "live_service"),
|
| 63 |
+
("238960", "Path of Exile", "live_service"),
|
| 64 |
+
# Racing
|
| 65 |
+
("1551360", "Forza Horizon 5", "racing"),
|
| 66 |
+
# Story Driven
|
| 67 |
+
("1174180", "Red Dead Redemption 2", "story_driven"),
|
| 68 |
+
# Casual
|
| 69 |
+
("413150", "Stardew Valley", "casual"),
|
| 70 |
+
("105600", "Terraria", "casual"),
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
# Fetching settings
|
| 74 |
+
SETTINGS = {
|
| 75 |
+
# Review fetching
|
| 76 |
+
"reviews_per_game": 2700, # ~80k total across ~30 games
|
| 77 |
+
"batch_size": 100, # Steam API batch size
|
| 78 |
+
"sleep_between_batches": 1.5, # Seconds between API calls
|
| 79 |
+
"sleep_between_games": 5.0, # Longer pause between games
|
| 80 |
+
"min_review_length": 50, # Filter short reviews (chars)
|
| 81 |
+
"max_retries": 3, # Retry count on failure
|
| 82 |
+
"retry_base_delay": 10.0, # Base delay for exponential backoff
|
| 83 |
+
|
| 84 |
+
# Preprocessing
|
| 85 |
+
"phrase_min_count": 10, # Min occurrences for phrase detection
|
| 86 |
+
"phrase_threshold": 10.0, # Phrase detection threshold
|
| 87 |
+
|
| 88 |
+
# FastText training
|
| 89 |
+
"fasttext_vector_size": 150,
|
| 90 |
+
"fasttext_window": 5,
|
| 91 |
+
"fasttext_min_count": 5,
|
| 92 |
+
"fasttext_epochs": 10,
|
| 93 |
+
"fasttext_workers": 4,
|
| 94 |
+
|
| 95 |
+
# Expansion
|
| 96 |
+
"similarity_threshold": 0.55,
|
| 97 |
+
"max_suggestions_per_seed": 20,
|
| 98 |
+
"min_frequency": 10, # Min word frequency in corpus
|
| 99 |
+
"auto_approve_threshold": 0.70, # Score threshold for auto-approval
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# Steam API endpoint
|
| 103 |
+
STEAM_REVIEWS_API = "https://store.steampowered.com/appreviews/{app_id}"
|
| 104 |
+
|
| 105 |
+
# Steam language setting for reviews
|
| 106 |
+
STEAM_REVIEW_LANGUAGE = "schinese" # schinese, english, tchinese, etc.
|
scripts/expand_keywords/expander.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Keyword dictionary expansion with exclusive category assignment.
|
| 3 |
+
|
| 4 |
+
Key principle: Each word can only belong to ONE category.
|
| 5 |
+
This prevents cross-contamination where a word like "unplayable"
|
| 6 |
+
might be counted in both Bugs and Performance categories.
|
| 7 |
+
|
| 8 |
+
Algorithm:
|
| 9 |
+
1. For each category: find candidate words similar to seed keywords
|
| 10 |
+
2. Collect ALL candidates in a global pool
|
| 11 |
+
3. Assign each word to the category with highest score
|
| 12 |
+
4. Filter by similarity threshold and frequency
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import logging
|
| 17 |
+
import math
|
| 18 |
+
from collections import defaultdict
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
from datetime import datetime
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
from gensim.models import FastText
|
| 24 |
+
|
| 25 |
+
from .config import OUTPUT_DIR, SETTINGS
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass
|
| 31 |
+
class Candidate:
|
| 32 |
+
"""A candidate word for dictionary expansion."""
|
| 33 |
+
|
| 34 |
+
word: str
|
| 35 |
+
similarity: float
|
| 36 |
+
frequency: int
|
| 37 |
+
source_seeds: list[str] = field(default_factory=list)
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def score(self) -> float:
|
| 41 |
+
"""
|
| 42 |
+
Combined score from similarity and frequency.
|
| 43 |
+
|
| 44 |
+
Formula: 0.7 * similarity + 0.3 * normalized_log_frequency
|
| 45 |
+
Frequency factor normalized to ~0-1 range.
|
| 46 |
+
"""
|
| 47 |
+
freq_factor = math.log10(max(self.frequency, 1) + 1) / 5
|
| 48 |
+
return self.similarity * 0.7 + freq_factor * 0.3
|
| 49 |
+
|
| 50 |
+
def to_dict(self) -> dict:
|
| 51 |
+
return {
|
| 52 |
+
"word": self.word.replace("_", " "),
|
| 53 |
+
"similarity": round(self.similarity, 3),
|
| 54 |
+
"frequency": self.frequency,
|
| 55 |
+
"score": round(self.score, 3),
|
| 56 |
+
"source_seeds": self.source_seeds,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class KeywordExpander:
|
| 61 |
+
"""
|
| 62 |
+
Expands keyword dictionary using trained FastText model.
|
| 63 |
+
|
| 64 |
+
Uses exclusive category assignment to prevent words
|
| 65 |
+
appearing in multiple categories.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
def __init__(
|
| 69 |
+
self,
|
| 70 |
+
model: FastText,
|
| 71 |
+
existing_keywords: dict[str, list[str]],
|
| 72 |
+
word_frequencies: dict[str, int],
|
| 73 |
+
similarity_threshold: float | None = None,
|
| 74 |
+
max_suggestions_per_seed: int | None = None,
|
| 75 |
+
min_frequency: int | None = None,
|
| 76 |
+
):
|
| 77 |
+
"""
|
| 78 |
+
Initialize expander.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
model: Trained FastText model
|
| 82 |
+
existing_keywords: Current TOPIC_KEYWORDS dictionary
|
| 83 |
+
word_frequencies: Word frequency counts from corpus
|
| 84 |
+
similarity_threshold: Minimum similarity for candidates
|
| 85 |
+
max_suggestions_per_seed: Max similar words per seed
|
| 86 |
+
min_frequency: Minimum corpus frequency
|
| 87 |
+
"""
|
| 88 |
+
self.model = model
|
| 89 |
+
self.existing = existing_keywords
|
| 90 |
+
self.word_freq = word_frequencies
|
| 91 |
+
|
| 92 |
+
self.similarity_threshold = similarity_threshold or SETTINGS["similarity_threshold"]
|
| 93 |
+
self.max_suggestions = max_suggestions_per_seed or SETTINGS["max_suggestions_per_seed"]
|
| 94 |
+
self.min_frequency = min_frequency or SETTINGS["min_frequency"]
|
| 95 |
+
|
| 96 |
+
# Build set of all existing words (normalized)
|
| 97 |
+
self.existing_words: set[str] = set()
|
| 98 |
+
for words in existing_keywords.values():
|
| 99 |
+
for w in words:
|
| 100 |
+
self.existing_words.add(w.lower().replace(" ", "_"))
|
| 101 |
+
|
| 102 |
+
logger.info(f"Expander initialized with {len(self.existing_words)} existing keywords")
|
| 103 |
+
|
| 104 |
+
def _find_candidates_for_category(
|
| 105 |
+
self,
|
| 106 |
+
category: str,
|
| 107 |
+
seeds: list[str],
|
| 108 |
+
) -> dict[str, Candidate]:
|
| 109 |
+
"""
|
| 110 |
+
Find candidate words for a single category.
|
| 111 |
+
|
| 112 |
+
Returns dict[word -> Candidate] with best similarity per word.
|
| 113 |
+
"""
|
| 114 |
+
candidates: dict[str, Candidate] = {}
|
| 115 |
+
|
| 116 |
+
for seed in seeds:
|
| 117 |
+
# Normalize seed (e.g., "frame rate" -> "frame_rate")
|
| 118 |
+
seed_normalized = seed.lower().replace(" ", "_")
|
| 119 |
+
|
| 120 |
+
# Skip if seed not in vocabulary
|
| 121 |
+
if seed_normalized not in self.model.wv:
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
# Get similar words
|
| 125 |
+
try:
|
| 126 |
+
similar = self.model.wv.most_similar(
|
| 127 |
+
seed_normalized,
|
| 128 |
+
topn=self.max_suggestions,
|
| 129 |
+
)
|
| 130 |
+
except KeyError:
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
for word, similarity in similar:
|
| 134 |
+
# Skip existing words
|
| 135 |
+
if word in self.existing_words:
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
# Skip below threshold
|
| 139 |
+
if similarity < self.similarity_threshold:
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
# Check frequency
|
| 143 |
+
freq = self.word_freq.get(word, 0)
|
| 144 |
+
if freq < self.min_frequency:
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
# Update or add candidate
|
| 148 |
+
if word in candidates:
|
| 149 |
+
# Keep higher similarity
|
| 150 |
+
if similarity > candidates[word].similarity:
|
| 151 |
+
candidates[word].similarity = similarity
|
| 152 |
+
candidates[word].source_seeds.append(seed)
|
| 153 |
+
else:
|
| 154 |
+
candidates[word] = Candidate(
|
| 155 |
+
word=word,
|
| 156 |
+
similarity=similarity,
|
| 157 |
+
frequency=freq,
|
| 158 |
+
source_seeds=[seed],
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
return candidates
|
| 162 |
+
|
| 163 |
+
def expand_all_exclusive(self) -> dict[str, list[Candidate]]:
|
| 164 |
+
"""
|
| 165 |
+
Expand all categories with exclusive assignment.
|
| 166 |
+
|
| 167 |
+
Each word is assigned only to the category where it has
|
| 168 |
+
the highest score.
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
Dict mapping category -> list of Candidates (sorted by score)
|
| 172 |
+
"""
|
| 173 |
+
logger.info("Starting exclusive expansion...")
|
| 174 |
+
|
| 175 |
+
# Step 1: Collect candidates from all categories
|
| 176 |
+
# Format: word -> [(category, Candidate), ...]
|
| 177 |
+
all_candidates: dict[str, list[tuple[str, Candidate]]] = defaultdict(list)
|
| 178 |
+
|
| 179 |
+
for category, seeds in self.existing.items():
|
| 180 |
+
category_candidates = self._find_candidates_for_category(category, seeds)
|
| 181 |
+
for word, candidate in category_candidates.items():
|
| 182 |
+
all_candidates[word].append((category, candidate))
|
| 183 |
+
|
| 184 |
+
logger.info(f"[{category}] Found {len(category_candidates)} raw candidates")
|
| 185 |
+
|
| 186 |
+
# Step 2: Assign each word to category with highest score
|
| 187 |
+
final_assignments: dict[str, list[Candidate]] = defaultdict(list)
|
| 188 |
+
|
| 189 |
+
for word, category_candidates in all_candidates.items():
|
| 190 |
+
# Find category with highest score
|
| 191 |
+
best_category, best_candidate = max(
|
| 192 |
+
category_candidates,
|
| 193 |
+
key=lambda x: x[1].score,
|
| 194 |
+
)
|
| 195 |
+
final_assignments[best_category].append(best_candidate)
|
| 196 |
+
|
| 197 |
+
# Step 3: Sort candidates in each category by score
|
| 198 |
+
for category in final_assignments:
|
| 199 |
+
final_assignments[category].sort(key=lambda c: c.score, reverse=True)
|
| 200 |
+
|
| 201 |
+
# Log results
|
| 202 |
+
total = sum(len(cands) for cands in final_assignments.values())
|
| 203 |
+
logger.info(f"Exclusive assignment complete: {total} total candidates")
|
| 204 |
+
|
| 205 |
+
for category, cands in sorted(final_assignments.items()):
|
| 206 |
+
logger.info(f" {category}: {len(cands)} candidates")
|
| 207 |
+
|
| 208 |
+
return dict(final_assignments)
|
| 209 |
+
|
| 210 |
+
def export_candidates(
|
| 211 |
+
self,
|
| 212 |
+
path: Path | str | None = None,
|
| 213 |
+
include_threshold_in_name: bool = False,
|
| 214 |
+
) -> Path:
|
| 215 |
+
"""
|
| 216 |
+
Export candidates to JSON for manual review.
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
path: Output path (default: output/candidates.json)
|
| 220 |
+
include_threshold_in_name: Add threshold to filename for comparison
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
Path to exported file
|
| 224 |
+
"""
|
| 225 |
+
if path:
|
| 226 |
+
path = Path(path)
|
| 227 |
+
elif include_threshold_in_name:
|
| 228 |
+
path = OUTPUT_DIR / f"candidates_t{self.similarity_threshold:.2f}.json"
|
| 229 |
+
else:
|
| 230 |
+
path = OUTPUT_DIR / "candidates.json"
|
| 231 |
+
|
| 232 |
+
results = self.expand_all_exclusive()
|
| 233 |
+
|
| 234 |
+
export_data = {
|
| 235 |
+
"metadata": {
|
| 236 |
+
"generated_at": datetime.now().isoformat(),
|
| 237 |
+
"similarity_threshold": self.similarity_threshold,
|
| 238 |
+
"min_frequency": self.min_frequency,
|
| 239 |
+
"total_candidates": sum(len(c) for c in results.values()),
|
| 240 |
+
},
|
| 241 |
+
"categories": {},
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
for category, candidates in sorted(results.items()):
|
| 245 |
+
export_data["categories"][category] = [c.to_dict() for c in candidates]
|
| 246 |
+
|
| 247 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 248 |
+
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
| 249 |
+
|
| 250 |
+
logger.info(f"Exported candidates to {path}")
|
| 251 |
+
return path
|
| 252 |
+
|
| 253 |
+
def generate_keywords_py(
|
| 254 |
+
self,
|
| 255 |
+
output_path: Path | str | None = None,
|
| 256 |
+
auto_approve_threshold: float | None = None,
|
| 257 |
+
) -> Path:
|
| 258 |
+
"""
|
| 259 |
+
Generate new keywords.py with expanded dictionary.
|
| 260 |
+
|
| 261 |
+
Words with score >= auto_approve_threshold are added directly.
|
| 262 |
+
Words below threshold are added as comments for manual review.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
output_path: Output path (default: output/keywords_expanded.py)
|
| 266 |
+
auto_approve_threshold: Score threshold for auto-approval
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
Path to generated file
|
| 270 |
+
"""
|
| 271 |
+
output_path = Path(output_path) if output_path else OUTPUT_DIR / "keywords_expanded.py"
|
| 272 |
+
auto_approve = auto_approve_threshold or SETTINGS["auto_approve_threshold"]
|
| 273 |
+
|
| 274 |
+
results = self.expand_all_exclusive()
|
| 275 |
+
|
| 276 |
+
lines = [
|
| 277 |
+
'"""',
|
| 278 |
+
"Expanded keyword dictionary for game review topic detection.",
|
| 279 |
+
f"Generated: {datetime.now().isoformat()}",
|
| 280 |
+
f"Auto-approve threshold: {auto_approve}",
|
| 281 |
+
'"""',
|
| 282 |
+
"",
|
| 283 |
+
"TOPIC_KEYWORDS = {",
|
| 284 |
+
]
|
| 285 |
+
|
| 286 |
+
for category, seeds in self.existing.items():
|
| 287 |
+
lines.append(f' "{category}": [')
|
| 288 |
+
|
| 289 |
+
# Existing keywords
|
| 290 |
+
lines.append(" # Existing")
|
| 291 |
+
for seed in seeds:
|
| 292 |
+
lines.append(f' "{seed}",')
|
| 293 |
+
|
| 294 |
+
# New candidates
|
| 295 |
+
candidates = results.get(category, [])
|
| 296 |
+
if candidates:
|
| 297 |
+
# Auto-approved
|
| 298 |
+
auto_approved = [c for c in candidates if c.score >= auto_approve]
|
| 299 |
+
if auto_approved:
|
| 300 |
+
lines.append(f" # NEW (auto-approved, score >= {auto_approve})")
|
| 301 |
+
for c in auto_approved:
|
| 302 |
+
word_display = c.word.replace("_", " ")
|
| 303 |
+
lines.append(f' "{word_display}", # score={c.score:.2f}')
|
| 304 |
+
|
| 305 |
+
# Candidates requiring review
|
| 306 |
+
review_needed = [c for c in candidates if c.score < auto_approve]
|
| 307 |
+
if review_needed:
|
| 308 |
+
lines.append(f" # CANDIDATES (score < {auto_approve}, require review)")
|
| 309 |
+
for c in review_needed:
|
| 310 |
+
word_display = c.word.replace("_", " ")
|
| 311 |
+
lines.append(f' # "{word_display}", # score={c.score:.2f}')
|
| 312 |
+
|
| 313 |
+
lines.append(" ],")
|
| 314 |
+
lines.append("")
|
| 315 |
+
|
| 316 |
+
lines.append("}")
|
| 317 |
+
lines.append("")
|
| 318 |
+
|
| 319 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 320 |
+
f.write("\n".join(lines))
|
| 321 |
+
|
| 322 |
+
logger.info(f"Generated keywords file at {output_path}")
|
| 323 |
+
return output_path
|
| 324 |
+
|
| 325 |
+
def get_expansion_stats(self) -> dict:
|
| 326 |
+
"""Get statistics about the expansion."""
|
| 327 |
+
results = self.expand_all_exclusive()
|
| 328 |
+
auto_threshold = SETTINGS["auto_approve_threshold"]
|
| 329 |
+
|
| 330 |
+
stats = {
|
| 331 |
+
"total_candidates": 0,
|
| 332 |
+
"auto_approved": 0,
|
| 333 |
+
"needs_review": 0,
|
| 334 |
+
"by_category": {},
|
| 335 |
+
}
|
| 336 |
+
|
| 337 |
+
for category, candidates in results.items():
|
| 338 |
+
auto = sum(1 for c in candidates if c.score >= auto_threshold)
|
| 339 |
+
review = len(candidates) - auto
|
| 340 |
+
|
| 341 |
+
stats["by_category"][category] = {
|
| 342 |
+
"total": len(candidates),
|
| 343 |
+
"auto_approved": auto,
|
| 344 |
+
"needs_review": review,
|
| 345 |
+
}
|
| 346 |
+
stats["total_candidates"] += len(candidates)
|
| 347 |
+
stats["auto_approved"] += auto
|
| 348 |
+
stats["needs_review"] += review
|
| 349 |
+
|
| 350 |
+
return stats
|
scripts/expand_keywords/fetcher.py
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Review fetcher with rate limiting and progress tracking.
|
| 3 |
+
|
| 4 |
+
Downloads reviews from Steam API with:
|
| 5 |
+
- Cursor-based pagination
|
| 6 |
+
- Sleep between requests to respect rate limits
|
| 7 |
+
- Progress persistence (JSONL per game + progress.json)
|
| 8 |
+
- Resume capability
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import asyncio
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
import httpx
|
| 20 |
+
|
| 21 |
+
from .config import GAMES, REVIEWS_DIR, SETTINGS, STEAM_REVIEW_LANGUAGE, STEAM_REVIEWS_API
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class FetchProgress:
|
| 28 |
+
"""Progress tracking for a single game."""
|
| 29 |
+
app_id: str
|
| 30 |
+
name: str
|
| 31 |
+
target: int
|
| 32 |
+
fetched: int = 0
|
| 33 |
+
cursor: str = "*"
|
| 34 |
+
completed: bool = False
|
| 35 |
+
last_updated: str = ""
|
| 36 |
+
|
| 37 |
+
def to_dict(self) -> dict:
|
| 38 |
+
return {
|
| 39 |
+
"app_id": self.app_id,
|
| 40 |
+
"name": self.name,
|
| 41 |
+
"target": self.target,
|
| 42 |
+
"fetched": self.fetched,
|
| 43 |
+
"cursor": self.cursor,
|
| 44 |
+
"completed": self.completed,
|
| 45 |
+
"last_updated": self.last_updated,
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
@classmethod
|
| 49 |
+
def from_dict(cls, data: dict) -> "FetchProgress":
|
| 50 |
+
return cls(
|
| 51 |
+
app_id=data["app_id"],
|
| 52 |
+
name=data["name"],
|
| 53 |
+
target=data["target"],
|
| 54 |
+
fetched=data.get("fetched", 0),
|
| 55 |
+
cursor=data.get("cursor", "*"),
|
| 56 |
+
completed=data.get("completed", False),
|
| 57 |
+
last_updated=data.get("last_updated", ""),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class ReviewFetcher:
|
| 63 |
+
"""
|
| 64 |
+
Fetches reviews from Steam with rate limiting.
|
| 65 |
+
|
| 66 |
+
Features:
|
| 67 |
+
- Async HTTP client with timeout
|
| 68 |
+
- Exponential backoff on rate limiting
|
| 69 |
+
- Progress persistence (resume capability)
|
| 70 |
+
- JSONL output per game
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
timeout: float = 30.0
|
| 74 |
+
progress_file: Path = field(default_factory=lambda: REVIEWS_DIR / "progress.json")
|
| 75 |
+
|
| 76 |
+
def __post_init__(self):
|
| 77 |
+
self._progress: dict[str, FetchProgress] = {}
|
| 78 |
+
self._load_progress()
|
| 79 |
+
|
| 80 |
+
def _load_progress(self) -> None:
|
| 81 |
+
"""Load progress from file if exists."""
|
| 82 |
+
if self.progress_file.exists():
|
| 83 |
+
try:
|
| 84 |
+
with open(self.progress_file, "r", encoding="utf-8") as f:
|
| 85 |
+
data = json.load(f)
|
| 86 |
+
for app_id, progress_data in data.items():
|
| 87 |
+
self._progress[app_id] = FetchProgress.from_dict(progress_data)
|
| 88 |
+
logger.info(f"Loaded progress for {len(self._progress)} games")
|
| 89 |
+
except (json.JSONDecodeError, KeyError) as e:
|
| 90 |
+
logger.warning(f"Failed to load progress: {e}")
|
| 91 |
+
self._progress = {}
|
| 92 |
+
|
| 93 |
+
def _save_progress(self) -> None:
|
| 94 |
+
"""Save progress to file."""
|
| 95 |
+
data = {app_id: prog.to_dict() for app_id, prog in self._progress.items()}
|
| 96 |
+
with open(self.progress_file, "w", encoding="utf-8") as f:
|
| 97 |
+
json.dump(data, f, indent=2)
|
| 98 |
+
|
| 99 |
+
def get_progress(self) -> dict[str, dict]:
|
| 100 |
+
"""Get current progress for all games."""
|
| 101 |
+
return {app_id: prog.to_dict() for app_id, prog in self._progress.items()}
|
| 102 |
+
|
| 103 |
+
def _get_reviews_file(self, app_id: str) -> Path:
|
| 104 |
+
"""Get path to reviews JSONL file for a game."""
|
| 105 |
+
return REVIEWS_DIR / f"{app_id}.jsonl"
|
| 106 |
+
|
| 107 |
+
def _append_reviews(self, app_id: str, reviews: list[str]) -> None:
|
| 108 |
+
"""Append reviews to JSONL file."""
|
| 109 |
+
reviews_file = self._get_reviews_file(app_id)
|
| 110 |
+
with open(reviews_file, "a", encoding="utf-8") as f:
|
| 111 |
+
for review in reviews:
|
| 112 |
+
f.write(json.dumps({"text": review}, ensure_ascii=False) + "\n")
|
| 113 |
+
|
| 114 |
+
def load_reviews(self, app_id: str) -> list[str]:
|
| 115 |
+
"""Load reviews from JSONL file."""
|
| 116 |
+
reviews_file = self._get_reviews_file(app_id)
|
| 117 |
+
if not reviews_file.exists():
|
| 118 |
+
return []
|
| 119 |
+
|
| 120 |
+
reviews = []
|
| 121 |
+
with open(reviews_file, "r", encoding="utf-8") as f:
|
| 122 |
+
for line in f:
|
| 123 |
+
try:
|
| 124 |
+
data = json.loads(line.strip())
|
| 125 |
+
reviews.append(data["text"])
|
| 126 |
+
except (json.JSONDecodeError, KeyError):
|
| 127 |
+
continue
|
| 128 |
+
return reviews
|
| 129 |
+
|
| 130 |
+
def load_all_reviews(self) -> list[str]:
|
| 131 |
+
"""Load all reviews from all downloaded games."""
|
| 132 |
+
all_reviews = []
|
| 133 |
+
for app_id, _, _ in GAMES:
|
| 134 |
+
reviews = self.load_reviews(app_id)
|
| 135 |
+
all_reviews.extend(reviews)
|
| 136 |
+
logger.info(f"Loaded {len(all_reviews)} total reviews")
|
| 137 |
+
return all_reviews
|
| 138 |
+
|
| 139 |
+
async def _fetch_batch(
|
| 140 |
+
self,
|
| 141 |
+
client: httpx.AsyncClient,
|
| 142 |
+
app_id: str,
|
| 143 |
+
cursor: str,
|
| 144 |
+
batch_size: int,
|
| 145 |
+
) -> tuple[list[str], str | None]:
|
| 146 |
+
"""Fetch a single batch of reviews."""
|
| 147 |
+
url = STEAM_REVIEWS_API.format(app_id=app_id)
|
| 148 |
+
params: dict[str, Any] = {
|
| 149 |
+
"json": "1",
|
| 150 |
+
"filter": "recent", # "recent" has more reviews available than "all"
|
| 151 |
+
"review_type": "all",
|
| 152 |
+
"language": STEAM_REVIEW_LANGUAGE,
|
| 153 |
+
"num_per_page": str(batch_size),
|
| 154 |
+
"cursor": cursor,
|
| 155 |
+
"purchase_type": "all",
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
response = await client.get(url, params=params)
|
| 160 |
+
response.raise_for_status()
|
| 161 |
+
data = response.json()
|
| 162 |
+
except httpx.HTTPError as e:
|
| 163 |
+
logger.error(f"HTTP error fetching reviews for {app_id}: {e}")
|
| 164 |
+
return [], None
|
| 165 |
+
|
| 166 |
+
if not data.get("success"):
|
| 167 |
+
logger.warning(f"API returned success=false for {app_id}")
|
| 168 |
+
return [], None
|
| 169 |
+
|
| 170 |
+
reviews_data = data.get("reviews", [])
|
| 171 |
+
min_length = SETTINGS["min_review_length"]
|
| 172 |
+
|
| 173 |
+
reviews = [
|
| 174 |
+
review.get("review", "").strip()
|
| 175 |
+
for review in reviews_data
|
| 176 |
+
if review.get("review") and len(review.get("review", "").strip()) >= min_length
|
| 177 |
+
]
|
| 178 |
+
|
| 179 |
+
new_cursor = data.get("cursor")
|
| 180 |
+
return reviews, new_cursor
|
| 181 |
+
|
| 182 |
+
async def _fetch_with_backoff(
|
| 183 |
+
self,
|
| 184 |
+
client: httpx.AsyncClient,
|
| 185 |
+
app_id: str,
|
| 186 |
+
cursor: str,
|
| 187 |
+
batch_size: int,
|
| 188 |
+
) -> tuple[list[str], str | None]:
|
| 189 |
+
"""Fetch with exponential backoff on failure."""
|
| 190 |
+
max_retries = SETTINGS["max_retries"]
|
| 191 |
+
base_delay = SETTINGS["retry_base_delay"]
|
| 192 |
+
|
| 193 |
+
for attempt in range(max_retries):
|
| 194 |
+
reviews, new_cursor = await self._fetch_batch(client, app_id, cursor, batch_size)
|
| 195 |
+
|
| 196 |
+
if reviews or new_cursor is None:
|
| 197 |
+
return reviews, new_cursor
|
| 198 |
+
|
| 199 |
+
# Empty reviews with cursor - might be rate limited
|
| 200 |
+
delay = base_delay * (2 ** attempt)
|
| 201 |
+
logger.warning(f"Empty response, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
|
| 202 |
+
await asyncio.sleep(delay)
|
| 203 |
+
|
| 204 |
+
return [], None
|
| 205 |
+
|
| 206 |
+
async def fetch_game_reviews(
|
| 207 |
+
self,
|
| 208 |
+
app_id: str,
|
| 209 |
+
name: str,
|
| 210 |
+
target: int,
|
| 211 |
+
resume: bool = True,
|
| 212 |
+
) -> int:
|
| 213 |
+
"""
|
| 214 |
+
Fetch reviews for a single game.
|
| 215 |
+
|
| 216 |
+
Returns number of reviews fetched.
|
| 217 |
+
"""
|
| 218 |
+
# Check if already completed
|
| 219 |
+
if resume and app_id in self._progress:
|
| 220 |
+
progress = self._progress[app_id]
|
| 221 |
+
if progress.completed:
|
| 222 |
+
logger.info(f"[{name}] Already completed ({progress.fetched} reviews)")
|
| 223 |
+
return progress.fetched
|
| 224 |
+
cursor = progress.cursor
|
| 225 |
+
fetched = progress.fetched
|
| 226 |
+
else:
|
| 227 |
+
# Start fresh - clear existing file
|
| 228 |
+
reviews_file = self._get_reviews_file(app_id)
|
| 229 |
+
if reviews_file.exists():
|
| 230 |
+
reviews_file.unlink()
|
| 231 |
+
cursor = "*"
|
| 232 |
+
fetched = 0
|
| 233 |
+
|
| 234 |
+
# Initialize progress
|
| 235 |
+
self._progress[app_id] = FetchProgress(
|
| 236 |
+
app_id=app_id,
|
| 237 |
+
name=name,
|
| 238 |
+
target=target,
|
| 239 |
+
fetched=fetched,
|
| 240 |
+
cursor=cursor,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
batch_size = SETTINGS["batch_size"]
|
| 244 |
+
sleep_between = SETTINGS["sleep_between_batches"]
|
| 245 |
+
seen_cursors: set[str] = set()
|
| 246 |
+
|
| 247 |
+
logger.info(f"[{name}] Starting fetch: target={target}, already={fetched}")
|
| 248 |
+
|
| 249 |
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
| 250 |
+
while fetched < target:
|
| 251 |
+
reviews, new_cursor = await self._fetch_with_backoff(
|
| 252 |
+
client, app_id, cursor, batch_size
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
if not reviews:
|
| 256 |
+
logger.warning(f"[{name}] No more reviews available")
|
| 257 |
+
break
|
| 258 |
+
|
| 259 |
+
if new_cursor and new_cursor in seen_cursors:
|
| 260 |
+
logger.warning(f"[{name}] Cursor loop detected")
|
| 261 |
+
break
|
| 262 |
+
|
| 263 |
+
if new_cursor:
|
| 264 |
+
seen_cursors.add(new_cursor)
|
| 265 |
+
|
| 266 |
+
# Save reviews
|
| 267 |
+
self._append_reviews(app_id, reviews)
|
| 268 |
+
fetched += len(reviews)
|
| 269 |
+
|
| 270 |
+
# Update progress
|
| 271 |
+
self._progress[app_id].fetched = fetched
|
| 272 |
+
self._progress[app_id].cursor = new_cursor or cursor
|
| 273 |
+
self._progress[app_id].last_updated = datetime.now().isoformat()
|
| 274 |
+
self._save_progress()
|
| 275 |
+
|
| 276 |
+
logger.info(f"[{name}] Fetched {fetched}/{target} reviews")
|
| 277 |
+
|
| 278 |
+
if not new_cursor or new_cursor == "*":
|
| 279 |
+
break
|
| 280 |
+
|
| 281 |
+
cursor = new_cursor
|
| 282 |
+
await asyncio.sleep(sleep_between)
|
| 283 |
+
|
| 284 |
+
# Mark as completed
|
| 285 |
+
self._progress[app_id].completed = True
|
| 286 |
+
self._progress[app_id].last_updated = datetime.now().isoformat()
|
| 287 |
+
self._save_progress()
|
| 288 |
+
|
| 289 |
+
logger.info(f"[{name}] Completed with {fetched} reviews")
|
| 290 |
+
return fetched
|
| 291 |
+
|
| 292 |
+
async def fetch_all(
|
| 293 |
+
self,
|
| 294 |
+
resume: bool = True,
|
| 295 |
+
limit_games: int | None = None,
|
| 296 |
+
) -> dict[str, int]:
|
| 297 |
+
"""
|
| 298 |
+
Fetch reviews for all configured games.
|
| 299 |
+
|
| 300 |
+
Args:
|
| 301 |
+
resume: Continue from previous progress
|
| 302 |
+
limit_games: Limit number of games (for testing)
|
| 303 |
+
|
| 304 |
+
Returns:
|
| 305 |
+
Dict mapping app_id to number of reviews fetched
|
| 306 |
+
"""
|
| 307 |
+
results: dict[str, int] = {}
|
| 308 |
+
sleep_between_games = SETTINGS["sleep_between_games"]
|
| 309 |
+
reviews_per_game = SETTINGS["reviews_per_game"]
|
| 310 |
+
|
| 311 |
+
games = GAMES[:limit_games] if limit_games else GAMES
|
| 312 |
+
|
| 313 |
+
for i, (app_id, name, genre) in enumerate(games):
|
| 314 |
+
logger.info(f"Processing game {i + 1}/{len(games)}: {name} ({genre})")
|
| 315 |
+
|
| 316 |
+
count = await self.fetch_game_reviews(
|
| 317 |
+
app_id=app_id,
|
| 318 |
+
name=name,
|
| 319 |
+
target=reviews_per_game,
|
| 320 |
+
resume=resume,
|
| 321 |
+
)
|
| 322 |
+
results[app_id] = count
|
| 323 |
+
|
| 324 |
+
# Sleep between games (except for last one)
|
| 325 |
+
if i < len(games) - 1:
|
| 326 |
+
logger.info(f"Sleeping {sleep_between_games}s before next game...")
|
| 327 |
+
await asyncio.sleep(sleep_between_games)
|
| 328 |
+
|
| 329 |
+
total = sum(results.values())
|
| 330 |
+
logger.info(f"Total reviews fetched: {total}")
|
| 331 |
+
return results
|
| 332 |
+
|
| 333 |
+
def get_stats(self) -> dict:
|
| 334 |
+
"""Get statistics about fetched reviews."""
|
| 335 |
+
stats = {
|
| 336 |
+
"games_total": len(GAMES),
|
| 337 |
+
"games_completed": 0,
|
| 338 |
+
"games_in_progress": 0,
|
| 339 |
+
"reviews_total": 0,
|
| 340 |
+
"reviews_per_game": {},
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
for app_id, name, _ in GAMES:
|
| 344 |
+
reviews_file = self._get_reviews_file(app_id)
|
| 345 |
+
if reviews_file.exists():
|
| 346 |
+
count = sum(1 for _ in open(reviews_file, "r", encoding="utf-8"))
|
| 347 |
+
stats["reviews_per_game"][name] = count
|
| 348 |
+
stats["reviews_total"] += count
|
| 349 |
+
|
| 350 |
+
if app_id in self._progress and self._progress[app_id].completed:
|
| 351 |
+
stats["games_completed"] += 1
|
| 352 |
+
else:
|
| 353 |
+
stats["games_in_progress"] += 1
|
| 354 |
+
|
| 355 |
+
return stats
|
scripts/expand_keywords/keywords_base.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Słowa kluczowe do wykrywania tematów w recenzjach gier.
|
| 3 |
+
Używane w podejściu hybrydowym (Keywords + ML Sentiment).
|
| 4 |
+
|
| 5 |
+
Kategorie zostały dobrane na podstawie najczęstszych tematów
|
| 6 |
+
poruszanych w recenzjach gier na platformie Steam.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
TOPIC_KEYWORDS = {
|
| 10 |
+
# =========================================================================
|
| 11 |
+
# CORE GAMEPLAY
|
| 12 |
+
# =========================================================================
|
| 13 |
+
"Gameplay": [
|
| 14 |
+
# Podstawowe
|
| 15 |
+
"gameplay", "mechanics", "game mechanics", "core gameplay", "game loop",
|
| 16 |
+
"combat", "combat system", "fighting", "battle", "battles",
|
| 17 |
+
# Progresja
|
| 18 |
+
"progression", "leveling", "level up", "experience", "xp", "grind", "grinding",
|
| 19 |
+
"skill tree", "talent tree", "unlock", "unlocks", "unlockables",
|
| 20 |
+
# Misje i aktywności
|
| 21 |
+
"quests", "quest", "missions", "mission", "objectives", "side quests",
|
| 22 |
+
"main quest", "fetch quests", "puzzles", "puzzle", "exploration",
|
| 23 |
+
# Design
|
| 24 |
+
"game design", "level design", "map design", "pacing",
|
| 25 |
+
"balancing", "balanced", "unbalanced", "overpowered", "underpowered", "meta",
|
| 26 |
+
# Wrogowie
|
| 27 |
+
"enemies", "enemy", "bosses", "boss fights", "boss battle", "mobs",
|
| 28 |
+
# Ruch i umiejętności
|
| 29 |
+
"movement", "traversal", "parkour", "skills", "abilities", "powers",
|
| 30 |
+
"spells", "weapons", "weapon variety", "builds", "build variety",
|
| 31 |
+
],
|
| 32 |
+
|
| 33 |
+
"Fun": [
|
| 34 |
+
# Pozytywne
|
| 35 |
+
"fun", "enjoyable", "entertaining", "addictive", "addicting", "engaging",
|
| 36 |
+
"exciting", "thrilling", "satisfying", "rewarding", "immersive",
|
| 37 |
+
"masterpiece", "gem", "hidden gem", "must play", "must buy",
|
| 38 |
+
# Negatywne
|
| 39 |
+
"boring", "tedious", "repetitive", "monotonous", "dull", "bland",
|
| 40 |
+
"frustrating", "annoying", "unfun", "not fun", "waste of time",
|
| 41 |
+
"disappointing", "letdown", "overhyped", "overrated", "underrated",
|
| 42 |
+
],
|
| 43 |
+
|
| 44 |
+
"Difficulty": [
|
| 45 |
+
# Poziomy trudności
|
| 46 |
+
"difficulty", "easy", "normal", "hard", "very hard", "nightmare",
|
| 47 |
+
"easy mode", "hard mode", "difficulty settings", "difficulty options",
|
| 48 |
+
# Opisy trudności
|
| 49 |
+
"challenging", "too easy", "too hard", "too difficult", "punishing",
|
| 50 |
+
"forgiving", "casual", "hardcore", "souls-like", "soulslike",
|
| 51 |
+
"dark souls", "die a lot", "dying", "deaths", "unfair", "cheap deaths",
|
| 52 |
+
# Krzywa trudności
|
| 53 |
+
"learning curve", "steep learning curve", "skill ceiling", "skill floor",
|
| 54 |
+
"newcomer friendly", "beginner friendly", "accessible",
|
| 55 |
+
],
|
| 56 |
+
|
| 57 |
+
# =========================================================================
|
| 58 |
+
# TECHNICAL
|
| 59 |
+
# =========================================================================
|
| 60 |
+
"Performance": [
|
| 61 |
+
# Wydajność
|
| 62 |
+
"performance", "optimize", "optimized", "optimization", "well optimized",
|
| 63 |
+
"poorly optimized", "unoptimized", "runs well", "runs smooth", "runs poorly",
|
| 64 |
+
# FPS
|
| 65 |
+
"fps", "framerate", "frame rate", "frames per second", "60fps", "30fps",
|
| 66 |
+
"fps drops", "frame drops", "drops", "dips", "stuttering", "stutter",
|
| 67 |
+
"hitching", "micro stutter",
|
| 68 |
+
# Zasoby
|
| 69 |
+
"cpu", "gpu", "ram", "vram", "memory", "memory leak", "memory usage",
|
| 70 |
+
# Ładowanie
|
| 71 |
+
"loading", "loading times", "load times", "loading screens", "long loading",
|
| 72 |
+
# Stabilność
|
| 73 |
+
"smooth", "stable", "unstable", "lag", "lagging", "input lag",
|
| 74 |
+
],
|
| 75 |
+
|
| 76 |
+
"Bugs": [
|
| 77 |
+
# Ogólne
|
| 78 |
+
"bugs", "bug", "buggy", "glitch", "glitches", "glitchy",
|
| 79 |
+
"broken", "issues", "problems", "jank", "janky",
|
| 80 |
+
# Crashe
|
| 81 |
+
"crash", "crashes", "crashing", "crashed", "freeze", "freezing", "frozen",
|
| 82 |
+
"ctd", "crash to desktop", "black screen", "stuck",
|
| 83 |
+
# Konkretne bugi
|
| 84 |
+
"save bug", "save corruption", "corrupted save", "lost progress",
|
| 85 |
+
"clipping", "falling through", "invisible", "t-pose",
|
| 86 |
+
"softlock", "soft lock", "softlocked", "game breaking",
|
| 87 |
+
# Stan gry
|
| 88 |
+
"unplayable", "unfinished", "early access", "beta", "alpha",
|
| 89 |
+
"needs polish", "polished", "rough edges",
|
| 90 |
+
],
|
| 91 |
+
|
| 92 |
+
# =========================================================================
|
| 93 |
+
# AUDIO-VISUAL
|
| 94 |
+
# =========================================================================
|
| 95 |
+
"Graphics": [
|
| 96 |
+
# Ogólne
|
| 97 |
+
"graphics", "visuals", "visual", "graphic", "graphically",
|
| 98 |
+
"looks", "look", "looking", "looks good", "looks bad", "looks great",
|
| 99 |
+
# Styl
|
| 100 |
+
"art style", "art direction", "artstyle", "aesthetic", "stylized",
|
| 101 |
+
"realistic", "photorealistic", "cartoony", "anime", "pixel art", "retro",
|
| 102 |
+
# Techniczne
|
| 103 |
+
"textures", "texture", "models", "model", "animations", "animation",
|
| 104 |
+
"lighting", "lights", "shadows", "shadow", "reflections", "ray tracing",
|
| 105 |
+
"rendering", "shaders", "particle effects", "particles",
|
| 106 |
+
# Rozdzielczość
|
| 107 |
+
"resolution", "4k", "1440p", "1080p", "720p", "upscaling", "dlss", "fsr",
|
| 108 |
+
# Środowisko
|
| 109 |
+
"environments", "environment", "scenery", "landscapes", "world design",
|
| 110 |
+
"level of detail", "lod", "draw distance", "pop in", "pop-in",
|
| 111 |
+
# Oceny
|
| 112 |
+
"beautiful", "gorgeous", "stunning", "breathtaking", "pretty",
|
| 113 |
+
"ugly", "dated", "outdated", "aged", "old looking",
|
| 114 |
+
],
|
| 115 |
+
|
| 116 |
+
"Sound": [
|
| 117 |
+
# Muzyka
|
| 118 |
+
"music", "soundtrack", "ost", "score", "composer", "tracks",
|
| 119 |
+
"ambient", "ambient music", "battle music", "menu music",
|
| 120 |
+
# Głos
|
| 121 |
+
"voice", "voice acting", "voice actors", "voice over", "vo",
|
| 122 |
+
"voice lines", "dialogue", "dubbed", "dubbing", "lip sync",
|
| 123 |
+
# Efekty dźwiękowe
|
| 124 |
+
"sound", "sounds", "audio", "sfx", "sound effects", "sound design",
|
| 125 |
+
"footsteps", "gunshots", "explosions",
|
| 126 |
+
# Jakość
|
| 127 |
+
"atmosphere", "atmospheric", "immersive audio", "spatial audio",
|
| 128 |
+
"surround", "audio quality", "sound quality",
|
| 129 |
+
# Problemy
|
| 130 |
+
"audio bug", "audio glitch", "no sound", "sound cutting", "loud", "quiet",
|
| 131 |
+
],
|
| 132 |
+
|
| 133 |
+
# =========================================================================
|
| 134 |
+
# CONTENT & VALUE
|
| 135 |
+
# =========================================================================
|
| 136 |
+
"Content": [
|
| 137 |
+
# Długość
|
| 138 |
+
"hours", "hour", "length", "long", "short", "playtime", "play time",
|
| 139 |
+
"how long", "game length", "campaign length",
|
| 140 |
+
# Ilość contentu
|
| 141 |
+
"content", "lots of content", "lack of content", "thin", "meaty",
|
| 142 |
+
"activities", "things to do", "side content", "endgame", "end game",
|
| 143 |
+
"post game", "new game plus", "ng+",
|
| 144 |
+
# Replayability
|
| 145 |
+
"replay", "replay value", "replayability", "replayable",
|
| 146 |
+
"multiple endings", "different endings", "choices matter",
|
| 147 |
+
"multiple playthroughs", "completionist", "100%", "100 percent",
|
| 148 |
+
],
|
| 149 |
+
|
| 150 |
+
"Monetization": [
|
| 151 |
+
# Cena (ex-Price)
|
| 152 |
+
"price", "pricing", "cost", "costs", "priced",
|
| 153 |
+
"expensive", "overpriced", "cheap", "affordable",
|
| 154 |
+
"value", "worth", "worth it", "not worth", "bang for buck",
|
| 155 |
+
"value for money", "money well spent",
|
| 156 |
+
"sale", "discount", "on sale", "full price", "wait for sale",
|
| 157 |
+
"refund", "refunded", "steam sale",
|
| 158 |
+
"aaa price", "indie price", "budget", "premium",
|
| 159 |
+
"free to play", "f2p", "free",
|
| 160 |
+
# MTX (ex-Microtransactions)
|
| 161 |
+
"microtransactions", "microtransaction", "mtx", "monetization",
|
| 162 |
+
"in app purchases", "iap", "real money", "cash shop", "item shop",
|
| 163 |
+
"pay to win", "p2w", "pay2win", "paywall", "pay wall",
|
| 164 |
+
"pay to progress", "paying", "whale", "whales",
|
| 165 |
+
"loot box", "loot boxes", "lootbox", "gacha", "gambling",
|
| 166 |
+
"rng", "random", "chance",
|
| 167 |
+
"battle pass", "season pass", "battlepass", "seasons",
|
| 168 |
+
"premium currency", "gems", "coins", "points",
|
| 169 |
+
"cosmetics", "cosmetic", "skins", "skin", "outfits",
|
| 170 |
+
"dlc", "expansion", "expansions", "dlcs",
|
| 171 |
+
"cash grab", "money grab", "greedy", "predatory", "scam",
|
| 172 |
+
],
|
| 173 |
+
|
| 174 |
+
# =========================================================================
|
| 175 |
+
# MULTIPLAYER & COMMUNITY
|
| 176 |
+
# =========================================================================
|
| 177 |
+
"Multiplayer": [
|
| 178 |
+
# Tryby
|
| 179 |
+
"multiplayer", "multi-player", "online", "offline",
|
| 180 |
+
"co-op", "coop", "co op", "cooperative",
|
| 181 |
+
"pvp", "pve", "pvpve", "versus",
|
| 182 |
+
"singleplayer", "single player", "solo", "solo play",
|
| 183 |
+
# Matchmaking
|
| 184 |
+
"matchmaking", "queue", "queue times", "waiting",
|
| 185 |
+
"servers", "server", "dedicated servers", "p2p", "peer to peer",
|
| 186 |
+
"ping", "latency", "connection", "disconnects", "desync",
|
| 187 |
+
# Gracze
|
| 188 |
+
"players", "teammates", "team", "squad", "party",
|
| 189 |
+
"randoms", "random teammates", "lobbies", "lobby",
|
| 190 |
+
# Problemy
|
| 191 |
+
"cheaters", "cheater", "hackers", "hacker", "hacking", "cheating",
|
| 192 |
+
"aimbots", "wallhacks", "anticheat", "anti cheat",
|
| 193 |
+
"toxic", "toxicity", "griefing", "griefers",
|
| 194 |
+
],
|
| 195 |
+
|
| 196 |
+
"Community": [
|
| 197 |
+
# Społeczność
|
| 198 |
+
"community", "playerbase", "player base", "players",
|
| 199 |
+
"active", "dead game", "dead", "alive", "population",
|
| 200 |
+
# Modding
|
| 201 |
+
"mods", "mod", "modding", "mod support", "workshop",
|
| 202 |
+
"steam workshop", "nexus", "modders", "modded",
|
| 203 |
+
"custom content", "user generated",
|
| 204 |
+
# Deweloperzy (interakcja)
|
| 205 |
+
"devs", "developers", "dev team", "community manager",
|
| 206 |
+
"communication", "transparent", "listening",
|
| 207 |
+
# Społeczność graczy
|
| 208 |
+
"helpful", "friendly", "toxic community", "welcoming",
|
| 209 |
+
"guides", "wiki", "tutorials", "newbie friendly",
|
| 210 |
+
],
|
| 211 |
+
|
| 212 |
+
# =========================================================================
|
| 213 |
+
# CONTROLS & UI
|
| 214 |
+
# =========================================================================
|
| 215 |
+
"Controls": [
|
| 216 |
+
# Sterowanie
|
| 217 |
+
"controls", "control", "controlling", "control scheme",
|
| 218 |
+
"keybinds", "keybind", "key bindings", "rebind", "remapping",
|
| 219 |
+
# Urządzenia
|
| 220 |
+
"keyboard", "mouse", "kb+m", "kbm",
|
| 221 |
+
"controller", "gamepad", "joystick", "controller support",
|
| 222 |
+
"xbox controller", "ps controller", "dualsense",
|
| 223 |
+
# Responsywność
|
| 224 |
+
"responsive", "unresponsive", "clunky", "sluggish", "tight",
|
| 225 |
+
"smooth controls", "floaty", "heavy", "weighty",
|
| 226 |
+
# Celowanie
|
| 227 |
+
"aiming", "aim", "aim assist", "auto aim",
|
| 228 |
+
"camera", "camera controls", "camera angle",
|
| 229 |
+
],
|
| 230 |
+
|
| 231 |
+
"UI": [
|
| 232 |
+
# Interface
|
| 233 |
+
"ui", "user interface", "interface", "hud",
|
| 234 |
+
"menu", "menus", "main menu", "pause menu",
|
| 235 |
+
"ux", "user experience",
|
| 236 |
+
# Design UI
|
| 237 |
+
"clean ui", "cluttered", "minimalist", "intuitive",
|
| 238 |
+
"confusing", "overwhelming", "readable", "readable text",
|
| 239 |
+
# Elementy
|
| 240 |
+
"minimap", "map", "inventory", "crafting menu",
|
| 241 |
+
"skill menu", "quest log", "journal",
|
| 242 |
+
# Problemy
|
| 243 |
+
"font size", "text size", "too small", "can't read",
|
| 244 |
+
"navigation", "navigating",
|
| 245 |
+
],
|
| 246 |
+
|
| 247 |
+
# =========================================================================
|
| 248 |
+
# STORY & NARRATIVE
|
| 249 |
+
# =========================================================================
|
| 250 |
+
"Story": [
|
| 251 |
+
# Narracja
|
| 252 |
+
"story", "storyline", "plot", "narrative", "storytelling",
|
| 253 |
+
"writing", "written", "well written", "poorly written",
|
| 254 |
+
# Elementy fabularne
|
| 255 |
+
"characters", "character", "protagonist", "main character",
|
| 256 |
+
"villain", "antagonist", "npcs", "npc", "companions",
|
| 257 |
+
"dialogue", "dialogues", "conversations", "choices",
|
| 258 |
+
# Świat
|
| 259 |
+
"lore", "world building", "worldbuilding", "universe",
|
| 260 |
+
"setting", "backstory", "history",
|
| 261 |
+
# Emocje
|
| 262 |
+
"emotional", "emotions", "feels", "touching", "heartwarming",
|
| 263 |
+
"dark", "mature", "gritty", "lighthearted",
|
| 264 |
+
# Zakończenie
|
| 265 |
+
"ending", "endings", "conclusion", "finale",
|
| 266 |
+
"twist", "plot twist", "predictable", "unpredictable",
|
| 267 |
+
# Cutscenki
|
| 268 |
+
"cutscenes", "cutscene", "cinematics", "cinematic",
|
| 269 |
+
"script", "scripted", "linear", "open ended",
|
| 270 |
+
],
|
| 271 |
+
|
| 272 |
+
# =========================================================================
|
| 273 |
+
# DEVELOPER SUPPORT
|
| 274 |
+
# =========================================================================
|
| 275 |
+
"Support": [
|
| 276 |
+
# Aktualizacje
|
| 277 |
+
"updates", "update", "patch", "patches", "patched",
|
| 278 |
+
"hotfix", "hotfixes", "bug fixes", "fixed",
|
| 279 |
+
# Stan rozwoju
|
| 280 |
+
"abandoned", "dead", "no updates", "still updating",
|
| 281 |
+
"active development", "roadmap", "planned",
|
| 282 |
+
"early access", "full release", "1.0", "launch",
|
| 283 |
+
# Deweloperzy
|
| 284 |
+
"developer", "developers", "dev", "devs", "studio",
|
| 285 |
+
"indie dev", "indie developer", "aaa developer",
|
| 286 |
+
# Wsparcie
|
| 287 |
+
"support", "customer support", "response", "feedback",
|
| 288 |
+
"listening to feedback", "ignoring", "communication",
|
| 289 |
+
# Porty
|
| 290 |
+
"port", "ported", "console port", "pc port", "lazy port",
|
| 291 |
+
],
|
| 292 |
+
|
| 293 |
+
# =========================================================================
|
| 294 |
+
# PREDICTION & INTENT (NEW!)
|
| 295 |
+
# =========================================================================
|
| 296 |
+
"Retention": [
|
| 297 |
+
# Pozytywne (High Retention)
|
| 298 |
+
"addictive", "addicted", "can't stop playing", "hooked", "drug",
|
| 299 |
+
"thousands of hours", "hundreds of hours", "worth it", "worth every penny",
|
| 300 |
+
"buy it", "must buy", "highly recommend", "masterpiece", "goty",
|
| 301 |
+
"game of the year", "10/10", "best game", "favorite game",
|
| 302 |
+
# Negatywne (Churn)
|
| 303 |
+
"refund", "refunded", "refunding", "uninstalled", "uninstall", "delete",
|
| 304 |
+
"waste of money", "waste of time", "don't buy", "do not buy",
|
| 305 |
+
"regret", "regretting", "boring", "bored", "sleep", "sleepy",
|
| 306 |
+
"wait for sale", "not worth it", "cash grab", "scam",
|
| 307 |
+
],
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
# =============================================================================
|
| 311 |
+
# WYKLUCZENIA (Context-aware filtering)
|
| 312 |
+
# =============================================================================
|
| 313 |
+
# Słowa wykluczające - jeśli występują w pobliżu słowa kluczowego,
|
| 314 |
+
# ignorujemy to słowo kluczowe w danym kontekście.
|
| 315 |
+
# Format: "słowo_kluczowe": ["słowo_obok", "inne_słowo"]
|
| 316 |
+
|
| 317 |
+
EXCLUSIONS = {
|
| 318 |
+
# "fps" jako gatunek (FPS shooter) vs wydajność (60 fps)
|
| 319 |
+
"fps": ["genre", "shooter", "first person", "fps game", "fps genre"],
|
| 320 |
+
# "free" jako darmowy vs "free to play" model biznesowy
|
| 321 |
+
"free": ["drm free", "bug free", "free roam", "free world"],
|
| 322 |
+
# "controls" jako sterowanie vs "kontroluje" w narracji
|
| 323 |
+
"control": ["mind control", "control the world", "control freak"],
|
| 324 |
+
}
|
scripts/expand_keywords/main.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CLI for keyword expansion toolkit.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
# Fetch reviews from Steam (can be resumed)
|
| 6 |
+
python -m scripts.expand_keywords fetch --resume
|
| 7 |
+
|
| 8 |
+
# Train FastText model
|
| 9 |
+
python -m scripts.expand_keywords train
|
| 10 |
+
|
| 11 |
+
# Expand dictionary and export candidates
|
| 12 |
+
python -m scripts.expand_keywords expand --threshold 0.55
|
| 13 |
+
|
| 14 |
+
# Generate new keywords.py
|
| 15 |
+
python -m scripts.expand_keywords generate --auto-approve 0.7
|
| 16 |
+
|
| 17 |
+
# Run all steps
|
| 18 |
+
python -m scripts.expand_keywords run --resume
|
| 19 |
+
|
| 20 |
+
# Show statistics
|
| 21 |
+
python -m scripts.expand_keywords stats
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
import argparse
|
| 25 |
+
import asyncio
|
| 26 |
+
import logging
|
| 27 |
+
import sys
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
|
| 30 |
+
# Add project root to path for imports
|
| 31 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent
|
| 32 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 33 |
+
|
| 34 |
+
from scripts.expand_keywords.config import GAMES, MODELS_DIR, OUTPUT_DIR, SETTINGS
|
| 35 |
+
from scripts.expand_keywords.expander import KeywordExpander
|
| 36 |
+
from scripts.expand_keywords.fetcher import ReviewFetcher
|
| 37 |
+
from scripts.expand_keywords.preprocessor import Preprocessor, extract_ngrams_from_keywords
|
| 38 |
+
from scripts.expand_keywords.trainer import FastTextTrainer
|
| 39 |
+
|
| 40 |
+
# Configure logging
|
| 41 |
+
logging.basicConfig(
|
| 42 |
+
level=logging.INFO,
|
| 43 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
| 44 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 45 |
+
)
|
| 46 |
+
logger = logging.getLogger(__name__)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def load_existing_keywords() -> dict[str, list[str]]:
|
| 50 |
+
"""Load existing TOPIC_KEYWORDS from keywords.py."""
|
| 51 |
+
keywords_path = PROJECT_ROOT / "backend" / "app" / "core" / "keywords.py"
|
| 52 |
+
|
| 53 |
+
if not keywords_path.exists():
|
| 54 |
+
raise FileNotFoundError(f"Keywords file not found: {keywords_path}")
|
| 55 |
+
|
| 56 |
+
# Execute keywords.py to get TOPIC_KEYWORDS
|
| 57 |
+
namespace: dict = {}
|
| 58 |
+
exec(keywords_path.read_text(encoding="utf-8"), namespace)
|
| 59 |
+
|
| 60 |
+
keywords = namespace.get("TOPIC_KEYWORDS")
|
| 61 |
+
if not keywords:
|
| 62 |
+
raise ValueError("TOPIC_KEYWORDS not found in keywords.py")
|
| 63 |
+
|
| 64 |
+
return keywords
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
async def cmd_fetch(args: argparse.Namespace) -> None:
|
| 68 |
+
"""Fetch reviews from Steam."""
|
| 69 |
+
logger.info("Starting review fetch...")
|
| 70 |
+
|
| 71 |
+
fetcher = ReviewFetcher()
|
| 72 |
+
|
| 73 |
+
# Show current progress
|
| 74 |
+
stats = fetcher.get_stats()
|
| 75 |
+
logger.info(f"Current stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
|
| 76 |
+
|
| 77 |
+
await fetcher.fetch_all(
|
| 78 |
+
resume=args.resume,
|
| 79 |
+
limit_games=args.limit,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Show final stats
|
| 83 |
+
stats = fetcher.get_stats()
|
| 84 |
+
logger.info(f"Final stats: {stats['reviews_total']} reviews from {stats['games_completed']} games")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def cmd_train(args: argparse.Namespace) -> None:
|
| 88 |
+
"""Train FastText model."""
|
| 89 |
+
logger.info("Starting model training...")
|
| 90 |
+
|
| 91 |
+
# Load existing keywords for frozen n-grams
|
| 92 |
+
keywords = load_existing_keywords()
|
| 93 |
+
existing_ngrams = extract_ngrams_from_keywords(keywords)
|
| 94 |
+
logger.info(f"Loaded {len(existing_ngrams)} n-grams from existing dictionary")
|
| 95 |
+
|
| 96 |
+
# Load reviews
|
| 97 |
+
fetcher = ReviewFetcher()
|
| 98 |
+
reviews = fetcher.load_all_reviews()
|
| 99 |
+
|
| 100 |
+
if not reviews:
|
| 101 |
+
logger.error("No reviews found. Run 'fetch' first.")
|
| 102 |
+
return
|
| 103 |
+
|
| 104 |
+
logger.info(f"Loaded {len(reviews)} reviews")
|
| 105 |
+
|
| 106 |
+
# Preprocess
|
| 107 |
+
preprocessor = Preprocessor(existing_ngrams=existing_ngrams)
|
| 108 |
+
sentences = preprocessor.preprocess_corpus(reviews)
|
| 109 |
+
preprocessor.save()
|
| 110 |
+
|
| 111 |
+
# Train
|
| 112 |
+
trainer = FastTextTrainer()
|
| 113 |
+
trainer.train(sentences)
|
| 114 |
+
trainer.save()
|
| 115 |
+
|
| 116 |
+
logger.info("Training complete!")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def cmd_expand(args: argparse.Namespace) -> None:
|
| 120 |
+
"""Expand dictionary and export candidates."""
|
| 121 |
+
logger.info("Starting dictionary expansion...")
|
| 122 |
+
|
| 123 |
+
# Load components
|
| 124 |
+
keywords = load_existing_keywords()
|
| 125 |
+
|
| 126 |
+
preprocessor = Preprocessor()
|
| 127 |
+
try:
|
| 128 |
+
preprocessor.load()
|
| 129 |
+
except FileNotFoundError:
|
| 130 |
+
logger.error("Preprocessor not found. Run 'train' first.")
|
| 131 |
+
return
|
| 132 |
+
|
| 133 |
+
trainer = FastTextTrainer()
|
| 134 |
+
try:
|
| 135 |
+
model = trainer.load()
|
| 136 |
+
except FileNotFoundError:
|
| 137 |
+
logger.error("Model not found. Run 'train' first.")
|
| 138 |
+
return
|
| 139 |
+
|
| 140 |
+
# Expand
|
| 141 |
+
expander = KeywordExpander(
|
| 142 |
+
model=model,
|
| 143 |
+
existing_keywords=keywords,
|
| 144 |
+
word_frequencies=preprocessor.get_word_frequencies(),
|
| 145 |
+
similarity_threshold=args.threshold,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Export candidates (with threshold in filename if requested)
|
| 149 |
+
expander.export_candidates(include_threshold_in_name=args.compare)
|
| 150 |
+
|
| 151 |
+
# Show stats
|
| 152 |
+
stats = expander.get_expansion_stats()
|
| 153 |
+
logger.info(f"Expansion complete: {stats['total_candidates']} candidates")
|
| 154 |
+
logger.info(f" Auto-approved: {stats['auto_approved']}")
|
| 155 |
+
logger.info(f" Needs review: {stats['needs_review']}")
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def cmd_compare(args: argparse.Namespace) -> None:
|
| 159 |
+
"""Compare multiple thresholds."""
|
| 160 |
+
logger.info("Comparing thresholds...")
|
| 161 |
+
|
| 162 |
+
# Load components
|
| 163 |
+
keywords = load_existing_keywords()
|
| 164 |
+
|
| 165 |
+
preprocessor = Preprocessor()
|
| 166 |
+
try:
|
| 167 |
+
preprocessor.load()
|
| 168 |
+
except FileNotFoundError:
|
| 169 |
+
logger.error("Preprocessor not found. Run 'train' first.")
|
| 170 |
+
return
|
| 171 |
+
|
| 172 |
+
trainer = FastTextTrainer()
|
| 173 |
+
try:
|
| 174 |
+
model = trainer.load()
|
| 175 |
+
except FileNotFoundError:
|
| 176 |
+
logger.error("Model not found. Run 'train' first.")
|
| 177 |
+
return
|
| 178 |
+
|
| 179 |
+
thresholds = args.thresholds
|
| 180 |
+
results = []
|
| 181 |
+
|
| 182 |
+
for threshold in thresholds:
|
| 183 |
+
expander = KeywordExpander(
|
| 184 |
+
model=model,
|
| 185 |
+
existing_keywords=keywords,
|
| 186 |
+
word_frequencies=preprocessor.get_word_frequencies(),
|
| 187 |
+
similarity_threshold=threshold,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# Export with threshold in name
|
| 191 |
+
expander.export_candidates(include_threshold_in_name=True)
|
| 192 |
+
|
| 193 |
+
stats = expander.get_expansion_stats()
|
| 194 |
+
results.append((threshold, stats))
|
| 195 |
+
|
| 196 |
+
# Print comparison table
|
| 197 |
+
print("\n" + "=" * 60)
|
| 198 |
+
print("THRESHOLD COMPARISON")
|
| 199 |
+
print("=" * 60)
|
| 200 |
+
print(f"{'Threshold':<12} {'Total':<10} {'Auto-OK':<10} {'Review':<10}")
|
| 201 |
+
print("-" * 60)
|
| 202 |
+
|
| 203 |
+
for threshold, stats in results:
|
| 204 |
+
print(f"{threshold:<12.2f} {stats['total_candidates']:<10} {stats['auto_approved']:<10} {stats['needs_review']:<10}")
|
| 205 |
+
|
| 206 |
+
print("-" * 60)
|
| 207 |
+
print(f"\nOutput files saved to: {OUTPUT_DIR}/")
|
| 208 |
+
print("Compare candidates_t*.json to see differences.")
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def cmd_generate(args: argparse.Namespace) -> None:
|
| 212 |
+
"""Generate new keywords.py."""
|
| 213 |
+
logger.info("Generating expanded keywords.py...")
|
| 214 |
+
|
| 215 |
+
# Load components
|
| 216 |
+
keywords = load_existing_keywords()
|
| 217 |
+
|
| 218 |
+
preprocessor = Preprocessor()
|
| 219 |
+
try:
|
| 220 |
+
preprocessor.load()
|
| 221 |
+
except FileNotFoundError:
|
| 222 |
+
logger.error("Preprocessor not found. Run 'train' first.")
|
| 223 |
+
return
|
| 224 |
+
|
| 225 |
+
trainer = FastTextTrainer()
|
| 226 |
+
try:
|
| 227 |
+
model = trainer.load()
|
| 228 |
+
except FileNotFoundError:
|
| 229 |
+
logger.error("Model not found. Run 'train' first.")
|
| 230 |
+
return
|
| 231 |
+
|
| 232 |
+
# Generate
|
| 233 |
+
expander = KeywordExpander(
|
| 234 |
+
model=model,
|
| 235 |
+
existing_keywords=keywords,
|
| 236 |
+
word_frequencies=preprocessor.get_word_frequencies(),
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
output_path = expander.generate_keywords_py(
|
| 240 |
+
auto_approve_threshold=args.auto_approve,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
logger.info(f"Generated: {output_path}")
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
async def cmd_run(args: argparse.Namespace) -> None:
|
| 247 |
+
"""Run all steps: fetch, train, expand, generate."""
|
| 248 |
+
logger.info("Running complete pipeline...")
|
| 249 |
+
|
| 250 |
+
# Step 1: Fetch
|
| 251 |
+
await cmd_fetch(args)
|
| 252 |
+
|
| 253 |
+
# Step 2: Train
|
| 254 |
+
cmd_train(args)
|
| 255 |
+
|
| 256 |
+
# Step 3: Expand
|
| 257 |
+
cmd_expand(args)
|
| 258 |
+
|
| 259 |
+
# Step 4: Generate
|
| 260 |
+
cmd_generate(args)
|
| 261 |
+
|
| 262 |
+
logger.info("Pipeline complete!")
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def cmd_stats(args: argparse.Namespace) -> None:
|
| 266 |
+
"""Show statistics."""
|
| 267 |
+
# Fetcher stats
|
| 268 |
+
fetcher = ReviewFetcher()
|
| 269 |
+
fetch_stats = fetcher.get_stats()
|
| 270 |
+
|
| 271 |
+
print("\n=== Fetch Statistics ===")
|
| 272 |
+
print(f"Games configured: {fetch_stats['games_total']}")
|
| 273 |
+
print(f"Games completed: {fetch_stats['games_completed']}")
|
| 274 |
+
print(f"Games in progress: {fetch_stats['games_in_progress']}")
|
| 275 |
+
print(f"Total reviews: {fetch_stats['reviews_total']}")
|
| 276 |
+
|
| 277 |
+
if fetch_stats["reviews_per_game"]:
|
| 278 |
+
print("\nReviews per game:")
|
| 279 |
+
for name, count in sorted(fetch_stats["reviews_per_game"].items()):
|
| 280 |
+
print(f" {name}: {count}")
|
| 281 |
+
|
| 282 |
+
# Model stats
|
| 283 |
+
model_path = MODELS_DIR / "fasttext.model"
|
| 284 |
+
if model_path.exists():
|
| 285 |
+
print("\n=== Model Statistics ===")
|
| 286 |
+
trainer = FastTextTrainer()
|
| 287 |
+
model = trainer.load()
|
| 288 |
+
print(f"Vocabulary size: {len(model.wv)}")
|
| 289 |
+
|
| 290 |
+
# Expansion stats (if available)
|
| 291 |
+
candidates_path = OUTPUT_DIR / "candidates.json"
|
| 292 |
+
if candidates_path.exists():
|
| 293 |
+
import json
|
| 294 |
+
with open(candidates_path, "r", encoding="utf-8") as f:
|
| 295 |
+
data = json.load(f)
|
| 296 |
+
print("\n=== Expansion Statistics ===")
|
| 297 |
+
print(f"Total candidates: {data['metadata']['total_candidates']}")
|
| 298 |
+
for cat, cands in data["categories"].items():
|
| 299 |
+
print(f" {cat}: {len(cands)}")
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
def cmd_similar(args: argparse.Namespace) -> None:
|
| 303 |
+
"""Find similar words for testing."""
|
| 304 |
+
trainer = FastTextTrainer()
|
| 305 |
+
try:
|
| 306 |
+
model = trainer.load()
|
| 307 |
+
except FileNotFoundError:
|
| 308 |
+
logger.error("Model not found. Run 'train' first.")
|
| 309 |
+
return
|
| 310 |
+
|
| 311 |
+
word = args.word
|
| 312 |
+
topn = args.topn
|
| 313 |
+
|
| 314 |
+
similar = trainer.get_similar(word, topn=topn)
|
| 315 |
+
|
| 316 |
+
if similar:
|
| 317 |
+
print(f"\nWords similar to '{word}':")
|
| 318 |
+
for w, sim in similar:
|
| 319 |
+
print(f" {w}: {sim:.3f}")
|
| 320 |
+
else:
|
| 321 |
+
print(f"Word '{word}' not found in vocabulary")
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def main():
|
| 325 |
+
parser = argparse.ArgumentParser(
|
| 326 |
+
description="Keyword expansion toolkit using FastText",
|
| 327 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
| 331 |
+
|
| 332 |
+
# fetch command
|
| 333 |
+
fetch_parser = subparsers.add_parser("fetch", help="Fetch reviews from Steam")
|
| 334 |
+
fetch_parser.add_argument(
|
| 335 |
+
"--resume", "-r",
|
| 336 |
+
action="store_true",
|
| 337 |
+
help="Resume from previous progress",
|
| 338 |
+
)
|
| 339 |
+
fetch_parser.add_argument(
|
| 340 |
+
"--limit", "-l",
|
| 341 |
+
type=int,
|
| 342 |
+
default=None,
|
| 343 |
+
help="Limit number of games (for testing)",
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
# train command
|
| 347 |
+
train_parser = subparsers.add_parser("train", help="Train FastText model")
|
| 348 |
+
|
| 349 |
+
# expand command
|
| 350 |
+
expand_parser = subparsers.add_parser("expand", help="Expand dictionary")
|
| 351 |
+
expand_parser.add_argument(
|
| 352 |
+
"--threshold", "-t",
|
| 353 |
+
type=float,
|
| 354 |
+
default=SETTINGS["similarity_threshold"],
|
| 355 |
+
help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
|
| 356 |
+
)
|
| 357 |
+
expand_parser.add_argument(
|
| 358 |
+
"--compare", "-c",
|
| 359 |
+
action="store_true",
|
| 360 |
+
help="Include threshold in output filename (for comparison)",
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
# compare command
|
| 364 |
+
compare_parser = subparsers.add_parser("compare", help="Compare multiple thresholds")
|
| 365 |
+
compare_parser.add_argument(
|
| 366 |
+
"--thresholds", "-t",
|
| 367 |
+
type=float,
|
| 368 |
+
nargs="+",
|
| 369 |
+
default=[0.45, 0.50, 0.55, 0.60, 0.65, 0.70],
|
| 370 |
+
help="Thresholds to compare (default: 0.45 0.50 0.55 0.60 0.65 0.70)",
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
# generate command
|
| 374 |
+
generate_parser = subparsers.add_parser("generate", help="Generate keywords.py")
|
| 375 |
+
generate_parser.add_argument(
|
| 376 |
+
"--auto-approve", "-a",
|
| 377 |
+
type=float,
|
| 378 |
+
default=SETTINGS["auto_approve_threshold"],
|
| 379 |
+
help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
# run command (all steps)
|
| 383 |
+
run_parser = subparsers.add_parser("run", help="Run all steps")
|
| 384 |
+
run_parser.add_argument(
|
| 385 |
+
"--resume", "-r",
|
| 386 |
+
action="store_true",
|
| 387 |
+
help="Resume fetch from previous progress",
|
| 388 |
+
)
|
| 389 |
+
run_parser.add_argument(
|
| 390 |
+
"--limit", "-l",
|
| 391 |
+
type=int,
|
| 392 |
+
default=None,
|
| 393 |
+
help="Limit number of games (for testing)",
|
| 394 |
+
)
|
| 395 |
+
run_parser.add_argument(
|
| 396 |
+
"--threshold", "-t",
|
| 397 |
+
type=float,
|
| 398 |
+
default=SETTINGS["similarity_threshold"],
|
| 399 |
+
help=f"Similarity threshold (default: {SETTINGS['similarity_threshold']})",
|
| 400 |
+
)
|
| 401 |
+
run_parser.add_argument(
|
| 402 |
+
"--auto-approve", "-a",
|
| 403 |
+
type=float,
|
| 404 |
+
default=SETTINGS["auto_approve_threshold"],
|
| 405 |
+
help=f"Auto-approve threshold (default: {SETTINGS['auto_approve_threshold']})",
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
# stats command
|
| 409 |
+
stats_parser = subparsers.add_parser("stats", help="Show statistics")
|
| 410 |
+
|
| 411 |
+
# similar command (for testing)
|
| 412 |
+
similar_parser = subparsers.add_parser("similar", help="Find similar words")
|
| 413 |
+
similar_parser.add_argument("word", help="Word to find similar words for")
|
| 414 |
+
similar_parser.add_argument(
|
| 415 |
+
"--topn", "-n",
|
| 416 |
+
type=int,
|
| 417 |
+
default=20,
|
| 418 |
+
help="Number of results (default: 20)",
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
args = parser.parse_args()
|
| 422 |
+
|
| 423 |
+
if not args.command:
|
| 424 |
+
parser.print_help()
|
| 425 |
+
return
|
| 426 |
+
|
| 427 |
+
# Execute command
|
| 428 |
+
if args.command == "fetch":
|
| 429 |
+
asyncio.run(cmd_fetch(args))
|
| 430 |
+
elif args.command == "train":
|
| 431 |
+
cmd_train(args)
|
| 432 |
+
elif args.command == "expand":
|
| 433 |
+
cmd_expand(args)
|
| 434 |
+
elif args.command == "compare":
|
| 435 |
+
cmd_compare(args)
|
| 436 |
+
elif args.command == "generate":
|
| 437 |
+
cmd_generate(args)
|
| 438 |
+
elif args.command == "run":
|
| 439 |
+
asyncio.run(cmd_run(args))
|
| 440 |
+
elif args.command == "stats":
|
| 441 |
+
cmd_stats(args)
|
| 442 |
+
elif args.command == "similar":
|
| 443 |
+
cmd_similar(args)
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
if __name__ == "__main__":
|
| 447 |
+
main()
|
scripts/expand_keywords/preprocessor.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text preprocessing with n-gram detection using gensim.Phrases.
|
| 3 |
+
|
| 4 |
+
Pipeline:
|
| 5 |
+
1. Tokenization (jieba for Chinese, regex for English/mixed)
|
| 6 |
+
2. Build Phrases models (bigrams, trigrams)
|
| 7 |
+
3. Apply frozen n-grams from existing dictionary
|
| 8 |
+
4. Apply detected phrases
|
| 9 |
+
|
| 10 |
+
This ensures that multi-word concepts like "帧率" or "加载画面"
|
| 11 |
+
are treated as single tokens during FastText training.
|
| 12 |
+
|
| 13 |
+
For Chinese text:
|
| 14 |
+
- Uses jieba for word segmentation (Chinese has no spaces)
|
| 15 |
+
- Keeps English words intact (common in gaming reviews: fps, bug, dlc)
|
| 16 |
+
- Removes punctuation but preserves Chinese characters
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import logging
|
| 20 |
+
import pickle
|
| 21 |
+
import re
|
| 22 |
+
from collections import Counter
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import jieba
|
| 26 |
+
from gensim.models import Phrases
|
| 27 |
+
from gensim.models.phrases import Phraser
|
| 28 |
+
|
| 29 |
+
from .config import MODELS_DIR, SETTINGS
|
| 30 |
+
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Preprocessor:
|
| 35 |
+
"""
|
| 36 |
+
Text preprocessor with n-gram detection.
|
| 37 |
+
|
| 38 |
+
Uses gensim Phrases for automatic phrase detection plus
|
| 39 |
+
frozen n-grams from the existing keyword dictionary.
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
def __init__(self, existing_ngrams: list[str] | None = None):
|
| 43 |
+
"""
|
| 44 |
+
Initialize preprocessor.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
existing_ngrams: Multi-word phrases from existing keywords.py
|
| 48 |
+
(e.g., "frame rate", "loading screen")
|
| 49 |
+
"""
|
| 50 |
+
self.frozen_ngrams: set[tuple[str, ...]] = set()
|
| 51 |
+
if existing_ngrams:
|
| 52 |
+
self.frozen_ngrams = self._normalize_ngrams(existing_ngrams)
|
| 53 |
+
logger.info(f"Loaded {len(self.frozen_ngrams)} frozen n-grams")
|
| 54 |
+
|
| 55 |
+
self.bigram_model: Phraser | None = None
|
| 56 |
+
self.trigram_model: Phraser | None = None
|
| 57 |
+
self.word_frequencies: Counter = Counter()
|
| 58 |
+
|
| 59 |
+
def _normalize_ngrams(self, ngrams: list[str]) -> set[tuple[str, ...]]:
|
| 60 |
+
"""Convert n-grams to lowercase tuple format for fast lookup."""
|
| 61 |
+
result = set()
|
| 62 |
+
for ng in ngrams:
|
| 63 |
+
if " " in ng:
|
| 64 |
+
tokens = tuple(ng.lower().split())
|
| 65 |
+
result.add(tokens)
|
| 66 |
+
return result
|
| 67 |
+
|
| 68 |
+
def tokenize(self, text: str) -> list[str]:
|
| 69 |
+
"""
|
| 70 |
+
Tokenization for Chinese/mixed text using jieba.
|
| 71 |
+
|
| 72 |
+
- Uses jieba for Chinese word segmentation
|
| 73 |
+
- Keeps English words intact (common in gaming: fps, bug, dlc)
|
| 74 |
+
- Removes punctuation (both Chinese and English)
|
| 75 |
+
- Lowercases English text
|
| 76 |
+
"""
|
| 77 |
+
# Remove URLs
|
| 78 |
+
text = re.sub(r'https?://\S+', ' ', text)
|
| 79 |
+
|
| 80 |
+
# Remove punctuation (Chinese and English) but keep Chinese chars and alphanumeric
|
| 81 |
+
# Chinese punctuation: 。!?,、;:""''()【】《》
|
| 82 |
+
text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbfa-zA-Z0-9\s]', ' ', text)
|
| 83 |
+
|
| 84 |
+
# Lowercase English text
|
| 85 |
+
text = text.lower()
|
| 86 |
+
|
| 87 |
+
# Use jieba to segment Chinese text
|
| 88 |
+
# jieba handles mixed Chinese/English text well
|
| 89 |
+
tokens = list(jieba.cut(text))
|
| 90 |
+
|
| 91 |
+
# Filter: remove empty strings and single spaces
|
| 92 |
+
tokens = [t.strip() for t in tokens if t.strip()]
|
| 93 |
+
|
| 94 |
+
return tokens
|
| 95 |
+
|
| 96 |
+
def build_phrase_models(
|
| 97 |
+
self,
|
| 98 |
+
corpus: list[list[str]],
|
| 99 |
+
min_count: int | None = None,
|
| 100 |
+
threshold: float | None = None,
|
| 101 |
+
) -> None:
|
| 102 |
+
"""
|
| 103 |
+
Build Phrases models for automatic n-gram detection.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
corpus: List of tokenized documents
|
| 107 |
+
min_count: Minimum phrase occurrences (default from settings)
|
| 108 |
+
threshold: Scoring threshold (higher = fewer phrases)
|
| 109 |
+
"""
|
| 110 |
+
min_count = min_count or SETTINGS["phrase_min_count"]
|
| 111 |
+
threshold = threshold or SETTINGS["phrase_threshold"]
|
| 112 |
+
|
| 113 |
+
logger.info(f"Building phrase models (min_count={min_count}, threshold={threshold})")
|
| 114 |
+
|
| 115 |
+
# Build bigram model: "frame rate" -> "frame_rate"
|
| 116 |
+
bigram_phrases = Phrases(
|
| 117 |
+
corpus,
|
| 118 |
+
min_count=min_count,
|
| 119 |
+
threshold=threshold,
|
| 120 |
+
delimiter="_",
|
| 121 |
+
)
|
| 122 |
+
self.bigram_model = Phraser(bigram_phrases)
|
| 123 |
+
|
| 124 |
+
# Apply bigramy to create input for trigram detection
|
| 125 |
+
bigram_corpus = [self.bigram_model[doc] for doc in corpus]
|
| 126 |
+
|
| 127 |
+
# Build trigram model: "dark_souls like" -> "dark_souls_like"
|
| 128 |
+
trigram_phrases = Phrases(
|
| 129 |
+
bigram_corpus,
|
| 130 |
+
min_count=min_count,
|
| 131 |
+
threshold=threshold,
|
| 132 |
+
delimiter="_",
|
| 133 |
+
)
|
| 134 |
+
self.trigram_model = Phraser(trigram_phrases)
|
| 135 |
+
|
| 136 |
+
# Log detected phrases
|
| 137 |
+
bigram_count = len(bigram_phrases.export_phrases())
|
| 138 |
+
trigram_count = len(trigram_phrases.export_phrases())
|
| 139 |
+
logger.info(f"Detected {bigram_count} bigrams, {trigram_count} trigrams")
|
| 140 |
+
|
| 141 |
+
def _apply_frozen_ngrams(self, tokens: list[str]) -> list[str]:
|
| 142 |
+
"""
|
| 143 |
+
Apply frozen n-grams from existing dictionary.
|
| 144 |
+
|
| 145 |
+
These are always joined, even if not detected by Phrases.
|
| 146 |
+
"""
|
| 147 |
+
result = []
|
| 148 |
+
i = 0
|
| 149 |
+
|
| 150 |
+
while i < len(tokens):
|
| 151 |
+
matched = False
|
| 152 |
+
|
| 153 |
+
# Try trigrams first (longer matches preferred)
|
| 154 |
+
if i + 2 < len(tokens):
|
| 155 |
+
trigram = (tokens[i], tokens[i + 1], tokens[i + 2])
|
| 156 |
+
if trigram in self.frozen_ngrams:
|
| 157 |
+
result.append("_".join(trigram))
|
| 158 |
+
i += 3
|
| 159 |
+
matched = True
|
| 160 |
+
|
| 161 |
+
# Try bigrams
|
| 162 |
+
if not matched and i + 1 < len(tokens):
|
| 163 |
+
bigram = (tokens[i], tokens[i + 1])
|
| 164 |
+
if bigram in self.frozen_ngrams:
|
| 165 |
+
result.append("_".join(bigram))
|
| 166 |
+
i += 2
|
| 167 |
+
matched = True
|
| 168 |
+
|
| 169 |
+
if not matched:
|
| 170 |
+
result.append(tokens[i])
|
| 171 |
+
i += 1
|
| 172 |
+
|
| 173 |
+
return result
|
| 174 |
+
|
| 175 |
+
def apply_phrases(self, tokens: list[str]) -> list[str]:
|
| 176 |
+
"""
|
| 177 |
+
Apply phrase models and frozen n-grams to tokens.
|
| 178 |
+
|
| 179 |
+
Order:
|
| 180 |
+
1. Frozen n-grams (from existing dictionary)
|
| 181 |
+
2. Automatic Phrases (bigrams then trigrams)
|
| 182 |
+
"""
|
| 183 |
+
# Apply frozen n-grams first
|
| 184 |
+
tokens = self._apply_frozen_ngrams(tokens)
|
| 185 |
+
|
| 186 |
+
# Apply automatic phrase models
|
| 187 |
+
if self.bigram_model:
|
| 188 |
+
tokens = list(self.bigram_model[tokens])
|
| 189 |
+
if self.trigram_model:
|
| 190 |
+
tokens = list(self.trigram_model[tokens])
|
| 191 |
+
|
| 192 |
+
return tokens
|
| 193 |
+
|
| 194 |
+
def preprocess_corpus(
|
| 195 |
+
self,
|
| 196 |
+
reviews: list[str],
|
| 197 |
+
build_phrases: bool = True,
|
| 198 |
+
) -> list[list[str]]:
|
| 199 |
+
"""
|
| 200 |
+
Full preprocessing pipeline.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
reviews: Raw review texts
|
| 204 |
+
build_phrases: Whether to build phrase models (skip if loading)
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
List of tokenized documents with phrases applied
|
| 208 |
+
"""
|
| 209 |
+
logger.info(f"Preprocessing {len(reviews)} reviews...")
|
| 210 |
+
|
| 211 |
+
# Step 1: Tokenize all reviews
|
| 212 |
+
tokenized = [self.tokenize(review) for review in reviews]
|
| 213 |
+
logger.info("Tokenization complete")
|
| 214 |
+
|
| 215 |
+
# Step 2: Build phrase models
|
| 216 |
+
if build_phrases:
|
| 217 |
+
self.build_phrase_models(tokenized)
|
| 218 |
+
|
| 219 |
+
# Step 3: Apply phrases and count frequencies
|
| 220 |
+
processed = []
|
| 221 |
+
for tokens in tokenized:
|
| 222 |
+
phrased = self.apply_phrases(tokens)
|
| 223 |
+
processed.append(phrased)
|
| 224 |
+
self.word_frequencies.update(phrased)
|
| 225 |
+
|
| 226 |
+
logger.info(f"Vocabulary size: {len(self.word_frequencies)}")
|
| 227 |
+
return processed
|
| 228 |
+
|
| 229 |
+
def get_word_frequencies(self) -> dict[str, int]:
|
| 230 |
+
"""Get word frequency dictionary."""
|
| 231 |
+
return dict(self.word_frequencies)
|
| 232 |
+
|
| 233 |
+
def save(self, path: Path | None = None) -> None:
|
| 234 |
+
"""Save preprocessor state (phrase models, frequencies)."""
|
| 235 |
+
path = path or MODELS_DIR / "preprocessor.pkl"
|
| 236 |
+
|
| 237 |
+
data = {
|
| 238 |
+
"frozen_ngrams": self.frozen_ngrams,
|
| 239 |
+
"bigram_model": self.bigram_model,
|
| 240 |
+
"trigram_model": self.trigram_model,
|
| 241 |
+
"word_frequencies": self.word_frequencies,
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
with open(path, "wb") as f:
|
| 245 |
+
pickle.dump(data, f)
|
| 246 |
+
|
| 247 |
+
logger.info(f"Saved preprocessor to {path}")
|
| 248 |
+
|
| 249 |
+
def load(self, path: Path | None = None) -> None:
|
| 250 |
+
"""Load preprocessor state."""
|
| 251 |
+
path = path or MODELS_DIR / "preprocessor.pkl"
|
| 252 |
+
|
| 253 |
+
if not path.exists():
|
| 254 |
+
raise FileNotFoundError(f"Preprocessor not found at {path}")
|
| 255 |
+
|
| 256 |
+
with open(path, "rb") as f:
|
| 257 |
+
data = pickle.load(f)
|
| 258 |
+
|
| 259 |
+
self.frozen_ngrams = data["frozen_ngrams"]
|
| 260 |
+
self.bigram_model = data["bigram_model"]
|
| 261 |
+
self.trigram_model = data["trigram_model"]
|
| 262 |
+
self.word_frequencies = data["word_frequencies"]
|
| 263 |
+
|
| 264 |
+
logger.info(f"Loaded preprocessor from {path}")
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def extract_ngrams_from_keywords(keywords: dict[str, list[str]]) -> list[str]:
|
| 268 |
+
"""
|
| 269 |
+
Extract multi-word phrases from keywords dictionary.
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
keywords: TOPIC_KEYWORDS dictionary from keywords.py
|
| 273 |
+
|
| 274 |
+
Returns:
|
| 275 |
+
List of multi-word phrases (e.g., ["frame rate", "loading screen"])
|
| 276 |
+
"""
|
| 277 |
+
ngrams = []
|
| 278 |
+
for category_words in keywords.values():
|
| 279 |
+
for word in category_words:
|
| 280 |
+
if " " in word:
|
| 281 |
+
ngrams.append(word)
|
| 282 |
+
return ngrams
|
scripts/expand_keywords/trainer.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastText model training.
|
| 3 |
+
|
| 4 |
+
FastText is preferred over Word2Vec because:
|
| 5 |
+
- Better handling of typos and misspellings (common in reviews)
|
| 6 |
+
- Can generate vectors for out-of-vocabulary words
|
| 7 |
+
- Uses character n-grams internally
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import logging
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
from gensim.models import FastText
|
| 14 |
+
|
| 15 |
+
from .config import MODELS_DIR, SETTINGS
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class FastTextTrainer:
|
| 21 |
+
"""
|
| 22 |
+
Trains FastText word embeddings on review corpus.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
vector_size: int | None = None,
|
| 28 |
+
window: int | None = None,
|
| 29 |
+
min_count: int | None = None,
|
| 30 |
+
epochs: int | None = None,
|
| 31 |
+
workers: int | None = None,
|
| 32 |
+
):
|
| 33 |
+
"""
|
| 34 |
+
Initialize trainer with hyperparameters.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
vector_size: Dimensionality of word vectors
|
| 38 |
+
window: Context window size
|
| 39 |
+
min_count: Minimum word frequency
|
| 40 |
+
epochs: Number of training iterations
|
| 41 |
+
workers: Number of worker threads
|
| 42 |
+
"""
|
| 43 |
+
self.vector_size = vector_size or SETTINGS["fasttext_vector_size"]
|
| 44 |
+
self.window = window or SETTINGS["fasttext_window"]
|
| 45 |
+
self.min_count = min_count or SETTINGS["fasttext_min_count"]
|
| 46 |
+
self.epochs = epochs or SETTINGS["fasttext_epochs"]
|
| 47 |
+
self.workers = workers or SETTINGS["fasttext_workers"]
|
| 48 |
+
|
| 49 |
+
self.model: FastText | None = None
|
| 50 |
+
|
| 51 |
+
def train(self, sentences: list[list[str]]) -> FastText:
|
| 52 |
+
"""
|
| 53 |
+
Train FastText model on tokenized sentences.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
sentences: List of tokenized documents (output from preprocessor)
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Trained FastText model
|
| 60 |
+
"""
|
| 61 |
+
logger.info(
|
| 62 |
+
f"Training FastText model: "
|
| 63 |
+
f"vector_size={self.vector_size}, window={self.window}, "
|
| 64 |
+
f"min_count={self.min_count}, epochs={self.epochs}"
|
| 65 |
+
)
|
| 66 |
+
logger.info(f"Training on {len(sentences)} documents")
|
| 67 |
+
|
| 68 |
+
self.model = FastText(
|
| 69 |
+
sentences=sentences,
|
| 70 |
+
vector_size=self.vector_size,
|
| 71 |
+
window=self.window,
|
| 72 |
+
min_count=self.min_count,
|
| 73 |
+
epochs=self.epochs,
|
| 74 |
+
workers=self.workers,
|
| 75 |
+
sg=1, # Skip-gram (better for semantic similarity)
|
| 76 |
+
min_n=3, # Minimum character n-gram length
|
| 77 |
+
max_n=6, # Maximum character n-gram length
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
vocab_size = len(self.model.wv)
|
| 81 |
+
logger.info(f"Training complete. Vocabulary size: {vocab_size}")
|
| 82 |
+
|
| 83 |
+
return self.model
|
| 84 |
+
|
| 85 |
+
def save(self, path: Path | str | None = None) -> Path:
|
| 86 |
+
"""
|
| 87 |
+
Save trained model.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
path: Save path (default: models/fasttext.model)
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Path where model was saved
|
| 94 |
+
"""
|
| 95 |
+
if self.model is None:
|
| 96 |
+
raise ValueError("No model to save. Train first.")
|
| 97 |
+
|
| 98 |
+
path = Path(path) if path else MODELS_DIR / "fasttext.model"
|
| 99 |
+
self.model.save(str(path))
|
| 100 |
+
logger.info(f"Saved model to {path}")
|
| 101 |
+
return path
|
| 102 |
+
|
| 103 |
+
def load(self, path: Path | str | None = None) -> FastText:
|
| 104 |
+
"""
|
| 105 |
+
Load model from file.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
path: Model path (default: models/fasttext.model)
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Loaded FastText model
|
| 112 |
+
"""
|
| 113 |
+
path = Path(path) if path else MODELS_DIR / "fasttext.model"
|
| 114 |
+
|
| 115 |
+
if not path.exists():
|
| 116 |
+
raise FileNotFoundError(f"Model not found at {path}")
|
| 117 |
+
|
| 118 |
+
self.model = FastText.load(str(path))
|
| 119 |
+
vocab_size = len(self.model.wv)
|
| 120 |
+
logger.info(f"Loaded model from {path}. Vocabulary size: {vocab_size}")
|
| 121 |
+
return self.model
|
| 122 |
+
|
| 123 |
+
def get_similar(
|
| 124 |
+
self,
|
| 125 |
+
word: str,
|
| 126 |
+
topn: int = 10,
|
| 127 |
+
) -> list[tuple[str, float]]:
|
| 128 |
+
"""
|
| 129 |
+
Get most similar words to a given word.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
word: Query word
|
| 133 |
+
topn: Number of results
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
List of (word, similarity) tuples
|
| 137 |
+
"""
|
| 138 |
+
if self.model is None:
|
| 139 |
+
raise ValueError("No model loaded. Train or load first.")
|
| 140 |
+
|
| 141 |
+
# Normalize word (space to underscore for phrases)
|
| 142 |
+
word_normalized = word.lower().replace(" ", "_")
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
return self.model.wv.most_similar(word_normalized, topn=topn)
|
| 146 |
+
except KeyError:
|
| 147 |
+
logger.warning(f"Word '{word}' not in vocabulary")
|
| 148 |
+
return []
|
| 149 |
+
|
| 150 |
+
def get_similarity(self, word1: str, word2: str) -> float:
|
| 151 |
+
"""
|
| 152 |
+
Get similarity between two words.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
word1: First word
|
| 156 |
+
word2: Second word
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
Cosine similarity (-1 to 1)
|
| 160 |
+
"""
|
| 161 |
+
if self.model is None:
|
| 162 |
+
raise ValueError("No model loaded. Train or load first.")
|
| 163 |
+
|
| 164 |
+
w1 = word1.lower().replace(" ", "_")
|
| 165 |
+
w2 = word2.lower().replace(" ", "_")
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
return float(self.model.wv.similarity(w1, w2))
|
| 169 |
+
except KeyError as e:
|
| 170 |
+
logger.warning(f"Word not in vocabulary: {e}")
|
| 171 |
+
return 0.0
|
| 172 |
+
|
| 173 |
+
def word_in_vocab(self, word: str) -> bool:
|
| 174 |
+
"""Check if word is in vocabulary."""
|
| 175 |
+
if self.model is None:
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
word_normalized = word.lower().replace(" ", "_")
|
| 179 |
+
return word_normalized in self.model.wv
|
| 180 |
+
|
| 181 |
+
def get_vocab_words(self) -> list[str]:
|
| 182 |
+
"""Get all words in vocabulary."""
|
| 183 |
+
if self.model is None:
|
| 184 |
+
return []
|
| 185 |
+
return list(self.model.wv.key_to_index.keys())
|