Spaces:
Running
Running
| """ | |
| Text preprocessing with n-gram detection using gensim.Phrases. | |
| Pipeline: | |
| 1. Tokenization (jieba for Chinese, regex for English/mixed) | |
| 2. Build Phrases models (bigrams, trigrams) | |
| 3. Apply frozen n-grams from existing dictionary | |
| 4. Apply detected phrases | |
| This ensures that multi-word concepts like "帧率" or "加载画面" | |
| are treated as single tokens during FastText training. | |
| For Chinese text: | |
| - Uses jieba for word segmentation (Chinese has no spaces) | |
| - Keeps English words intact (common in gaming reviews: fps, bug, dlc) | |
| - Removes punctuation but preserves Chinese characters | |
| """ | |
| import logging | |
| import pickle | |
| import re | |
| from collections import Counter | |
| from pathlib import Path | |
| import jieba | |
| from gensim.models import Phrases | |
| from gensim.models.phrases import Phraser | |
| from .config import MODELS_DIR, SETTINGS | |
| logger = logging.getLogger(__name__) | |
| class Preprocessor: | |
| """ | |
| Text preprocessor with n-gram detection. | |
| Uses gensim Phrases for automatic phrase detection plus | |
| frozen n-grams from the existing keyword dictionary. | |
| """ | |
| def __init__(self, existing_ngrams: list[str] | None = None): | |
| """ | |
| Initialize preprocessor. | |
| Args: | |
| existing_ngrams: Multi-word phrases from existing keywords.py | |
| (e.g., "frame rate", "loading screen") | |
| """ | |
| self.frozen_ngrams: set[tuple[str, ...]] = set() | |
| if existing_ngrams: | |
| self.frozen_ngrams = self._normalize_ngrams(existing_ngrams) | |
| logger.info(f"Loaded {len(self.frozen_ngrams)} frozen n-grams") | |
| self.bigram_model: Phraser | None = None | |
| self.trigram_model: Phraser | None = None | |
| self.word_frequencies: Counter = Counter() | |
| def _normalize_ngrams(self, ngrams: list[str]) -> set[tuple[str, ...]]: | |
| """Convert n-grams to lowercase tuple format for fast lookup.""" | |
| result = set() | |
| for ng in ngrams: | |
| if " " in ng: | |
| tokens = tuple(ng.lower().split()) | |
| result.add(tokens) | |
| return result | |
| def tokenize(self, text: str) -> list[str]: | |
| """ | |
| Tokenization for Chinese/mixed text using jieba. | |
| - Uses jieba for Chinese word segmentation | |
| - Keeps English words intact (common in gaming: fps, bug, dlc) | |
| - Removes punctuation (both Chinese and English) | |
| - Lowercases English text | |
| """ | |
| # Remove URLs | |
| text = re.sub(r'https?://\S+', ' ', text) | |
| # Remove punctuation (Chinese and English) but keep Chinese chars and alphanumeric | |
| # Chinese punctuation: 。!?,、;:""''()【】《》 | |
| text = re.sub(r'[^\u4e00-\u9fff\u3400-\u4dbfa-zA-Z0-9\s]', ' ', text) | |
| # Lowercase English text | |
| text = text.lower() | |
| # Use jieba to segment Chinese text | |
| # jieba handles mixed Chinese/English text well | |
| tokens = list(jieba.cut(text)) | |
| # Filter: remove empty strings and single spaces | |
| tokens = [t.strip() for t in tokens if t.strip()] | |
| return tokens | |
| def build_phrase_models( | |
| self, | |
| corpus: list[list[str]], | |
| min_count: int | None = None, | |
| threshold: float | None = None, | |
| ) -> None: | |
| """ | |
| Build Phrases models for automatic n-gram detection. | |
| Args: | |
| corpus: List of tokenized documents | |
| min_count: Minimum phrase occurrences (default from settings) | |
| threshold: Scoring threshold (higher = fewer phrases) | |
| """ | |
| min_count = min_count or SETTINGS["phrase_min_count"] | |
| threshold = threshold or SETTINGS["phrase_threshold"] | |
| logger.info(f"Building phrase models (min_count={min_count}, threshold={threshold})") | |
| # Build bigram model: "frame rate" -> "frame_rate" | |
| bigram_phrases = Phrases( | |
| corpus, | |
| min_count=min_count, | |
| threshold=threshold, | |
| delimiter="_", | |
| ) | |
| self.bigram_model = Phraser(bigram_phrases) | |
| # Apply bigramy to create input for trigram detection | |
| bigram_corpus = [self.bigram_model[doc] for doc in corpus] | |
| # Build trigram model: "dark_souls like" -> "dark_souls_like" | |
| trigram_phrases = Phrases( | |
| bigram_corpus, | |
| min_count=min_count, | |
| threshold=threshold, | |
| delimiter="_", | |
| ) | |
| self.trigram_model = Phraser(trigram_phrases) | |
| # Log detected phrases | |
| bigram_count = len(bigram_phrases.export_phrases()) | |
| trigram_count = len(trigram_phrases.export_phrases()) | |
| logger.info(f"Detected {bigram_count} bigrams, {trigram_count} trigrams") | |
| def _apply_frozen_ngrams(self, tokens: list[str]) -> list[str]: | |
| """ | |
| Apply frozen n-grams from existing dictionary. | |
| These are always joined, even if not detected by Phrases. | |
| """ | |
| result = [] | |
| i = 0 | |
| while i < len(tokens): | |
| matched = False | |
| # Try trigrams first (longer matches preferred) | |
| if i + 2 < len(tokens): | |
| trigram = (tokens[i], tokens[i + 1], tokens[i + 2]) | |
| if trigram in self.frozen_ngrams: | |
| result.append("_".join(trigram)) | |
| i += 3 | |
| matched = True | |
| # Try bigrams | |
| if not matched and i + 1 < len(tokens): | |
| bigram = (tokens[i], tokens[i + 1]) | |
| if bigram in self.frozen_ngrams: | |
| result.append("_".join(bigram)) | |
| i += 2 | |
| matched = True | |
| if not matched: | |
| result.append(tokens[i]) | |
| i += 1 | |
| return result | |
| def apply_phrases(self, tokens: list[str]) -> list[str]: | |
| """ | |
| Apply phrase models and frozen n-grams to tokens. | |
| Order: | |
| 1. Frozen n-grams (from existing dictionary) | |
| 2. Automatic Phrases (bigrams then trigrams) | |
| """ | |
| # Apply frozen n-grams first | |
| tokens = self._apply_frozen_ngrams(tokens) | |
| # Apply automatic phrase models | |
| if self.bigram_model: | |
| tokens = list(self.bigram_model[tokens]) | |
| if self.trigram_model: | |
| tokens = list(self.trigram_model[tokens]) | |
| return tokens | |
| def preprocess_corpus( | |
| self, | |
| reviews: list[str], | |
| build_phrases: bool = True, | |
| ) -> list[list[str]]: | |
| """ | |
| Full preprocessing pipeline. | |
| Args: | |
| reviews: Raw review texts | |
| build_phrases: Whether to build phrase models (skip if loading) | |
| Returns: | |
| List of tokenized documents with phrases applied | |
| """ | |
| logger.info(f"Preprocessing {len(reviews)} reviews...") | |
| # Step 1: Tokenize all reviews | |
| tokenized = [self.tokenize(review) for review in reviews] | |
| logger.info("Tokenization complete") | |
| # Step 2: Build phrase models | |
| if build_phrases: | |
| self.build_phrase_models(tokenized) | |
| # Step 3: Apply phrases and count frequencies | |
| processed = [] | |
| for tokens in tokenized: | |
| phrased = self.apply_phrases(tokens) | |
| processed.append(phrased) | |
| self.word_frequencies.update(phrased) | |
| logger.info(f"Vocabulary size: {len(self.word_frequencies)}") | |
| return processed | |
| def get_word_frequencies(self) -> dict[str, int]: | |
| """Get word frequency dictionary.""" | |
| return dict(self.word_frequencies) | |
| def save(self, path: Path | None = None) -> None: | |
| """Save preprocessor state (phrase models, frequencies).""" | |
| path = path or MODELS_DIR / "preprocessor.pkl" | |
| data = { | |
| "frozen_ngrams": self.frozen_ngrams, | |
| "bigram_model": self.bigram_model, | |
| "trigram_model": self.trigram_model, | |
| "word_frequencies": self.word_frequencies, | |
| } | |
| with open(path, "wb") as f: | |
| pickle.dump(data, f) | |
| logger.info(f"Saved preprocessor to {path}") | |
| def load(self, path: Path | None = None) -> None: | |
| """Load preprocessor state.""" | |
| path = path or MODELS_DIR / "preprocessor.pkl" | |
| if not path.exists(): | |
| raise FileNotFoundError(f"Preprocessor not found at {path}") | |
| with open(path, "rb") as f: | |
| data = pickle.load(f) | |
| self.frozen_ngrams = data["frozen_ngrams"] | |
| self.bigram_model = data["bigram_model"] | |
| self.trigram_model = data["trigram_model"] | |
| self.word_frequencies = data["word_frequencies"] | |
| logger.info(f"Loaded preprocessor from {path}") | |
| def extract_ngrams_from_keywords(keywords: dict[str, list[str]]) -> list[str]: | |
| """ | |
| Extract multi-word phrases from keywords dictionary. | |
| Args: | |
| keywords: TOPIC_KEYWORDS dictionary from keywords.py | |
| Returns: | |
| List of multi-word phrases (e.g., ["frame rate", "loading screen"]) | |
| """ | |
| ngrams = [] | |
| for category_words in keywords.values(): | |
| for word in category_words: | |
| if " " in word: | |
| ngrams.append(word) | |
| return ngrams | |