""" preprocessing.py — Mongolian NLP preprocessing pipeline. Two distinct modes, called from the analysis router on the SAME raw text: preprocess_nlp(text) → for NER + Sentiment Analysis Goal: give BERT maximum linguistic context. Keeps punctuation, restores capitalisation, protects name structure. Does NOT remove stopwords — grammar words help sentiment polarity. preprocess_tm(text) → for Topic Modeling (BERTopic) Goal: give BERTopic clean content-bearing tokens only. Aggressive: lowercase, strip all punctuation, remove stopwords. Keeps compound name hyphens as single tokens (бат-эрдэнэ). Changes from the original: - protect_names() now handles BOTH uppercase (А.Бат) and lowercase (б.амар) social-media initials, and handles compound surnames with hyphens (А.Бат-Эрдэнэ) - clean_basic() now removes hashtags/mentions and BMP emoji (U+2000-U+2BFF etc.) before deep cleaning — the original passed these through to clean_deep where they were handled inconsistently - clean_deep() regex narrowed — original [А-Яа-яӨөҮүЁё-]+ allowed a trailing hyphen to absorb the next word. Name protection now happens in clean_basic (via _protect_names) so clean_deep never sees raw А.Бат forms at all - capitalize_for_ner() is a new function that restores sentence-start capitals and capitalises the initial letter in lowercase name patterns, fixing the core problem where б.амар wouldn't be tagged as PER - remove_stopwords() now also filters single-character tokens (д, т, н etc.) - preprocess_dual() added — returns both NLP and TM forms in one call - add_stopwords() added — lets main.py inject KB stopwords at startup """ import re import unicodedata from typing import List, Optional, Set, Tuple # --------------------------------------------------------------------------- # Compiled patterns # --------------------------------------------------------------------------- MONGOLIAN_PATTERN = re.compile(r"[А-Яа-яӨөҮүЁё]") URL_PATTERN = re.compile(r"https?://\S+|www\.\S+") HASHTAG_MENTION = re.compile(r"[@#]\S+") # BMP symbol/emoji blocks to remove. # Intentionally EXCLUDES: # U+2000-U+206F General Punctuation (— – … " " ' •) # U+20A0-U+20CF Currency Symbols (₮) # because the original range \u2000-\u27FF was removing these. BMP_EMOJI = re.compile( r"[\u20D0-\u20FF" # Combining Diacritical Marks for Symbols (⃣ base) r"\u2100-\u27FF" # Symbol blocks: letterlike → dingbats r"\u2900-\u2BFF" # Supplemental Arrows, Misc Math Symbols r"\uFE00-\uFEFF" # Variation Selectors, specials r"\uFF00-\uFFEF]" # Halfwidth/Fullwidth Forms ) SUPPLEMENTARY_EMOJI = re.compile(r"[\U00010000-\U0010FFFF]") # Sentiment-bearing emoji → neutral text markers (NLP mode only). # [LAUGH] is intentionally ambiguous: 😂/🤣 are frequently sarcastic in # Mongolian social media. Replacing with a literal sentiment word would be # wrong half the time. Instead we pass [LAUGH] to BERT and let it infer # polarity from the surrounding tokens. EMOJI_SENTIMENT: dict = { # Ambiguous laughing "😂": "[LAUGH]", "🤣": "[LAUGH]", "😅": "[LAUGH]", # Positive — love / warmth "❤": "[LOVE]", "🥰": "[LOVE]", "😍": "[LOVE]", "💕": "[LOVE]", "💗": "[LOVE]", "💖": "[LOVE]", "💝": "[LOVE]", # Positive — excitement / energy (🔥✨🤩 dominate your dataset) "🔥": "[EXCITED]", "✨": "[EXCITED]", "🤩": "[EXCITED]", "🎉": "[EXCITED]", "👏": "[EXCITED]", # Positive — approval / gratitude "👍": "[POSITIVE]", "🙏": "[GRATEFUL]", # Negative — anger "😡": "[ANGRY]", "🤬": "[ANGRY]", "😤": "[ANGRY]", # Negative — sadness "😢": "[SAD]", "😭": "[SAD]", "💔": "[SAD]", # Negative — disapproval "👎": "[NEGATIVE]", } # Uppercase Mongolian initial: А.Бат-Эрдэнэ, Б.Сувдаа MN_NAME_UPPER = re.compile( r"\b([А-ЯӨҮЁ])\.\s*" r"([А-Яа-яӨөҮүЁё][а-яөүёa-z]+" r"(?:-[А-Яа-яӨөҮүЁё][а-яөүёa-z]+)*)" ) # Lowercase initial: б.амар, о.батзориг (very common in social media) # ORIGINAL CODE HAD NO HANDLING FOR THIS — it only matched [А-ЯӨҮЁ] MN_NAME_LOWER = re.compile( r"\b([а-яөүё])\.\s*" r"([а-яөүёa-z]+" r"(?:-[а-яөүёa-z]+)*)" ) # Protected form А_Бат-Эрдэнэ — matched by restore_names() MN_NAME_PROTECTED = re.compile( r"\b([А-ЯӨҮЁ])_([А-Яа-яӨөҮүЁё][а-яөүёa-z]+(?:-[А-Яа-яӨөҮүЁё][а-яөүёa-z]+)*)\b" ) SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=[А-ЯӨҮЁ\u0400-\u04FF]|[A-Z])") # --------------------------------------------------------------------------- # Stopwords # --------------------------------------------------------------------------- MONGOLIAN_STOPWORDS: Set[str] = { "ба", "бас", "бол", "бөгөөд", "байна", "байгаа", "байсан", "бсан", "бхаа", "бн", "бна", "байх", "юм", "биш", "бгаа", "бдаг", "байдаг", "бхоо", "бх", "энэ", "тэр", "эдгээр", "тэдгээр", "үүн", "үүнд", "үүнээс", "үүний", "үүнтэй", "түүн", "түүнд", "түүнээс", "түүний", "түүнтэй", "тийм", "ийм", "чинь", "минь", "билээ", "шүү", "би", "чи", "та", "бид", "тэд", "миний", "чиний", "таны", "бидний", "тэдний", "над", "надад", "надаас", "чамд", "чамаас", "танд", "танаас", "өөр", "өөрөө", "өөрийн", "өөрт", "өөрөөс", "гэж", "гэх", "гэсэн", "гэжээ", "гэв", "гэвч", "гээд", "гнээ", "гэнэ", "гээ", "л", "ч", "уу", "үү", "юу", "яаж", "яагаад", "хаана", "хэзээ", "хэн", "ямар", "ямарч", "яах", "вэ", "бэ", "бээ", "болон", "мөн", "эсвэл", "гэхдээ", "харин", "дээр", "доор", "дотор", "гадна", "хойно", "өмнө", "руу", "рүү", "аас", "ээс", "оос", "өөс", "тай", "тэй", "той", "д", "т", "нь", "аа", "ээ", "оо", "өө", "бай", "болно", "болох", "болсон", "их", "бага", "маш", "тун", "нэлээд", "шиг", "шд", "н", "шдэ", "шдээ", "шт", "штэ", "штээ", "ш дээ", "ш тээ", "бз", "биз", "дээ", "даа", "юмаа", "аан", "хө", "тэ", "тээ", "гш", "ммхн", "сдаа", "сда", "хаха", "кк", "гэх", "хийх", "авах", "өгөх", "очих", "ирэх", "ын", "ийн", "ний", "ийг", "ууд", "үүд", "та нар", } # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _normalize_unicode(text: str) -> str: """NFC-normalize so visually identical Mongolian characters match regexes.""" return unicodedata.normalize("NFC", text) def _remove_emoji(text: str, convert_sentiment: bool = False) -> str: """ Remove BMP symbol blocks and supplementary-plane emoji. If convert_sentiment=True (NLP mode): replace known sentiment emoji with text markers BEFORE stripping so the signal survives into BERT. Conversion must happen first because the regex replacements below would otherwise erase the emoji before we can read them. """ if convert_sentiment: for emoji, marker in EMOJI_SENTIMENT.items(): text = text.replace(emoji, f" {marker} ") text = BMP_EMOJI.sub(" ", text) text = SUPPLEMENTARY_EMOJI.sub(" ", text) return text def _protect_names(text: str) -> str: """ Convert Mongolian name patterns to protected underscore form before any cleaning strips the dots or hyphens. WHY UNDERSCORE: The character-whitelist in clean_deep preserves [_] and [-], so both the initial-name join AND the compound-surname hyphen survive. WHAT CHANGED FROM ORIGINAL: Original clean_deep had: re.sub(r'\b([А-ЯӨҮЁ])\.\s*([А-Яа-яӨөҮүЁё-]+)', r'\1_\2', text) Problems: 1. Only runs INSIDE clean_deep, after clean_basic already ran. If the text came through preprocess_nlp (which never calls clean_deep), name dots were never protected. 2. [А-Яа-яӨөҮүЁё-]+ has a trailing hyphen IN the character class which means it matches the hyphen character anywhere, including at the end of a word, potentially absorbing the next token. 3. Only matched uppercase initials — б.амар was completely missed. This version runs _protect_names() inside clean_basic() so it fires for BOTH NLP and TM pipelines, before any stripping occurs. """ def _replace_upper(m: re.Match) -> str: initial = m.group(1) name = "-".join(p.capitalize() for p in m.group(2).split("-")) return f"{initial}_{name}" def _replace_lower(m: re.Match) -> str: # Capitalize the single-letter initial and each name part initial = m.group(1).upper() name = "-".join(p.capitalize() for p in m.group(2).split("-")) return f"{initial}_{name}" text = MN_NAME_UPPER.sub(_replace_upper, text) text = MN_NAME_LOWER.sub(_replace_lower, text) return text def _restore_names(text: str) -> str: """Undo protection: А_Бат-Эрдэнэ → А.Бат-Эрдэнэ (NLP mode only).""" return MN_NAME_PROTECTED.sub(lambda m: f"{m.group(1)}.{m.group(2)}", text) def _capitalize_for_ner(text: str) -> str: """ Heuristic capitalisation for NER on social-media Mongolian text. WHY THIS IS NEEDED: Davlan/bert-base-multilingual-cased-ner-hrl is a CASED model — it uses capitalisation as a primary signal to identify proper nouns. Mongolian social media is frequently written entirely lowercase. Without this step, "монгол улсын ерөнхийлөгч х.баттулга" will not tag х.баттулга as PER because the model sees it as an ordinary lowercase word. WHAT THIS DOES: 1. Capitalises the first word of each detected sentence. 2. Capitalises the name component inside protected tokens: Б_амар → Б_Амар (the initial is already uppercase from _protect_names, but the name itself may still be lowercase if it came from the lower pattern and capitalize() didn't fire — this is a safety pass). WHAT THIS DOES NOT DO: - Does NOT blindly capitalise all words (that would confuse common nouns) """ sentences = SENTENCE_BOUNDARY.split(text) sentences = [s[0].upper() + s[1:] if s else s for s in sentences] text = " ".join(sentences) # Fix any protected token where name part is still lowercase text = re.sub( r"([А-ЯӨҮЁ])_([а-яөүё])([а-яөүё]*)", lambda m: f"{m.group(1)}_{m.group(2).upper()}{m.group(3)}", text, ) return text # --------------------------------------------------------------------------- # Main class # --------------------------------------------------------------------------- class Preprocessor: """ Text preprocessing pipeline for Mongolian social media data. Initialise once at app startup with stopwords from the knowledge base: kb = KnowledgeBase() preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords()) Then in the analysis router, call preprocess_dual() per document: nlp_text, tm_text = preprocessor.preprocess_dual(raw_text) entities = ner_engine.recognize(nlp_text) sentiment = sentiment_analyzer.analyze(nlp_text) tm_texts.append(tm_text) After all documents: if len(tm_texts) >= 10: # BERTopic minimum — skip below this topic_results, summary = topic_modeler.fit_transform(tm_texts) """ def __init__(self, extra_stopwords: Optional[List[str]] = None): self.stopwords: Set[str] = MONGOLIAN_STOPWORDS.copy() if extra_stopwords: self.stopwords.update(w.lower().strip() for w in extra_stopwords) def add_stopwords(self, words: List[str]) -> None: """ Inject additional stopwords at runtime (e.g. after admin adds one). Call preprocessor.add_stopwords(kb.get_stopwords()) when the admin saves a new stopword — takes effect on the next analysis request without restarting the server. """ self.stopwords.update(w.lower().strip() for w in words) def is_mongolian(self, text: str) -> bool: return isinstance(text, str) and bool(MONGOLIAN_PATTERN.search(text)) # ------------------------------------------------------------------ # clean_basic # ------------------------------------------------------------------ def clean_basic(self, text: str, replace_url: bool = True, convert_emoji: bool = False) -> str: """ Light surface cleaning. CHANGES FROM ORIGINAL: Original: only handled URLs and whitespace normalisation. Updated: 1. Unicode NFC normalisation added (first step — must precede regex) 2. _protect_names() called here so it fires for BOTH pipelines. Original had protection only inside clean_deep() which is only called in TM mode — NLP mode had no name protection at all. 3. Hashtag/mention removal added. Original left @user and #tag in the text; in TM mode these became bare tokens like "монгол" (from #монгол) with artificially inflated frequency. 4. BMP emoji removal added via _remove_emoji(). Original only removed supplementary-plane emoji and only inside clean_deep(). 5. convert_emoji added: when True (NLP mode) known sentiment emoji are converted to text markers before stripping so the signal reaches BERT. When False (TM mode) emoji are stripped directly. Args: replace_url: True = replace with [URL] token (NLP needs the signal that a URL was present). False = remove entirely (TM — URL adds no topic). convert_emoji: True = sentiment emoji → [LAUGH]/[LOVE]/etc. False = strip all emoji (TM mode default). """ if not isinstance(text, str): return "" text = _normalize_unicode(text) text = _protect_names(text) # must be before any dot/hyphen stripping if replace_url: text = URL_PATTERN.sub("[URL]", text) else: text = URL_PATTERN.sub("", text) text = HASHTAG_MENTION.sub("", text) text = _remove_emoji(text, convert_sentiment=convert_emoji) text = re.sub(r"\s+", " ", text).strip() return text # ------------------------------------------------------------------ # clean_deep # ------------------------------------------------------------------ def clean_deep(self, text: str) -> str: """ Aggressive symbol/punctuation removal for TM mode. CHANGES FROM ORIGINAL: Original had THREE issues inside this function: 1. Name protection regex: r'\b([А-ЯӨҮЁ])\.\s*([А-Яа-яӨөҮүЁё-]+)' The character class [А-Яа-яӨөҮүЁё-]+ includes a hyphen at the END of the class which makes it match ANY hyphen, including standalone hyphens at word boundaries. This could join the protected name to the next word. FIX: Name protection is fully removed from here — it now happens in _protect_names() called inside clean_basic(). By the time clean_deep() runs, А.Бат-Эрдэнэ is already А_Бат-Эрдэнэ and the character whitelist below preserves both _ and -. 2. Uppercase-only matching: the original only protected [А-ЯӨҮЁ] initials. Lowercase б.амар was left unprotected. FIX: Handled by _protect_names() as above. 3. Emoji removal: original had re.sub(r'[\U00010000-\U0010ffff]', '', text) here, missing BMP symbols. FIX: Moved to _remove_emoji() inside clean_basic() which runs first. What this function now does: - Remove [URL] placeholder if still present - Apply character whitelist: keep Mongolian Cyrillic, Latin, digits, spaces, underscores (protected name joins), and hyphens (compound surname separators inside protected names) - Normalise whitespace """ if not isinstance(text, str): return "" # Remove [URL] placeholder text = re.sub(r"\[URL\]", "", text) # Character whitelist — everything outside this becomes a space # _ preserved: А_Бат protected form # - preserved: Бат-Эрдэнэ compound name (inside protected token) text = re.sub( r"[^A-Za-zА-Яа-яӨөҮүЁё0-9\s_\-]", " ", text, ) text = re.sub(r"\s+", " ", text) return text.strip() # ------------------------------------------------------------------ # Stopword removal # ------------------------------------------------------------------ def _remove_stopwords(self, text: str) -> str: """ Remove stopwords from lowercased text. CHANGE FROM ORIGINAL: Added len(w) > 1 filter. Single-character Mongolian tokens (д, т, н, ч, л, etc.) are case inflections and particles written as separate words in informal text. They are effectively stopwords and pollute topic model vocabulary. The original code left them in. """ if not isinstance(text, str): return "" words = text.split() return " ".join( w for w in words if len(w) > 1 and w.lower() not in self.stopwords ) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def preprocess_nlp(self, text: str) -> str: """ Preprocessing for NER and Sentiment Analysis. Pipeline: clean_basic → capitalize_for_ner → restore_names Intentionally skips: clean_deep, lowercasing, stopword removal """ if not isinstance(text, str): return "" text = self.clean_basic(text, replace_url=True, convert_emoji=True) text = _capitalize_for_ner(text) text = _restore_names(text) return text def preprocess_tm(self, text: str) -> str: """ Preprocessing for Topic Modeling (BERTopic). Pipeline: clean_basic → clean_deep → lowercase → remove_stopwords → strip initial prefix Why strip the initial prefix in TM mode: After lowercasing, А_Бат-Эрдэнэ becomes а_бат-эрдэнэ. The single letter а adds nothing to topic clusters — the meaningful token is бат-эрдэнэ (the surname, treated as one compound token). The regex below strips the initial and underscore, keeping the name. """ if not isinstance(text, str): return "" text = self.clean_basic(text, replace_url=False) text = self.clean_deep(text) text = text.lower() text = self._remove_stopwords(text) # Strip single-letter initial prefix: а_батэрдэнэ → батэрдэнэ text = re.sub( r"\b[а-яөүё]_([а-яөүёa-z]+(?:-[а-яөүёa-z]+)*)\b", r"\1", text, ) return re.sub(r"\s+", " ", text).strip() def preprocess_dual(self, text: str) -> Tuple[str, str]: """ Return both NLP and TM forms in one call. Use this in the router to avoid processing the same text twice: nlp_text, tm_text = preprocessor.preprocess_dual(raw) """ return self.preprocess_nlp(text), self.preprocess_tm(text) def split_sentences(self, text: str) -> List[str]: """ Split NLP-preprocessed text into sentences for chunked NER. Useful when a document exceeds BERT's 512-token limit. """ parts = SENTENCE_BOUNDARY.split(text) return [p.strip() for p in parts if p.strip()] def preprocess_batch(self, texts: List[str], mode: str = "nlp") -> List[str]: """ Preprocess a list of texts in the given mode ("nlp" or "tm"). Returns a list of the same length. """ fn = self.preprocess_tm if mode == "tm" else self.preprocess_nlp return [fn(t) for t in texts]