Spaces:
Running
Running
| """ | |
| preprocessing.py — Mongolian NLP preprocessing pipeline. | |
| Two distinct modes, called from the analysis router on the SAME raw text: | |
| preprocess_nlp(text) → for NER + Sentiment Analysis | |
| Goal: give BERT maximum linguistic context. | |
| Keeps punctuation, restores capitalisation, protects name structure. | |
| Does NOT remove stopwords — grammar words help sentiment polarity. | |
| preprocess_tm(text) → for Topic Modeling (BERTopic) | |
| Goal: give BERTopic clean content-bearing tokens only. | |
| Aggressive: lowercase, strip all punctuation, remove stopwords. | |
| Keeps compound name hyphens as single tokens (бат-эрдэнэ). | |
| Changes from the original: | |
| - protect_names() now handles BOTH uppercase (А.Бат) and lowercase (б.амар) | |
| social-media initials, and handles compound surnames with hyphens (А.Бат-Эрдэнэ) | |
| - clean_basic() now removes hashtags/mentions and BMP emoji (U+2000-U+2BFF etc.) | |
| before deep cleaning — the original passed these through to clean_deep | |
| where they were handled inconsistently | |
| - clean_deep() regex narrowed — original [А-Яа-яӨөҮүЁё-]+ allowed a trailing | |
| hyphen to absorb the next word. Name protection now happens in clean_basic | |
| (via _protect_names) so clean_deep never sees raw А.Бат forms at all | |
| - capitalize_for_ner() is a new function that restores sentence-start | |
| capitals and capitalises the initial letter in lowercase name patterns, | |
| fixing the core problem where б.амар wouldn't be tagged as PER | |
| - remove_stopwords() now also filters single-character tokens (д, т, н etc.) | |
| - preprocess_dual() added — returns both NLP and TM forms in one call | |
| - add_stopwords() added — lets main.py inject KB stopwords at startup | |
| """ | |
| import re | |
| import unicodedata | |
| from typing import List, Optional, Set, Tuple | |
| # --------------------------------------------------------------------------- | |
| # Compiled patterns | |
| # --------------------------------------------------------------------------- | |
| MONGOLIAN_PATTERN = re.compile(r"[А-Яа-яӨөҮүЁё]") | |
| URL_PATTERN = re.compile(r"https?://\S+|www\.\S+") | |
| HASHTAG_MENTION = re.compile(r"[@#]\S+") | |
| # BMP symbol/emoji blocks to remove. | |
| # Intentionally EXCLUDES: | |
| # U+2000-U+206F General Punctuation (— – … " " ' •) | |
| # U+20A0-U+20CF Currency Symbols (₮) | |
| # because the original range \u2000-\u27FF was removing these. | |
| BMP_EMOJI = re.compile( | |
| r"[\u20D0-\u20FF" # Combining Diacritical Marks for Symbols (⃣ base) | |
| r"\u2100-\u27FF" # Symbol blocks: letterlike → dingbats | |
| r"\u2900-\u2BFF" # Supplemental Arrows, Misc Math Symbols | |
| r"\uFE00-\uFEFF" # Variation Selectors, specials | |
| r"\uFF00-\uFFEF]" # Halfwidth/Fullwidth Forms | |
| ) | |
| SUPPLEMENTARY_EMOJI = re.compile(r"[\U00010000-\U0010FFFF]") | |
| # Sentiment-bearing emoji → neutral text markers (NLP mode only). | |
| # [LAUGH] is intentionally ambiguous: 😂/🤣 are frequently sarcastic in | |
| # Mongolian social media. Replacing with a literal sentiment word would be | |
| # wrong half the time. Instead we pass [LAUGH] to BERT and let it infer | |
| # polarity from the surrounding tokens. | |
| EMOJI_SENTIMENT: dict = { | |
| # Ambiguous laughing | |
| "😂": "[LAUGH]", "🤣": "[LAUGH]", "😅": "[LAUGH]", | |
| # Positive — love / warmth | |
| "❤": "[LOVE]", "🥰": "[LOVE]", "😍": "[LOVE]", | |
| "💕": "[LOVE]", "💗": "[LOVE]", "💖": "[LOVE]", "💝": "[LOVE]", | |
| # Positive — excitement / energy (🔥✨🤩 dominate your dataset) | |
| "🔥": "[EXCITED]", "✨": "[EXCITED]", "🤩": "[EXCITED]", | |
| "🎉": "[EXCITED]", "👏": "[EXCITED]", | |
| # Positive — approval / gratitude | |
| "👍": "[POSITIVE]", | |
| "🙏": "[GRATEFUL]", | |
| # Negative — anger | |
| "😡": "[ANGRY]", "🤬": "[ANGRY]", "😤": "[ANGRY]", | |
| # Negative — sadness | |
| "😢": "[SAD]", "😭": "[SAD]", "💔": "[SAD]", | |
| # Negative — disapproval | |
| "👎": "[NEGATIVE]", | |
| } | |
| # Uppercase Mongolian initial: А.Бат-Эрдэнэ, Б.Сувдаа | |
| MN_NAME_UPPER = re.compile( | |
| r"\b([А-ЯӨҮЁ])\.\s*" | |
| r"([А-Яа-яӨөҮүЁё][а-яөүёa-z]+" | |
| r"(?:-[А-Яа-яӨөҮүЁё][а-яөүёa-z]+)*)" | |
| ) | |
| # Lowercase initial: б.амар, о.батзориг (very common in social media) | |
| # ORIGINAL CODE HAD NO HANDLING FOR THIS — it only matched [А-ЯӨҮЁ] | |
| MN_NAME_LOWER = re.compile( | |
| r"\b([а-яөүё])\.\s*" | |
| r"([а-яөүёa-z]+" | |
| r"(?:-[а-яөүёa-z]+)*)" | |
| ) | |
| # Protected form А_Бат-Эрдэнэ — matched by restore_names() | |
| MN_NAME_PROTECTED = re.compile( | |
| r"\b([А-ЯӨҮЁ])_([А-Яа-яӨөҮүЁё][а-яөүёa-z]+(?:-[А-Яа-яӨөҮүЁё][а-яөүёa-z]+)*)\b" | |
| ) | |
| SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=[А-ЯӨҮЁ\u0400-\u04FF]|[A-Z])") | |
| # --------------------------------------------------------------------------- | |
| # Stopwords | |
| # --------------------------------------------------------------------------- | |
| MONGOLIAN_STOPWORDS: Set[str] = { | |
| "ба", "бас", "бол", "бөгөөд", "байна", "байгаа", "байсан", "бсан", | |
| "бхаа", "бн", "бна", "байх", "юм", "биш", "бгаа", "бдаг", "байдаг", | |
| "бхоо", "бх", | |
| "энэ", "тэр", "эдгээр", "тэдгээр", "үүн", "үүнд", "үүнээс", "үүний", | |
| "үүнтэй", "түүн", "түүнд", "түүнээс", "түүний", "түүнтэй", | |
| "тийм", "ийм", "чинь", "минь", "билээ", "шүү", | |
| "би", "чи", "та", "бид", "тэд", "миний", "чиний", "таны", "бидний", | |
| "тэдний", "над", "надад", "надаас", "чамд", "чамаас", "танд", "танаас", | |
| "өөр", "өөрөө", "өөрийн", "өөрт", "өөрөөс", | |
| "гэж", "гэх", "гэсэн", "гэжээ", "гэв", "гэвч", "гээд", "гнээ", | |
| "гэнэ", "гээ", | |
| "л", "ч", "уу", "үү", "юу", "яаж", "яагаад", | |
| "хаана", "хэзээ", "хэн", "ямар", "ямарч", "яах", "вэ", "бэ", "бээ", | |
| "болон", "мөн", "эсвэл", "гэхдээ", "харин", | |
| "дээр", "доор", "дотор", "гадна", "хойно", "өмнө", | |
| "руу", "рүү", "аас", "ээс", "оос", "өөс", "тай", "тэй", "той", | |
| "д", "т", "нь", "аа", "ээ", "оо", "өө", | |
| "бай", "болно", "болох", "болсон", | |
| "их", "бага", "маш", "тун", "нэлээд", "шиг", | |
| "шд", "н", "шдэ", "шдээ", "шт", "штэ", "штээ", "ш дээ", "ш тээ", | |
| "бз", "биз", "дээ", "даа", "юмаа", "аан", "хө", "тэ", "тээ", | |
| "гш", "ммхн", "сдаа", "сда", "хаха", "кк", | |
| "гэх", "хийх", "авах", "өгөх", "очих", "ирэх", | |
| "ын", "ийн", "ний", "ийг", "ууд", "үүд", | |
| "та нар", | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Internal helpers | |
| # --------------------------------------------------------------------------- | |
| def _normalize_unicode(text: str) -> str: | |
| """NFC-normalize so visually identical Mongolian characters match regexes.""" | |
| return unicodedata.normalize("NFC", text) | |
| def _remove_emoji(text: str, convert_sentiment: bool = False) -> str: | |
| """ | |
| Remove BMP symbol blocks and supplementary-plane emoji. | |
| If convert_sentiment=True (NLP mode): replace known sentiment emoji with | |
| text markers BEFORE stripping so the signal survives into BERT. | |
| Conversion must happen first because the regex replacements below would | |
| otherwise erase the emoji before we can read them. | |
| """ | |
| if convert_sentiment: | |
| for emoji, marker in EMOJI_SENTIMENT.items(): | |
| text = text.replace(emoji, f" {marker} ") | |
| text = BMP_EMOJI.sub(" ", text) | |
| text = SUPPLEMENTARY_EMOJI.sub(" ", text) | |
| return text | |
| def _protect_names(text: str) -> str: | |
| """ | |
| Convert Mongolian name patterns to protected underscore form before | |
| any cleaning strips the dots or hyphens. | |
| WHY UNDERSCORE: | |
| The character-whitelist in clean_deep preserves [_] and [-], so both | |
| the initial-name join AND the compound-surname hyphen survive. | |
| WHAT CHANGED FROM ORIGINAL: | |
| Original clean_deep had: | |
| re.sub(r'\b([А-ЯӨҮЁ])\.\s*([А-Яа-яӨөҮүЁё-]+)', r'\1_\2', text) | |
| Problems: | |
| 1. Only runs INSIDE clean_deep, after clean_basic already ran. | |
| If the text came through preprocess_nlp (which never calls | |
| clean_deep), name dots were never protected. | |
| 2. [А-Яа-яӨөҮүЁё-]+ has a trailing hyphen IN the character class | |
| which means it matches the hyphen character anywhere, including | |
| at the end of a word, potentially absorbing the next token. | |
| 3. Only matched uppercase initials — б.амар was completely missed. | |
| This version runs _protect_names() inside clean_basic() so it fires | |
| for BOTH NLP and TM pipelines, before any stripping occurs. | |
| """ | |
| def _replace_upper(m: re.Match) -> str: | |
| initial = m.group(1) | |
| name = "-".join(p.capitalize() for p in m.group(2).split("-")) | |
| return f"{initial}_{name}" | |
| def _replace_lower(m: re.Match) -> str: | |
| # Capitalize the single-letter initial and each name part | |
| initial = m.group(1).upper() | |
| name = "-".join(p.capitalize() for p in m.group(2).split("-")) | |
| return f"{initial}_{name}" | |
| text = MN_NAME_UPPER.sub(_replace_upper, text) | |
| text = MN_NAME_LOWER.sub(_replace_lower, text) | |
| return text | |
| def _restore_names(text: str) -> str: | |
| """Undo protection: А_Бат-Эрдэнэ → А.Бат-Эрдэнэ (NLP mode only).""" | |
| return MN_NAME_PROTECTED.sub(lambda m: f"{m.group(1)}.{m.group(2)}", text) | |
| def _capitalize_for_ner(text: str) -> str: | |
| """ | |
| Heuristic capitalisation for NER on social-media Mongolian text. | |
| WHY THIS IS NEEDED: | |
| Davlan/bert-base-multilingual-cased-ner-hrl is a CASED model — it uses | |
| capitalisation as a primary signal to identify proper nouns. Mongolian | |
| social media is frequently written entirely lowercase. Without this step, | |
| "монгол улсын ерөнхийлөгч х.баттулга" will not tag х.баттулга as PER | |
| because the model sees it as an ordinary lowercase word. | |
| WHAT THIS DOES: | |
| 1. Capitalises the first word of each detected sentence. | |
| 2. Capitalises the name component inside protected tokens: | |
| Б_амар → Б_Амар (the initial is already uppercase from _protect_names, | |
| but the name itself may still be lowercase if it came from the lower | |
| pattern and capitalize() didn't fire — this is a safety pass). | |
| WHAT THIS DOES NOT DO: | |
| - Does NOT blindly capitalise all words (that would confuse common nouns) | |
| """ | |
| sentences = SENTENCE_BOUNDARY.split(text) | |
| sentences = [s[0].upper() + s[1:] if s else s for s in sentences] | |
| text = " ".join(sentences) | |
| # Fix any protected token where name part is still lowercase | |
| text = re.sub( | |
| r"([А-ЯӨҮЁ])_([а-яөүё])([а-яөүё]*)", | |
| lambda m: f"{m.group(1)}_{m.group(2).upper()}{m.group(3)}", | |
| text, | |
| ) | |
| return text | |
| # --------------------------------------------------------------------------- | |
| # Main class | |
| # --------------------------------------------------------------------------- | |
| class Preprocessor: | |
| """ | |
| Text preprocessing pipeline for Mongolian social media data. | |
| Initialise once at app startup with stopwords from the knowledge base: | |
| kb = KnowledgeBase() | |
| preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords()) | |
| Then in the analysis router, call preprocess_dual() per document: | |
| nlp_text, tm_text = preprocessor.preprocess_dual(raw_text) | |
| entities = ner_engine.recognize(nlp_text) | |
| sentiment = sentiment_analyzer.analyze(nlp_text) | |
| tm_texts.append(tm_text) | |
| After all documents: | |
| if len(tm_texts) >= 10: # BERTopic minimum — skip below this | |
| topic_results, summary = topic_modeler.fit_transform(tm_texts) | |
| """ | |
| def __init__(self, extra_stopwords: Optional[List[str]] = None): | |
| self.stopwords: Set[str] = MONGOLIAN_STOPWORDS.copy() | |
| if extra_stopwords: | |
| self.stopwords.update(w.lower().strip() for w in extra_stopwords) | |
| def add_stopwords(self, words: List[str]) -> None: | |
| """ | |
| Inject additional stopwords at runtime (e.g. after admin adds one). | |
| Call preprocessor.add_stopwords(kb.get_stopwords()) when the admin | |
| saves a new stopword — takes effect on the next analysis request | |
| without restarting the server. | |
| """ | |
| self.stopwords.update(w.lower().strip() for w in words) | |
| def is_mongolian(self, text: str) -> bool: | |
| return isinstance(text, str) and bool(MONGOLIAN_PATTERN.search(text)) | |
| # ------------------------------------------------------------------ | |
| # clean_basic | |
| # ------------------------------------------------------------------ | |
| def clean_basic(self, text: str, replace_url: bool = True, | |
| convert_emoji: bool = False) -> str: | |
| """ | |
| Light surface cleaning. | |
| CHANGES FROM ORIGINAL: | |
| Original: only handled URLs and whitespace normalisation. | |
| Updated: | |
| 1. Unicode NFC normalisation added (first step — must precede regex) | |
| 2. _protect_names() called here so it fires for BOTH pipelines. | |
| Original had protection only inside clean_deep() which is only | |
| called in TM mode — NLP mode had no name protection at all. | |
| 3. Hashtag/mention removal added. Original left @user and #tag | |
| in the text; in TM mode these became bare tokens like "монгол" | |
| (from #монгол) with artificially inflated frequency. | |
| 4. BMP emoji removal added via _remove_emoji(). Original only | |
| removed supplementary-plane emoji and only inside clean_deep(). | |
| 5. convert_emoji added: when True (NLP mode) known sentiment emoji | |
| are converted to text markers before stripping so the signal | |
| reaches BERT. When False (TM mode) emoji are stripped directly. | |
| Args: | |
| replace_url: True = replace with [URL] token (NLP needs the | |
| signal that a URL was present). | |
| False = remove entirely (TM — URL adds no topic). | |
| convert_emoji: True = sentiment emoji → [LAUGH]/[LOVE]/etc. | |
| False = strip all emoji (TM mode default). | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| text = _normalize_unicode(text) | |
| text = _protect_names(text) # must be before any dot/hyphen stripping | |
| if replace_url: | |
| text = URL_PATTERN.sub("[URL]", text) | |
| else: | |
| text = URL_PATTERN.sub("", text) | |
| text = HASHTAG_MENTION.sub("", text) | |
| text = _remove_emoji(text, convert_sentiment=convert_emoji) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| # ------------------------------------------------------------------ | |
| # clean_deep | |
| # ------------------------------------------------------------------ | |
| def clean_deep(self, text: str) -> str: | |
| """ | |
| Aggressive symbol/punctuation removal for TM mode. | |
| CHANGES FROM ORIGINAL: | |
| Original had THREE issues inside this function: | |
| 1. Name protection regex: r'\b([А-ЯӨҮЁ])\.\s*([А-Яа-яӨөҮүЁё-]+)' | |
| The character class [А-Яа-яӨөҮүЁё-]+ includes a hyphen at the | |
| END of the class which makes it match ANY hyphen, including | |
| standalone hyphens at word boundaries. This could join the | |
| protected name to the next word. | |
| FIX: Name protection is fully removed from here — it now happens | |
| in _protect_names() called inside clean_basic(). By the time | |
| clean_deep() runs, А.Бат-Эрдэнэ is already А_Бат-Эрдэнэ and | |
| the character whitelist below preserves both _ and -. | |
| 2. Uppercase-only matching: the original only protected [А-ЯӨҮЁ] | |
| initials. Lowercase б.амар was left unprotected. | |
| FIX: Handled by _protect_names() as above. | |
| 3. Emoji removal: original had re.sub(r'[\U00010000-\U0010ffff]', '', text) | |
| here, missing BMP symbols. | |
| FIX: Moved to _remove_emoji() inside clean_basic() which runs first. | |
| What this function now does: | |
| - Remove [URL] placeholder if still present | |
| - Apply character whitelist: keep Mongolian Cyrillic, Latin, digits, | |
| spaces, underscores (protected name joins), and hyphens | |
| (compound surname separators inside protected names) | |
| - Normalise whitespace | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| # Remove [URL] placeholder | |
| text = re.sub(r"\[URL\]", "", text) | |
| # Character whitelist — everything outside this becomes a space | |
| # _ preserved: А_Бат protected form | |
| # - preserved: Бат-Эрдэнэ compound name (inside protected token) | |
| text = re.sub( | |
| r"[^A-Za-zА-Яа-яӨөҮүЁё0-9\s_\-]", | |
| " ", | |
| text, | |
| ) | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| # ------------------------------------------------------------------ | |
| # Stopword removal | |
| # ------------------------------------------------------------------ | |
| def _remove_stopwords(self, text: str) -> str: | |
| """ | |
| Remove stopwords from lowercased text. | |
| CHANGE FROM ORIGINAL: | |
| Added len(w) > 1 filter. Single-character Mongolian tokens (д, т, н, | |
| ч, л, etc.) are case inflections and particles written as separate | |
| words in informal text. They are effectively stopwords and pollute | |
| topic model vocabulary. The original code left them in. | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| words = text.split() | |
| return " ".join( | |
| w for w in words | |
| if len(w) > 1 and w.lower() not in self.stopwords | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Public API | |
| # ------------------------------------------------------------------ | |
| def preprocess_nlp(self, text: str) -> str: | |
| """ | |
| Preprocessing for NER and Sentiment Analysis. | |
| Pipeline: clean_basic → capitalize_for_ner → restore_names | |
| Intentionally skips: clean_deep, lowercasing, stopword removal | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| text = self.clean_basic(text, replace_url=True, convert_emoji=True) | |
| text = _capitalize_for_ner(text) | |
| text = _restore_names(text) | |
| return text | |
| def preprocess_tm(self, text: str) -> str: | |
| """ | |
| Preprocessing for Topic Modeling (BERTopic). | |
| Pipeline: clean_basic → clean_deep → lowercase → remove_stopwords | |
| → strip initial prefix | |
| Why strip the initial prefix in TM mode: | |
| After lowercasing, А_Бат-Эрдэнэ becomes а_бат-эрдэнэ. | |
| The single letter а adds nothing to topic clusters — the meaningful | |
| token is бат-эрдэнэ (the surname, treated as one compound token). | |
| The regex below strips the initial and underscore, keeping the name. | |
| """ | |
| if not isinstance(text, str): | |
| return "" | |
| text = self.clean_basic(text, replace_url=False) | |
| text = self.clean_deep(text) | |
| text = text.lower() | |
| text = self._remove_stopwords(text) | |
| # Strip single-letter initial prefix: а_батэрдэнэ → батэрдэнэ | |
| text = re.sub( | |
| r"\b[а-яөүё]_([а-яөүёa-z]+(?:-[а-яөүёa-z]+)*)\b", | |
| r"\1", | |
| text, | |
| ) | |
| return re.sub(r"\s+", " ", text).strip() | |
| def preprocess_dual(self, text: str) -> Tuple[str, str]: | |
| """ | |
| Return both NLP and TM forms in one call. | |
| Use this in the router to avoid processing the same text twice: | |
| nlp_text, tm_text = preprocessor.preprocess_dual(raw) | |
| """ | |
| return self.preprocess_nlp(text), self.preprocess_tm(text) | |
| def split_sentences(self, text: str) -> List[str]: | |
| """ | |
| Split NLP-preprocessed text into sentences for chunked NER. | |
| Useful when a document exceeds BERT's 512-token limit. | |
| """ | |
| parts = SENTENCE_BOUNDARY.split(text) | |
| return [p.strip() for p in parts if p.strip()] | |
| def preprocess_batch(self, texts: List[str], mode: str = "nlp") -> List[str]: | |
| """ | |
| Preprocess a list of texts in the given mode ("nlp" or "tm"). | |
| Returns a list of the same length. | |
| """ | |
| fn = self.preprocess_tm if mode == "tm" else self.preprocess_nlp | |
| return [fn(t) for t in texts] |