File size: 21,235 Bytes
ae91091
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
 
 
59f9987
 
 
 
 
 
 
 
 
 
 
 
e1c327f
 
 
 
 
59f9987
e1c327f
 
 
 
 
59f9987
 
 
e1c327f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59f9987
 
 
 
 
 
ae91091
59f9987
 
 
 
 
 
 
ae91091
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
59f9987
 
 
 
ae91091
59f9987
 
ae91091
59f9987
 
ae91091
 
 
 
 
59f9987
ae91091
 
59f9987
 
 
ae91091
 
59f9987
ae91091
 
 
59f9987
 
 
 
 
 
 
 
 
e1c327f
 
 
 
 
 
 
 
 
 
 
 
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
59f9987
 
 
 
 
 
 
ae91091
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
59f9987
 
 
 
 
 
 
 
 
 
ae91091
 
59f9987
ae91091
59f9987
 
 
 
e1c327f
 
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1c327f
 
 
59f9987
 
e1c327f
 
 
 
 
59f9987
ae91091
 
59f9987
 
 
 
 
 
 
 
 
 
e1c327f
59f9987
ae91091
 
59f9987
 
 
 
ae91091
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
 
59f9987
 
ae91091
59f9987
 
 
 
 
 
 
 
 
ae91091
 
 
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
 
 
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1c327f
59f9987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae91091
59f9987
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
"""
preprocessing.py — Mongolian NLP preprocessing pipeline.

Two distinct modes, called from the analysis router on the SAME raw text:

  preprocess_nlp(text)  →  for NER + Sentiment Analysis
    Goal: give BERT maximum linguistic context.
    Keeps punctuation, restores capitalisation, protects name structure.
    Does NOT remove stopwords — grammar words help sentiment polarity.

  preprocess_tm(text)   →  for Topic Modeling (BERTopic)
    Goal: give BERTopic clean content-bearing tokens only.
    Aggressive: lowercase, strip all punctuation, remove stopwords.
    Keeps compound name hyphens as single tokens (бат-эрдэнэ).

Changes from the original:
  - protect_names() now handles BOTH uppercase (А.Бат) and lowercase (б.амар)
    social-media initials, and handles compound surnames with hyphens (А.Бат-Эрдэнэ)
  - clean_basic() now removes hashtags/mentions and BMP emoji (U+2000-U+2BFF etc.)
    before deep cleaning — the original passed these through to clean_deep
    where they were handled inconsistently
  - clean_deep() regex narrowed — original [А-Яа-яӨөҮүЁё-]+ allowed a trailing
    hyphen to absorb the next word. Name protection now happens in clean_basic
    (via _protect_names) so clean_deep never sees raw А.Бат forms at all
  - capitalize_for_ner() is a new function that restores sentence-start
    capitals and capitalises the initial letter in lowercase name patterns,
    fixing the core problem where б.амар wouldn't be tagged as PER
  - remove_stopwords() now also filters single-character tokens (д, т, н etc.)
  - preprocess_dual() added — returns both NLP and TM forms in one call
  - add_stopwords() added — lets main.py inject KB stopwords at startup
"""

import re
import unicodedata
from typing import List, Optional, Set, Tuple


# ---------------------------------------------------------------------------
# Compiled patterns
# ---------------------------------------------------------------------------

MONGOLIAN_PATTERN = re.compile(r"[А-Яа-яӨөҮүЁё]")
URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
HASHTAG_MENTION = re.compile(r"[@#]\S+")

# BMP symbol/emoji blocks to remove.
# Intentionally EXCLUDES:
#   U+2000-U+206F  General Punctuation (—  –  …  "  "  '  •)
#   U+20A0-U+20CF  Currency Symbols    (₮)
# because the original range \u2000-\u27FF was removing these.
BMP_EMOJI = re.compile(
    r"[\u20D0-\u20FF"   # Combining Diacritical Marks for Symbols (⃣ base)
    r"\u2100-\u27FF"    # Symbol blocks: letterlike → dingbats
    r"\u2900-\u2BFF"    # Supplemental Arrows, Misc Math Symbols
    r"\uFE00-\uFEFF"    # Variation Selectors, specials
    r"\uFF00-\uFFEF]"   # Halfwidth/Fullwidth Forms
)
SUPPLEMENTARY_EMOJI = re.compile(r"[\U00010000-\U0010FFFF]")

# Sentiment-bearing emoji → neutral text markers (NLP mode only).
# [LAUGH] is intentionally ambiguous: 😂/🤣 are frequently sarcastic in
# Mongolian social media. Replacing with a literal sentiment word would be
# wrong half the time. Instead we pass [LAUGH] to BERT and let it infer
# polarity from the surrounding tokens.
EMOJI_SENTIMENT: dict = {
    # Ambiguous laughing
    "😂": "[LAUGH]", "🤣": "[LAUGH]", "😅": "[LAUGH]",
    # Positive — love / warmth
    "❤": "[LOVE]", "🥰": "[LOVE]", "😍": "[LOVE]",
    "💕": "[LOVE]", "💗": "[LOVE]", "💖": "[LOVE]", "💝": "[LOVE]",
    # Positive — excitement / energy  (🔥✨🤩 dominate your dataset)
    "🔥": "[EXCITED]", "✨": "[EXCITED]", "🤩": "[EXCITED]",
    "🎉": "[EXCITED]", "👏": "[EXCITED]",
    # Positive — approval / gratitude
    "👍": "[POSITIVE]",
    "🙏": "[GRATEFUL]",
    # Negative — anger
    "😡": "[ANGRY]", "🤬": "[ANGRY]", "😤": "[ANGRY]",
    # Negative — sadness
    "😢": "[SAD]", "😭": "[SAD]", "💔": "[SAD]",
    # Negative — disapproval
    "👎": "[NEGATIVE]",
}

# Uppercase Mongolian initial: А.Бат-Эрдэнэ, Б.Сувдаа
MN_NAME_UPPER = re.compile(
    r"\b([А-ЯӨҮЁ])\.\s*"
    r"([А-Яа-яӨөҮүЁё][а-яөүёa-z]+"
    r"(?:-[А-Яа-яӨөҮүЁё][а-яөүёa-z]+)*)"
)

# Lowercase initial: б.амар, о.батзориг  (very common in social media)
# ORIGINAL CODE HAD NO HANDLING FOR THIS — it only matched [А-ЯӨҮЁ]
MN_NAME_LOWER = re.compile(
    r"\b([а-яөүё])\.\s*"
    r"([а-яөүёa-z]+"
    r"(?:-[а-яөүёa-z]+)*)"
)

# Protected form А_Бат-Эрдэнэ — matched by restore_names()
MN_NAME_PROTECTED = re.compile(
    r"\b([А-ЯӨҮЁ])_([А-Яа-яӨөҮүЁё][а-яөүёa-z]+(?:-[А-Яа-яӨөҮүЁё][а-яөүёa-z]+)*)\b"
)

SENTENCE_BOUNDARY = re.compile(r"(?<=[.!?])\s+(?=[А-ЯӨҮЁ\u0400-\u04FF]|[A-Z])")


# ---------------------------------------------------------------------------
# Stopwords
# ---------------------------------------------------------------------------

MONGOLIAN_STOPWORDS: Set[str] = {
    "ба", "бас", "бол", "бөгөөд", "байна", "байгаа", "байсан", "бсан",
    "бхаа", "бн", "бна", "байх", "юм", "биш", "бгаа", "бдаг", "байдаг",
    "бхоо", "бх",
    "энэ", "тэр", "эдгээр", "тэдгээр", "үүн", "үүнд", "үүнээс", "үүний",
    "үүнтэй", "түүн", "түүнд", "түүнээс", "түүний", "түүнтэй",
    "тийм", "ийм", "чинь", "минь", "билээ", "шүү",
    "би", "чи", "та", "бид", "тэд", "миний", "чиний", "таны", "бидний",
    "тэдний", "над", "надад", "надаас", "чамд", "чамаас", "танд", "танаас",
    "өөр", "өөрөө", "өөрийн", "өөрт", "өөрөөс",
    "гэж", "гэх", "гэсэн", "гэжээ", "гэв", "гэвч", "гээд", "гнээ",
    "гэнэ", "гээ",
    "л", "ч", "уу", "үү", "юу", "яаж", "яагаад",
    "хаана", "хэзээ", "хэн", "ямар", "ямарч", "яах", "вэ", "бэ", "бээ",
    "болон", "мөн", "эсвэл", "гэхдээ", "харин",
    "дээр", "доор", "дотор", "гадна", "хойно", "өмнө",
    "руу", "рүү", "аас", "ээс", "оос", "өөс", "тай", "тэй", "той",
    "д", "т", "нь", "аа", "ээ", "оо", "өө",
    "бай", "болно", "болох", "болсон",
    "их", "бага", "маш", "тун", "нэлээд", "шиг",
    "шд", "н", "шдэ", "шдээ", "шт", "штэ", "штээ", "ш дээ", "ш тээ",
    "бз", "биз", "дээ", "даа", "юмаа", "аан", "хө", "тэ", "тээ",
    "гш", "ммхн", "сдаа", "сда", "хаха", "кк",
    "гэх", "хийх", "авах", "өгөх", "очих", "ирэх",
    "ын", "ийн", "ний", "ийг", "ууд", "үүд",
    "та нар",
}


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------

def _normalize_unicode(text: str) -> str:
    """NFC-normalize so visually identical Mongolian characters match regexes."""
    return unicodedata.normalize("NFC", text)


def _remove_emoji(text: str, convert_sentiment: bool = False) -> str:
    """
    Remove BMP symbol blocks and supplementary-plane emoji.

    If convert_sentiment=True (NLP mode): replace known sentiment emoji with
    text markers BEFORE stripping so the signal survives into BERT.
    Conversion must happen first because the regex replacements below would
    otherwise erase the emoji before we can read them.
    """
    if convert_sentiment:
        for emoji, marker in EMOJI_SENTIMENT.items():
            text = text.replace(emoji, f" {marker} ")
    text = BMP_EMOJI.sub(" ", text)
    text = SUPPLEMENTARY_EMOJI.sub(" ", text)
    return text


def _protect_names(text: str) -> str:
    """
    Convert Mongolian name patterns to protected underscore form before
    any cleaning strips the dots or hyphens.

    WHY UNDERSCORE:
      The character-whitelist in clean_deep preserves [_] and [-], so both
      the initial-name join AND the compound-surname hyphen survive.

    WHAT CHANGED FROM ORIGINAL:
      Original clean_deep had:
        re.sub(r'\b([А-ЯӨҮЁ])\.\s*([А-Яа-яӨөҮүЁё-]+)', r'\1_\2', text)
      Problems:
        1. Only runs INSIDE clean_deep, after clean_basic already ran.
           If the text came through preprocess_nlp (which never calls
           clean_deep), name dots were never protected.
        2. [А-Яа-яӨөҮүЁё-]+ has a trailing hyphen IN the character class
           which means it matches the hyphen character anywhere, including
           at the end of a word, potentially absorbing the next token.
        3. Only matched uppercase initials — б.амар was completely missed.

      This version runs _protect_names() inside clean_basic() so it fires
      for BOTH NLP and TM pipelines, before any stripping occurs.
    """
    def _replace_upper(m: re.Match) -> str:
        initial = m.group(1)
        name = "-".join(p.capitalize() for p in m.group(2).split("-"))
        return f"{initial}_{name}"

    def _replace_lower(m: re.Match) -> str:
        # Capitalize the single-letter initial and each name part
        initial = m.group(1).upper()
        name = "-".join(p.capitalize() for p in m.group(2).split("-"))
        return f"{initial}_{name}"

    text = MN_NAME_UPPER.sub(_replace_upper, text)
    text = MN_NAME_LOWER.sub(_replace_lower, text)
    return text


def _restore_names(text: str) -> str:
    """Undo protection: А_Бат-Эрдэнэ → А.Бат-Эрдэнэ (NLP mode only)."""
    return MN_NAME_PROTECTED.sub(lambda m: f"{m.group(1)}.{m.group(2)}", text)


def _capitalize_for_ner(text: str) -> str:
    """
    Heuristic capitalisation for NER on social-media Mongolian text.

    WHY THIS IS NEEDED:
      Davlan/bert-base-multilingual-cased-ner-hrl is a CASED model — it uses
      capitalisation as a primary signal to identify proper nouns. Mongolian
      social media is frequently written entirely lowercase. Without this step,
      "монгол улсын ерөнхийлөгч х.баттулга" will not tag х.баттулга as PER
      because the model sees it as an ordinary lowercase word.

    WHAT THIS DOES:
      1. Capitalises the first word of each detected sentence.
      2. Capitalises the name component inside protected tokens:
         Б_амар → Б_Амар (the initial is already uppercase from _protect_names,
         but the name itself may still be lowercase if it came from the lower
         pattern and capitalize() didn't fire — this is a safety pass).

    WHAT THIS DOES NOT DO:
      - Does NOT blindly capitalise all words (that would confuse common nouns)
    """
    sentences = SENTENCE_BOUNDARY.split(text)
    sentences = [s[0].upper() + s[1:] if s else s for s in sentences]
    text = " ".join(sentences)

    # Fix any protected token where name part is still lowercase
    text = re.sub(
        r"([А-ЯӨҮЁ])_([а-яөүё])([а-яөүё]*)",
        lambda m: f"{m.group(1)}_{m.group(2).upper()}{m.group(3)}",
        text,
    )
    return text


# ---------------------------------------------------------------------------
# Main class
# ---------------------------------------------------------------------------

class Preprocessor:
    """
    Text preprocessing pipeline for Mongolian social media data.

    Initialise once at app startup with stopwords from the knowledge base:

        kb = KnowledgeBase()
        preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords())

    Then in the analysis router, call preprocess_dual() per document:

        nlp_text, tm_text = preprocessor.preprocess_dual(raw_text)
        entities  = ner_engine.recognize(nlp_text)
        sentiment = sentiment_analyzer.analyze(nlp_text)
        tm_texts.append(tm_text)

    After all documents:
        if len(tm_texts) >= 10:  # BERTopic minimum — skip below this
            topic_results, summary = topic_modeler.fit_transform(tm_texts)
    """

    def __init__(self, extra_stopwords: Optional[List[str]] = None):
        self.stopwords: Set[str] = MONGOLIAN_STOPWORDS.copy()
        if extra_stopwords:
            self.stopwords.update(w.lower().strip() for w in extra_stopwords)

    def add_stopwords(self, words: List[str]) -> None:
        """
        Inject additional stopwords at runtime (e.g. after admin adds one).
        Call preprocessor.add_stopwords(kb.get_stopwords()) when the admin
        saves a new stopword — takes effect on the next analysis request
        without restarting the server.
        """
        self.stopwords.update(w.lower().strip() for w in words)

    def is_mongolian(self, text: str) -> bool:
        return isinstance(text, str) and bool(MONGOLIAN_PATTERN.search(text))

    # ------------------------------------------------------------------
    # clean_basic
    # ------------------------------------------------------------------

    def clean_basic(self, text: str, replace_url: bool = True,
                    convert_emoji: bool = False) -> str:
        """
        Light surface cleaning.

        CHANGES FROM ORIGINAL:
          Original: only handled URLs and whitespace normalisation.
          Updated:
            1. Unicode NFC normalisation added (first step — must precede regex)
            2. _protect_names() called here so it fires for BOTH pipelines.
               Original had protection only inside clean_deep() which is only
               called in TM mode — NLP mode had no name protection at all.
            3. Hashtag/mention removal added. Original left @user and #tag
               in the text; in TM mode these became bare tokens like "монгол"
               (from #монгол) with artificially inflated frequency.
            4. BMP emoji removal added via _remove_emoji(). Original only
               removed supplementary-plane emoji and only inside clean_deep().
            5. convert_emoji added: when True (NLP mode) known sentiment emoji
               are converted to text markers before stripping so the signal
               reaches BERT. When False (TM mode) emoji are stripped directly.

        Args:
            replace_url:    True  = replace with [URL] token (NLP needs the
                                    signal that a URL was present).
                            False = remove entirely (TM — URL adds no topic).
            convert_emoji:  True  = sentiment emoji → [LAUGH]/[LOVE]/etc.
                            False = strip all emoji (TM mode default).
        """
        if not isinstance(text, str):
            return ""

        text = _normalize_unicode(text)
        text = _protect_names(text)          # must be before any dot/hyphen stripping

        if replace_url:
            text = URL_PATTERN.sub("[URL]", text)
        else:
            text = URL_PATTERN.sub("", text)

        text = HASHTAG_MENTION.sub("", text)
        text = _remove_emoji(text, convert_sentiment=convert_emoji)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # ------------------------------------------------------------------
    # clean_deep
    # ------------------------------------------------------------------

    def clean_deep(self, text: str) -> str:
        """
        Aggressive symbol/punctuation removal for TM mode.

        CHANGES FROM ORIGINAL:
          Original had THREE issues inside this function:
          1. Name protection regex: r'\b([А-ЯӨҮЁ])\.\s*([А-Яа-яӨөҮүЁё-]+)'
             The character class [А-Яа-яӨөҮүЁё-]+ includes a hyphen at the
             END of the class which makes it match ANY hyphen, including
             standalone hyphens at word boundaries. This could join the
             protected name to the next word.
             FIX: Name protection is fully removed from here — it now happens
             in _protect_names() called inside clean_basic(). By the time
             clean_deep() runs, А.Бат-Эрдэнэ is already А_Бат-Эрдэнэ and
             the character whitelist below preserves both _ and -.

          2. Uppercase-only matching: the original only protected [А-ЯӨҮЁ]
             initials. Lowercase б.амар was left unprotected.
             FIX: Handled by _protect_names() as above.

          3. Emoji removal: original had re.sub(r'[\U00010000-\U0010ffff]', '', text)
             here, missing BMP symbols.
             FIX: Moved to _remove_emoji() inside clean_basic() which runs first.

        What this function now does:
          - Remove [URL] placeholder if still present
          - Apply character whitelist: keep Mongolian Cyrillic, Latin, digits,
            spaces, underscores (protected name joins), and hyphens
            (compound surname separators inside protected names)
          - Normalise whitespace
        """
        if not isinstance(text, str):
            return ""

        # Remove [URL] placeholder
        text = re.sub(r"\[URL\]", "", text)

        # Character whitelist — everything outside this becomes a space
        # _ preserved: А_Бат protected form
        # - preserved: Бат-Эрдэнэ compound name (inside protected token)
        text = re.sub(
            r"[^A-Za-zА-Яа-яӨөҮүЁё0-9\s_\-]",
            " ",
            text,
        )
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    # ------------------------------------------------------------------
    # Stopword removal
    # ------------------------------------------------------------------

    def _remove_stopwords(self, text: str) -> str:
        """
        Remove stopwords from lowercased text.

        CHANGE FROM ORIGINAL:
        Added len(w) > 1 filter. Single-character Mongolian tokens (д, т, н,
        ч, л, etc.) are case inflections and particles written as separate
        words in informal text. They are effectively stopwords and pollute
        topic model vocabulary. The original code left them in.
        """
        if not isinstance(text, str):
            return ""
        words = text.split()
        return " ".join(
            w for w in words
            if len(w) > 1 and w.lower() not in self.stopwords
        )

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def preprocess_nlp(self, text: str) -> str:
        """
        Preprocessing for NER and Sentiment Analysis.

        Pipeline: clean_basic → capitalize_for_ner → restore_names
        Intentionally skips: clean_deep, lowercasing, stopword removal
        """
        if not isinstance(text, str):
            return ""
        text = self.clean_basic(text, replace_url=True, convert_emoji=True)
        text = _capitalize_for_ner(text)
        text = _restore_names(text)
        return text

    def preprocess_tm(self, text: str) -> str:
        """
        Preprocessing for Topic Modeling (BERTopic).

        Pipeline: clean_basic → clean_deep → lowercase → remove_stopwords
                  → strip initial prefix

        Why strip the initial prefix in TM mode:
          After lowercasing, А_Бат-Эрдэнэ becomes а_бат-эрдэнэ.
          The single letter а adds nothing to topic clusters — the meaningful
          token is бат-эрдэнэ (the surname, treated as one compound token).
          The regex below strips the initial and underscore, keeping the name.
        """
        if not isinstance(text, str):
            return ""
        text = self.clean_basic(text, replace_url=False)
        text = self.clean_deep(text)
        text = text.lower()
        text = self._remove_stopwords(text)
        # Strip single-letter initial prefix: а_батэрдэнэ → батэрдэнэ
        text = re.sub(
            r"\b[а-яөүё]_([а-яөүёa-z]+(?:-[а-яөүёa-z]+)*)\b",
            r"\1",
            text,
        )
        return re.sub(r"\s+", " ", text).strip()

    def preprocess_dual(self, text: str) -> Tuple[str, str]:
        """
        Return both NLP and TM forms in one call.

        Use this in the router to avoid processing the same text twice:
            nlp_text, tm_text = preprocessor.preprocess_dual(raw)
        """
        return self.preprocess_nlp(text), self.preprocess_tm(text)

    def split_sentences(self, text: str) -> List[str]:
        """
        Split NLP-preprocessed text into sentences for chunked NER.
        Useful when a document exceeds BERT's 512-token limit.
        """
        parts = SENTENCE_BOUNDARY.split(text)
        return [p.strip() for p in parts if p.strip()]

    def preprocess_batch(self, texts: List[str], mode: str = "nlp") -> List[str]:
        """
        Preprocess a list of texts in the given mode ("nlp" or "tm").
        Returns a list of the same length.
        """
        fn = self.preprocess_tm if mode == "tm" else self.preprocess_nlp
        return [fn(t) for t in texts]