""" check_preprocessing.py — Manual diagnostic for the Mongolian preprocessing pipeline. Run from inside NLP-intelligence/: python check_preprocessing.py Each test prints: INPUT → NLP OUTPUT | TM OUTPUT Then flags anything that looks wrong. """ from nlp_core.preprocessing import Preprocessor p = Preprocessor() # --------------------------------------------------------------------------- # Test cases: (label, raw_input, what_to_check) # --------------------------------------------------------------------------- CASES = [ # ── Name protection ──────────────────────────────────────────────────── ("uppercase initial", "Д.Гантулга УИХ-ын гишүүн байна.", "NLP: name should be Д.Гантулга (dot restored). TM: initial stripped → гантулга or гантулга"), ("lowercase initial (social media)", "өнөөдөр б.амар ирэхгүй байна гэсэн", "NLP: б.амар → Б.Амар (capitalized). TM: initial stripped, амар kept"), ("compound surname", "А.Бат-Эрдэнэ сайдаар томилогдлоо.", "NLP: А.Бат-Эрдэнэ stays as one token with dot. TM: бат-эрдэнэ as one hyphenated token"), # ── Capitalization for NER ───────────────────────────────────────────── ("all lowercase sentence", "монгол улсын ерөнхийлөгч х.баттулга өнөөдөр хэлэв", "NLP: 'монгол' → 'Монгол', х.баттулга → Х.Баттулга"), # ── Hashtags and mentions ────────────────────────────────────────────── ("hashtag and mention", "@МонголТВ #монголулс Улаанбаатар хотод мэдээ гарлаа", "NLP: @МонголТВ and #монголулс stripped. TM: same."), # ── URLs ─────────────────────────────────────────────────────────────── ("URL handling", "Дэлгэрэнгүй мэдээллийг https://montsame.mn/news/123 хаягаас үзнэ үү", "NLP: URL → [URL] token. TM: URL removed entirely."), # ── Emoji ────────────────────────────────────────────────────────────── ("emoji sentiment markers", "Маш сайн байна 😊🔥 Улаанбаатар хотод ирлээ ✅", "NLP: 🔥→[EXCITED], unknown 😊 stripped. TM: all emoji stripped."), ("sarcastic laugh emoji", "Засгийн газрын шийдвэр маш сайн байна 😂😂", "NLP: 😂→[LAUGH] (ambiguous, BERT infers from context). TM: stripped."), ("negative emoji", "Энэ бол огт зөв биш 😡💔 нийтлэл байна", "NLP: 😡→[ANGRY] 💔→[SAD]. TM: stripped."), ("togrog symbol preserved", "Энэ бараа 50,000₮ байна — маш үнэтэй", "NLP: ₮ and — preserved (were wrongly removed before). TM: stripped by clean_deep."), # ── Stopword removal (TM only) ───────────────────────────────────────── ("stopword removal in TM", "энэ бол маш сайн санаа юм байна", "NLP: ALL words kept. TM: энэ бол маш юм байна removed, 'сайн санаа' should remain"), # ── Punctuation preservation (NLP only) ─────────────────────────────── ("punctuation in NLP", "Тийм үү? Та хаанаас ирсэн бэ. Монгол улсаас.", "NLP: punctuation kept. TM: punctuation stripped."), # ── Real social media style ──────────────────────────────────────────── ("real social media post", "яах вэ дээ шдэ 😂 @найз #хөгжилтэй монгол хүн л гэж бодогдоод байна", "NLP: slang particles kept, emoji/tags stripped. TM: шдэ, яах, вэ, дээ, л, гэж removed"), ("mixed mongolian english", "Today Монгол улсын ерөнхийлөгч made an announcement. #politics", "NLP: English words kept, Mongolian capitalized. TM: cleaned."), ] # --------------------------------------------------------------------------- # Runner # --------------------------------------------------------------------------- RESET = "\033[0m" BOLD = "\033[1m" YELLOW = "\033[33m" CYAN = "\033[36m" GREEN = "\033[32m" RED = "\033[31m" def run(): print(f"\n{BOLD}=== PREPROCESSING DIAGNOSTIC ==={RESET}\n") issues = [] for label, raw, hint in CASES: nlp_out = p.preprocess_nlp(raw) tm_out = p.preprocess_tm(raw) print(f"{BOLD}{CYAN}[{label}]{RESET}") print(f" {YELLOW}IN :{RESET} {raw}") print(f" {GREEN}NLP:{RESET} {nlp_out}") print(f" {GREEN}TM :{RESET} {tm_out}") print(f" {YELLOW}CHECK:{RESET} {hint}") # ── Automatic sanity checks ────────────────────────────────────── case_issues = [] # NLP: should not be empty if not nlp_out.strip(): case_issues.append("NLP output is EMPTY") # TM: should not be empty (unless all stopwords) if not tm_out.strip(): case_issues.append("TM output is EMPTY (may be okay if all stopwords)") # NLP: URL should become [URL] if "https://" in raw and "[URL]" not in nlp_out: case_issues.append("URL not replaced with [URL] in NLP mode") # TM: URL should be fully removed if "https://" in raw and ("https://" in tm_out or "[URL]" in tm_out): case_issues.append("URL not fully removed in TM mode") # NLP: hashtag/mention should be stripped if "@" in nlp_out or (any(c in raw for c in "@#") and "#" in nlp_out): case_issues.append("Hashtag or mention still present in NLP output") # NLP: if input had uppercase initial name like Д.Гантулга, it should survive import re upper_names = re.findall(r"[А-ЯӨҮЁ]\.[А-Яа-яӨөҮүЁё]", raw) for name in upper_names: initial = name[0] if initial + "." not in nlp_out: case_issues.append(f"Name initial {name!r} lost dot in NLP output → got: {nlp_out}") # NLP: first word of sentence should be capitalized first_word = nlp_out.split()[0] if nlp_out.split() else "" if first_word and first_word[0].islower(): case_issues.append(f"First word '{first_word}' not capitalized in NLP output") if case_issues: for issue in case_issues: print(f" {RED}⚠ ISSUE: {issue}{RESET}") issues.extend([(label, i) for i in case_issues]) else: print(f" {GREEN}✓ No automatic issues detected{RESET}") print() # ── Summary ───────────────────────────────────────────────────────── print(f"{BOLD}=== SUMMARY ==={RESET}") if issues: print(f"{RED}{len(issues)} issue(s) found:{RESET}") for label, issue in issues: print(f" [{label}] {issue}") else: print(f"{GREEN}All automatic checks passed. Review the outputs above manually.{RESET}") print() if __name__ == "__main__": run()