Spaces:
Running
Running
File size: 8,128 Bytes
e1c327f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
check_preprocessing.py — Manual diagnostic for the Mongolian preprocessing pipeline.
Run from inside NLP-intelligence/:
python check_preprocessing.py
Each test prints: INPUT → NLP OUTPUT | TM OUTPUT
Then flags anything that looks wrong.
"""
from nlp_core.preprocessing import Preprocessor
p = Preprocessor()
# ---------------------------------------------------------------------------
# Test cases: (label, raw_input, what_to_check)
# ---------------------------------------------------------------------------
CASES = [
# ── Name protection ────────────────────────────────────────────────────
("uppercase initial",
"Д.Гантулга УИХ-ын гишүүн байна.",
"NLP: name should be Д.Гантулга (dot restored). TM: initial stripped → гантулга or гантулга"),
("lowercase initial (social media)",
"өнөөдөр б.амар ирэхгүй байна гэсэн",
"NLP: б.амар → Б.Амар (capitalized). TM: initial stripped, амар kept"),
("compound surname",
"А.Бат-Эрдэнэ сайдаар томилогдлоо.",
"NLP: А.Бат-Эрдэнэ stays as one token with dot. TM: бат-эрдэнэ as one hyphenated token"),
# ── Capitalization for NER ─────────────────────────────────────────────
("all lowercase sentence",
"монгол улсын ерөнхийлөгч х.баттулга өнөөдөр хэлэв",
"NLP: 'монгол' → 'Монгол', х.баттулга → Х.Баттулга"),
# ── Hashtags and mentions ──────────────────────────────────────────────
("hashtag and mention",
"@МонголТВ #монголулс Улаанбаатар хотод мэдээ гарлаа",
"NLP: @МонголТВ and #монголулс stripped. TM: same."),
# ── URLs ───────────────────────────────────────────────────────────────
("URL handling",
"Дэлгэрэнгүй мэдээллийг https://montsame.mn/news/123 хаягаас үзнэ үү",
"NLP: URL → [URL] token. TM: URL removed entirely."),
# ── Emoji ──────────────────────────────────────────────────────────────
("emoji sentiment markers",
"Маш сайн байна 😊🔥 Улаанбаатар хотод ирлээ ✅",
"NLP: 🔥→[EXCITED], unknown 😊 stripped. TM: all emoji stripped."),
("sarcastic laugh emoji",
"Засгийн газрын шийдвэр маш сайн байна 😂😂",
"NLP: 😂→[LAUGH] (ambiguous, BERT infers from context). TM: stripped."),
("negative emoji",
"Энэ бол огт зөв биш 😡💔 нийтлэл байна",
"NLP: 😡→[ANGRY] 💔→[SAD]. TM: stripped."),
("togrog symbol preserved",
"Энэ бараа 50,000₮ байна — маш үнэтэй",
"NLP: ₮ and — preserved (were wrongly removed before). TM: stripped by clean_deep."),
# ── Stopword removal (TM only) ─────────────────────────────────────────
("stopword removal in TM",
"энэ бол маш сайн санаа юм байна",
"NLP: ALL words kept. TM: энэ бол маш юм байна removed, 'сайн санаа' should remain"),
# ── Punctuation preservation (NLP only) ───────────────────────────────
("punctuation in NLP",
"Тийм үү? Та хаанаас ирсэн бэ. Монгол улсаас.",
"NLP: punctuation kept. TM: punctuation stripped."),
# ── Real social media style ────────────────────────────────────────────
("real social media post",
"яах вэ дээ шдэ 😂 @найз #хөгжилтэй монгол хүн л гэж бодогдоод байна",
"NLP: slang particles kept, emoji/tags stripped. TM: шдэ, яах, вэ, дээ, л, гэж removed"),
("mixed mongolian english",
"Today Монгол улсын ерөнхийлөгч made an announcement. #politics",
"NLP: English words kept, Mongolian capitalized. TM: cleaned."),
]
# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------
RESET = "\033[0m"
BOLD = "\033[1m"
YELLOW = "\033[33m"
CYAN = "\033[36m"
GREEN = "\033[32m"
RED = "\033[31m"
def run():
print(f"\n{BOLD}=== PREPROCESSING DIAGNOSTIC ==={RESET}\n")
issues = []
for label, raw, hint in CASES:
nlp_out = p.preprocess_nlp(raw)
tm_out = p.preprocess_tm(raw)
print(f"{BOLD}{CYAN}[{label}]{RESET}")
print(f" {YELLOW}IN :{RESET} {raw}")
print(f" {GREEN}NLP:{RESET} {nlp_out}")
print(f" {GREEN}TM :{RESET} {tm_out}")
print(f" {YELLOW}CHECK:{RESET} {hint}")
# ── Automatic sanity checks ──────────────────────────────────────
case_issues = []
# NLP: should not be empty
if not nlp_out.strip():
case_issues.append("NLP output is EMPTY")
# TM: should not be empty (unless all stopwords)
if not tm_out.strip():
case_issues.append("TM output is EMPTY (may be okay if all stopwords)")
# NLP: URL should become [URL]
if "https://" in raw and "[URL]" not in nlp_out:
case_issues.append("URL not replaced with [URL] in NLP mode")
# TM: URL should be fully removed
if "https://" in raw and ("https://" in tm_out or "[URL]" in tm_out):
case_issues.append("URL not fully removed in TM mode")
# NLP: hashtag/mention should be stripped
if "@" in nlp_out or (any(c in raw for c in "@#") and "#" in nlp_out):
case_issues.append("Hashtag or mention still present in NLP output")
# NLP: if input had uppercase initial name like Д.Гантулга, it should survive
import re
upper_names = re.findall(r"[А-ЯӨҮЁ]\.[А-Яа-яӨөҮүЁё]", raw)
for name in upper_names:
initial = name[0]
if initial + "." not in nlp_out:
case_issues.append(f"Name initial {name!r} lost dot in NLP output → got: {nlp_out}")
# NLP: first word of sentence should be capitalized
first_word = nlp_out.split()[0] if nlp_out.split() else ""
if first_word and first_word[0].islower():
case_issues.append(f"First word '{first_word}' not capitalized in NLP output")
if case_issues:
for issue in case_issues:
print(f" {RED}⚠ ISSUE: {issue}{RESET}")
issues.extend([(label, i) for i in case_issues])
else:
print(f" {GREEN}✓ No automatic issues detected{RESET}")
print()
# ── Summary ─────────────────────────────────────────────────────────
print(f"{BOLD}=== SUMMARY ==={RESET}")
if issues:
print(f"{RED}{len(issues)} issue(s) found:{RESET}")
for label, issue in issues:
print(f" [{label}] {issue}")
else:
print(f"{GREEN}All automatic checks passed. Review the outputs above manually.{RESET}")
print()
if __name__ == "__main__":
run()
|