File size: 8,128 Bytes
e1c327f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""
check_preprocessing.py — Manual diagnostic for the Mongolian preprocessing pipeline.

Run from inside NLP-intelligence/:
    python check_preprocessing.py

Each test prints: INPUT → NLP OUTPUT | TM OUTPUT
Then flags anything that looks wrong.
"""

from nlp_core.preprocessing import Preprocessor

p = Preprocessor()

# ---------------------------------------------------------------------------
# Test cases: (label, raw_input, what_to_check)
# ---------------------------------------------------------------------------
CASES = [
    # ── Name protection ────────────────────────────────────────────────────
    ("uppercase initial",
     "Д.Гантулга УИХ-ын гишүүн байна.",
     "NLP: name should be Д.Гантулга (dot restored). TM: initial stripped → гантулга or гантулга"),

    ("lowercase initial (social media)",
     "өнөөдөр б.амар ирэхгүй байна гэсэн",
     "NLP: б.амар → Б.Амар (capitalized). TM: initial stripped, амар kept"),

    ("compound surname",
     "А.Бат-Эрдэнэ сайдаар томилогдлоо.",
     "NLP: А.Бат-Эрдэнэ stays as one token with dot. TM: бат-эрдэнэ as one hyphenated token"),

    # ── Capitalization for NER ─────────────────────────────────────────────
    ("all lowercase sentence",
     "монгол улсын ерөнхийлөгч х.баттулга өнөөдөр хэлэв",
     "NLP: 'монгол' → 'Монгол', х.баттулга → Х.Баттулга"),

    # ── Hashtags and mentions ──────────────────────────────────────────────
    ("hashtag and mention",
     "@МонголТВ #монголулс Улаанбаатар хотод мэдээ гарлаа",
     "NLP: @МонголТВ and #монголулс stripped. TM: same."),

    # ── URLs ───────────────────────────────────────────────────────────────
    ("URL handling",
     "Дэлгэрэнгүй мэдээллийг https://montsame.mn/news/123 хаягаас үзнэ үү",
     "NLP: URL → [URL] token. TM: URL removed entirely."),

    # ── Emoji ──────────────────────────────────────────────────────────────
    ("emoji sentiment markers",
     "Маш сайн байна 😊🔥 Улаанбаатар хотод ирлээ ✅",
     "NLP: 🔥→[EXCITED], unknown 😊 stripped. TM: all emoji stripped."),

    ("sarcastic laugh emoji",
     "Засгийн газрын шийдвэр маш сайн байна 😂😂",
     "NLP: 😂→[LAUGH] (ambiguous, BERT infers from context). TM: stripped."),

    ("negative emoji",
     "Энэ бол огт зөв биш 😡💔 нийтлэл байна",
     "NLP: 😡→[ANGRY] 💔→[SAD]. TM: stripped."),

    ("togrog symbol preserved",
     "Энэ бараа 50,000₮ байна — маш үнэтэй",
     "NLP: ₮ and — preserved (were wrongly removed before). TM: stripped by clean_deep."),

    # ── Stopword removal (TM only) ─────────────────────────────────────────
    ("stopword removal in TM",
     "энэ бол маш сайн санаа юм байна",
     "NLP: ALL words kept. TM: энэ бол маш юм байна removed, 'сайн санаа' should remain"),

    # ── Punctuation preservation (NLP only) ───────────────────────────────
    ("punctuation in NLP",
     "Тийм үү? Та хаанаас ирсэн бэ. Монгол улсаас.",
     "NLP: punctuation kept. TM: punctuation stripped."),

    # ── Real social media style ────────────────────────────────────────────
    ("real social media post",
     "яах вэ дээ шдэ 😂 @найз #хөгжилтэй монгол хүн л гэж бодогдоод байна",
     "NLP: slang particles kept, emoji/tags stripped. TM: шдэ, яах, вэ, дээ, л, гэж removed"),

    ("mixed mongolian english",
     "Today Монгол улсын ерөнхийлөгч made an announcement. #politics",
     "NLP: English words kept, Mongolian capitalized. TM: cleaned."),
]

# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------
RESET = "\033[0m"
BOLD  = "\033[1m"
YELLOW = "\033[33m"
CYAN  = "\033[36m"
GREEN = "\033[32m"
RED   = "\033[31m"

def run():
    print(f"\n{BOLD}=== PREPROCESSING DIAGNOSTIC ==={RESET}\n")
    issues = []

    for label, raw, hint in CASES:
        nlp_out = p.preprocess_nlp(raw)
        tm_out  = p.preprocess_tm(raw)

        print(f"{BOLD}{CYAN}[{label}]{RESET}")
        print(f"  {YELLOW}IN :{RESET}  {raw}")
        print(f"  {GREEN}NLP:{RESET}  {nlp_out}")
        print(f"  {GREEN}TM :{RESET}  {tm_out}")
        print(f"  {YELLOW}CHECK:{RESET} {hint}")

        # ── Automatic sanity checks ──────────────────────────────────────
        case_issues = []

        # NLP: should not be empty
        if not nlp_out.strip():
            case_issues.append("NLP output is EMPTY")

        # TM: should not be empty (unless all stopwords)
        if not tm_out.strip():
            case_issues.append("TM output is EMPTY (may be okay if all stopwords)")

        # NLP: URL should become [URL]
        if "https://" in raw and "[URL]" not in nlp_out:
            case_issues.append("URL not replaced with [URL] in NLP mode")

        # TM: URL should be fully removed
        if "https://" in raw and ("https://" in tm_out or "[URL]" in tm_out):
            case_issues.append("URL not fully removed in TM mode")

        # NLP: hashtag/mention should be stripped
        if "@" in nlp_out or (any(c in raw for c in "@#") and "#" in nlp_out):
            case_issues.append("Hashtag or mention still present in NLP output")

        # NLP: if input had uppercase initial name like Д.Гантулга, it should survive
        import re
        upper_names = re.findall(r"[А-ЯӨҮЁ]\.[А-Яа-яӨөҮүЁё]", raw)
        for name in upper_names:
            initial = name[0]
            if initial + "." not in nlp_out:
                case_issues.append(f"Name initial {name!r} lost dot in NLP output → got: {nlp_out}")

        # NLP: first word of sentence should be capitalized
        first_word = nlp_out.split()[0] if nlp_out.split() else ""
        if first_word and first_word[0].islower():
            case_issues.append(f"First word '{first_word}' not capitalized in NLP output")

        if case_issues:
            for issue in case_issues:
                print(f"  {RED}⚠ ISSUE: {issue}{RESET}")
            issues.extend([(label, i) for i in case_issues])
        else:
            print(f"  {GREEN}✓ No automatic issues detected{RESET}")

        print()

    # ── Summary ─────────────────────────────────────────────────────────
    print(f"{BOLD}=== SUMMARY ==={RESET}")
    if issues:
        print(f"{RED}{len(issues)} issue(s) found:{RESET}")
        for label, issue in issues:
            print(f"  [{label}] {issue}")
    else:
        print(f"{GREEN}All automatic checks passed. Review the outputs above manually.{RESET}")

    print()

if __name__ == "__main__":
    run()