File size: 5,972 Bytes
e1c327f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
silver_label.py — Auto-label sumbee social media data with the current NER model.

Produces two CoNLL files:
  data/silver_high.conll   — sentences where ALL entities scored >= CONF_THRESHOLD
                             Safe to add to training directly (still review a sample)
  data/silver_review.conll — sentences with at least one low-confidence entity
                             Must be manually corrected before using for training

Run from NLP-intelligence/:
    python scripts/silver_label.py
    python scripts/silver_label.py --limit 500   # quick test on first 500 rows
"""

import argparse
import csv
import os
import re
import sys
from typing import List, Tuple

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from nlp_core.ner_engine import NEREngine
from nlp_core.preprocessing import Preprocessor

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
SUMBEE_CSV   = os.path.join("..", "preprocessing", "sumbee_master_dataset.csv")
OUT_HIGH     = os.path.join("data", "silver_high.conll")
OUT_REVIEW   = os.path.join("data", "silver_review.conll")
CONF_THRESHOLD = 0.85        # entities below this trigger "review" bucket
MN_PATTERN   = re.compile(r"[А-Яа-яӨөҮүЁё]")


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def is_mongolian(text: str) -> bool:
    return bool(MN_PATTERN.search(text))


def word_offsets(text: str) -> List[Tuple[int, int, str]]:
    """Return (start, end, word) for each whitespace-separated token."""
    result = []
    pos = 0
    for word in text.split():
        start = text.find(word, pos)
        end = start + len(word)
        result.append((start, end, word))
        pos = end
    return result


def align_to_conll(preprocessed: str, entities) -> List[Tuple[str, str]]:
    """
    Map NER entity spans (char offsets) back to individual tokens.
    Returns list of (word, BIO-label) pairs.
    """
    offsets = word_offsets(preprocessed)
    labels = ["O"] * len(offsets)

    for ent in entities:
        e_start, e_end, e_type = ent.start, ent.end, ent.entity_group
        first = True
        for i, (ws, we, _) in enumerate(offsets):
            # token overlaps with entity span
            if ws < e_end and we > e_start:
                labels[i] = f"B-{e_type}" if first else f"I-{e_type}"
                first = False

    return [(word, lbl) for (_, _, word), lbl in zip(offsets, labels)]


def to_conll_block(pairs: List[Tuple[str, str]]) -> str:
    """Format (word, label) pairs as a CoNLL block (blank-line separated)."""
    lines = [f"{word} O O {label}" for word, label in pairs]
    return "\n".join(lines)


def min_entity_score(entities) -> float:
    if not entities:
        return 1.0
    return min(e.score for e in entities)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main(limit: int = None):
    preprocessor = Preprocessor()
    ner = NEREngine()

    csv_path = os.path.join(os.path.dirname(__file__), SUMBEE_CSV)
    if not os.path.exists(csv_path):
        # try relative from project root
        csv_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                "..", "preprocessing", "sumbee_master_dataset.csv")

    print(f"Reading sumbee data from {csv_path}")
    rows = []
    with open(csv_path, encoding="utf-8") as f:
        for row in csv.DictReader(f):
            if is_mongolian(row["Text"]):
                rows.append(row["Text"])
            if limit and len(rows) >= limit:
                break

    print(f"Mongolian rows to label: {len(rows)}")

    high_blocks = []
    review_blocks = []
    skipped = 0

    for i, raw in enumerate(rows):
        if i % 100 == 0:
            print(f"  {i}/{len(rows)} ...", end="\r")

        preprocessed = preprocessor.preprocess_nlp(raw)
        if not preprocessed.strip():
            skipped += 1
            continue

        try:
            entities = ner.recognize(preprocessed)
        except Exception as e:
            skipped += 1
            continue

        pairs = align_to_conll(preprocessed, entities)
        if not pairs:
            skipped += 1
            continue

        block = to_conll_block(pairs)
        min_score = min_entity_score(entities)

        if min_score >= CONF_THRESHOLD:
            high_blocks.append(block)
        else:
            # Add a comment line so reviewer knows which entities to check
            low_ents = [f"{e.word}({e.entity_group},{e.score:.2f})"
                        for e in entities if e.score < CONF_THRESHOLD]
            review_blocks.append(f"# REVIEW: {', '.join(low_ents)}\n{block}")

    print(f"\nDone. High-confidence: {len(high_blocks)} | "
          f"Needs review: {len(review_blocks)} | Skipped: {skipped}")

    # Write outputs (relative to project root, so run from NLP-intelligence/)
    base = os.path.dirname(os.path.dirname(__file__))
    high_path   = os.path.join(base, "data", "silver_high.conll")
    review_path = os.path.join(base, "data", "silver_review.conll")

    with open(high_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(high_blocks))
    print(f"Saved: {high_path}")

    with open(review_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(review_blocks))
    print(f"Saved: {review_path}")
    print(f"\nNext step: review {review_path} manually, then run scripts/merge_train.py")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--limit", type=int, default=None,
                        help="Process only first N Mongolian rows (default: all)")
    args = parser.parse_args()
    main(args.limit)