""" silver_label.py — Auto-label sumbee social media data with the current NER model. Produces two CoNLL files: data/silver_high.conll — sentences where ALL entities scored >= CONF_THRESHOLD Safe to add to training directly (still review a sample) data/silver_review.conll — sentences with at least one low-confidence entity Must be manually corrected before using for training Run from NLP-intelligence/: python scripts/silver_label.py python scripts/silver_label.py --limit 500 # quick test on first 500 rows """ import argparse import csv import os import re import sys from typing import List, Tuple sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from nlp_core.ner_engine import NEREngine from nlp_core.preprocessing import Preprocessor # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- SUMBEE_CSV = os.path.join("..", "preprocessing", "sumbee_master_dataset.csv") OUT_HIGH = os.path.join("data", "silver_high.conll") OUT_REVIEW = os.path.join("data", "silver_review.conll") CONF_THRESHOLD = 0.85 # entities below this trigger "review" bucket MN_PATTERN = re.compile(r"[А-Яа-яӨөҮүЁё]") # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def is_mongolian(text: str) -> bool: return bool(MN_PATTERN.search(text)) def word_offsets(text: str) -> List[Tuple[int, int, str]]: """Return (start, end, word) for each whitespace-separated token.""" result = [] pos = 0 for word in text.split(): start = text.find(word, pos) end = start + len(word) result.append((start, end, word)) pos = end return result def align_to_conll(preprocessed: str, entities) -> List[Tuple[str, str]]: """ Map NER entity spans (char offsets) back to individual tokens. Returns list of (word, BIO-label) pairs. """ offsets = word_offsets(preprocessed) labels = ["O"] * len(offsets) for ent in entities: e_start, e_end, e_type = ent.start, ent.end, ent.entity_group first = True for i, (ws, we, _) in enumerate(offsets): # token overlaps with entity span if ws < e_end and we > e_start: labels[i] = f"B-{e_type}" if first else f"I-{e_type}" first = False return [(word, lbl) for (_, _, word), lbl in zip(offsets, labels)] def to_conll_block(pairs: List[Tuple[str, str]]) -> str: """Format (word, label) pairs as a CoNLL block (blank-line separated).""" lines = [f"{word} O O {label}" for word, label in pairs] return "\n".join(lines) def min_entity_score(entities) -> float: if not entities: return 1.0 return min(e.score for e in entities) # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(limit: int = None): preprocessor = Preprocessor() ner = NEREngine() csv_path = os.path.join(os.path.dirname(__file__), SUMBEE_CSV) if not os.path.exists(csv_path): # try relative from project root csv_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "..", "preprocessing", "sumbee_master_dataset.csv") print(f"Reading sumbee data from {csv_path}") rows = [] with open(csv_path, encoding="utf-8") as f: for row in csv.DictReader(f): if is_mongolian(row["Text"]): rows.append(row["Text"]) if limit and len(rows) >= limit: break print(f"Mongolian rows to label: {len(rows)}") high_blocks = [] review_blocks = [] skipped = 0 for i, raw in enumerate(rows): if i % 100 == 0: print(f" {i}/{len(rows)} ...", end="\r") preprocessed = preprocessor.preprocess_nlp(raw) if not preprocessed.strip(): skipped += 1 continue try: entities = ner.recognize(preprocessed) except Exception as e: skipped += 1 continue pairs = align_to_conll(preprocessed, entities) if not pairs: skipped += 1 continue block = to_conll_block(pairs) min_score = min_entity_score(entities) if min_score >= CONF_THRESHOLD: high_blocks.append(block) else: # Add a comment line so reviewer knows which entities to check low_ents = [f"{e.word}({e.entity_group},{e.score:.2f})" for e in entities if e.score < CONF_THRESHOLD] review_blocks.append(f"# REVIEW: {', '.join(low_ents)}\n{block}") print(f"\nDone. High-confidence: {len(high_blocks)} | " f"Needs review: {len(review_blocks)} | Skipped: {skipped}") # Write outputs (relative to project root, so run from NLP-intelligence/) base = os.path.dirname(os.path.dirname(__file__)) high_path = os.path.join(base, "data", "silver_high.conll") review_path = os.path.join(base, "data", "silver_review.conll") with open(high_path, "w", encoding="utf-8") as f: f.write("\n\n".join(high_blocks)) print(f"Saved: {high_path}") with open(review_path, "w", encoding="utf-8") as f: f.write("\n\n".join(review_blocks)) print(f"Saved: {review_path}") print(f"\nNext step: review {review_path} manually, then run scripts/merge_train.py") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--limit", type=int, default=None, help="Process only first N Mongolian rows (default: all)") args = parser.parse_args() main(args.limit)