Spaces:
Running
Running
File size: 5,972 Bytes
e1c327f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | """
silver_label.py — Auto-label sumbee social media data with the current NER model.
Produces two CoNLL files:
data/silver_high.conll — sentences where ALL entities scored >= CONF_THRESHOLD
Safe to add to training directly (still review a sample)
data/silver_review.conll — sentences with at least one low-confidence entity
Must be manually corrected before using for training
Run from NLP-intelligence/:
python scripts/silver_label.py
python scripts/silver_label.py --limit 500 # quick test on first 500 rows
"""
import argparse
import csv
import os
import re
import sys
from typing import List, Tuple
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from nlp_core.ner_engine import NEREngine
from nlp_core.preprocessing import Preprocessor
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
SUMBEE_CSV = os.path.join("..", "preprocessing", "sumbee_master_dataset.csv")
OUT_HIGH = os.path.join("data", "silver_high.conll")
OUT_REVIEW = os.path.join("data", "silver_review.conll")
CONF_THRESHOLD = 0.85 # entities below this trigger "review" bucket
MN_PATTERN = re.compile(r"[А-Яа-яӨөҮүЁё]")
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def is_mongolian(text: str) -> bool:
return bool(MN_PATTERN.search(text))
def word_offsets(text: str) -> List[Tuple[int, int, str]]:
"""Return (start, end, word) for each whitespace-separated token."""
result = []
pos = 0
for word in text.split():
start = text.find(word, pos)
end = start + len(word)
result.append((start, end, word))
pos = end
return result
def align_to_conll(preprocessed: str, entities) -> List[Tuple[str, str]]:
"""
Map NER entity spans (char offsets) back to individual tokens.
Returns list of (word, BIO-label) pairs.
"""
offsets = word_offsets(preprocessed)
labels = ["O"] * len(offsets)
for ent in entities:
e_start, e_end, e_type = ent.start, ent.end, ent.entity_group
first = True
for i, (ws, we, _) in enumerate(offsets):
# token overlaps with entity span
if ws < e_end and we > e_start:
labels[i] = f"B-{e_type}" if first else f"I-{e_type}"
first = False
return [(word, lbl) for (_, _, word), lbl in zip(offsets, labels)]
def to_conll_block(pairs: List[Tuple[str, str]]) -> str:
"""Format (word, label) pairs as a CoNLL block (blank-line separated)."""
lines = [f"{word} O O {label}" for word, label in pairs]
return "\n".join(lines)
def min_entity_score(entities) -> float:
if not entities:
return 1.0
return min(e.score for e in entities)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main(limit: int = None):
preprocessor = Preprocessor()
ner = NEREngine()
csv_path = os.path.join(os.path.dirname(__file__), SUMBEE_CSV)
if not os.path.exists(csv_path):
# try relative from project root
csv_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
"..", "preprocessing", "sumbee_master_dataset.csv")
print(f"Reading sumbee data from {csv_path}")
rows = []
with open(csv_path, encoding="utf-8") as f:
for row in csv.DictReader(f):
if is_mongolian(row["Text"]):
rows.append(row["Text"])
if limit and len(rows) >= limit:
break
print(f"Mongolian rows to label: {len(rows)}")
high_blocks = []
review_blocks = []
skipped = 0
for i, raw in enumerate(rows):
if i % 100 == 0:
print(f" {i}/{len(rows)} ...", end="\r")
preprocessed = preprocessor.preprocess_nlp(raw)
if not preprocessed.strip():
skipped += 1
continue
try:
entities = ner.recognize(preprocessed)
except Exception as e:
skipped += 1
continue
pairs = align_to_conll(preprocessed, entities)
if not pairs:
skipped += 1
continue
block = to_conll_block(pairs)
min_score = min_entity_score(entities)
if min_score >= CONF_THRESHOLD:
high_blocks.append(block)
else:
# Add a comment line so reviewer knows which entities to check
low_ents = [f"{e.word}({e.entity_group},{e.score:.2f})"
for e in entities if e.score < CONF_THRESHOLD]
review_blocks.append(f"# REVIEW: {', '.join(low_ents)}\n{block}")
print(f"\nDone. High-confidence: {len(high_blocks)} | "
f"Needs review: {len(review_blocks)} | Skipped: {skipped}")
# Write outputs (relative to project root, so run from NLP-intelligence/)
base = os.path.dirname(os.path.dirname(__file__))
high_path = os.path.join(base, "data", "silver_high.conll")
review_path = os.path.join(base, "data", "silver_review.conll")
with open(high_path, "w", encoding="utf-8") as f:
f.write("\n\n".join(high_blocks))
print(f"Saved: {high_path}")
with open(review_path, "w", encoding="utf-8") as f:
f.write("\n\n".join(review_blocks))
print(f"Saved: {review_path}")
print(f"\nNext step: review {review_path} manually, then run scripts/merge_train.py")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--limit", type=int, default=None,
help="Process only first N Mongolian rows (default: all)")
args = parser.parse_args()
main(args.limit)
|