NLP-intelligence / scripts /fix_labels.py
Nomio4640's picture
NER finetune
e1c327f
"""
fix_labels.py — Auto-correct known labeling errors in train_merged.conll.
Fixes applied:
1. Sequence errors: I-X without preceding B-X or I-X → convert to B-X
2. Definite wrong labels: common words incorrectly tagged as entities
3. Systematic silver-label error: томилолт I-PER → O
Run from NLP-intelligence/:
python scripts/fix_labels.py
Output: data/train_final.conll
"""
import os
import sys
# Words that are NEVER entities in any context
ALWAYS_O = {
# Verbs wrongly tagged as entities
"байна": {"B-PER"}, # "is/are"
"байгаа": {"I-MISC"}, # "being"
"хийж": {"B-PER", "B-LOC"}, # verb "doing"
# Particles / pronouns wrongly tagged
"юм": {"I-MISC"}, # particle
"бол": {"I-MISC"}, # copula "is"
"нэг": {"I-MISC"}, # "one"
"би": {"I-MISC"}, # pronoun "I"
"ямар": {"B-PER"}, # interrogative "what kind"
"та": {"B-PER"}, # pronoun "you"
"сарын": {"B-PER"}, # "of the month"
"мөн": {"B-LOC"}, # adverb "also"
"манай": {"B-LOC"}, # possessive "our" — not a location
# Number
"2": {"I-PER"},
# Systematic silver error: "assignment/delegation" ≠ person
"томилолт": {"I-PER"},
}
def fix_block(tokens):
"""
tokens: list of (word, label)
Returns fixed list of (word, label).
"""
result = []
prev_label = "O"
prev_type = None
for word, label in tokens:
fixed = label
# Fix 1: wrong labels for specific words
key = word.lower()
if key in ALWAYS_O and label in ALWAYS_O[key]:
fixed = "O"
# Fix 2: I-X without matching B-X or I-X before it → B-X
if fixed.startswith("I-"):
etype = fixed[2:]
if prev_label == "O" or (
prev_label.startswith("B-") and prev_label[2:] != etype
) or (
prev_label.startswith("I-") and prev_label[2:] != etype
):
fixed = f"B-{etype}"
result.append((word, fixed))
prev_label = fixed
prev_type = fixed[2:] if "-" in fixed else None
return result
def main():
base = os.path.dirname(os.path.dirname(__file__))
src = os.path.join(base, "data", "train_merged.conll")
dst = os.path.join(base, "data", "train_final.conll")
if not os.path.exists(src):
print(f"ERROR: {src} not found"); sys.exit(1)
fixed_count = 0
seq_fixed = 0
out_blocks = []
with open(src, encoding="utf-8") as f:
current_raw = []
for line in f:
line = line.rstrip()
if line == "" or line.startswith("#"):
if current_raw:
tokens = current_raw
fixed = fix_block(tokens)
# Count changes
for (_, ol), (_, nl) in zip(tokens, fixed):
if ol != nl:
if ol.startswith("I-") and nl.startswith("B-"):
seq_fixed += 1
else:
fixed_count += 1
out_blocks.append(fixed)
current_raw = []
else:
parts = line.split()
if len(parts) >= 4:
current_raw.append((parts[0], parts[-1]))
if current_raw:
out_blocks.append(fix_block(current_raw))
with open(dst, "w", encoding="utf-8") as f:
for block in out_blocks:
for word, label in block:
f.write(f"{word} O O {label}\n")
f.write("\n")
print(f"Wrong-label fixes: {fixed_count}")
print(f"Sequence fixes (I→B): {seq_fixed}")
print(f"Sentences written: {len(out_blocks)}")
print(f"Saved → {dst}")
print(f"\nUse data/train_final.conll for Colab fine-tuning.")
if __name__ == "__main__":
main()