Spaces:
Running
Running
| """ | |
| fix_labels.py — Auto-correct known labeling errors in train_merged.conll. | |
| Fixes applied: | |
| 1. Sequence errors: I-X without preceding B-X or I-X → convert to B-X | |
| 2. Definite wrong labels: common words incorrectly tagged as entities | |
| 3. Systematic silver-label error: томилолт I-PER → O | |
| Run from NLP-intelligence/: | |
| python scripts/fix_labels.py | |
| Output: data/train_final.conll | |
| """ | |
| import os | |
| import sys | |
| # Words that are NEVER entities in any context | |
| ALWAYS_O = { | |
| # Verbs wrongly tagged as entities | |
| "байна": {"B-PER"}, # "is/are" | |
| "байгаа": {"I-MISC"}, # "being" | |
| "хийж": {"B-PER", "B-LOC"}, # verb "doing" | |
| # Particles / pronouns wrongly tagged | |
| "юм": {"I-MISC"}, # particle | |
| "бол": {"I-MISC"}, # copula "is" | |
| "нэг": {"I-MISC"}, # "one" | |
| "би": {"I-MISC"}, # pronoun "I" | |
| "ямар": {"B-PER"}, # interrogative "what kind" | |
| "та": {"B-PER"}, # pronoun "you" | |
| "сарын": {"B-PER"}, # "of the month" | |
| "мөн": {"B-LOC"}, # adverb "also" | |
| "манай": {"B-LOC"}, # possessive "our" — not a location | |
| # Number | |
| "2": {"I-PER"}, | |
| # Systematic silver error: "assignment/delegation" ≠ person | |
| "томилолт": {"I-PER"}, | |
| } | |
| def fix_block(tokens): | |
| """ | |
| tokens: list of (word, label) | |
| Returns fixed list of (word, label). | |
| """ | |
| result = [] | |
| prev_label = "O" | |
| prev_type = None | |
| for word, label in tokens: | |
| fixed = label | |
| # Fix 1: wrong labels for specific words | |
| key = word.lower() | |
| if key in ALWAYS_O and label in ALWAYS_O[key]: | |
| fixed = "O" | |
| # Fix 2: I-X without matching B-X or I-X before it → B-X | |
| if fixed.startswith("I-"): | |
| etype = fixed[2:] | |
| if prev_label == "O" or ( | |
| prev_label.startswith("B-") and prev_label[2:] != etype | |
| ) or ( | |
| prev_label.startswith("I-") and prev_label[2:] != etype | |
| ): | |
| fixed = f"B-{etype}" | |
| result.append((word, fixed)) | |
| prev_label = fixed | |
| prev_type = fixed[2:] if "-" in fixed else None | |
| return result | |
| def main(): | |
| base = os.path.dirname(os.path.dirname(__file__)) | |
| src = os.path.join(base, "data", "train_merged.conll") | |
| dst = os.path.join(base, "data", "train_final.conll") | |
| if not os.path.exists(src): | |
| print(f"ERROR: {src} not found"); sys.exit(1) | |
| fixed_count = 0 | |
| seq_fixed = 0 | |
| out_blocks = [] | |
| with open(src, encoding="utf-8") as f: | |
| current_raw = [] | |
| for line in f: | |
| line = line.rstrip() | |
| if line == "" or line.startswith("#"): | |
| if current_raw: | |
| tokens = current_raw | |
| fixed = fix_block(tokens) | |
| # Count changes | |
| for (_, ol), (_, nl) in zip(tokens, fixed): | |
| if ol != nl: | |
| if ol.startswith("I-") and nl.startswith("B-"): | |
| seq_fixed += 1 | |
| else: | |
| fixed_count += 1 | |
| out_blocks.append(fixed) | |
| current_raw = [] | |
| else: | |
| parts = line.split() | |
| if len(parts) >= 4: | |
| current_raw.append((parts[0], parts[-1])) | |
| if current_raw: | |
| out_blocks.append(fix_block(current_raw)) | |
| with open(dst, "w", encoding="utf-8") as f: | |
| for block in out_blocks: | |
| for word, label in block: | |
| f.write(f"{word} O O {label}\n") | |
| f.write("\n") | |
| print(f"Wrong-label fixes: {fixed_count}") | |
| print(f"Sequence fixes (I→B): {seq_fixed}") | |
| print(f"Sentences written: {len(out_blocks)}") | |
| print(f"Saved → {dst}") | |
| print(f"\nUse data/train_final.conll for Colab fine-tuning.") | |
| if __name__ == "__main__": | |
| main() | |