""" merge_train.py — Merge original train.txt with silver-labeled sumbee data. Run AFTER reviewing data/silver_review.conll manually. Usage: python scripts/merge_train.py # merges high + original python scripts/merge_train.py --include-review # also includes review file # (only after manual correction) Output: data/train_merged.conll (use this for Colab fine-tuning) """ import argparse import os import random def read_conll_blocks(path: str): """Read a CoNLL file and return list of sentence blocks (skip # comments).""" blocks = [] with open(path, encoding="utf-8") as f: current = [] for line in f: line = line.rstrip() if line.startswith("#"): # strip comment lines from review file continue if line == "": if current: blocks.append("\n".join(current)) current = [] else: current.append(line) if current: blocks.append("\n".join(current)) return [b for b in blocks if b.strip()] def main(include_review: bool = False, seed: int = 42): base = os.path.dirname(os.path.dirname(__file__)) original = os.path.join(base, "data", "train.txt") silver_high = os.path.join(base, "data", "silver_high.conll") silver_review = os.path.join(base, "data", "silver_review_done.conll") output = os.path.join(base, "data", "train_merged.conll") if not os.path.exists(original): print(f"ERROR: {original} not found"); return if not os.path.exists(silver_high): print(f"ERROR: {silver_high} not found — run silver_label.py first"); return print("Reading original train.txt ...") orig_blocks = read_conll_blocks(original) print(f" {len(orig_blocks)} sentences") print("Reading silver_high.conll ...") high_blocks = read_conll_blocks(silver_high) print(f" {len(high_blocks)} sentences") all_blocks = orig_blocks + high_blocks if include_review: if not os.path.exists(silver_review): print(f"WARNING: {silver_review} not found, skipping") else: print("Reading silver_review.conll (assuming manually corrected) ...") review_blocks = read_conll_blocks(silver_review) print(f" {len(review_blocks)} sentences") all_blocks += review_blocks random.seed(seed) random.shuffle(all_blocks) with open(output, "w", encoding="utf-8") as f: f.write("\n\n".join(all_blocks)) f.write("\n") print(f"\nMerged {len(all_blocks)} sentences → {output}") print("Upload this file to Google Drive for Colab fine-tuning.") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--include-review", action="store_true", help="Also include silver_review.conll (only after manual correction)") args = parser.parse_args() main(include_review=args.include_review)