""" Converts all raw dataset formats into unified JSONL training format. Output schema per line: {"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"} Datasets handled: - FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json - W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json - JFLEG: data/raw/jfleg/*.src + *.ref* Run: python scripts/preprocess_data.py """ import json import os from pathlib import Path def apply_bea19_edits(text: str, edits_block: list) -> str: """ Apply BEA-2019 character-level edits to produce corrected text. edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]] We use the first annotator's corrections. Edits are applied in reverse order to preserve character offsets. """ if not edits_block or len(edits_block) == 0: return text # Take first annotator's edits annotator_edits = edits_block[0][1] # Sort by start position descending to apply from end to preserve offsets sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True) result = text for edit in sorted_edits: start = edit[0] end = edit[1] replacement = edit[2] # Skip null replacements (no correction needed) and noop edits if replacement is None: continue result = result[:start] + replacement + result[end:] return result def process_bea19_json(json_path: str, source_name: str, out_file): """ Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS). Each line is a JSON object with 'text' and 'edits' fields. Produces (input=original, target=corrected) pairs. """ count = 0 with open(json_path) as f: for line in f: line = line.strip() if not line: continue obj = json.loads(line) original = obj["text"] edits = obj.get("edits", []) corrected = apply_bea19_edits(original, edits) # Only include if there were actual corrections if original.strip() != corrected.strip() and corrected.strip(): out_file.write(json.dumps({ "input": original, "target": corrected, "source": source_name, }) + "\n") count += 1 return count def process_fce(raw_dir: str, out_file) -> int: """Process all FCE JSON files.""" total = 0 json_dir = Path(raw_dir) / "json" if not json_dir.exists(): print(f" ⚠ FCE directory not found: {json_dir}") return 0 for json_file in sorted(json_dir.glob("*.json")): n = process_bea19_json(str(json_file), "fce", out_file) print(f" {json_file.name}: {n} pairs") total += n return total def process_wi_locness(raw_dir: str, out_file) -> int: """Process all W&I+LOCNESS JSON files.""" total = 0 json_dir = Path(raw_dir) / "json" if not json_dir.exists(): print(f" ⚠ W&I+LOCNESS directory not found: {json_dir}") return 0 for json_file in sorted(json_dir.glob("*.json")): n = process_bea19_json(str(json_file), "wi_locness", out_file) print(f" {json_file.name}: {n} pairs") total += n return total def process_jfleg(raw_dir: str, out_file) -> int: """ JFLEG: .src files (original) and .ref0..ref3 (4 human corrections). Each reference becomes a separate training pair. """ total = 0 src_files = list(Path(raw_dir).glob("*.src")) if not src_files: print(f" ⚠ JFLEG directory empty or not found: {raw_dir}") return 0 for src_file in src_files: refs = [src_file.with_suffix(f".ref{i}") for i in range(4)] with open(src_file) as sf: src_lines = sf.readlines() for ref_path in refs: if ref_path.exists(): with open(ref_path) as rf: ref_lines = rf.readlines() for src, ref in zip(src_lines, ref_lines): src, ref = src.strip(), ref.strip() if src and ref and src != ref: out_file.write(json.dumps({ "input": src, "target": ref, "source": "jfleg", }) + "\n") total += 1 return total def create_splits(train_path: str, val_ratio: float = 0.1): """Split train.jsonl into train and val sets.""" import random random.seed(42) with open(train_path) as f: lines = f.readlines() random.shuffle(lines) val_size = int(len(lines) * val_ratio) val_lines = lines[:val_size] train_lines = lines[val_size:] with open(train_path, "w") as f: f.writelines(train_lines) val_path = train_path.replace("train.jsonl", "val.jsonl") with open(val_path, "w") as f: f.writelines(val_lines) # Also create a small test split from val test_size = min(len(val_lines) // 2, 500) test_lines = val_lines[:test_size] test_path = train_path.replace("train.jsonl", "test.jsonl") with open(test_path, "w") as f: f.writelines(test_lines) return len(train_lines), len(val_lines), len(test_lines) def main(): os.makedirs("data/processed", exist_ok=True) print("=== Preprocessing datasets into unified JSONL ===\n") total = 0 with open("data/processed/train.jsonl", "w") as out: # FCE print("Processing FCE...") n = process_fce("data/raw/fce", out) print(f" Total FCE: {n} pairs\n") total += n # W&I+LOCNESS print("Processing W&I+LOCNESS...") n = process_wi_locness("data/raw/wi+locness", out) print(f" Total W&I+LOCNESS: {n} pairs\n") total += n # JFLEG print("Processing JFLEG...") n = process_jfleg("data/raw/jfleg", out) print(f" Total JFLEG: {n} pairs\n") total += n print(f"Total examples in train.jsonl: {total}") # Create train/val/test splits print("\nSplitting into train/val/test...") n_train, n_val, n_test = create_splits("data/processed/train.jsonl") print(f" Train: {n_train} | Val: {n_val} | Test: {n_test}") print("\n✓ Preprocessing complete.") print(" data/processed/train.jsonl") print(" data/processed/val.jsonl") print(" data/processed/test.jsonl") if __name__ == "__main__": main()