File size: 6,542 Bytes
3df5819 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """
Converts all raw dataset formats into unified JSONL training format.
Output schema per line:
{"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"}
Datasets handled:
- FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json
- W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json
- JFLEG: data/raw/jfleg/*.src + *.ref*
Run: python scripts/preprocess_data.py
"""
import json
import os
from pathlib import Path
def apply_bea19_edits(text: str, edits_block: list) -> str:
"""
Apply BEA-2019 character-level edits to produce corrected text.
edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]]
We use the first annotator's corrections.
Edits are applied in reverse order to preserve character offsets.
"""
if not edits_block or len(edits_block) == 0:
return text
# Take first annotator's edits
annotator_edits = edits_block[0][1]
# Sort by start position descending to apply from end to preserve offsets
sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True)
result = text
for edit in sorted_edits:
start = edit[0]
end = edit[1]
replacement = edit[2]
# Skip null replacements (no correction needed) and noop edits
if replacement is None:
continue
result = result[:start] + replacement + result[end:]
return result
def process_bea19_json(json_path: str, source_name: str, out_file):
"""
Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
Each line is a JSON object with 'text' and 'edits' fields.
Produces (input=original, target=corrected) pairs.
"""
count = 0
with open(json_path) as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
original = obj["text"]
edits = obj.get("edits", [])
corrected = apply_bea19_edits(original, edits)
# Only include if there were actual corrections
if original.strip() != corrected.strip() and corrected.strip():
out_file.write(json.dumps({
"input": original,
"target": corrected,
"source": source_name,
}) + "\n")
count += 1
return count
def process_fce(raw_dir: str, out_file) -> int:
"""Process all FCE JSON files."""
total = 0
json_dir = Path(raw_dir) / "json"
if not json_dir.exists():
print(f" ⚠ FCE directory not found: {json_dir}")
return 0
for json_file in sorted(json_dir.glob("*.json")):
n = process_bea19_json(str(json_file), "fce", out_file)
print(f" {json_file.name}: {n} pairs")
total += n
return total
def process_wi_locness(raw_dir: str, out_file) -> int:
"""Process all W&I+LOCNESS JSON files."""
total = 0
json_dir = Path(raw_dir) / "json"
if not json_dir.exists():
print(f" ⚠ W&I+LOCNESS directory not found: {json_dir}")
return 0
for json_file in sorted(json_dir.glob("*.json")):
n = process_bea19_json(str(json_file), "wi_locness", out_file)
print(f" {json_file.name}: {n} pairs")
total += n
return total
def process_jfleg(raw_dir: str, out_file) -> int:
"""
JFLEG: .src files (original) and .ref0..ref3 (4 human corrections).
Each reference becomes a separate training pair.
"""
total = 0
src_files = list(Path(raw_dir).glob("*.src"))
if not src_files:
print(f" ⚠ JFLEG directory empty or not found: {raw_dir}")
return 0
for src_file in src_files:
refs = [src_file.with_suffix(f".ref{i}") for i in range(4)]
with open(src_file) as sf:
src_lines = sf.readlines()
for ref_path in refs:
if ref_path.exists():
with open(ref_path) as rf:
ref_lines = rf.readlines()
for src, ref in zip(src_lines, ref_lines):
src, ref = src.strip(), ref.strip()
if src and ref and src != ref:
out_file.write(json.dumps({
"input": src,
"target": ref,
"source": "jfleg",
}) + "\n")
total += 1
return total
def create_splits(train_path: str, val_ratio: float = 0.1):
"""Split train.jsonl into train and val sets."""
import random
random.seed(42)
with open(train_path) as f:
lines = f.readlines()
random.shuffle(lines)
val_size = int(len(lines) * val_ratio)
val_lines = lines[:val_size]
train_lines = lines[val_size:]
with open(train_path, "w") as f:
f.writelines(train_lines)
val_path = train_path.replace("train.jsonl", "val.jsonl")
with open(val_path, "w") as f:
f.writelines(val_lines)
# Also create a small test split from val
test_size = min(len(val_lines) // 2, 500)
test_lines = val_lines[:test_size]
test_path = train_path.replace("train.jsonl", "test.jsonl")
with open(test_path, "w") as f:
f.writelines(test_lines)
return len(train_lines), len(val_lines), len(test_lines)
def main():
os.makedirs("data/processed", exist_ok=True)
print("=== Preprocessing datasets into unified JSONL ===\n")
total = 0
with open("data/processed/train.jsonl", "w") as out:
# FCE
print("Processing FCE...")
n = process_fce("data/raw/fce", out)
print(f" Total FCE: {n} pairs\n")
total += n
# W&I+LOCNESS
print("Processing W&I+LOCNESS...")
n = process_wi_locness("data/raw/wi+locness", out)
print(f" Total W&I+LOCNESS: {n} pairs\n")
total += n
# JFLEG
print("Processing JFLEG...")
n = process_jfleg("data/raw/jfleg", out)
print(f" Total JFLEG: {n} pairs\n")
total += n
print(f"Total examples in train.jsonl: {total}")
# Create train/val/test splits
print("\nSplitting into train/val/test...")
n_train, n_val, n_test = create_splits("data/processed/train.jsonl")
print(f" Train: {n_train} | Val: {n_val} | Test: {n_test}")
print("\n✓ Preprocessing complete.")
print(" data/processed/train.jsonl")
print(" data/processed/val.jsonl")
print(" data/processed/test.jsonl")
if __name__ == "__main__":
main()
|