File size: 6,542 Bytes

3df5819

"""
Converts all raw dataset formats into unified JSONL training format.
Output schema per line:
{"input": "...", "target": "...", "source": "fce|wi_locness|jfleg|synthetic"}

Datasets handled:
  - FCE v2.1 (BEA-2019 format): data/raw/fce/json/*.json
  - W&I+LOCNESS v2.1 (BEA-2019 format): data/raw/wi+locness/json/*.json
  - JFLEG: data/raw/jfleg/*.src + *.ref*

Run: python scripts/preprocess_data.py
"""

import json
import os
from pathlib import Path


def apply_bea19_edits(text: str, edits_block: list) -> str:
    """
    Apply BEA-2019 character-level edits to produce corrected text.

    edits_block format: [annotator_id, [(start, end, replacement, [error_type]), ...]]
    We use the first annotator's corrections.
    Edits are applied in reverse order to preserve character offsets.
    """
    if not edits_block or len(edits_block) == 0:
        return text

    # Take first annotator's edits
    annotator_edits = edits_block[0][1]

    # Sort by start position descending to apply from end to preserve offsets
    sorted_edits = sorted(annotator_edits, key=lambda e: e[0], reverse=True)

    result = text
    for edit in sorted_edits:
        start = edit[0]
        end = edit[1]
        replacement = edit[2]

        # Skip null replacements (no correction needed) and noop edits
        if replacement is None:
            continue

        result = result[:start] + replacement + result[end:]

    return result


def process_bea19_json(json_path: str, source_name: str, out_file):
    """
    Process a BEA-2019 format JSON file (FCE or W&I+LOCNESS).
    Each line is a JSON object with 'text' and 'edits' fields.
    Produces (input=original, target=corrected) pairs.
    """
    count = 0
    with open(json_path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            original = obj["text"]
            edits = obj.get("edits", [])
            corrected = apply_bea19_edits(original, edits)

            # Only include if there were actual corrections
            if original.strip() != corrected.strip() and corrected.strip():
                out_file.write(json.dumps({
                    "input": original,
                    "target": corrected,
                    "source": source_name,
                }) + "\n")
                count += 1
    return count


def process_fce(raw_dir: str, out_file) -> int:
    """Process all FCE JSON files."""
    total = 0
    json_dir = Path(raw_dir) / "json"
    if not json_dir.exists():
        print(f"  ⚠ FCE directory not found: {json_dir}")
        return 0
    for json_file in sorted(json_dir.glob("*.json")):
        n = process_bea19_json(str(json_file), "fce", out_file)
        print(f"  {json_file.name}: {n} pairs")
        total += n
    return total


def process_wi_locness(raw_dir: str, out_file) -> int:
    """Process all W&I+LOCNESS JSON files."""
    total = 0
    json_dir = Path(raw_dir) / "json"
    if not json_dir.exists():
        print(f"  ⚠ W&I+LOCNESS directory not found: {json_dir}")
        return 0
    for json_file in sorted(json_dir.glob("*.json")):
        n = process_bea19_json(str(json_file), "wi_locness", out_file)
        print(f"  {json_file.name}: {n} pairs")
        total += n
    return total


def process_jfleg(raw_dir: str, out_file) -> int:
    """
    JFLEG: .src files (original) and .ref0..ref3 (4 human corrections).
    Each reference becomes a separate training pair.
    """
    total = 0
    src_files = list(Path(raw_dir).glob("*.src"))
    if not src_files:
        print(f"  ⚠ JFLEG directory empty or not found: {raw_dir}")
        return 0
    for src_file in src_files:
        refs = [src_file.with_suffix(f".ref{i}") for i in range(4)]
        with open(src_file) as sf:
            src_lines = sf.readlines()
        for ref_path in refs:
            if ref_path.exists():
                with open(ref_path) as rf:
                    ref_lines = rf.readlines()
                for src, ref in zip(src_lines, ref_lines):
                    src, ref = src.strip(), ref.strip()
                    if src and ref and src != ref:
                        out_file.write(json.dumps({
                            "input": src,
                            "target": ref,
                            "source": "jfleg",
                        }) + "\n")
                        total += 1
    return total


def create_splits(train_path: str, val_ratio: float = 0.1):
    """Split train.jsonl into train and val sets."""
    import random
    random.seed(42)

    with open(train_path) as f:
        lines = f.readlines()

    random.shuffle(lines)
    val_size = int(len(lines) * val_ratio)
    val_lines = lines[:val_size]
    train_lines = lines[val_size:]

    with open(train_path, "w") as f:
        f.writelines(train_lines)

    val_path = train_path.replace("train.jsonl", "val.jsonl")
    with open(val_path, "w") as f:
        f.writelines(val_lines)

    # Also create a small test split from val
    test_size = min(len(val_lines) // 2, 500)
    test_lines = val_lines[:test_size]
    test_path = train_path.replace("train.jsonl", "test.jsonl")
    with open(test_path, "w") as f:
        f.writelines(test_lines)

    return len(train_lines), len(val_lines), len(test_lines)


def main():
    os.makedirs("data/processed", exist_ok=True)

    print("=== Preprocessing datasets into unified JSONL ===\n")
    total = 0

    with open("data/processed/train.jsonl", "w") as out:
        # FCE
        print("Processing FCE...")
        n = process_fce("data/raw/fce", out)
        print(f"  Total FCE: {n} pairs\n")
        total += n

        # W&I+LOCNESS
        print("Processing W&I+LOCNESS...")
        n = process_wi_locness("data/raw/wi+locness", out)
        print(f"  Total W&I+LOCNESS: {n} pairs\n")
        total += n

        # JFLEG
        print("Processing JFLEG...")
        n = process_jfleg("data/raw/jfleg", out)
        print(f"  Total JFLEG: {n} pairs\n")
        total += n

    print(f"Total examples in train.jsonl: {total}")

    # Create train/val/test splits
    print("\nSplitting into train/val/test...")
    n_train, n_val, n_test = create_splits("data/processed/train.jsonl")
    print(f"  Train: {n_train} | Val: {n_val} | Test: {n_test}")

    print("\n✓ Preprocessing complete.")
    print("  data/processed/train.jsonl")
    print("  data/processed/val.jsonl")
    print("  data/processed/test.jsonl")


if __name__ == "__main__":
    main()