| """ |
| MASH Stage 1: Data Preparation |
| - Load raw training pairs (human PS/Supp + AI paraphrased versions) |
| - Filter for quality (word count, length ratio) |
| - Split into train/val sets |
| - Save in format ready for Style-SFT and DPO |
| """ |
|
|
| import json |
| import random |
| import os |
| from pathlib import Path |
|
|
| def load_raw_data(path: str) -> list: |
| data = [] |
| with open(path) as f: |
| for line in f: |
| d = json.loads(line) |
| data.append(d) |
| return data |
|
|
| def filter_data(data: list, min_words: int = 50, max_words: int = 800) -> list: |
| """Filter for quality samples.""" |
| filtered = [] |
| for d in data: |
| hw = d['human_words'] |
| aw = d['ai_words'] |
| |
| if hw < min_words or aw < min_words: |
| continue |
| if hw > max_words or aw > max_words: |
| continue |
| |
| ratio = aw / hw if hw > 0 else 0 |
| if ratio < 0.5 or ratio > 2.0: |
| continue |
| |
| if len(d['human_text'].strip()) < 100 or len(d['ai_text'].strip()) < 100: |
| continue |
| filtered.append(d) |
| return filtered |
|
|
| def prepare_sft_data(data: list) -> list: |
| """ |
| Prepare data for Style-injection SFT. |
| Each sample has: |
| - input: AI text |
| - target_human: human text (for style transfer task) |
| - target_ai: AI text (for reconstruction task) |
| """ |
| sft_data = [] |
| for d in data: |
| sft_data.append({ |
| 'id': d['essay_id'], |
| 'type': d['type'], |
| 'tier': d.get('tier', 'unknown'), |
| 'input_text': d['ai_text'], |
| 'human_text': d['human_text'], |
| 'ai_text': d['ai_text'], |
| }) |
| return sft_data |
|
|
| def split_data(data: list, val_ratio: float = 0.1, seed: int = 42) -> tuple: |
| """Split into train and validation sets, stratified by type.""" |
| random.seed(seed) |
| |
| |
| ps_data = [d for d in data if d['type'] == 'ps'] |
| supp_data = [d for d in data if d['type'] == 'supp'] |
| |
| random.shuffle(ps_data) |
| random.shuffle(supp_data) |
| |
| ps_val_size = max(1, int(len(ps_data) * val_ratio)) |
| supp_val_size = max(1, int(len(supp_data) * val_ratio)) |
| |
| val_data = ps_data[:ps_val_size] + supp_data[:supp_val_size] |
| train_data = ps_data[ps_val_size:] + supp_data[supp_val_size:] |
| |
| random.shuffle(train_data) |
| random.shuffle(val_data) |
| |
| return train_data, val_data |
|
|
| def save_jsonl(data: list, path: str): |
| with open(path, 'w') as f: |
| for d in data: |
| f.write(json.dumps(d, ensure_ascii=False) + '\n') |
|
|
| def main(): |
| raw_path = '/home/ubuntu/experiment/training_pairs_v3_final.jsonl' |
| output_dir = '/home/ubuntu/mash_training/data' |
| os.makedirs(output_dir, exist_ok=True) |
| |
| |
| print("Loading raw data...") |
| raw_data = load_raw_data(raw_path) |
| print(f" Raw samples: {len(raw_data)}") |
| |
| print("Filtering data...") |
| filtered = filter_data(raw_data) |
| print(f" After filtering: {len(filtered)}") |
| |
| |
| print("Preparing SFT data...") |
| sft_data = prepare_sft_data(filtered) |
| |
| |
| print("Splitting into train/val...") |
| train_data, val_data = split_data(sft_data) |
| print(f" Train: {len(train_data)}") |
| print(f" Val: {len(val_data)}") |
| |
| |
| from collections import Counter |
| train_types = Counter(d['type'] for d in train_data) |
| val_types = Counter(d['type'] for d in val_data) |
| print(f" Train types: {dict(train_types)}") |
| print(f" Val types: {dict(val_types)}") |
| |
| |
| save_jsonl(train_data, os.path.join(output_dir, 'train.jsonl')) |
| save_jsonl(val_data, os.path.join(output_dir, 'val.jsonl')) |
| save_jsonl(sft_data, os.path.join(output_dir, 'all.jsonl')) |
| |
| print(f"\nData saved to {output_dir}/") |
| print(" train.jsonl - for training") |
| print(" val.jsonl - for validation") |
| print(" all.jsonl - complete dataset") |
|
|
| if __name__ == '__main__': |
| main() |
|
|