File size: 4,147 Bytes
69da0bb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | """
MASH Stage 1: Data Preparation
- Load raw training pairs (human PS/Supp + AI paraphrased versions)
- Filter for quality (word count, length ratio)
- Split into train/val sets
- Save in format ready for Style-SFT and DPO
"""
import json
import random
import os
from pathlib import Path
def load_raw_data(path: str) -> list:
data = []
with open(path) as f:
for line in f:
d = json.loads(line)
data.append(d)
return data
def filter_data(data: list, min_words: int = 50, max_words: int = 800) -> list:
"""Filter for quality samples."""
filtered = []
for d in data:
hw = d['human_words']
aw = d['ai_words']
# Both texts should be within reasonable range
if hw < min_words or aw < min_words:
continue
if hw > max_words or aw > max_words:
continue
# Length ratio should be reasonable (AI version shouldn't be too different)
ratio = aw / hw if hw > 0 else 0
if ratio < 0.5 or ratio > 2.0:
continue
# Text should not be empty or too short
if len(d['human_text'].strip()) < 100 or len(d['ai_text'].strip()) < 100:
continue
filtered.append(d)
return filtered
def prepare_sft_data(data: list) -> list:
"""
Prepare data for Style-injection SFT.
Each sample has:
- input: AI text
- target_human: human text (for style transfer task)
- target_ai: AI text (for reconstruction task)
"""
sft_data = []
for d in data:
sft_data.append({
'id': d['essay_id'],
'type': d['type'],
'tier': d.get('tier', 'unknown'),
'input_text': d['ai_text'],
'human_text': d['human_text'],
'ai_text': d['ai_text'],
})
return sft_data
def split_data(data: list, val_ratio: float = 0.1, seed: int = 42) -> tuple:
"""Split into train and validation sets, stratified by type."""
random.seed(seed)
# Separate by type
ps_data = [d for d in data if d['type'] == 'ps']
supp_data = [d for d in data if d['type'] == 'supp']
random.shuffle(ps_data)
random.shuffle(supp_data)
ps_val_size = max(1, int(len(ps_data) * val_ratio))
supp_val_size = max(1, int(len(supp_data) * val_ratio))
val_data = ps_data[:ps_val_size] + supp_data[:supp_val_size]
train_data = ps_data[ps_val_size:] + supp_data[supp_val_size:]
random.shuffle(train_data)
random.shuffle(val_data)
return train_data, val_data
def save_jsonl(data: list, path: str):
with open(path, 'w') as f:
for d in data:
f.write(json.dumps(d, ensure_ascii=False) + '\n')
def main():
raw_path = '/home/ubuntu/experiment/training_pairs_v3_final.jsonl'
output_dir = '/home/ubuntu/mash_training/data'
os.makedirs(output_dir, exist_ok=True)
# Load and filter
print("Loading raw data...")
raw_data = load_raw_data(raw_path)
print(f" Raw samples: {len(raw_data)}")
print("Filtering data...")
filtered = filter_data(raw_data)
print(f" After filtering: {len(filtered)}")
# Prepare SFT format
print("Preparing SFT data...")
sft_data = prepare_sft_data(filtered)
# Split
print("Splitting into train/val...")
train_data, val_data = split_data(sft_data)
print(f" Train: {len(train_data)}")
print(f" Val: {len(val_data)}")
# Type distribution
from collections import Counter
train_types = Counter(d['type'] for d in train_data)
val_types = Counter(d['type'] for d in val_data)
print(f" Train types: {dict(train_types)}")
print(f" Val types: {dict(val_types)}")
# Save
save_jsonl(train_data, os.path.join(output_dir, 'train.jsonl'))
save_jsonl(val_data, os.path.join(output_dir, 'val.jsonl'))
save_jsonl(sft_data, os.path.join(output_dir, 'all.jsonl'))
print(f"\nData saved to {output_dir}/")
print(" train.jsonl - for training")
print(" val.jsonl - for validation")
print(" all.jsonl - complete dataset")
if __name__ == '__main__':
main()
|