File size: 4,147 Bytes
69da0bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
MASH Stage 1: Data Preparation
- Load raw training pairs (human PS/Supp + AI paraphrased versions)
- Filter for quality (word count, length ratio)
- Split into train/val sets
- Save in format ready for Style-SFT and DPO
"""

import json
import random
import os
from pathlib import Path

def load_raw_data(path: str) -> list:
    data = []
    with open(path) as f:
        for line in f:
            d = json.loads(line)
            data.append(d)
    return data

def filter_data(data: list, min_words: int = 50, max_words: int = 800) -> list:
    """Filter for quality samples."""
    filtered = []
    for d in data:
        hw = d['human_words']
        aw = d['ai_words']
        # Both texts should be within reasonable range
        if hw < min_words or aw < min_words:
            continue
        if hw > max_words or aw > max_words:
            continue
        # Length ratio should be reasonable (AI version shouldn't be too different)
        ratio = aw / hw if hw > 0 else 0
        if ratio < 0.5 or ratio > 2.0:
            continue
        # Text should not be empty or too short
        if len(d['human_text'].strip()) < 100 or len(d['ai_text'].strip()) < 100:
            continue
        filtered.append(d)
    return filtered

def prepare_sft_data(data: list) -> list:
    """
    Prepare data for Style-injection SFT.
    Each sample has:
    - input: AI text
    - target_human: human text (for style transfer task)
    - target_ai: AI text (for reconstruction task)
    """
    sft_data = []
    for d in data:
        sft_data.append({
            'id': d['essay_id'],
            'type': d['type'],
            'tier': d.get('tier', 'unknown'),
            'input_text': d['ai_text'],
            'human_text': d['human_text'],
            'ai_text': d['ai_text'],
        })
    return sft_data

def split_data(data: list, val_ratio: float = 0.1, seed: int = 42) -> tuple:
    """Split into train and validation sets, stratified by type."""
    random.seed(seed)
    
    # Separate by type
    ps_data = [d for d in data if d['type'] == 'ps']
    supp_data = [d for d in data if d['type'] == 'supp']
    
    random.shuffle(ps_data)
    random.shuffle(supp_data)
    
    ps_val_size = max(1, int(len(ps_data) * val_ratio))
    supp_val_size = max(1, int(len(supp_data) * val_ratio))
    
    val_data = ps_data[:ps_val_size] + supp_data[:supp_val_size]
    train_data = ps_data[ps_val_size:] + supp_data[supp_val_size:]
    
    random.shuffle(train_data)
    random.shuffle(val_data)
    
    return train_data, val_data

def save_jsonl(data: list, path: str):
    with open(path, 'w') as f:
        for d in data:
            f.write(json.dumps(d, ensure_ascii=False) + '\n')

def main():
    raw_path = '/home/ubuntu/experiment/training_pairs_v3_final.jsonl'
    output_dir = '/home/ubuntu/mash_training/data'
    os.makedirs(output_dir, exist_ok=True)
    
    # Load and filter
    print("Loading raw data...")
    raw_data = load_raw_data(raw_path)
    print(f"  Raw samples: {len(raw_data)}")
    
    print("Filtering data...")
    filtered = filter_data(raw_data)
    print(f"  After filtering: {len(filtered)}")
    
    # Prepare SFT format
    print("Preparing SFT data...")
    sft_data = prepare_sft_data(filtered)
    
    # Split
    print("Splitting into train/val...")
    train_data, val_data = split_data(sft_data)
    print(f"  Train: {len(train_data)}")
    print(f"  Val: {len(val_data)}")
    
    # Type distribution
    from collections import Counter
    train_types = Counter(d['type'] for d in train_data)
    val_types = Counter(d['type'] for d in val_data)
    print(f"  Train types: {dict(train_types)}")
    print(f"  Val types: {dict(val_types)}")
    
    # Save
    save_jsonl(train_data, os.path.join(output_dir, 'train.jsonl'))
    save_jsonl(val_data, os.path.join(output_dir, 'val.jsonl'))
    save_jsonl(sft_data, os.path.join(output_dir, 'all.jsonl'))
    
    print(f"\nData saved to {output_dir}/")
    print("  train.jsonl - for training")
    print("  val.jsonl   - for validation")
    print("  all.jsonl   - complete dataset")

if __name__ == '__main__':
    main()