catninja123 commited on
Commit
f821a72
·
verified ·
1 Parent(s): eda78f2

Upload src/merge_pairs.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/merge_pairs.py +161 -0
src/merge_pairs.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Merge Gemini and Grok AI paraphrase pairs into unified training data.
3
+
4
+ 1. Load deep-cleaned human texts (from human_texts_clean.jsonl)
5
+ 2. Match with existing Gemini AI pairs (from training_pairs_clean.jsonl)
6
+ 3. Match with new Grok AI pairs (from grok_pairs.jsonl)
7
+ 4. For each human text, create pairs with both AI versions
8
+ 5. Split into train/val and save
9
+ """
10
+
11
+ import json
12
+ import random
13
+ import hashlib
14
+ from collections import Counter
15
+
16
+ HUMAN_CLEAN = '/home/ubuntu/mash_training/data/human_texts_clean.jsonl'
17
+ GEMINI_PAIRS = '/home/ubuntu/experiment/training_pairs_clean.jsonl'
18
+ GROK_PAIRS = '/home/ubuntu/mash_training/data/grok_pairs.jsonl'
19
+ OUTPUT_TRAIN = '/home/ubuntu/mash_training/data/train.jsonl'
20
+ OUTPUT_VAL = '/home/ubuntu/mash_training/data/val.jsonl'
21
+ OUTPUT_ALL = '/home/ubuntu/mash_training/data/all.jsonl'
22
+
23
+
24
+ def load_jsonl(path):
25
+ data = []
26
+ try:
27
+ with open(path) as f:
28
+ for line in f:
29
+ line = line.strip()
30
+ if line:
31
+ data.append(json.loads(line))
32
+ except FileNotFoundError:
33
+ print(f" WARNING: {path} not found")
34
+ return data
35
+
36
+
37
+ def main():
38
+ # Load clean human texts (these are the canonical versions)
39
+ human_data = load_jsonl(HUMAN_CLEAN)
40
+ human_by_id = {d['essay_id']: d for d in human_data}
41
+ print(f"Clean human texts: {len(human_data)}")
42
+
43
+ # Load Gemini pairs
44
+ gemini_raw = load_jsonl(GEMINI_PAIRS)
45
+ gemini_by_id = {d['essay_id']: d for d in gemini_raw}
46
+ print(f"Gemini pairs (raw): {len(gemini_raw)}")
47
+
48
+ # Load Grok pairs
49
+ grok_raw = load_jsonl(GROK_PAIRS)
50
+ grok_by_id = {d['essay_id']: d for d in grok_raw}
51
+ print(f"Grok pairs: {len(grok_raw)}")
52
+
53
+ # Build unified training pairs
54
+ all_pairs = []
55
+ stats = {
56
+ 'gemini_matched': 0,
57
+ 'grok_matched': 0,
58
+ 'both_matched': 0,
59
+ 'neither': 0,
60
+ }
61
+
62
+ for eid, human in human_by_id.items():
63
+ has_gemini = eid in gemini_by_id
64
+ has_grok = eid in grok_by_id
65
+
66
+ if has_gemini and has_grok:
67
+ stats['both_matched'] += 1
68
+ elif has_gemini:
69
+ stats['gemini_matched'] += 1
70
+ elif has_grok:
71
+ stats['grok_matched'] += 1
72
+ else:
73
+ stats['neither'] += 1
74
+ continue
75
+
76
+ # Use clean human text as the canonical version
77
+ clean_human_text = human['human_text']
78
+
79
+ if has_gemini:
80
+ gemini_ai = gemini_by_id[eid]['ai_text']
81
+ # Validate: AI text should be reasonable length
82
+ if len(gemini_ai.split()) >= 20:
83
+ all_pairs.append({
84
+ 'essay_id': eid,
85
+ 'type': human['type'],
86
+ 'tier': human.get('tier', 'unknown'),
87
+ 'year': human.get('year', 'unknown'),
88
+ 'input_text': gemini_ai,
89
+ 'human_text': clean_human_text,
90
+ 'ai_text': gemini_ai,
91
+ 'ai_model': 'gemini-2.5-flash',
92
+ })
93
+
94
+ if has_grok:
95
+ grok_ai = grok_by_id[eid]['ai_text']
96
+ if len(grok_ai.split()) >= 20:
97
+ all_pairs.append({
98
+ 'essay_id': eid,
99
+ 'type': human['type'],
100
+ 'tier': human.get('tier', 'unknown'),
101
+ 'year': human.get('year', 'unknown'),
102
+ 'input_text': grok_ai,
103
+ 'human_text': clean_human_text,
104
+ 'ai_text': grok_ai,
105
+ 'ai_model': 'grok-3-mini-fast',
106
+ })
107
+
108
+ print(f"\nMatching stats:")
109
+ print(f" Both Gemini+Grok: {stats['both_matched']}")
110
+ print(f" Gemini only: {stats['gemini_matched']}")
111
+ print(f" Grok only: {stats['grok_matched']}")
112
+ print(f" Neither: {stats['neither']}")
113
+ print(f" Total training pairs: {len(all_pairs)}")
114
+
115
+ # Model distribution
116
+ model_dist = Counter(p['ai_model'] for p in all_pairs)
117
+ print(f"\nModel distribution: {dict(model_dist)}")
118
+
119
+ # Type distribution
120
+ type_dist = Counter(p['type'] for p in all_pairs)
121
+ print(f"Type distribution: {dict(type_dist)}")
122
+
123
+ # Split into train/val (stratified by type)
124
+ random.seed(42)
125
+
126
+ ps_pairs = [p for p in all_pairs if p['type'] == 'ps']
127
+ supp_pairs = [p for p in all_pairs if p['type'] == 'supp']
128
+
129
+ random.shuffle(ps_pairs)
130
+ random.shuffle(supp_pairs)
131
+
132
+ ps_val_size = max(1, int(len(ps_pairs) * 0.1))
133
+ supp_val_size = max(1, int(len(supp_pairs) * 0.1))
134
+
135
+ val_data = ps_pairs[:ps_val_size] + supp_pairs[:supp_val_size]
136
+ train_data = ps_pairs[ps_val_size:] + supp_pairs[supp_val_size:]
137
+
138
+ random.shuffle(train_data)
139
+ random.shuffle(val_data)
140
+
141
+ print(f"\nTrain: {len(train_data)}")
142
+ print(f"Val: {len(val_data)}")
143
+
144
+ # Save
145
+ def save_jsonl(data, path):
146
+ with open(path, 'w') as f:
147
+ for d in data:
148
+ f.write(json.dumps(d, ensure_ascii=False) + '\n')
149
+
150
+ save_jsonl(train_data, OUTPUT_TRAIN)
151
+ save_jsonl(val_data, OUTPUT_VAL)
152
+ save_jsonl(all_pairs, OUTPUT_ALL)
153
+
154
+ print(f"\nSaved to:")
155
+ print(f" {OUTPUT_TRAIN}")
156
+ print(f" {OUTPUT_VAL}")
157
+ print(f" {OUTPUT_ALL}")
158
+
159
+
160
+ if __name__ == '__main__':
161
+ main()