Spaces:

catninja123
/

mash-stylebart-trainer

Paused

App Files Files Community

catninja123 commited on Mar 8

Commit

f821a72

verified ·

1 Parent(s): eda78f2

Upload src/merge_pairs.py with huggingface_hub

Browse files

Files changed (1) hide show

src/merge_pairs.py +161 -0

src/merge_pairs.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+Merge Gemini and Grok AI paraphrase pairs into unified training data.
+1. Load deep-cleaned human texts (from human_texts_clean.jsonl)
+2. Match with existing Gemini AI pairs (from training_pairs_clean.jsonl)
+3. Match with new Grok AI pairs (from grok_pairs.jsonl)
+4. For each human text, create pairs with both AI versions
+5. Split into train/val and save
+"""
+import json
+import random
+import hashlib
+from collections import Counter
+HUMAN_CLEAN = '/home/ubuntu/mash_training/data/human_texts_clean.jsonl'
+GEMINI_PAIRS = '/home/ubuntu/experiment/training_pairs_clean.jsonl'
+GROK_PAIRS = '/home/ubuntu/mash_training/data/grok_pairs.jsonl'
+OUTPUT_TRAIN = '/home/ubuntu/mash_training/data/train.jsonl'
+OUTPUT_VAL = '/home/ubuntu/mash_training/data/val.jsonl'
+OUTPUT_ALL = '/home/ubuntu/mash_training/data/all.jsonl'
+def load_jsonl(path):
+    data = []
+    try:
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data.append(json.loads(line))
+    except FileNotFoundError:
+        print(f"  WARNING: {path} not found")
+    return data
+def main():
+    # Load clean human texts (these are the canonical versions)
+    human_data = load_jsonl(HUMAN_CLEAN)
+    human_by_id = {d['essay_id']: d for d in human_data}
+    print(f"Clean human texts: {len(human_data)}")
+    # Load Gemini pairs
+    gemini_raw = load_jsonl(GEMINI_PAIRS)
+    gemini_by_id = {d['essay_id']: d for d in gemini_raw}
+    print(f"Gemini pairs (raw): {len(gemini_raw)}")
+    # Load Grok pairs
+    grok_raw = load_jsonl(GROK_PAIRS)
+    grok_by_id = {d['essay_id']: d for d in grok_raw}
+    print(f"Grok pairs: {len(grok_raw)}")
+    # Build unified training pairs
+    all_pairs = []
+    stats = {
+        'gemini_matched': 0,
+        'grok_matched': 0,
+        'both_matched': 0,
+        'neither': 0,
+    }
+    for eid, human in human_by_id.items():
+        has_gemini = eid in gemini_by_id
+        has_grok = eid in grok_by_id
+        if has_gemini and has_grok:
+            stats['both_matched'] += 1
+        elif has_gemini:
+            stats['gemini_matched'] += 1
+        elif has_grok:
+            stats['grok_matched'] += 1
+        else:
+            stats['neither'] += 1
+            continue
+        # Use clean human text as the canonical version
+        clean_human_text = human['human_text']
+        if has_gemini:
+            gemini_ai = gemini_by_id[eid]['ai_text']
+            # Validate: AI text should be reasonable length
+            if len(gemini_ai.split()) >= 20:
+                all_pairs.append({
+                    'essay_id': eid,
+                    'type': human['type'],
+                    'tier': human.get('tier', 'unknown'),
+                    'year': human.get('year', 'unknown'),
+                    'input_text': gemini_ai,
+                    'human_text': clean_human_text,
+                    'ai_text': gemini_ai,
+                    'ai_model': 'gemini-2.5-flash',
+                })
+        if has_grok:
+            grok_ai = grok_by_id[eid]['ai_text']
+            if len(grok_ai.split()) >= 20:
+                all_pairs.append({
+                    'essay_id': eid,
+                    'type': human['type'],
+                    'tier': human.get('tier', 'unknown'),
+                    'year': human.get('year', 'unknown'),
+                    'input_text': grok_ai,
+                    'human_text': clean_human_text,
+                    'ai_text': grok_ai,
+                    'ai_model': 'grok-3-mini-fast',
+                })
+    print(f"\nMatching stats:")
+    print(f"  Both Gemini+Grok: {stats['both_matched']}")
+    print(f"  Gemini only: {stats['gemini_matched']}")
+    print(f"  Grok only: {stats['grok_matched']}")
+    print(f"  Neither: {stats['neither']}")
+    print(f"  Total training pairs: {len(all_pairs)}")
+    # Model distribution
+    model_dist = Counter(p['ai_model'] for p in all_pairs)
+    print(f"\nModel distribution: {dict(model_dist)}")
+    # Type distribution
+    type_dist = Counter(p['type'] for p in all_pairs)
+    print(f"Type distribution: {dict(type_dist)}")
+    # Split into train/val (stratified by type)
+    random.seed(42)
+    ps_pairs = [p for p in all_pairs if p['type'] == 'ps']
+    supp_pairs = [p for p in all_pairs if p['type'] == 'supp']
+    random.shuffle(ps_pairs)
+    random.shuffle(supp_pairs)
+    ps_val_size = max(1, int(len(ps_pairs) * 0.1))
+    supp_val_size = max(1, int(len(supp_pairs) * 0.1))
+    val_data = ps_pairs[:ps_val_size] + supp_pairs[:supp_val_size]
+    train_data = ps_pairs[ps_val_size:] + supp_pairs[supp_val_size:]
+    random.shuffle(train_data)
+    random.shuffle(val_data)
+    print(f"\nTrain: {len(train_data)}")
+    print(f"Val: {len(val_data)}")
+    # Save
+    def save_jsonl(data, path):
+        with open(path, 'w') as f:
+            for d in data:
+                f.write(json.dumps(d, ensure_ascii=False) + '\n')
+    save_jsonl(train_data, OUTPUT_TRAIN)
+    save_jsonl(val_data, OUTPUT_VAL)
+    save_jsonl(all_pairs, OUTPUT_ALL)
+    print(f"\nSaved to:")
+    print(f"  {OUTPUT_TRAIN}")
+    print(f"  {OUTPUT_VAL}")
+    print(f"  {OUTPUT_ALL}")
+if __name__ == '__main__':
+    main()