Spaces:

catninja123
/

mash-stylebart-trainer

Paused

App Files Files Community

catninja123 commited on Mar 8

Commit

720d8ac

verified ·

1 Parent(s): 10427af

Upload src/dataset_dpo.py with huggingface_hub

Browse files

Files changed (1) hide show

src/dataset_dpo.py +135 -0

src/dataset_dpo.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+MASH DPO Dataset - Preference pairs for Direct Preference Optimization
+Preference pairs:
+  - prompt: instruction + AI text
+  - chosen: human-written text (naturally passes GPTZero)
+  - rejected: SFT model output (detected as AI by GPTZero)
+For the initial DPO round, we use the training data's human texts as "chosen"
+and the AI texts (which the SFT model was trained to approximate) as a proxy
+for "rejected". This works because:
+1. The SFT model's outputs are stylistically similar to the AI inputs
+2. The human texts represent the target distribution we want to reach
+3. DPO will push the model away from AI-like patterns toward human-like ones
+"""
+import json
+import random
+import torch
+from torch.utils.data import Dataset
+# Same instruction templates as SFT v4
+INSTRUCTIONS = [
+    "Rewrite the following AI-generated {type} essay in a natural, authentic human voice. Preserve the original meaning and key details while making the writing sound genuinely human-written:\n\n{text}",
+    "Transform this AI-written {type} essay into natural human writing. Keep the same ideas and details but make it sound like a real person wrote it:\n\n{text}",
+    "Convert the following machine-generated {type} essay to sound authentically human. Maintain the core content while adopting a genuine, personal writing style:\n\n{text}",
+    "Rewrite this {type} essay to remove all traces of AI writing. The output should read as if written by a real student, preserving the original meaning:\n\n{text}",
+    "Make the following AI-generated {type} essay sound human-written. Keep the same content and structure but use natural, authentic language:\n\n{text}",
+]
+TYPE_NAMES = {
+    'ps': 'personal statement',
+    'supp': 'supplemental',
+}
+class DPODataset(Dataset):
+    """
+    Dataset for DPO training.
+    Each sample contains:
+    - prompt_ids: tokenized instruction + AI text (encoder input)
+    - chosen_ids: tokenized human text (preferred output)
+    - rejected_ids: tokenized AI text (dispreferred output)
+    """
+    def __init__(self, data_path: str, tokenizer,
+                 max_input_len: int = 512, max_target_len: int = 512):
+        self.tokenizer = tokenizer
+        self.max_input_len = max_input_len
+        self.max_target_len = max_target_len
+        self.examples = []
+        with open(data_path) as f:
+            for line in f:
+                d = json.loads(line)
+                essay_type = d.get('type', 'supp')
+                type_name = TYPE_NAMES.get(essay_type, essay_type)
+                # For DPO: chosen=human, rejected=AI (the original AI text)
+                # The AI text serves as a proxy for what the SFT model would generate
+                self.examples.append({
+                    'ai_text': d.get('ai_text', d.get('input_text', '')),
+                    'human_text': d['human_text'],
+                    'type_name': type_name,
+                })
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, idx):
+        ex = self.examples[idx]
+        # Build instruction prompt (same as SFT)
+        template = random.choice(INSTRUCTIONS)
+        prompt_text = template.format(
+            type=ex['type_name'],
+            text=ex['ai_text'],
+        )
+        # Tokenize prompt (encoder input)
+        prompt_enc = self.tokenizer(
+            prompt_text,
+            max_length=self.max_input_len,
+            truncation=True,
+            padding='max_length',
+            return_tensors='pt',
+        )
+        # Tokenize chosen (human text)
+        chosen_enc = self.tokenizer(
+            text_target=ex['human_text'],
+            max_length=self.max_target_len,
+            truncation=True,
+            padding='max_length',
+            return_tensors='pt',
+        )
+        # Tokenize rejected (AI text — the original, not the instruction)
+        rejected_enc = self.tokenizer(
+            text_target=ex['ai_text'],
+            max_length=self.max_target_len,
+            truncation=True,
+            padding='max_length',
+            return_tensors='pt',
+        )
+        # Build labels (replace pad with -100)
+        chosen_labels = chosen_enc['input_ids'].squeeze().clone()
+        chosen_labels[chosen_labels == self.tokenizer.pad_token_id] = -100
+        rejected_labels = rejected_enc['input_ids'].squeeze().clone()
+        rejected_labels[rejected_labels == self.tokenizer.pad_token_id] = -100
+        return {
+            'input_ids': prompt_enc['input_ids'].squeeze(),
+            'attention_mask': prompt_enc['attention_mask'].squeeze(),
+            'chosen_labels': chosen_labels,
+            'rejected_labels': rejected_labels,
+            'chosen_attention_mask': (chosen_enc['input_ids'].squeeze() != self.tokenizer.pad_token_id).long(),
+            'rejected_attention_mask': (rejected_enc['input_ids'].squeeze() != self.tokenizer.pad_token_id).long(),
+        }
+def dpo_collate_fn(batch):
+    """Collate function for DPO dataset."""
+    return {
+        'input_ids': torch.stack([b['input_ids'] for b in batch]),
+        'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
+        'chosen_labels': torch.stack([b['chosen_labels'] for b in batch]),
+        'rejected_labels': torch.stack([b['rejected_labels'] for b in batch]),
+        'chosen_attention_mask': torch.stack([b['chosen_attention_mask'] for b in batch]),
+        'rejected_attention_mask': torch.stack([b['rejected_attention_mask'] for b in batch]),
+    }