Spaces:

catninja123
/

mash-stylebart-trainer

Paused

App Files Files Community

catninja123 commited on Mar 8

Commit

899504c

verified ·

1 Parent(s): 5694ce7

Upload src/train_dpo.py with huggingface_hub

Browse files

Files changed (1) hide show

src/train_dpo.py +375 -0

src/train_dpo.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+MASH Stage 3: DPO Alignment with GPTZero as Reward
+1. Use SFT model to generate paraphrases for training data
+2. Score each paraphrase with GPTZero API
+3. Construct preference pairs:
+   - chosen = human text (passes as human)
+   - rejected = model output that GPTZero detects as AI
+4. Train with DPO loss
+GPTZero API is only called during data construction (~50-100 queries),
+NOT during training itself.
+"""
+import os
+import sys
+import json
+import time
+import argparse
+import requests
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model import StyleBART
+from dataset import MASHDPODataset, dpo_collate_fn
+# ============================================================
+# GPTZero API Integration
+# ============================================================
+GPTZERO_API_KEY = os.environ.get('GPTZERO_API_KEY', '')
+GPTZERO_API_URL = 'https://api.gptzero.me/v2/predict/text'
+def query_gptzero(text: str) -> dict:
+    """
+    Query GPTZero API for AI detection score.
+    Returns: {'ai_prob': float, 'human_prob': float, 'mixed_prob': float}
+    """
+    if not GPTZERO_API_KEY:
+        raise ValueError("GPTZERO_API_KEY not set")
+    headers = {
+        'x-api-key': GPTZERO_API_KEY,
+        'Content-Type': 'application/json',
+    }
+    payload = {
+        'document': text,
+        'version': '2024-04-04',
+    }
+    for attempt in range(3):
+        try:
+            resp = requests.post(GPTZERO_API_URL, json=payload, headers=headers, timeout=30)
+            resp.raise_for_status()
+            result = resp.json()
+            doc = result.get('documents', [{}])[0]
+            return {
+                'ai_prob': doc.get('completely_generated_prob', 0),
+                'human_prob': 1 - doc.get('completely_generated_prob', 0),
+                'class': doc.get('predicted_class', 'unknown'),
+            }
+        except Exception as e:
+            if attempt < 2:
+                time.sleep(2 ** attempt)
+            else:
+                print(f"GPTZero API error: {e}")
+                return {'ai_prob': 0.5, 'human_prob': 0.5, 'class': 'error'}
+# ============================================================
+# DPO Data Construction
+# ============================================================
+def construct_dpo_data(sft_model_path: str, train_data_path: str,
+                       output_path: str, device: str = 'cuda',
+                       max_samples: int = 500, ai_threshold: float = 0.5):
+    """
+    Construct DPO preference pairs using SFT model + GPTZero.
+    For each sample:
+    1. Generate paraphrase with SFT model (human style)
+    2. Query GPTZero
+    3. If detected as AI → use as rejected; human text = chosen
+    4. If detected as human → skip (model already succeeds)
+    """
+    print(f"Loading SFT model from {sft_model_path}...")
+    model = StyleBART.load_pretrained(sft_model_path, device=device)
+    model = model.to(device)
+    model.eval()
+    # Load training data
+    raw_data = []
+    with open(train_data_path) as f:
+        for line in f:
+            raw_data.append(json.loads(line))
+    # Sample subset for DPO construction
+    import random
+    random.shuffle(raw_data)
+    raw_data = raw_data[:max_samples]
+    dpo_pairs = []
+    n_queried = 0
+    n_rejected = 0
+    print(f"Constructing DPO pairs from {len(raw_data)} samples...")
+    for i, d in enumerate(raw_data):
+        essay_type = d['type']
+        style_key = f'human_{essay_type}'
+        # Tokenize input
+        inputs = model.tokenizer(
+            d['input_text'],
+            max_length=512, truncation=True,
+            return_tensors='pt',
+        ).to(device)
+        # Generate with human style
+        with torch.no_grad():
+            generated = model.generate_text(
+                inputs['input_ids'],
+                inputs['attention_mask'],
+                style_keys=[style_key],
+                max_length=512, num_beams=4,
+            )
+        gen_text = model.tokenizer.decode(generated[0], skip_special_tokens=True)
+        # Query GPTZero
+        result = query_gptzero(gen_text)
+        n_queried += 1
+        if result['ai_prob'] > ai_threshold:
+            # Model failed to evade → good rejected sample
+            dpo_pairs.append({
+                'input_text': d['input_text'],
+                'chosen_text': d['human_text'],
+                'rejected_text': gen_text,
+                'style_key': style_key,
+                'essay_type': essay_type,
+                'gptzero_ai_prob': result['ai_prob'],
+            })
+            n_rejected += 1
+        if (i + 1) % 10 == 0:
+            print(f"  [{i+1}/{len(raw_data)}] Queried: {n_queried}, "
+                  f"Rejected (usable): {n_rejected}")
+    # Save DPO data
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, 'w') as f:
+        for pair in dpo_pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + '\n')
+    print(f"\nDPO data construction complete:")
+    print(f"  Total queried: {n_queried}")
+    print(f"  Usable rejected pairs: {n_rejected}")
+    print(f"  Rejection rate: {n_rejected/max(n_queried,1)*100:.1f}%")
+    print(f"  Saved to: {output_path}")
+    return dpo_pairs
+# ============================================================
+# DPO Training
+# ============================================================
+def compute_dpo_loss(model, batch, device, beta=0.1, ref_model=None):
+    """
+    Compute DPO loss.
+    L_DPO = -E[log σ(β · (log π(y_w|x) - log π_ref(y_w|x))
+                     - β · (log π(y_l|x) - log π_ref(y_l|x)))]
+    """
+    input_ids = batch['input_ids'].to(device)
+    attention_mask = batch['attention_mask'].to(device)
+    chosen_labels = batch['chosen_labels'].to(device)
+    rejected_labels = batch['rejected_labels'].to(device)
+    style_keys = batch['style_keys']
+    # Compute log probs for chosen
+    chosen_outputs = model(input_ids, attention_mask, chosen_labels, style_keys)
+    chosen_logits = chosen_outputs.logits
+    chosen_log_probs = compute_sequence_log_probs(chosen_logits, chosen_labels)
+    # Compute log probs for rejected
+    rejected_outputs = model(input_ids, attention_mask, rejected_labels, style_keys)
+    rejected_logits = rejected_outputs.logits
+    rejected_log_probs = compute_sequence_log_probs(rejected_logits, rejected_labels)
+    # Reference model log probs (frozen SFT model)
+    if ref_model is not None:
+        with torch.no_grad():
+            ref_chosen_outputs = ref_model(input_ids, attention_mask, chosen_labels, style_keys)
+            ref_chosen_log_probs = compute_sequence_log_probs(ref_chosen_outputs.logits, chosen_labels)
+            ref_rejected_outputs = ref_model(input_ids, attention_mask, rejected_labels, style_keys)
+            ref_rejected_log_probs = compute_sequence_log_probs(ref_rejected_outputs.logits, rejected_labels)
+    else:
+        ref_chosen_log_probs = chosen_log_probs.detach()
+        ref_rejected_log_probs = rejected_log_probs.detach()
+    # DPO loss
+    chosen_rewards = beta * (chosen_log_probs - ref_chosen_log_probs)
+    rejected_rewards = beta * (rejected_log_probs - ref_rejected_log_probs)
+    loss = -F.logsigmoid(chosen_rewards - rejected_rewards).mean()
+    # Metrics
+    with torch.no_grad():
+        reward_margin = (chosen_rewards - rejected_rewards).mean().item()
+        accuracy = ((chosen_rewards > rejected_rewards).float().mean().item())
+    return loss, {
+        'loss': loss.item(),
+        'reward_margin': reward_margin,
+        'accuracy': accuracy,
+    }
+def compute_sequence_log_probs(logits, labels):
+    """Compute per-sequence average log probability."""
+    shift_logits = logits[..., :-1, :].contiguous()
+    shift_labels = labels[..., 1:].contiguous()
+    log_probs = F.log_softmax(shift_logits, dim=-1)
+    # Gather log probs for actual tokens
+    token_log_probs = log_probs.gather(-1, shift_labels.clamp(min=0).unsqueeze(-1)).squeeze(-1)
+    # Mask padding (-100)
+    mask = (shift_labels != -100).float()
+    # Average log prob per sequence
+    seq_log_probs = (token_log_probs * mask).sum(dim=-1) / mask.sum(dim=-1).clamp(min=1)
+    return seq_log_probs
+def train_dpo(model, ref_model, train_loader, optimizer, scheduler,
+              device, beta=0.1):
+    """Train one epoch of DPO."""
+    model.train()
+    total_metrics = {'loss': 0, 'reward_margin': 0, 'accuracy': 0}
+    n_batches = 0
+    for batch in train_loader:
+        loss, metrics = compute_dpo_loss(model, batch, device, beta, ref_model)
+        optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        scheduler.step()
+        for k in total_metrics:
+            total_metrics[k] += metrics[k]
+        n_batches += 1
+    return {k: v / n_batches for k, v in total_metrics.items()}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', choices=['construct', 'train', 'both'], default='both')
+    parser.add_argument('--sft_model_path', default='checkpoints/sft/best')
+    parser.add_argument('--train_data', default='data/train.jsonl')
+    parser.add_argument('--dpo_data', default='data/dpo_pairs.jsonl')
+    parser.add_argument('--output_dir', default='checkpoints/dpo')
+    parser.add_argument('--batch_size', type=int, default=4)
+    parser.add_argument('--epochs', type=int, default=3)
+    parser.add_argument('--lr', type=float, default=1e-5)
+    parser.add_argument('--beta', type=float, default=0.1)
+    parser.add_argument('--max_dpo_samples', type=int, default=500)
+    parser.add_argument('--ai_threshold', type=float, default=0.5)
+    parser.add_argument('--seed', type=int, default=42)
+    args = parser.parse_args()
+    torch.manual_seed(args.seed)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Device: {device}")
+    # Stage 3a: Construct DPO data
+    if args.mode in ['construct', 'both']:
+        construct_dpo_data(
+            sft_model_path=args.sft_model_path,
+            train_data_path=args.train_data,
+            output_path=args.dpo_data,
+            device=str(device),
+            max_samples=args.max_dpo_samples,
+            ai_threshold=args.ai_threshold,
+        )
+    # Stage 3b: DPO Training
+    if args.mode in ['train', 'both']:
+        # Check DPO data exists
+        if not os.path.exists(args.dpo_data):
+            print(f"ERROR: DPO data not found at {args.dpo_data}")
+            print("Run with --mode construct first")
+            return
+        # Load DPO model (initialized from SFT)
+        print(f"\nLoading DPO model from {args.sft_model_path}...")
+        model = StyleBART.load_pretrained(args.sft_model_path, device=str(device))
+        model = model.to(device)
+        # Load reference model (frozen SFT)
+        print("Loading reference model (frozen)...")
+        ref_model = StyleBART.load_pretrained(args.sft_model_path, device=str(device))
+        ref_model = ref_model.to(device)
+        ref_model.eval()
+        for p in ref_model.parameters():
+            p.requires_grad = False
+        # Dataset
+        dpo_dataset = MASHDPODataset(
+            args.dpo_data, model.tokenizer,
+            max_input_len=512, max_target_len=512,
+        )
+        dpo_loader = DataLoader(
+            dpo_dataset, batch_size=args.batch_size,
+            shuffle=True, collate_fn=dpo_collate_fn,
+        )
+        print(f"DPO training pairs: {len(dpo_dataset)}")
+        # Optimizer
+        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
+        total_steps = len(dpo_loader) * args.epochs
+        scheduler = CosineAnnealingLR(optimizer, T_max=total_steps, eta_min=1e-7)
+        # Training
+        os.makedirs(args.output_dir, exist_ok=True)
+        best_margin = -float('inf')
+        print(f"\n{'='*60}")
+        print(f"Starting DPO Training")
+        print(f"  Epochs: {args.epochs}")
+        print(f"  Beta: {args.beta}")
+        print(f"  LR: {args.lr}")
+        print(f"{'='*60}\n")
+        for epoch in range(1, args.epochs + 1):
+            t0 = time.time()
+            metrics = train_dpo(
+                model, ref_model, dpo_loader, optimizer, scheduler,
+                device, beta=args.beta,
+            )
+            elapsed = time.time() - t0
+            print(f"Epoch {epoch}/{args.epochs} ({elapsed:.0f}s)")
+            print(f"  Loss: {metrics['loss']:.4f}")
+            print(f"  Reward margin: {metrics['reward_margin']:.4f}")
+            print(f"  Accuracy: {metrics['accuracy']:.2%}")
+            if metrics['reward_margin'] > best_margin:
+                best_margin = metrics['reward_margin']
+                model.save_pretrained(os.path.join(args.output_dir, 'best'))
+                print(f"  ★ New best model saved")
+        # Save final
+        model.save_pretrained(os.path.join(args.output_dir, 'final'))
+        print(f"\nDPO training complete! Models saved to {args.output_dir}/")
+if __name__ == '__main__':
+    main()