#!/usr/bin/env python3
"""
Training script for GoodGlinda-7B
Simplified reproduction skeleton - I ran this for 72 hours straight on my i7-12700 + RTX 4060/5070 Ti Overclocked and Undervoltaged.
At hour 14, this threw OOM errors until I fixed the 83°C thermal throttling with a paste replacement.
Advised is to use Watercooled setup.
"""

import torch
import deepspeed
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
import argparse

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct")
    parser.add_argument("--output_dir", type=str, default="./output")
    parser.add_argument("--deepspeed", type=str, default=None)
    args = parser.parse_args()
    
    # Load base model. I use 4-bit NF4 with double quantization to fit the 8GB 4060.
    # The 5070 Ti handles the heavier loads but sits idle 30% of the time waiting for the 4060.
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name,
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        torch_dtype=torch.bfloat16,
        device_map="auto"  # DeepSpeed ZeRO-2 handles the asymmetric VRAM (8GB + 16GB)
    )
    
    # LoRA adapters for the verification heads (local at layer 7, arbitration at 14, global at 28).
    # I tried rank 128 first but it OOM'd on the 4060, so I dropped to 64.
    lora_config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(model, lora_config)
    
    # Tokenizer setup
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Training arguments.
    # I wasted two days on pipeline parallelism before switching to ZeRO-2.
    # This config ran for 72 hours straight with 50,000 samples distilled from DeepSeek-V2.
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        warmup_steps=500,
        logging_steps=10,
        save_steps=500,
        bf16=True,
        deepspeed=args.deepspeed,
        gradient_checkpointing=True,
        optim="adamw_torch"
    )
    
    print("Model loaded. Ready for training.")
    print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
    print("Warning: This is a simplified skeleton. I trained for 72h on 50k samples.")
    print("Watch your thermals. I hit 83°C at hour 14 and had to repaste.")

if __name__ == "__main__":
    main()