"""
D1337 CIPHER - Custom Training Script
=====================================
Optimized QLoRA training for 31B model on 4x L40S (192GB VRAM)

Brand: D1337 SOVEREIGN LABS
Model: GLM-4.7-Flash-abliterated (31B) -> D1337 CIPHER

Based on TRL docs: https://huggingface.co/docs/trl/main/en/sft_trainer
"""

import os
import torch
from datasets import load_dataset
from transformers import BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

def main():
    print("=" * 60)
    print("D1337 CIPHER - Training")
    print("D1337 SOVEREIGN LABS")
    print("=" * 60)
    
    # Config
    BASE_MODEL = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated"
    DATASET = "pacman1337/d1337-cipher-dataset"
    OUTPUT_MODEL = "pacman1337/d1337-cipher-v1"
    
    # Get HF token
    hf_token = os.environ.get("HF_TOKEN", None)
    print(f"HF_TOKEN: {'Found' if hf_token else 'Not found'}")
    
    # Check GPU
    if torch.cuda.is_available():
        print(f"GPUs: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            print(f"  GPU {i}: {props.name} ({props.total_memory / (1024**3):.1f} GB)")
    else:
        print("WARNING: No GPU!")
    
    # Load dataset
    print(f"\nLoading dataset: {DATASET}")
    dataset = load_dataset(DATASET, split="train")
    print(f"Dataset: {len(dataset)} samples")
    
    # QLoRA config (4-bit quantization)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )
    
    # LoRA config
    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    # SFT Config - all training args here
    sft_config = SFTConfig(
        output_dir="./d1337-cipher-output",
        
        # Training
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        learning_rate=2e-4,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        
        # Optimization
        bf16=True,
        gradient_checkpointing=True,
        max_grad_norm=1.0,
        
        # Logging
        logging_steps=1,
        save_steps=50,
        save_total_limit=2,
        report_to="none",
        
        # Hub
        push_to_hub=True if hf_token else False,
        hub_model_id=OUTPUT_MODEL,
        hub_token=hf_token,
        hub_private_repo=True,
        
        # SFT specific
        max_length=2048,
        packing=False,
        
        # Model init kwargs for quantization
        model_init_kwargs={
            "quantization_config": bnb_config,
            "device_map": "auto",
            "trust_remote_code": True,
            "torch_dtype": torch.bfloat16,
        },
    )
    
    # Create trainer - SFTTrainer handles everything
    print(f"\nLoading model: {BASE_MODEL}")
    trainer = SFTTrainer(
        model=BASE_MODEL,
        args=sft_config,
        train_dataset=dataset,
        peft_config=peft_config,
    )
    
    # Print trainable params
    trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in trainer.model.parameters())
    print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
    
    # Train
    print("\n" + "=" * 60)
    print("TRAINING STARTED")
    print("=" * 60)
    
    trainer.train()
    
    # Save
    print("\nSaving model...")
    trainer.save_model()
    
    if hf_token:
        print(f"Pushing to hub: {OUTPUT_MODEL}")
        trainer.push_to_hub()
    
    print("\n" + "=" * 60)
    print("TRAINING COMPLETE!")
    print(f"Model: {OUTPUT_MODEL}")
    print("=" * 60)


if __name__ == "__main__":
    main()