""" D1337 CIPHER - Custom Training Script ===================================== Optimized QLoRA training for 31B model on 4x L40S (192GB VRAM) Brand: D1337 SOVEREIGN LABS Model: GLM-4.7-Flash-abliterated (31B) -> D1337 CIPHER Based on TRL docs: https://huggingface.co/docs/trl/main/en/sft_trainer """ import os import torch from datasets import load_dataset from transformers import BitsAndBytesConfig from peft import LoraConfig from trl import SFTTrainer, SFTConfig def main(): print("=" * 60) print("D1337 CIPHER - Training") print("D1337 SOVEREIGN LABS") print("=" * 60) # Config BASE_MODEL = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated" DATASET = "pacman1337/d1337-cipher-dataset" OUTPUT_MODEL = "pacman1337/d1337-cipher-v1" # Get HF token hf_token = os.environ.get("HF_TOKEN", None) print(f"HF_TOKEN: {'Found' if hf_token else 'Not found'}") # Check GPU if torch.cuda.is_available(): print(f"GPUs: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): props = torch.cuda.get_device_properties(i) print(f" GPU {i}: {props.name} ({props.total_memory / (1024**3):.1f} GB)") else: print("WARNING: No GPU!") # Load dataset print(f"\nLoading dataset: {DATASET}") dataset = load_dataset(DATASET, split="train") print(f"Dataset: {len(dataset)} samples") # QLoRA config (4-bit quantization) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) # LoRA config peft_config = LoraConfig( r=32, lora_alpha=64, lora_dropout=0.05, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], bias="none", task_type="CAUSAL_LM", ) # SFT Config - all training args here sft_config = SFTConfig( output_dir="./d1337-cipher-output", # Training num_train_epochs=3, per_device_train_batch_size=1, gradient_accumulation_steps=8, learning_rate=2e-4, weight_decay=0.01, warmup_ratio=0.1, lr_scheduler_type="cosine", # Optimization bf16=True, gradient_checkpointing=True, max_grad_norm=1.0, # Logging logging_steps=1, save_steps=50, save_total_limit=2, report_to="none", # Hub push_to_hub=True if hf_token else False, hub_model_id=OUTPUT_MODEL, hub_token=hf_token, hub_private_repo=True, # SFT specific max_length=2048, packing=False, # Model init kwargs for quantization model_init_kwargs={ "quantization_config": bnb_config, "device_map": "auto", "trust_remote_code": True, "torch_dtype": torch.bfloat16, }, ) # Create trainer - SFTTrainer handles everything print(f"\nLoading model: {BASE_MODEL}") trainer = SFTTrainer( model=BASE_MODEL, args=sft_config, train_dataset=dataset, peft_config=peft_config, ) # Print trainable params trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad) total = sum(p.numel() for p in trainer.model.parameters()) print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") # Train print("\n" + "=" * 60) print("TRAINING STARTED") print("=" * 60) trainer.train() # Save print("\nSaving model...") trainer.save_model() if hf_token: print(f"Pushing to hub: {OUTPUT_MODEL}") trainer.push_to_hub() print("\n" + "=" * 60) print("TRAINING COMPLETE!") print(f"Model: {OUTPUT_MODEL}") print("=" * 60) if __name__ == "__main__": main()