Spaces:
Runtime error
Runtime error
| """ | |
| D1337 CIPHER - Custom Training Script | |
| ===================================== | |
| Optimized QLoRA training for 31B model on 4x L40S (192GB VRAM) | |
| Brand: D1337 SOVEREIGN LABS | |
| Model: GLM-4.7-Flash-abliterated (31B) -> D1337 CIPHER | |
| Based on TRL docs: https://huggingface.co/docs/trl/main/en/sft_trainer | |
| """ | |
| import os | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import BitsAndBytesConfig | |
| from peft import LoraConfig | |
| from trl import SFTTrainer, SFTConfig | |
| def main(): | |
| print("=" * 60) | |
| print("D1337 CIPHER - Training") | |
| print("D1337 SOVEREIGN LABS") | |
| print("=" * 60) | |
| # Config | |
| BASE_MODEL = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated" | |
| DATASET = "pacman1337/d1337-cipher-dataset" | |
| OUTPUT_MODEL = "pacman1337/d1337-cipher-v1" | |
| # Get HF token | |
| hf_token = os.environ.get("HF_TOKEN", None) | |
| print(f"HF_TOKEN: {'Found' if hf_token else 'Not found'}") | |
| # Check GPU | |
| if torch.cuda.is_available(): | |
| print(f"GPUs: {torch.cuda.device_count()}") | |
| for i in range(torch.cuda.device_count()): | |
| props = torch.cuda.get_device_properties(i) | |
| print(f" GPU {i}: {props.name} ({props.total_memory / (1024**3):.1f} GB)") | |
| else: | |
| print("WARNING: No GPU!") | |
| # Load dataset | |
| print(f"\nLoading dataset: {DATASET}") | |
| dataset = load_dataset(DATASET, split="train") | |
| print(f"Dataset: {len(dataset)} samples") | |
| # QLoRA config (4-bit quantization) | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| # LoRA config | |
| peft_config = LoraConfig( | |
| r=32, | |
| lora_alpha=64, | |
| lora_dropout=0.05, | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| # SFT Config - all training args here | |
| sft_config = SFTConfig( | |
| output_dir="./d1337-cipher-output", | |
| # Training | |
| num_train_epochs=3, | |
| per_device_train_batch_size=1, | |
| gradient_accumulation_steps=8, | |
| learning_rate=2e-4, | |
| weight_decay=0.01, | |
| warmup_ratio=0.1, | |
| lr_scheduler_type="cosine", | |
| # Optimization | |
| bf16=True, | |
| gradient_checkpointing=True, | |
| max_grad_norm=1.0, | |
| # Logging | |
| logging_steps=1, | |
| save_steps=50, | |
| save_total_limit=2, | |
| report_to="none", | |
| # Hub | |
| push_to_hub=True if hf_token else False, | |
| hub_model_id=OUTPUT_MODEL, | |
| hub_token=hf_token, | |
| hub_private_repo=True, | |
| # SFT specific | |
| max_length=2048, | |
| packing=False, | |
| # Model init kwargs for quantization | |
| model_init_kwargs={ | |
| "quantization_config": bnb_config, | |
| "device_map": "auto", | |
| "trust_remote_code": True, | |
| "torch_dtype": torch.bfloat16, | |
| }, | |
| ) | |
| # Create trainer - SFTTrainer handles everything | |
| print(f"\nLoading model: {BASE_MODEL}") | |
| trainer = SFTTrainer( | |
| model=BASE_MODEL, | |
| args=sft_config, | |
| train_dataset=dataset, | |
| peft_config=peft_config, | |
| ) | |
| # Print trainable params | |
| trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad) | |
| total = sum(p.numel() for p in trainer.model.parameters()) | |
| print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") | |
| # Train | |
| print("\n" + "=" * 60) | |
| print("TRAINING STARTED") | |
| print("=" * 60) | |
| trainer.train() | |
| # Save | |
| print("\nSaving model...") | |
| trainer.save_model() | |
| if hf_token: | |
| print(f"Pushing to hub: {OUTPUT_MODEL}") | |
| trainer.push_to_hub() | |
| print("\n" + "=" * 60) | |
| print("TRAINING COMPLETE!") | |
| print(f"Model: {OUTPUT_MODEL}") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |