import subprocess import sys # Install dependencies at runtime subprocess.run([sys.executable, "-m", "pip", "install", "peft", "bitsandbytes", "-q"]) import torch from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig from datasets import load_dataset from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training import os print("šŸ”„ D1337 CIPHER - L40S x4 TRAINING") print(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"GPU count: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): print(f" GPU {i}: {torch.cuda.get_device_name(i)}") else: print("āš ļø WARNING: No GPU detected! Training will be VERY slow on CPU.") # Model - EXACTLY from official HuggingFace page model_name = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated" print(f"\nšŸ”„ Loading: {model_name}") # Tokenizer print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 4-bit quantization - EXACTLY from official example print("Loading model with 4-bit quantization (31B params)...") quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) # Load model - EXACTLY like official example uses dtype= not torch_dtype= model = AutoModelForCausalLM.from_pretrained( model_name, dtype=torch.bfloat16, # OFFICIAL EXAMPLE USES dtype= device_map="auto", trust_remote_code=True, quantization_config=quant_config, ) print("āœ… Huihui-GLM-4.7-Flash-abliterated loaded!") # LoRA for efficient fine-tuning print("\nSetting up LoRA...") model = prepare_model_for_kbit_training(model) lora_config = LoraConfig( r=64, lora_alpha=128, target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Load dataset print("\nLoading dataset...") dataset = load_dataset("Desorden1337/d1337-cipher-dataset", split="train") print(f"Dataset size: {len(dataset)} samples") # Tokenize def tokenize(examples): tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048) tokens["labels"] = tokens["input_ids"].copy() return tokens dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names) # Training args - optimized for L40S x4 training_args = TrainingArguments( output_dir="./d1337-cipher", num_train_epochs=3, per_device_train_batch_size=2, gradient_accumulation_steps=8, learning_rate=2e-4, lr_scheduler_type="cosine", warmup_ratio=0.1, weight_decay=0.01, logging_steps=1, save_steps=25, save_total_limit=2, bf16=True, gradient_checkpointing=True, optim="paged_adamw_8bit", push_to_hub=True, hub_model_id="Desorden1337/d1337-cipher-v1", hub_private_repo=True, report_to="none" ) # Train print("\nšŸš€ STARTING TRAINING...") trainer = Trainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer ) trainer.train() print("\nšŸ“¤ Pushing to Hub...") trainer.push_to_hub() print("\nāœ… TRAINING COMPLETE! Model: Desorden1337/d1337-cipher-v1")