Desorden1337
Fix: use dtype= like official example for GLM model
3e8f1b5
import subprocess
import sys
# Install dependencies at runtime
subprocess.run([sys.executable, "-m", "pip", "install", "peft", "bitsandbytes", "-q"])
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
print("🔥 D1337 CIPHER - L40S x4 TRAINING")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"GPU count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
else:
print("⚠️ WARNING: No GPU detected! Training will be VERY slow on CPU.")
# Model - EXACTLY from official HuggingFace page
model_name = "huihui-ai/Huihui-GLM-4.7-Flash-abliterated"
print(f"\n🔥 Loading: {model_name}")
# Tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 4-bit quantization - EXACTLY from official example
print("Loading model with 4-bit quantization (31B params)...")
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
# Load model - EXACTLY like official example uses dtype= not torch_dtype=
model = AutoModelForCausalLM.from_pretrained(
model_name,
dtype=torch.bfloat16, # OFFICIAL EXAMPLE USES dtype=
device_map="auto",
trust_remote_code=True,
quantization_config=quant_config,
)
print("✅ Huihui-GLM-4.7-Flash-abliterated loaded!")
# LoRA for efficient fine-tuning
print("\nSetting up LoRA...")
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
r=64,
lora_alpha=128,
target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Load dataset
print("\nLoading dataset...")
dataset = load_dataset("Desorden1337/d1337-cipher-dataset", split="train")
print(f"Dataset size: {len(dataset)} samples")
# Tokenize
def tokenize(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=2048)
tokens["labels"] = tokens["input_ids"].copy()
return tokens
dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
# Training args - optimized for L40S x4
training_args = TrainingArguments(
output_dir="./d1337-cipher",
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
learning_rate=2e-4,
lr_scheduler_type="cosine",
warmup_ratio=0.1,
weight_decay=0.01,
logging_steps=1,
save_steps=25,
save_total_limit=2,
bf16=True,
gradient_checkpointing=True,
optim="paged_adamw_8bit",
push_to_hub=True,
hub_model_id="Desorden1337/d1337-cipher-v1",
hub_private_repo=True,
report_to="none"
)
# Train
print("\n🚀 STARTING TRAINING...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
trainer.train()
print("\n📤 Pushing to Hub...")
trainer.push_to_hub()
print("\n✅ TRAINING COMPLETE! Model: Desorden1337/d1337-cipher-v1")