File size: 6,483 Bytes
94c41e3 8821eb6 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 808673c 94c41e3 3cc883e 94c41e3 7ce901f 94c41e3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | # /// script
# requires-python = ">=3.10"
# dependencies = [
# "torch>=2.0.0",
# "transformers @ git+https://github.com/huggingface/transformers.git",
# "trl>=0.12.0",
# "peft>=0.7.0",
# "accelerate>=0.24.0",
# "datasets",
# "trackio",
# "bitsandbytes",
# ]
# ///
"""
Fine-tune GLM-4.7-Flash on Unblinded Mastery dataset for QA and instruction following.
Using TRL SFTTrainer with LoRA on H100.
"""
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import gc
import trackio
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
# Configuration
MODEL_NAME = "zai-org/GLM-4.7-Flash"
DATASET_NAME = "LordNeel/unblinded-mastery-sharegpt"
OUTPUT_MODEL = "LordNeel/GLM-4.7-Flash-Unblinded-Mastery"
print("=" * 60)
print("GLM-4.7-Flash Fine-tuning for Unblinded Mastery")
print("=" * 60)
# Load dataset
print("\nLoading dataset...")
dataset = load_dataset(DATASET_NAME, split="train")
print(f"Dataset loaded: {len(dataset)} examples")
# Create train/eval split
print("Creating train/eval split...")
dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
print(f" Train: {len(train_dataset)} examples")
print(f" Eval: {len(eval_dataset)} examples")
# 4-bit quantization config for memory efficiency
print("\nSetting up 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
# Load tokenizer
print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}")
# Load model with 4-bit quantization
print("\nLoading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_cache=False, # Disable KV cache for training
attn_implementation="eager", # Use standard attention to save memory
)
print("Model loaded!")
# Enable gradient checkpointing
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
# Enable input gradients for LoRA (lighter than prepare_model_for_kbit_training)
model.enable_input_require_grads()
# Clear memory
gc.collect()
torch.cuda.empty_cache()
print(f"GPU Memory: {torch.cuda.memory_allocated()/1024**3:.2f} GB allocated")
# Find all linear layer names for LoRA
print("\nFinding linear layers for LoRA...")
def find_all_linear_names(model):
cls = torch.nn.Linear
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
# Remove output layer
if 'lm_head' in lora_module_names:
lora_module_names.remove('lm_head')
return list(lora_module_names)
target_modules = find_all_linear_names(model)
print(f" Found target modules: {target_modules}")
# LoRA configuration - using lower rank for memory efficiency
print("\nConfiguring LoRA...")
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
target_modules=target_modules,
)
# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Format function for ShareGPT conversations
def format_sharegpt(example):
"""Format ShareGPT conversations to chat template."""
messages = []
for turn in example["conversations"]:
role_map = {"system": "system", "human": "user", "gpt": "assistant"}
role = role_map.get(turn["from"], turn["from"])
messages.append({"role": role, "content": turn["value"]})
# Apply chat template
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
return {"text": text}
# Format datasets
print("\nFormatting datasets...")
train_dataset = train_dataset.map(format_sharegpt, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(format_sharegpt, remove_columns=eval_dataset.column_names)
print("Datasets formatted!")
# Training configuration
print("\nConfiguring training...")
training_config = SFTConfig(
# Hub settings - CRITICAL for saving
output_dir=OUTPUT_MODEL.split("/")[-1],
push_to_hub=True,
hub_model_id=OUTPUT_MODEL,
hub_strategy="every_save",
hub_private_repo=False,
# Training parameters
num_train_epochs=3,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=16, # Effective batch size: 16
learning_rate=2e-4,
max_length=1024, # Reduced for memory
# Memory optimization
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
# Logging & checkpointing
logging_steps=10,
save_strategy="steps",
save_steps=100,
save_total_limit=3,
# Evaluation
eval_strategy="steps",
eval_steps=100,
# Optimization
warmup_ratio=0.1,
lr_scheduler_type="cosine",
optim="paged_adamw_8bit",
# Precision
bf16=True,
fp16=False,
# Monitoring
report_to="trackio",
project="unblinded-mastery-finetuning",
run_name="glm47flash-sft-lora",
# Dataset
dataset_text_field="text",
packing=False,
)
# Initialize trainer
print("\nInitializing trainer...")
trainer = SFTTrainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=training_config,
processing_class=tokenizer,
peft_config=None, # Already applied above
)
# Train
print("\n" + "=" * 60)
print("STARTING TRAINING")
print("=" * 60)
trainer.train()
# Save and push to hub
print("\nSaving model to Hub...")
trainer.save_model()
trainer.push_to_hub()
# Finish tracking
trackio.finish()
print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print(f"Model saved to: https://huggingface.co/{OUTPUT_MODEL}")
print(f"View metrics at: https://huggingface.co/spaces/LordNeel/trackio")
print("=" * 60)
|