fivehi7s
Training scripts and DeepSpeed configs
023deeb
#!/usr/bin/env python3
"""
Training script for GoodGlinda-7B
Simplified reproduction skeleton - I ran this for 72 hours straight on my i7-12700 + RTX 4060/5070 Ti Overclocked and Undervoltaged.
At hour 14, this threw OOM errors until I fixed the 83°C thermal throttling with a paste replacement.
Advised is to use Watercooled setup.
"""
import torch
import deepspeed
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct")
parser.add_argument("--output_dir", type=str, default="./output")
parser.add_argument("--deepspeed", type=str, default=None)
args = parser.parse_args()
# Load base model. I use 4-bit NF4 with double quantization to fit the 8GB 4060.
# The 5070 Ti handles the heavier loads but sits idle 30% of the time waiting for the 4060.
model = AutoModelForCausalLM.from_pretrained(
args.model_name,
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
torch_dtype=torch.bfloat16,
device_map="auto" # DeepSpeed ZeRO-2 handles the asymmetric VRAM (8GB + 16GB)
)
# LoRA adapters for the verification heads (local at layer 7, arbitration at 14, global at 28).
# I tried rank 128 first but it OOM'd on the 4060, so I dropped to 64.
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.pad_token = tokenizer.eos_token
# Training arguments.
# I wasted two days on pipeline parallelism before switching to ZeRO-2.
# This config ran for 72 hours straight with 50,000 samples distilled from DeepSeek-V2.
training_args = TrainingArguments(
output_dir=args.output_dir,
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
learning_rate=2e-4,
warmup_steps=500,
logging_steps=10,
save_steps=500,
bf16=True,
deepspeed=args.deepspeed,
gradient_checkpointing=True,
optim="adamw_torch"
)
print("Model loaded. Ready for training.")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
print("Warning: This is a simplified skeleton. I trained for 72h on 50k samples.")
print("Watch your thermals. I hit 83°C at hour 14 and had to repaste.")
if __name__ == "__main__":
main()