YellowLabsStudio
/

goodglinda-training-code

4bit-quantization

Model card Files Files and versions

goodglinda-training-code / train.py

fivehi7s

Training scripts and DeepSpeed configs

023deeb 7 days ago

history blame contribute delete

2.89 kB

	#!/usr/bin/env python3
	"""
	Training script for GoodGlinda-7B
	Simplified reproduction skeleton - I ran this for 72 hours straight on my i7-12700 + RTX 4060/5070 Ti Overclocked and Undervoltaged.
	At hour 14, this threw OOM errors until I fixed the 83°C thermal throttling with a paste replacement.
	Advised is to use Watercooled setup.
	"""

	import torch
	import deepspeed
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TrainingArguments,
	Trainer
	)
	from peft import LoraConfig, get_peft_model, TaskType
	import argparse

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct")
	parser.add_argument("--output_dir", type=str, default="./output")
	parser.add_argument("--deepspeed", type=str, default=None)
	args = parser.parse_args()

	# Load base model. I use 4-bit NF4 with double quantization to fit the 8GB 4060.
	# The 5070 Ti handles the heavier loads but sits idle 30% of the time waiting for the 4060.
	model = AutoModelForCausalLM.from_pretrained(
	args.model_name,
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	torch_dtype=torch.bfloat16,
	device_map="auto" # DeepSpeed ZeRO-2 handles the asymmetric VRAM (8GB + 16GB)
	)

	# LoRA adapters for the verification heads (local at layer 7, arbitration at 14, global at 28).
	# I tried rank 128 first but it OOM'd on the 4060, so I dropped to 64.
	lora_config = LoraConfig(
	r=64,
	lora_alpha=16,
	target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type=TaskType.CAUSAL_LM
	)
	model = get_peft_model(model, lora_config)

	# Tokenizer setup
	tokenizer = AutoTokenizer.from_pretrained(args.model_name)
	tokenizer.pad_token = tokenizer.eos_token

	# Training arguments.
	# I wasted two days on pipeline parallelism before switching to ZeRO-2.
	# This config ran for 72 hours straight with 50,000 samples distilled from DeepSeek-V2.
	training_args = TrainingArguments(
	output_dir=args.output_dir,
	num_train_epochs=3,
	per_device_train_batch_size=2,
	gradient_accumulation_steps=2,
	learning_rate=2e-4,
	warmup_steps=500,
	logging_steps=10,
	save_steps=500,
	bf16=True,
	deepspeed=args.deepspeed,
	gradient_checkpointing=True,
	optim="adamw_torch"
	)

	print("Model loaded. Ready for training.")
	print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
	print("Warning: This is a simplified skeleton. I trained for 72h on 50k samples.")
	print("Watch your thermals. I hit 83°C at hour 14 and had to repaste.")

	if __name__ == "__main__":
	main()