#!/usr/bin/env python3 """ Training script for GoodGlinda-7B Simplified reproduction skeleton - I ran this for 72 hours straight on my i7-12700 + RTX 4060/5070 Ti Overclocked and Undervoltaged. At hour 14, this threw OOM errors until I fixed the 83°C thermal throttling with a paste replacement. Advised is to use Watercooled setup. """ import torch import deepspeed from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer ) from peft import LoraConfig, get_peft_model, TaskType import argparse def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="Qwen/Qwen2.5-7B-Instruct") parser.add_argument("--output_dir", type=str, default="./output") parser.add_argument("--deepspeed", type=str, default=None) args = parser.parse_args() # Load base model. I use 4-bit NF4 with double quantization to fit the 8GB 4060. # The 5070 Ti handles the heavier loads but sits idle 30% of the time waiting for the 4060. model = AutoModelForCausalLM.from_pretrained( args.model_name, load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, torch_dtype=torch.bfloat16, device_map="auto" # DeepSpeed ZeRO-2 handles the asymmetric VRAM (8GB + 16GB) ) # LoRA adapters for the verification heads (local at layer 7, arbitration at 14, global at 28). # I tried rank 128 first but it OOM'd on the 4060, so I dropped to 64. lora_config = LoraConfig( r=64, lora_alpha=16, target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM ) model = get_peft_model(model, lora_config) # Tokenizer setup tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenizer.pad_token = tokenizer.eos_token # Training arguments. # I wasted two days on pipeline parallelism before switching to ZeRO-2. # This config ran for 72 hours straight with 50,000 samples distilled from DeepSeek-V2. training_args = TrainingArguments( output_dir=args.output_dir, num_train_epochs=3, per_device_train_batch_size=2, gradient_accumulation_steps=2, learning_rate=2e-4, warmup_steps=500, logging_steps=10, save_steps=500, bf16=True, deepspeed=args.deepspeed, gradient_checkpointing=True, optim="adamw_torch" ) print("Model loaded. Ready for training.") print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") print("Warning: This is a simplified skeleton. I trained for 72h on 50k samples.") print("Watch your thermals. I hit 83°C at hour 14 and had to repaste.") if __name__ == "__main__": main()