# /// script # dependencies = ["trl", "peft", "bitsandbytes", "datasets", "transformers"] # /// from datasets import load_dataset from peft import LoraConfig from trl import SFTTrainer, SFTConfig from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch import os # Configuration MODEL_ID = "Qwen/Qwen2.5-32B-Instruct" DATASET_ID = "sunkencity/survival-instruct" OUTPUT_MODEL_ID = "sunkencity/survival-expert-qwen-32b" # Load Dataset dataset = load_dataset(DATASET_ID, split="train") # SANITIZE DATASET def filter_empty(example): return ( example["instruction"] is not None and example["response"] is not None and len(example["instruction"].strip()) > 0 and len(example["response"].strip()) > 0 ) dataset = dataset.filter(filter_empty) # Load Model # 4-bit quantization is essential for 32B on single A100 if we want decent batch size bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, # Using bfloat16 for A100 ) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto", use_cache=False, torch_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer.pad_token = tokenizer.eos_token # MANUAL FORMATTING def format_row(example): instruction = example['instruction'] response = example['response'] # Qwen Chat Template # <|im_start|>user # {instruction}<|im_end|> # <|im_start|>assistant # {response}<|im_end|> text = f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>{tokenizer.eos_token}" return {"text": text} dataset = dataset.map(format_row) # LoRA peft_config = LoraConfig( r=32, # Increased rank for larger model capability lora_alpha=64, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] ) # Args training_args = SFTConfig( output_dir="./results", num_train_epochs=3, per_device_train_batch_size=4, # A100 has 80GB, we can afford larger batches gradient_accumulation_steps=4, learning_rate=1e-4, logging_steps=5, push_to_hub=True, hub_model_id=OUTPUT_MODEL_ID, fp16=False, bf16=True, # Enable BF16 for A100 packing=False, max_length=2048, # Increased context length for 32B dataset_text_field="text" ) # Trainer trainer = SFTTrainer( model=model, train_dataset=dataset, peft_config=peft_config, args=training_args, processing_class=tokenizer, ) print("Starting training...") trainer.train() print("Pushing to hub...") trainer.push_to_hub() print("Done!")