Choukrijer's picture
Upload BuildwellAI Qwen3-14B project
499c907 verified
#!/usr/bin/env python3
"""
BuildwellAI Model V2 - Fine-Tuning Script
Optimized for RunPod 2x RTX A5000 (48GB VRAM) with anti-overfitting measures.
Key Features:
- QLoRA 4-bit quantization for memory efficiency
- Validation loss monitoring with early stopping
- Learning rate warmup and cosine decay
- Weight decay regularization
- Gradient clipping
- Dropout in LoRA layers
- Proper train/val split
Usage:
python3 finetune.py [--config config.json]
"""
import os
import sys
import json
import torch
import argparse
from pathlib import Path
from datetime import datetime
from typing import Optional
# ============================================================================
# CONFIGURATION
# ============================================================================
DEFAULT_CONFIG = {
# Model
"base_model": "Qwen/Qwen3-14B",
"max_seq_length": 2048,
# LoRA Configuration (moderate to prevent overfitting)
"lora_r": 16, # Lower rank = less overfitting
"lora_alpha": 32,
"lora_dropout": 0.1, # Dropout for regularization
"lora_target_modules": [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
# Training Configuration (anti-overfitting)
"batch_size": 4,
"gradient_accumulation_steps": 4,
"learning_rate": 1e-5, # Lower LR for fine-tuning existing model
"num_epochs": 2, # Fewer epochs to prevent overfitting
"warmup_ratio": 0.1, # 10% warmup
"weight_decay": 0.05, # L2 regularization
"max_grad_norm": 0.5, # Gradient clipping
# Early Stopping
"early_stopping_patience": 3,
"early_stopping_threshold": 0.01,
# Validation
"eval_steps": 200,
"eval_strategy": "steps",
# Logging & Saving
"logging_steps": 50,
"save_steps": 200,
"save_total_limit": 3,
# Paths
"train_data": "../datasets/train.jsonl",
"val_data": "../datasets/validation.jsonl",
"output_dir": "../output/buildwellai-qwen3-14b-v2",
# Hub
"push_to_hub": False,
"hub_model_id": "buildwellai/qwen3-14b-v2",
}
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def setup_environment():
"""Setup environment for training."""
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def check_gpu():
"""Check GPU availability and memory."""
print("=" * 60)
print("GPU Configuration")
print("=" * 60)
if not torch.cuda.is_available():
print("ERROR: CUDA not available!")
sys.exit(1)
num_gpus = torch.cuda.device_count()
total_memory = 0
for i in range(num_gpus):
props = torch.cuda.get_device_properties(i)
memory_gb = props.total_memory / (1024**3)
total_memory += memory_gb
print(f"GPU {i}: {props.name} ({memory_gb:.1f} GB)")
print(f"Total GPUs: {num_gpus}")
print(f"Total VRAM: {total_memory:.1f} GB")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.version.cuda}")
return num_gpus
def load_config(config_path: Optional[str] = None) -> dict:
"""Load configuration from file or use defaults."""
config = DEFAULT_CONFIG.copy()
if config_path and os.path.exists(config_path):
with open(config_path) as f:
user_config = json.load(f)
config.update(user_config)
print(f"Loaded config from: {config_path}")
return config
def format_chat_example(example: dict, tokenizer) -> str:
"""Format a training example using chat template."""
messages = example.get("messages", [])
# Handle tool_calls in messages
formatted_messages = []
for msg in messages:
new_msg = {"role": msg["role"]}
content = msg.get("content", "")
# Handle None content
if content is None:
content = ""
new_msg["content"] = content
formatted_messages.append(new_msg)
text = tokenizer.apply_chat_template(
formatted_messages,
tokenize=False,
add_generation_prompt=False
)
if not text.endswith(tokenizer.eos_token):
text += tokenizer.eos_token
return text
# ============================================================================
# TRAINING WITH UNSLOTH (RECOMMENDED)
# ============================================================================
def train_with_unsloth(config: dict):
"""Train using Unsloth for 2-5x speedup."""
print("\n" + "=" * 60)
print("Training with Unsloth (Optimized)")
print("=" * 60)
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
from datasets import load_dataset
from transformers import EarlyStoppingCallback
# Resolve paths
script_dir = Path(__file__).parent
train_path = script_dir / config["train_data"]
val_path = script_dir / config["val_data"]
output_dir = script_dir / config["output_dir"]
output_dir.mkdir(parents=True, exist_ok=True)
# Load model
print(f"\nLoading model: {config['base_model']}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=config["base_model"],
max_seq_length=config["max_seq_length"],
dtype=torch.bfloat16 if is_bfloat16_supported() else torch.float16,
load_in_4bit=True,
)
# Apply LoRA with dropout for regularization
print("Applying LoRA with dropout...")
model = FastLanguageModel.get_peft_model(
model,
r=config["lora_r"],
lora_alpha=config["lora_alpha"],
lora_dropout=config["lora_dropout"], # Anti-overfitting
target_modules=config["lora_target_modules"],
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
# Load datasets
print(f"\nLoading training data: {train_path}")
train_dataset = load_dataset('json', data_files=str(train_path), split='train')
print(f"Training examples: {len(train_dataset):,}")
val_dataset = None
if val_path.exists():
print(f"Loading validation data: {val_path}")
val_dataset = load_dataset('json', data_files=str(val_path), split='train')
print(f"Validation examples: {len(val_dataset):,}")
# Format datasets
print("\nFormatting datasets...")
def format_fn(examples):
texts = []
for i in range(len(examples["messages"])):
example = {"messages": examples["messages"][i]}
text = format_chat_example(example, tokenizer)
texts.append(text)
return {"text": texts}
train_dataset = train_dataset.map(
format_fn,
batched=True,
remove_columns=train_dataset.column_names,
desc="Formatting train"
)
if val_dataset:
val_dataset = val_dataset.map(
format_fn,
batched=True,
remove_columns=val_dataset.column_names,
desc="Formatting validation"
)
# Training arguments with anti-overfitting settings
effective_batch = config["batch_size"] * config["gradient_accumulation_steps"] * torch.cuda.device_count()
print(f"\nEffective batch size: {effective_batch}")
training_args = UnslothTrainingArguments(
output_dir=str(output_dir),
# Training
num_train_epochs=config["num_epochs"],
per_device_train_batch_size=config["batch_size"],
per_device_eval_batch_size=config["batch_size"],
gradient_accumulation_steps=config["gradient_accumulation_steps"],
# Learning Rate (anti-overfitting)
learning_rate=config["learning_rate"],
lr_scheduler_type="cosine",
warmup_ratio=config["warmup_ratio"],
# Regularization (anti-overfitting)
weight_decay=config["weight_decay"],
max_grad_norm=config["max_grad_norm"],
# Evaluation
eval_strategy=config["eval_strategy"] if val_dataset else "no",
eval_steps=config["eval_steps"] if val_dataset else None,
load_best_model_at_end=True if val_dataset else False,
metric_for_best_model="eval_loss" if val_dataset else None,
greater_is_better=False if val_dataset else None,
# Logging & Saving
logging_steps=config["logging_steps"],
save_steps=config["save_steps"],
save_total_limit=config["save_total_limit"],
# Performance
optim="adamw_8bit",
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
seed=42,
report_to="tensorboard",
logging_dir=str(output_dir / "logs"),
)
# Callbacks
callbacks = []
if val_dataset:
callbacks.append(EarlyStoppingCallback(
early_stopping_patience=config["early_stopping_patience"],
early_stopping_threshold=config["early_stopping_threshold"]
))
# Create trainer
trainer = UnslothTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=val_dataset,
args=training_args,
max_seq_length=config["max_seq_length"],
dataset_text_field="text",
callbacks=callbacks,
)
# Training
print("\n" + "=" * 60)
print("STARTING TRAINING")
print("=" * 60)
print(f"Model: {config['base_model']}")
print(f"Training examples: {len(train_dataset):,}")
print(f"Validation examples: {len(val_dataset) if val_dataset else 0:,}")
print(f"Epochs: {config['num_epochs']}")
print(f"Batch size: {effective_batch}")
print(f"Learning rate: {config['learning_rate']}")
print(f"Weight decay: {config['weight_decay']} (regularization)")
print(f"LoRA dropout: {config['lora_dropout']} (regularization)")
print(f"Early stopping patience: {config['early_stopping_patience']}")
print("=" * 60 + "\n")
train_result = trainer.train()
# Save final model
print("\n" + "=" * 60)
print("SAVING MODEL")
print("=" * 60)
# Save adapter
adapter_dir = output_dir / "adapter"
model.save_pretrained(str(adapter_dir))
tokenizer.save_pretrained(str(adapter_dir))
print(f"Adapter saved: {adapter_dir}")
# Save merged model
merged_dir = output_dir / "merged"
try:
model.save_pretrained_merged(
str(merged_dir),
tokenizer,
save_method="merged_16bit"
)
print(f"Merged model saved: {merged_dir}")
except Exception as e:
print(f"Warning: Could not save merged model: {e}")
merged_dir = None
# Save training stats
stats = {
"train_loss": train_result.training_loss,
"train_runtime": train_result.metrics.get("train_runtime"),
"train_samples_per_second": train_result.metrics.get("train_samples_per_second"),
"config": config,
"completed_at": datetime.now().isoformat(),
}
with open(output_dir / "training_stats.json", 'w') as f:
json.dump(stats, f, indent=2)
return str(adapter_dir), str(merged_dir) if merged_dir else None
# ============================================================================
# TRAINING WITH HUGGINGFACE (FALLBACK)
# ============================================================================
def train_with_huggingface(config: dict):
"""Train using standard HuggingFace (fallback)."""
print("\n" + "=" * 60)
print("Training with HuggingFace (Standard)")
print("=" * 60)
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
BitsAndBytesConfig,
EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
# Resolve paths
script_dir = Path(__file__).parent
train_path = script_dir / config["train_data"]
val_path = script_dir / config["val_data"]
output_dir = script_dir / config["output_dir"]
output_dir.mkdir(parents=True, exist_ok=True)
# Load tokenizer
print(f"\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
config["base_model"],
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
# Load model
print(f"Loading model: {config['base_model']}")
model = AutoModelForCausalLM.from_pretrained(
config["base_model"],
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
)
# Prepare for training
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
# Apply LoRA
print("Applying LoRA...")
lora_config = LoraConfig(
r=config["lora_r"],
lora_alpha=config["lora_alpha"],
lora_dropout=config["lora_dropout"],
target_modules=config["lora_target_modules"],
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)")
# Load and process datasets
print(f"\nLoading data...")
train_dataset = load_dataset('json', data_files=str(train_path), split='train')
val_dataset = None
if val_path.exists():
val_dataset = load_dataset('json', data_files=str(val_path), split='train')
# Tokenize
def tokenize_fn(examples):
texts = []
for i in range(len(examples["messages"])):
example = {"messages": examples["messages"][i]}
text = format_chat_example(example, tokenizer)
texts.append(text)
tokenized = tokenizer(
texts,
truncation=True,
max_length=config["max_seq_length"],
padding=False,
)
return tokenized
train_dataset = train_dataset.map(
tokenize_fn,
batched=True,
remove_columns=train_dataset.column_names,
desc="Tokenizing train"
)
if val_dataset:
val_dataset = val_dataset.map(
tokenize_fn,
batched=True,
remove_columns=val_dataset.column_names,
desc="Tokenizing validation"
)
# Data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# Training arguments
training_args = TrainingArguments(
output_dir=str(output_dir),
# Training
num_train_epochs=config["num_epochs"],
per_device_train_batch_size=config["batch_size"],
per_device_eval_batch_size=config["batch_size"],
gradient_accumulation_steps=config["gradient_accumulation_steps"],
# Learning rate
learning_rate=config["learning_rate"],
lr_scheduler_type="cosine",
warmup_ratio=config["warmup_ratio"],
# Regularization
weight_decay=config["weight_decay"],
max_grad_norm=config["max_grad_norm"],
# Evaluation
eval_strategy=config["eval_strategy"] if val_dataset else "no",
eval_steps=config["eval_steps"] if val_dataset else None,
load_best_model_at_end=True if val_dataset else False,
metric_for_best_model="eval_loss" if val_dataset else None,
# Logging & Saving
logging_steps=config["logging_steps"],
save_steps=config["save_steps"],
save_total_limit=config["save_total_limit"],
# Performance
bf16=True,
optim="adamw_8bit",
gradient_checkpointing=True,
group_by_length=True,
report_to="tensorboard",
logging_dir=str(output_dir / "logs"),
dataloader_pin_memory=False,
)
# Callbacks
callbacks = []
if val_dataset:
callbacks.append(EarlyStoppingCallback(
early_stopping_patience=config["early_stopping_patience"],
early_stopping_threshold=config["early_stopping_threshold"]
))
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
callbacks=callbacks,
)
# Train
print("\n" + "=" * 60)
print("STARTING TRAINING")
print("=" * 60)
train_result = trainer.train()
# Save
print("\n" + "=" * 60)
print("SAVING MODEL")
print("=" * 60)
adapter_dir = output_dir / "adapter"
model.save_pretrained(str(adapter_dir))
tokenizer.save_pretrained(str(adapter_dir))
print(f"Adapter saved: {adapter_dir}")
return str(adapter_dir), None
# ============================================================================
# MAIN
# ============================================================================
def main():
parser = argparse.ArgumentParser(description="BuildwellAI Model V2 Fine-Tuning")
parser.add_argument("--config", type=str, help="Path to config JSON file")
args = parser.parse_args()
print("=" * 60)
print("BuildwellAI Model V2 - Fine-Tuning")
print("=" * 60)
print(f"Started: {datetime.now().isoformat()}")
# Setup
setup_environment()
num_gpus = check_gpu()
# Load config
config = load_config(args.config)
# Print config
print("\n" + "=" * 60)
print("Configuration")
print("=" * 60)
for key, value in config.items():
if not key.startswith("lora_target"):
print(f" {key}: {value}")
# Check for training data
script_dir = Path(__file__).parent
train_path = script_dir / config["train_data"]
if not train_path.exists():
print(f"\nERROR: Training data not found: {train_path}")
print("Run prepare_dataset.py first!")
sys.exit(1)
# Train
try:
from unsloth import FastLanguageModel
print("\nUnsloth available - using optimized training")
adapter_dir, merged_dir = train_with_unsloth(config)
except ImportError:
print("\nUnsloth not available - using HuggingFace")
adapter_dir, merged_dir = train_with_huggingface(config)
# Done
print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)
print(f"\nModel saved to:")
print(f" Adapter: {adapter_dir}")
if merged_dir:
print(f" Merged: {merged_dir}")
print(f"\nNext steps:")
print(f" 1. Test: python3 streaming_api.py --model {merged_dir or adapter_dir}")
print(f" 2. Deploy to production")
print(f"\nCompleted: {datetime.now().isoformat()}")
if __name__ == "__main__":
main()