#!/usr/bin/env python3 """ BuildwellAI Model V2 - Fine-Tuning Script Optimized for RunPod 2x RTX A5000 (48GB VRAM) with anti-overfitting measures. Key Features: - QLoRA 4-bit quantization for memory efficiency - Validation loss monitoring with early stopping - Learning rate warmup and cosine decay - Weight decay regularization - Gradient clipping - Dropout in LoRA layers - Proper train/val split Usage: python3 finetune.py [--config config.json] """ import os import sys import json import torch import argparse from pathlib import Path from datetime import datetime from typing import Optional # ============================================================================ # CONFIGURATION # ============================================================================ DEFAULT_CONFIG = { # Model "base_model": "Qwen/Qwen3-14B", "max_seq_length": 2048, # LoRA Configuration (moderate to prevent overfitting) "lora_r": 16, # Lower rank = less overfitting "lora_alpha": 32, "lora_dropout": 0.1, # Dropout for regularization "lora_target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], # Training Configuration (anti-overfitting) "batch_size": 4, "gradient_accumulation_steps": 4, "learning_rate": 1e-5, # Lower LR for fine-tuning existing model "num_epochs": 2, # Fewer epochs to prevent overfitting "warmup_ratio": 0.1, # 10% warmup "weight_decay": 0.05, # L2 regularization "max_grad_norm": 0.5, # Gradient clipping # Early Stopping "early_stopping_patience": 3, "early_stopping_threshold": 0.01, # Validation "eval_steps": 200, "eval_strategy": "steps", # Logging & Saving "logging_steps": 50, "save_steps": 200, "save_total_limit": 3, # Paths "train_data": "../datasets/train.jsonl", "val_data": "../datasets/validation.jsonl", "output_dir": "../output/buildwellai-qwen3-14b-v2", # Hub "push_to_hub": False, "hub_model_id": "buildwellai/qwen3-14b-v2", } # ============================================================================ # HELPER FUNCTIONS # ============================================================================ def setup_environment(): """Setup environment for training.""" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" os.environ["TOKENIZERS_PARALLELISM"] = "false" def check_gpu(): """Check GPU availability and memory.""" print("=" * 60) print("GPU Configuration") print("=" * 60) if not torch.cuda.is_available(): print("ERROR: CUDA not available!") sys.exit(1) num_gpus = torch.cuda.device_count() total_memory = 0 for i in range(num_gpus): props = torch.cuda.get_device_properties(i) memory_gb = props.total_memory / (1024**3) total_memory += memory_gb print(f"GPU {i}: {props.name} ({memory_gb:.1f} GB)") print(f"Total GPUs: {num_gpus}") print(f"Total VRAM: {total_memory:.1f} GB") print(f"PyTorch: {torch.__version__}") print(f"CUDA: {torch.version.cuda}") return num_gpus def load_config(config_path: Optional[str] = None) -> dict: """Load configuration from file or use defaults.""" config = DEFAULT_CONFIG.copy() if config_path and os.path.exists(config_path): with open(config_path) as f: user_config = json.load(f) config.update(user_config) print(f"Loaded config from: {config_path}") return config def format_chat_example(example: dict, tokenizer) -> str: """Format a training example using chat template.""" messages = example.get("messages", []) # Handle tool_calls in messages formatted_messages = [] for msg in messages: new_msg = {"role": msg["role"]} content = msg.get("content", "") # Handle None content if content is None: content = "" new_msg["content"] = content formatted_messages.append(new_msg) text = tokenizer.apply_chat_template( formatted_messages, tokenize=False, add_generation_prompt=False ) if not text.endswith(tokenizer.eos_token): text += tokenizer.eos_token return text # ============================================================================ # TRAINING WITH UNSLOTH (RECOMMENDED) # ============================================================================ def train_with_unsloth(config: dict): """Train using Unsloth for 2-5x speedup.""" print("\n" + "=" * 60) print("Training with Unsloth (Optimized)") print("=" * 60) from unsloth import FastLanguageModel, is_bfloat16_supported from unsloth import UnslothTrainer, UnslothTrainingArguments from datasets import load_dataset from transformers import EarlyStoppingCallback # Resolve paths script_dir = Path(__file__).parent train_path = script_dir / config["train_data"] val_path = script_dir / config["val_data"] output_dir = script_dir / config["output_dir"] output_dir.mkdir(parents=True, exist_ok=True) # Load model print(f"\nLoading model: {config['base_model']}") model, tokenizer = FastLanguageModel.from_pretrained( model_name=config["base_model"], max_seq_length=config["max_seq_length"], dtype=torch.bfloat16 if is_bfloat16_supported() else torch.float16, load_in_4bit=True, ) # Apply LoRA with dropout for regularization print("Applying LoRA with dropout...") model = FastLanguageModel.get_peft_model( model, r=config["lora_r"], lora_alpha=config["lora_alpha"], lora_dropout=config["lora_dropout"], # Anti-overfitting target_modules=config["lora_target_modules"], bias="none", use_gradient_checkpointing="unsloth", random_state=42, ) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f"Trainable parameters: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") # Load datasets print(f"\nLoading training data: {train_path}") train_dataset = load_dataset('json', data_files=str(train_path), split='train') print(f"Training examples: {len(train_dataset):,}") val_dataset = None if val_path.exists(): print(f"Loading validation data: {val_path}") val_dataset = load_dataset('json', data_files=str(val_path), split='train') print(f"Validation examples: {len(val_dataset):,}") # Format datasets print("\nFormatting datasets...") def format_fn(examples): texts = [] for i in range(len(examples["messages"])): example = {"messages": examples["messages"][i]} text = format_chat_example(example, tokenizer) texts.append(text) return {"text": texts} train_dataset = train_dataset.map( format_fn, batched=True, remove_columns=train_dataset.column_names, desc="Formatting train" ) if val_dataset: val_dataset = val_dataset.map( format_fn, batched=True, remove_columns=val_dataset.column_names, desc="Formatting validation" ) # Training arguments with anti-overfitting settings effective_batch = config["batch_size"] * config["gradient_accumulation_steps"] * torch.cuda.device_count() print(f"\nEffective batch size: {effective_batch}") training_args = UnslothTrainingArguments( output_dir=str(output_dir), # Training num_train_epochs=config["num_epochs"], per_device_train_batch_size=config["batch_size"], per_device_eval_batch_size=config["batch_size"], gradient_accumulation_steps=config["gradient_accumulation_steps"], # Learning Rate (anti-overfitting) learning_rate=config["learning_rate"], lr_scheduler_type="cosine", warmup_ratio=config["warmup_ratio"], # Regularization (anti-overfitting) weight_decay=config["weight_decay"], max_grad_norm=config["max_grad_norm"], # Evaluation eval_strategy=config["eval_strategy"] if val_dataset else "no", eval_steps=config["eval_steps"] if val_dataset else None, load_best_model_at_end=True if val_dataset else False, metric_for_best_model="eval_loss" if val_dataset else None, greater_is_better=False if val_dataset else None, # Logging & Saving logging_steps=config["logging_steps"], save_steps=config["save_steps"], save_total_limit=config["save_total_limit"], # Performance optim="adamw_8bit", fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), seed=42, report_to="tensorboard", logging_dir=str(output_dir / "logs"), ) # Callbacks callbacks = [] if val_dataset: callbacks.append(EarlyStoppingCallback( early_stopping_patience=config["early_stopping_patience"], early_stopping_threshold=config["early_stopping_threshold"] )) # Create trainer trainer = UnslothTrainer( model=model, tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=val_dataset, args=training_args, max_seq_length=config["max_seq_length"], dataset_text_field="text", callbacks=callbacks, ) # Training print("\n" + "=" * 60) print("STARTING TRAINING") print("=" * 60) print(f"Model: {config['base_model']}") print(f"Training examples: {len(train_dataset):,}") print(f"Validation examples: {len(val_dataset) if val_dataset else 0:,}") print(f"Epochs: {config['num_epochs']}") print(f"Batch size: {effective_batch}") print(f"Learning rate: {config['learning_rate']}") print(f"Weight decay: {config['weight_decay']} (regularization)") print(f"LoRA dropout: {config['lora_dropout']} (regularization)") print(f"Early stopping patience: {config['early_stopping_patience']}") print("=" * 60 + "\n") train_result = trainer.train() # Save final model print("\n" + "=" * 60) print("SAVING MODEL") print("=" * 60) # Save adapter adapter_dir = output_dir / "adapter" model.save_pretrained(str(adapter_dir)) tokenizer.save_pretrained(str(adapter_dir)) print(f"Adapter saved: {adapter_dir}") # Save merged model merged_dir = output_dir / "merged" try: model.save_pretrained_merged( str(merged_dir), tokenizer, save_method="merged_16bit" ) print(f"Merged model saved: {merged_dir}") except Exception as e: print(f"Warning: Could not save merged model: {e}") merged_dir = None # Save training stats stats = { "train_loss": train_result.training_loss, "train_runtime": train_result.metrics.get("train_runtime"), "train_samples_per_second": train_result.metrics.get("train_samples_per_second"), "config": config, "completed_at": datetime.now().isoformat(), } with open(output_dir / "training_stats.json", 'w') as f: json.dump(stats, f, indent=2) return str(adapter_dir), str(merged_dir) if merged_dir else None # ============================================================================ # TRAINING WITH HUGGINGFACE (FALLBACK) # ============================================================================ def train_with_huggingface(config: dict): """Train using standard HuggingFace (fallback).""" print("\n" + "=" * 60) print("Training with HuggingFace (Standard)") print("=" * 60) from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, EarlyStoppingCallback, ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from datasets import load_dataset # Resolve paths script_dir = Path(__file__).parent train_path = script_dir / config["train_data"] val_path = script_dir / config["val_data"] output_dir = script_dir / config["output_dir"] output_dir.mkdir(parents=True, exist_ok=True) # Load tokenizer print(f"\nLoading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( config["base_model"], trust_remote_code=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 4-bit quantization config bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) # Load model print(f"Loading model: {config['base_model']}") model = AutoModelForCausalLM.from_pretrained( config["base_model"], quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16, ) # Prepare for training model = prepare_model_for_kbit_training(model) model.gradient_checkpointing_enable() # Apply LoRA print("Applying LoRA...") lora_config = LoraConfig( r=config["lora_r"], lora_alpha=config["lora_alpha"], lora_dropout=config["lora_dropout"], target_modules=config["lora_target_modules"], bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") # Load and process datasets print(f"\nLoading data...") train_dataset = load_dataset('json', data_files=str(train_path), split='train') val_dataset = None if val_path.exists(): val_dataset = load_dataset('json', data_files=str(val_path), split='train') # Tokenize def tokenize_fn(examples): texts = [] for i in range(len(examples["messages"])): example = {"messages": examples["messages"][i]} text = format_chat_example(example, tokenizer) texts.append(text) tokenized = tokenizer( texts, truncation=True, max_length=config["max_seq_length"], padding=False, ) return tokenized train_dataset = train_dataset.map( tokenize_fn, batched=True, remove_columns=train_dataset.column_names, desc="Tokenizing train" ) if val_dataset: val_dataset = val_dataset.map( tokenize_fn, batched=True, remove_columns=val_dataset.column_names, desc="Tokenizing validation" ) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # Training arguments training_args = TrainingArguments( output_dir=str(output_dir), # Training num_train_epochs=config["num_epochs"], per_device_train_batch_size=config["batch_size"], per_device_eval_batch_size=config["batch_size"], gradient_accumulation_steps=config["gradient_accumulation_steps"], # Learning rate learning_rate=config["learning_rate"], lr_scheduler_type="cosine", warmup_ratio=config["warmup_ratio"], # Regularization weight_decay=config["weight_decay"], max_grad_norm=config["max_grad_norm"], # Evaluation eval_strategy=config["eval_strategy"] if val_dataset else "no", eval_steps=config["eval_steps"] if val_dataset else None, load_best_model_at_end=True if val_dataset else False, metric_for_best_model="eval_loss" if val_dataset else None, # Logging & Saving logging_steps=config["logging_steps"], save_steps=config["save_steps"], save_total_limit=config["save_total_limit"], # Performance bf16=True, optim="adamw_8bit", gradient_checkpointing=True, group_by_length=True, report_to="tensorboard", logging_dir=str(output_dir / "logs"), dataloader_pin_memory=False, ) # Callbacks callbacks = [] if val_dataset: callbacks.append(EarlyStoppingCallback( early_stopping_patience=config["early_stopping_patience"], early_stopping_threshold=config["early_stopping_threshold"] )) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, data_collator=data_collator, callbacks=callbacks, ) # Train print("\n" + "=" * 60) print("STARTING TRAINING") print("=" * 60) train_result = trainer.train() # Save print("\n" + "=" * 60) print("SAVING MODEL") print("=" * 60) adapter_dir = output_dir / "adapter" model.save_pretrained(str(adapter_dir)) tokenizer.save_pretrained(str(adapter_dir)) print(f"Adapter saved: {adapter_dir}") return str(adapter_dir), None # ============================================================================ # MAIN # ============================================================================ def main(): parser = argparse.ArgumentParser(description="BuildwellAI Model V2 Fine-Tuning") parser.add_argument("--config", type=str, help="Path to config JSON file") args = parser.parse_args() print("=" * 60) print("BuildwellAI Model V2 - Fine-Tuning") print("=" * 60) print(f"Started: {datetime.now().isoformat()}") # Setup setup_environment() num_gpus = check_gpu() # Load config config = load_config(args.config) # Print config print("\n" + "=" * 60) print("Configuration") print("=" * 60) for key, value in config.items(): if not key.startswith("lora_target"): print(f" {key}: {value}") # Check for training data script_dir = Path(__file__).parent train_path = script_dir / config["train_data"] if not train_path.exists(): print(f"\nERROR: Training data not found: {train_path}") print("Run prepare_dataset.py first!") sys.exit(1) # Train try: from unsloth import FastLanguageModel print("\nUnsloth available - using optimized training") adapter_dir, merged_dir = train_with_unsloth(config) except ImportError: print("\nUnsloth not available - using HuggingFace") adapter_dir, merged_dir = train_with_huggingface(config) # Done print("\n" + "=" * 60) print("TRAINING COMPLETE!") print("=" * 60) print(f"\nModel saved to:") print(f" Adapter: {adapter_dir}") if merged_dir: print(f" Merged: {merged_dir}") print(f"\nNext steps:") print(f" 1. Test: python3 streaming_api.py --model {merged_dir or adapter_dir}") print(f" 2. Deploy to production") print(f"\nCompleted: {datetime.now().isoformat()}") if __name__ == "__main__": main()