| | |
| | """ |
| | BuildwellAI Model V2 - Fine-Tuning Script |
| | |
| | Optimized for RunPod 2x RTX A5000 (48GB VRAM) with anti-overfitting measures. |
| | |
| | Key Features: |
| | - QLoRA 4-bit quantization for memory efficiency |
| | - Validation loss monitoring with early stopping |
| | - Learning rate warmup and cosine decay |
| | - Weight decay regularization |
| | - Gradient clipping |
| | - Dropout in LoRA layers |
| | - Proper train/val split |
| | |
| | Usage: |
| | python3 finetune.py [--config config.json] |
| | """ |
| |
|
| | import os |
| | import sys |
| | import json |
| | import torch |
| | import argparse |
| | from pathlib import Path |
| | from datetime import datetime |
| | from typing import Optional |
| |
|
| | |
| | |
| | |
| |
|
| | DEFAULT_CONFIG = { |
| | |
| | "base_model": "Qwen/Qwen3-14B", |
| | "max_seq_length": 2048, |
| |
|
| | |
| | "lora_r": 16, |
| | "lora_alpha": 32, |
| | "lora_dropout": 0.1, |
| | "lora_target_modules": [ |
| | "q_proj", "k_proj", "v_proj", "o_proj", |
| | "gate_proj", "up_proj", "down_proj" |
| | ], |
| |
|
| | |
| | "batch_size": 4, |
| | "gradient_accumulation_steps": 4, |
| | "learning_rate": 1e-5, |
| | "num_epochs": 2, |
| | "warmup_ratio": 0.1, |
| | "weight_decay": 0.05, |
| | "max_grad_norm": 0.5, |
| |
|
| | |
| | "early_stopping_patience": 3, |
| | "early_stopping_threshold": 0.01, |
| |
|
| | |
| | "eval_steps": 200, |
| | "eval_strategy": "steps", |
| |
|
| | |
| | "logging_steps": 50, |
| | "save_steps": 200, |
| | "save_total_limit": 3, |
| |
|
| | |
| | "train_data": "../datasets/train.jsonl", |
| | "val_data": "../datasets/validation.jsonl", |
| | "output_dir": "../output/buildwellai-qwen3-14b-v2", |
| |
|
| | |
| | "push_to_hub": False, |
| | "hub_model_id": "buildwellai/qwen3-14b-v2", |
| | } |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def setup_environment(): |
| | """Setup environment for training.""" |
| | os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
| | os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| |
|
| |
|
| | def check_gpu(): |
| | """Check GPU availability and memory.""" |
| | print("=" * 60) |
| | print("GPU Configuration") |
| | print("=" * 60) |
| |
|
| | if not torch.cuda.is_available(): |
| | print("ERROR: CUDA not available!") |
| | sys.exit(1) |
| |
|
| | num_gpus = torch.cuda.device_count() |
| | total_memory = 0 |
| |
|
| | for i in range(num_gpus): |
| | props = torch.cuda.get_device_properties(i) |
| | memory_gb = props.total_memory / (1024**3) |
| | total_memory += memory_gb |
| | print(f"GPU {i}: {props.name} ({memory_gb:.1f} GB)") |
| |
|
| | print(f"Total GPUs: {num_gpus}") |
| | print(f"Total VRAM: {total_memory:.1f} GB") |
| | print(f"PyTorch: {torch.__version__}") |
| | print(f"CUDA: {torch.version.cuda}") |
| |
|
| | return num_gpus |
| |
|
| |
|
| | def load_config(config_path: Optional[str] = None) -> dict: |
| | """Load configuration from file or use defaults.""" |
| | config = DEFAULT_CONFIG.copy() |
| |
|
| | if config_path and os.path.exists(config_path): |
| | with open(config_path) as f: |
| | user_config = json.load(f) |
| | config.update(user_config) |
| | print(f"Loaded config from: {config_path}") |
| |
|
| | return config |
| |
|
| |
|
| | def format_chat_example(example: dict, tokenizer) -> str: |
| | """Format a training example using chat template.""" |
| | messages = example.get("messages", []) |
| |
|
| | |
| | formatted_messages = [] |
| | for msg in messages: |
| | new_msg = {"role": msg["role"]} |
| | content = msg.get("content", "") |
| |
|
| | |
| | if content is None: |
| | content = "" |
| |
|
| | new_msg["content"] = content |
| | formatted_messages.append(new_msg) |
| |
|
| | text = tokenizer.apply_chat_template( |
| | formatted_messages, |
| | tokenize=False, |
| | add_generation_prompt=False |
| | ) |
| |
|
| | if not text.endswith(tokenizer.eos_token): |
| | text += tokenizer.eos_token |
| |
|
| | return text |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def train_with_unsloth(config: dict): |
| | """Train using Unsloth for 2-5x speedup.""" |
| | print("\n" + "=" * 60) |
| | print("Training with Unsloth (Optimized)") |
| | print("=" * 60) |
| |
|
| | from unsloth import FastLanguageModel, is_bfloat16_supported |
| | from unsloth import UnslothTrainer, UnslothTrainingArguments |
| | from datasets import load_dataset |
| | from transformers import EarlyStoppingCallback |
| |
|
| | |
| | script_dir = Path(__file__).parent |
| | train_path = script_dir / config["train_data"] |
| | val_path = script_dir / config["val_data"] |
| | output_dir = script_dir / config["output_dir"] |
| |
|
| | output_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | print(f"\nLoading model: {config['base_model']}") |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name=config["base_model"], |
| | max_seq_length=config["max_seq_length"], |
| | dtype=torch.bfloat16 if is_bfloat16_supported() else torch.float16, |
| | load_in_4bit=True, |
| | ) |
| |
|
| | |
| | print("Applying LoRA with dropout...") |
| | model = FastLanguageModel.get_peft_model( |
| | model, |
| | r=config["lora_r"], |
| | lora_alpha=config["lora_alpha"], |
| | lora_dropout=config["lora_dropout"], |
| | target_modules=config["lora_target_modules"], |
| | bias="none", |
| | use_gradient_checkpointing="unsloth", |
| | random_state=42, |
| | ) |
| |
|
| | trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| | total = sum(p.numel() for p in model.parameters()) |
| | print(f"Trainable parameters: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") |
| |
|
| | |
| | print(f"\nLoading training data: {train_path}") |
| | train_dataset = load_dataset('json', data_files=str(train_path), split='train') |
| | print(f"Training examples: {len(train_dataset):,}") |
| |
|
| | val_dataset = None |
| | if val_path.exists(): |
| | print(f"Loading validation data: {val_path}") |
| | val_dataset = load_dataset('json', data_files=str(val_path), split='train') |
| | print(f"Validation examples: {len(val_dataset):,}") |
| |
|
| | |
| | print("\nFormatting datasets...") |
| |
|
| | def format_fn(examples): |
| | texts = [] |
| | for i in range(len(examples["messages"])): |
| | example = {"messages": examples["messages"][i]} |
| | text = format_chat_example(example, tokenizer) |
| | texts.append(text) |
| | return {"text": texts} |
| |
|
| | train_dataset = train_dataset.map( |
| | format_fn, |
| | batched=True, |
| | remove_columns=train_dataset.column_names, |
| | desc="Formatting train" |
| | ) |
| |
|
| | if val_dataset: |
| | val_dataset = val_dataset.map( |
| | format_fn, |
| | batched=True, |
| | remove_columns=val_dataset.column_names, |
| | desc="Formatting validation" |
| | ) |
| |
|
| | |
| | effective_batch = config["batch_size"] * config["gradient_accumulation_steps"] * torch.cuda.device_count() |
| | print(f"\nEffective batch size: {effective_batch}") |
| |
|
| | training_args = UnslothTrainingArguments( |
| | output_dir=str(output_dir), |
| |
|
| | |
| | num_train_epochs=config["num_epochs"], |
| | per_device_train_batch_size=config["batch_size"], |
| | per_device_eval_batch_size=config["batch_size"], |
| | gradient_accumulation_steps=config["gradient_accumulation_steps"], |
| |
|
| | |
| | learning_rate=config["learning_rate"], |
| | lr_scheduler_type="cosine", |
| | warmup_ratio=config["warmup_ratio"], |
| |
|
| | |
| | weight_decay=config["weight_decay"], |
| | max_grad_norm=config["max_grad_norm"], |
| |
|
| | |
| | eval_strategy=config["eval_strategy"] if val_dataset else "no", |
| | eval_steps=config["eval_steps"] if val_dataset else None, |
| | load_best_model_at_end=True if val_dataset else False, |
| | metric_for_best_model="eval_loss" if val_dataset else None, |
| | greater_is_better=False if val_dataset else None, |
| |
|
| | |
| | logging_steps=config["logging_steps"], |
| | save_steps=config["save_steps"], |
| | save_total_limit=config["save_total_limit"], |
| |
|
| | |
| | optim="adamw_8bit", |
| | fp16=not is_bfloat16_supported(), |
| | bf16=is_bfloat16_supported(), |
| | seed=42, |
| | report_to="tensorboard", |
| | logging_dir=str(output_dir / "logs"), |
| | ) |
| |
|
| | |
| | callbacks = [] |
| | if val_dataset: |
| | callbacks.append(EarlyStoppingCallback( |
| | early_stopping_patience=config["early_stopping_patience"], |
| | early_stopping_threshold=config["early_stopping_threshold"] |
| | )) |
| |
|
| | |
| | trainer = UnslothTrainer( |
| | model=model, |
| | tokenizer=tokenizer, |
| | train_dataset=train_dataset, |
| | eval_dataset=val_dataset, |
| | args=training_args, |
| | max_seq_length=config["max_seq_length"], |
| | dataset_text_field="text", |
| | callbacks=callbacks, |
| | ) |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("STARTING TRAINING") |
| | print("=" * 60) |
| | print(f"Model: {config['base_model']}") |
| | print(f"Training examples: {len(train_dataset):,}") |
| | print(f"Validation examples: {len(val_dataset) if val_dataset else 0:,}") |
| | print(f"Epochs: {config['num_epochs']}") |
| | print(f"Batch size: {effective_batch}") |
| | print(f"Learning rate: {config['learning_rate']}") |
| | print(f"Weight decay: {config['weight_decay']} (regularization)") |
| | print(f"LoRA dropout: {config['lora_dropout']} (regularization)") |
| | print(f"Early stopping patience: {config['early_stopping_patience']}") |
| | print("=" * 60 + "\n") |
| |
|
| | train_result = trainer.train() |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("SAVING MODEL") |
| | print("=" * 60) |
| |
|
| | |
| | adapter_dir = output_dir / "adapter" |
| | model.save_pretrained(str(adapter_dir)) |
| | tokenizer.save_pretrained(str(adapter_dir)) |
| | print(f"Adapter saved: {adapter_dir}") |
| |
|
| | |
| | merged_dir = output_dir / "merged" |
| | try: |
| | model.save_pretrained_merged( |
| | str(merged_dir), |
| | tokenizer, |
| | save_method="merged_16bit" |
| | ) |
| | print(f"Merged model saved: {merged_dir}") |
| | except Exception as e: |
| | print(f"Warning: Could not save merged model: {e}") |
| | merged_dir = None |
| |
|
| | |
| | stats = { |
| | "train_loss": train_result.training_loss, |
| | "train_runtime": train_result.metrics.get("train_runtime"), |
| | "train_samples_per_second": train_result.metrics.get("train_samples_per_second"), |
| | "config": config, |
| | "completed_at": datetime.now().isoformat(), |
| | } |
| |
|
| | with open(output_dir / "training_stats.json", 'w') as f: |
| | json.dump(stats, f, indent=2) |
| |
|
| | return str(adapter_dir), str(merged_dir) if merged_dir else None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def train_with_huggingface(config: dict): |
| | """Train using standard HuggingFace (fallback).""" |
| | print("\n" + "=" * 60) |
| | print("Training with HuggingFace (Standard)") |
| | print("=" * 60) |
| |
|
| | from transformers import ( |
| | AutoModelForCausalLM, |
| | AutoTokenizer, |
| | TrainingArguments, |
| | Trainer, |
| | DataCollatorForLanguageModeling, |
| | BitsAndBytesConfig, |
| | EarlyStoppingCallback, |
| | ) |
| | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
| | from datasets import load_dataset |
| |
|
| | |
| | script_dir = Path(__file__).parent |
| | train_path = script_dir / config["train_data"] |
| | val_path = script_dir / config["val_data"] |
| | output_dir = script_dir / config["output_dir"] |
| |
|
| | output_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | print(f"\nLoading tokenizer...") |
| | tokenizer = AutoTokenizer.from_pretrained( |
| | config["base_model"], |
| | trust_remote_code=True |
| | ) |
| | if tokenizer.pad_token is None: |
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | |
| | bnb_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_compute_dtype=torch.bfloat16, |
| | bnb_4bit_use_double_quant=True, |
| | ) |
| |
|
| | |
| | print(f"Loading model: {config['base_model']}") |
| | model = AutoModelForCausalLM.from_pretrained( |
| | config["base_model"], |
| | quantization_config=bnb_config, |
| | device_map="auto", |
| | trust_remote_code=True, |
| | torch_dtype=torch.bfloat16, |
| | ) |
| |
|
| | |
| | model = prepare_model_for_kbit_training(model) |
| | model.gradient_checkpointing_enable() |
| |
|
| | |
| | print("Applying LoRA...") |
| | lora_config = LoraConfig( |
| | r=config["lora_r"], |
| | lora_alpha=config["lora_alpha"], |
| | lora_dropout=config["lora_dropout"], |
| | target_modules=config["lora_target_modules"], |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | ) |
| | model = get_peft_model(model, lora_config) |
| |
|
| | trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| | total = sum(p.numel() for p in model.parameters()) |
| | print(f"Trainable: {trainable:,} / {total:,} ({100*trainable/total:.2f}%)") |
| |
|
| | |
| | print(f"\nLoading data...") |
| | train_dataset = load_dataset('json', data_files=str(train_path), split='train') |
| |
|
| | val_dataset = None |
| | if val_path.exists(): |
| | val_dataset = load_dataset('json', data_files=str(val_path), split='train') |
| |
|
| | |
| | def tokenize_fn(examples): |
| | texts = [] |
| | for i in range(len(examples["messages"])): |
| | example = {"messages": examples["messages"][i]} |
| | text = format_chat_example(example, tokenizer) |
| | texts.append(text) |
| |
|
| | tokenized = tokenizer( |
| | texts, |
| | truncation=True, |
| | max_length=config["max_seq_length"], |
| | padding=False, |
| | ) |
| | return tokenized |
| |
|
| | train_dataset = train_dataset.map( |
| | tokenize_fn, |
| | batched=True, |
| | remove_columns=train_dataset.column_names, |
| | desc="Tokenizing train" |
| | ) |
| |
|
| | if val_dataset: |
| | val_dataset = val_dataset.map( |
| | tokenize_fn, |
| | batched=True, |
| | remove_columns=val_dataset.column_names, |
| | desc="Tokenizing validation" |
| | ) |
| |
|
| | |
| | data_collator = DataCollatorForLanguageModeling( |
| | tokenizer=tokenizer, |
| | mlm=False |
| | ) |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir=str(output_dir), |
| |
|
| | |
| | num_train_epochs=config["num_epochs"], |
| | per_device_train_batch_size=config["batch_size"], |
| | per_device_eval_batch_size=config["batch_size"], |
| | gradient_accumulation_steps=config["gradient_accumulation_steps"], |
| |
|
| | |
| | learning_rate=config["learning_rate"], |
| | lr_scheduler_type="cosine", |
| | warmup_ratio=config["warmup_ratio"], |
| |
|
| | |
| | weight_decay=config["weight_decay"], |
| | max_grad_norm=config["max_grad_norm"], |
| |
|
| | |
| | eval_strategy=config["eval_strategy"] if val_dataset else "no", |
| | eval_steps=config["eval_steps"] if val_dataset else None, |
| | load_best_model_at_end=True if val_dataset else False, |
| | metric_for_best_model="eval_loss" if val_dataset else None, |
| |
|
| | |
| | logging_steps=config["logging_steps"], |
| | save_steps=config["save_steps"], |
| | save_total_limit=config["save_total_limit"], |
| |
|
| | |
| | bf16=True, |
| | optim="adamw_8bit", |
| | gradient_checkpointing=True, |
| | group_by_length=True, |
| | report_to="tensorboard", |
| | logging_dir=str(output_dir / "logs"), |
| | dataloader_pin_memory=False, |
| | ) |
| |
|
| | |
| | callbacks = [] |
| | if val_dataset: |
| | callbacks.append(EarlyStoppingCallback( |
| | early_stopping_patience=config["early_stopping_patience"], |
| | early_stopping_threshold=config["early_stopping_threshold"] |
| | )) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=val_dataset, |
| | data_collator=data_collator, |
| | callbacks=callbacks, |
| | ) |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("STARTING TRAINING") |
| | print("=" * 60) |
| |
|
| | train_result = trainer.train() |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("SAVING MODEL") |
| | print("=" * 60) |
| |
|
| | adapter_dir = output_dir / "adapter" |
| | model.save_pretrained(str(adapter_dir)) |
| | tokenizer.save_pretrained(str(adapter_dir)) |
| | print(f"Adapter saved: {adapter_dir}") |
| |
|
| | return str(adapter_dir), None |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def main(): |
| | parser = argparse.ArgumentParser(description="BuildwellAI Model V2 Fine-Tuning") |
| | parser.add_argument("--config", type=str, help="Path to config JSON file") |
| | args = parser.parse_args() |
| |
|
| | print("=" * 60) |
| | print("BuildwellAI Model V2 - Fine-Tuning") |
| | print("=" * 60) |
| | print(f"Started: {datetime.now().isoformat()}") |
| |
|
| | |
| | setup_environment() |
| | num_gpus = check_gpu() |
| |
|
| | |
| | config = load_config(args.config) |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("Configuration") |
| | print("=" * 60) |
| | for key, value in config.items(): |
| | if not key.startswith("lora_target"): |
| | print(f" {key}: {value}") |
| |
|
| | |
| | script_dir = Path(__file__).parent |
| | train_path = script_dir / config["train_data"] |
| |
|
| | if not train_path.exists(): |
| | print(f"\nERROR: Training data not found: {train_path}") |
| | print("Run prepare_dataset.py first!") |
| | sys.exit(1) |
| |
|
| | |
| | try: |
| | from unsloth import FastLanguageModel |
| | print("\nUnsloth available - using optimized training") |
| | adapter_dir, merged_dir = train_with_unsloth(config) |
| | except ImportError: |
| | print("\nUnsloth not available - using HuggingFace") |
| | adapter_dir, merged_dir = train_with_huggingface(config) |
| |
|
| | |
| | print("\n" + "=" * 60) |
| | print("TRAINING COMPLETE!") |
| | print("=" * 60) |
| | print(f"\nModel saved to:") |
| | print(f" Adapter: {adapter_dir}") |
| | if merged_dir: |
| | print(f" Merged: {merged_dir}") |
| |
|
| | print(f"\nNext steps:") |
| | print(f" 1. Test: python3 streaming_api.py --model {merged_dir or adapter_dir}") |
| | print(f" 2. Deploy to production") |
| |
|
| | print(f"\nCompleted: {datetime.now().isoformat()}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|