|
|
|
|
|
""" |
|
|
Humigence Training Script with Hugging Face Accelerate |
|
|
Clean DDP training with single-GPU evaluation |
|
|
""" |
|
|
|
|
|
import os |
|
|
import json |
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
from dataclasses import dataclass, field |
|
|
from accelerate import Accelerator |
|
|
from accelerate.utils import set_seed |
|
|
from transformers import ( |
|
|
AutoTokenizer, AutoModelForCausalLM, |
|
|
TrainingArguments, Trainer, DataCollatorForLanguageModeling, |
|
|
BitsAndBytesConfig, get_linear_schedule_with_warmup |
|
|
) |
|
|
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType |
|
|
from datasets import Dataset |
|
|
import numpy as np |
|
|
from rich.console import Console |
|
|
from rich.table import Table |
|
|
from rich.panel import Panel |
|
|
|
|
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
os.environ["CUDA_LAUNCH_BLOCKING"] = "1" |
|
|
|
|
|
console = Console() |
|
|
|
|
|
@dataclass |
|
|
class TrainingConfig: |
|
|
"""Training configuration dataclass""" |
|
|
|
|
|
base_model: str = "microsoft/DialoGPT-small" |
|
|
training_recipe: str = "LoRA (FP16)" |
|
|
|
|
|
|
|
|
learning_rate: float = 2e-4 |
|
|
num_train_epochs: int = 1 |
|
|
per_device_train_batch_size: int = 2 |
|
|
per_device_eval_batch_size: int = 4 |
|
|
gradient_accumulation_steps: int = 4 |
|
|
max_seq_length: int = 1024 |
|
|
|
|
|
|
|
|
lora_r: int = 16 |
|
|
lora_alpha: int = 32 |
|
|
lora_dropout: float = 0.05 |
|
|
|
|
|
|
|
|
dataset_path: str = "" |
|
|
train_val_test_split: List[float] = field(default_factory=lambda: [0.8, 0.1, 0.1]) |
|
|
split_seed: int = 42 |
|
|
|
|
|
|
|
|
output_dir: str = "runs/humigence" |
|
|
logging_steps: int = 10 |
|
|
save_steps: int = 100 |
|
|
eval_steps: int = 100 |
|
|
|
|
|
|
|
|
eval_gpu_index: int = 0 |
|
|
|
|
|
def load_config(config_path: str) -> TrainingConfig: |
|
|
"""Load configuration from JSON file""" |
|
|
with open(config_path, 'r') as f: |
|
|
config_dict = json.load(f) |
|
|
|
|
|
|
|
|
config = TrainingConfig() |
|
|
for key, value in config_dict.items(): |
|
|
if hasattr(config, key): |
|
|
setattr(config, key, value) |
|
|
|
|
|
return config |
|
|
|
|
|
def prepare_dataset(config: TrainingConfig, tokenizer) -> tuple[Dataset, Dataset, Dataset]: |
|
|
"""Prepare dataset splits with tokenization""" |
|
|
console.print("[blue]π Preparing dataset...[/blue]") |
|
|
|
|
|
|
|
|
with open(config.dataset_path, 'r') as f: |
|
|
data = [json.loads(line) for line in f] |
|
|
|
|
|
console.print(f"[blue] Loaded {len(data)} samples[/blue]") |
|
|
|
|
|
|
|
|
np.random.seed(config.split_seed) |
|
|
indices = np.random.permutation(len(data)) |
|
|
|
|
|
train_size = int(len(data) * config.train_val_test_split[0]) |
|
|
val_size = int(len(data) * config.train_val_test_split[1]) |
|
|
|
|
|
train_indices = indices[:train_size] |
|
|
val_indices = indices[train_size:train_size + val_size] |
|
|
test_indices = indices[train_size + val_size:] |
|
|
|
|
|
train_data = [data[i] for i in train_indices] |
|
|
val_data = [data[i] for i in val_indices] |
|
|
test_data = [data[i] for i in test_indices] |
|
|
|
|
|
console.print(f"[blue] Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}[/blue]") |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
if "text" in examples: |
|
|
|
|
|
texts = examples["text"] |
|
|
elif "instruction" in examples and "output" in examples: |
|
|
|
|
|
texts = [] |
|
|
for i in range(len(examples["instruction"])): |
|
|
instruction = examples["instruction"][i] |
|
|
input_text = examples.get("input", [""])[i] if examples.get("input") else "" |
|
|
output = examples["output"][i] |
|
|
|
|
|
|
|
|
if input_text: |
|
|
text = f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output}" |
|
|
else: |
|
|
text = f"Instruction: {instruction}\nOutput: {output}" |
|
|
texts.append(text) |
|
|
else: |
|
|
|
|
|
text_col = None |
|
|
for col in ["text", "instruction", "input", "output"]: |
|
|
if col in examples: |
|
|
text_col = col |
|
|
break |
|
|
|
|
|
if text_col: |
|
|
texts = examples[text_col] |
|
|
else: |
|
|
|
|
|
texts = [str(ex) for ex in examples[list(examples.keys())[0]]] |
|
|
|
|
|
tokenized = tokenizer( |
|
|
texts, |
|
|
truncation=True, |
|
|
padding=True, |
|
|
max_length=config.max_seq_length, |
|
|
return_tensors=None |
|
|
) |
|
|
|
|
|
|
|
|
tokenized["labels"] = tokenized["input_ids"].copy() |
|
|
|
|
|
return tokenized |
|
|
|
|
|
|
|
|
train_dataset = Dataset.from_list(train_data) |
|
|
val_dataset = Dataset.from_list(val_data) |
|
|
test_dataset = Dataset.from_list(test_data) |
|
|
|
|
|
|
|
|
|
|
|
original_columns = list(train_dataset.column_names) |
|
|
|
|
|
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=original_columns) |
|
|
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=original_columns) |
|
|
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=original_columns) |
|
|
|
|
|
|
|
|
train_dataset.set_format("torch") |
|
|
val_dataset.set_format("torch") |
|
|
test_dataset.set_format("torch") |
|
|
|
|
|
return train_dataset, val_dataset, test_dataset |
|
|
|
|
|
def setup_model_and_tokenizer(config: TrainingConfig, accelerator: Accelerator): |
|
|
"""Setup model and tokenizer with LoRA/QLoRA""" |
|
|
console.print(f"[blue]π€ Loading model: {config.base_model}[/blue]") |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(config.base_model, trust_remote_code=True) |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
if "QLoRA" in config.training_recipe: |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16 |
|
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
config.base_model, |
|
|
quantization_config=bnb_config, |
|
|
device_map=None, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
|
|
|
model = prepare_model_for_kbit_training(model) |
|
|
else: |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
config.base_model, |
|
|
device_map=None, |
|
|
trust_remote_code=True, |
|
|
dtype=torch.bfloat16 if "BF16" in config.training_recipe else torch.float16 |
|
|
) |
|
|
|
|
|
|
|
|
if "gpt" in config.base_model.lower() or "dialo" in config.base_model.lower(): |
|
|
|
|
|
target_modules = ["c_attn", "c_proj"] |
|
|
elif "llama" in config.base_model.lower() or "mistral" in config.base_model.lower(): |
|
|
|
|
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] |
|
|
else: |
|
|
|
|
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] |
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=config.lora_r, |
|
|
lora_alpha=config.lora_alpha, |
|
|
target_modules=target_modules, |
|
|
lora_dropout=config.lora_dropout, |
|
|
bias="none", |
|
|
task_type=TaskType.CAUSAL_LM |
|
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
|
|
|
|
|
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) |
|
|
total_params = sum(p.numel() for p in model.parameters()) |
|
|
console.print(f"[blue] Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)[/blue]") |
|
|
|
|
|
return model, tokenizer |
|
|
|
|
|
def train_model(model, tokenizer, train_dataset, val_dataset, config: TrainingConfig, accelerator: Accelerator): |
|
|
"""Train the model using Accelerate""" |
|
|
console.print("[blue]π Starting training...[/blue]") |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=False |
|
|
) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=config.output_dir, |
|
|
per_device_train_batch_size=config.per_device_train_batch_size, |
|
|
per_device_eval_batch_size=config.per_device_eval_batch_size, |
|
|
gradient_accumulation_steps=config.gradient_accumulation_steps, |
|
|
num_train_epochs=config.num_train_epochs, |
|
|
learning_rate=config.learning_rate, |
|
|
logging_steps=config.logging_steps, |
|
|
save_steps=config.save_steps, |
|
|
eval_steps=config.eval_steps, |
|
|
eval_strategy="steps", |
|
|
save_strategy="steps", |
|
|
load_best_model_at_end=True, |
|
|
metric_for_best_model="eval_loss", |
|
|
greater_is_better=False, |
|
|
remove_unused_columns=False, |
|
|
dataloader_pin_memory=True, |
|
|
dataloader_num_workers=4, |
|
|
report_to=None, |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=train_dataset, |
|
|
eval_dataset=val_dataset, |
|
|
data_collator=data_collator, |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
|
if accelerator.is_main_process: |
|
|
trainer.save_model() |
|
|
console.print("[blue]πΎ Model saved[/blue]") |
|
|
|
|
|
return trainer |
|
|
|
|
|
def evaluate_model_on_single_gpu(model, tokenizer, test_dataset, config: TrainingConfig): |
|
|
"""Evaluate model on single GPU (cuda:0) to avoid device mismatches""" |
|
|
console.print("[blue]π§ͺ Running evaluation on cuda:0...[/blue]") |
|
|
|
|
|
|
|
|
eval_device = torch.device("cuda:0") |
|
|
model = model.to(eval_device) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=False |
|
|
) |
|
|
|
|
|
|
|
|
from torch.utils.data import DataLoader |
|
|
eval_dataloader = DataLoader( |
|
|
test_dataset, |
|
|
batch_size=config.per_device_eval_batch_size, |
|
|
collate_fn=data_collator, |
|
|
pin_memory=True |
|
|
) |
|
|
|
|
|
|
|
|
total_loss = 0.0 |
|
|
total_tokens = 0 |
|
|
correct_tokens = 0 |
|
|
num_samples = 0 |
|
|
|
|
|
with torch.no_grad(): |
|
|
for batch in eval_dataloader: |
|
|
|
|
|
batch = {k: v.to(eval_device) for k, v in batch.items()} |
|
|
|
|
|
|
|
|
outputs = model(**batch) |
|
|
loss = outputs.loss |
|
|
logits = outputs.logits |
|
|
|
|
|
|
|
|
total_loss += loss.item() |
|
|
num_samples += batch["input_ids"].size(0) |
|
|
|
|
|
|
|
|
predictions = torch.argmax(logits, dim=-1) |
|
|
labels = batch["labels"] |
|
|
|
|
|
|
|
|
mask = labels != -100 |
|
|
correct_tokens += (predictions[mask] == labels[mask]).sum().item() |
|
|
total_tokens += mask.sum().item() |
|
|
|
|
|
|
|
|
avg_loss = total_loss / len(eval_dataloader) |
|
|
accuracy = correct_tokens / max(total_tokens, 1) |
|
|
perplexity = np.exp(avg_loss) |
|
|
|
|
|
return { |
|
|
"loss": avg_loss, |
|
|
"accuracy": accuracy, |
|
|
"perplexity": perplexity, |
|
|
"correct_tokens": correct_tokens, |
|
|
"total_tokens": total_tokens, |
|
|
"num_samples": num_samples |
|
|
} |
|
|
|
|
|
def print_training_summary(config: TrainingConfig, train_dataset, val_dataset, test_dataset, eval_results): |
|
|
"""Print structured training summary""" |
|
|
console.print("\n[bold cyan]=" * 80) |
|
|
console.print("[bold cyan]π― TRAINING SUMMARY[/bold cyan]") |
|
|
console.print("[bold cyan]=" * 80) |
|
|
|
|
|
|
|
|
console.print(f"\n[bold green]π Dataset Summary[/bold green]") |
|
|
console.print(f" Train: {len(train_dataset):,} samples") |
|
|
console.print(f" Validation: {len(val_dataset):,} samples") |
|
|
console.print(f" Test: {len(test_dataset):,} samples") |
|
|
|
|
|
|
|
|
console.print(f"\n[bold blue]π€ Model Summary[/bold blue]") |
|
|
console.print(f" Base Model: {config.base_model}") |
|
|
console.print(f" Training Recipe: {config.training_recipe}") |
|
|
console.print(f" LoRA r: {config.lora_r}") |
|
|
console.print(f" LoRA alpha: {config.lora_alpha}") |
|
|
|
|
|
|
|
|
console.print(f"\n[bold yellow]π Training Summary[/bold yellow]") |
|
|
console.print(f" Epochs: {config.num_train_epochs}") |
|
|
console.print(f" Learning Rate: {config.learning_rate}") |
|
|
console.print(f" Batch Size: {config.per_device_train_batch_size}") |
|
|
console.print(f" Gradient Accumulation: {config.gradient_accumulation_steps}") |
|
|
|
|
|
|
|
|
console.print(f"\n[bold magenta]π§ͺ Evaluation Results (cuda:0)[/bold magenta]") |
|
|
console.print(f" Loss: {eval_results['loss']:.4f}") |
|
|
console.print(f" Accuracy: {eval_results['accuracy']:.4f}") |
|
|
console.print(f" Perplexity: {eval_results['perplexity']:.2f}") |
|
|
console.print(f" Correct Tokens: {eval_results['correct_tokens']:,}") |
|
|
console.print(f" Total Tokens: {eval_results['total_tokens']:,}") |
|
|
console.print(f" Samples: {eval_results['num_samples']:,}") |
|
|
|
|
|
console.print("\n[bold cyan]=" * 80) |
|
|
|
|
|
def main(): |
|
|
"""Main training function""" |
|
|
|
|
|
import argparse |
|
|
parser = argparse.ArgumentParser(description="Humigence Training with Accelerate") |
|
|
parser.add_argument("--config_file", type=str, required=True, help="Path to config file") |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
accelerator = Accelerator() |
|
|
set_seed(42) |
|
|
|
|
|
|
|
|
config = load_config(args.config_file) |
|
|
|
|
|
|
|
|
console.print(f"[blue]π Accelerate Info:[/blue]") |
|
|
console.print(f" Process index: {accelerator.process_index}") |
|
|
console.print(f" Local process index: {accelerator.local_process_index}") |
|
|
console.print(f" Device: {accelerator.device}") |
|
|
console.print(f" Distributed: {accelerator.distributed_type}") |
|
|
console.print(f" Mixed precision: {accelerator.mixed_precision}") |
|
|
|
|
|
try: |
|
|
|
|
|
model, tokenizer = setup_model_and_tokenizer(config, accelerator) |
|
|
|
|
|
|
|
|
train_dataset, val_dataset, test_dataset = prepare_dataset(config, tokenizer) |
|
|
|
|
|
|
|
|
trainer = train_model(model, tokenizer, train_dataset, val_dataset, config, accelerator) |
|
|
|
|
|
|
|
|
accelerator.wait_for_everyone() |
|
|
|
|
|
|
|
|
if accelerator.is_main_process: |
|
|
eval_results = evaluate_model_on_single_gpu(model, tokenizer, test_dataset, config) |
|
|
print_training_summary(config, train_dataset, val_dataset, test_dataset, eval_results) |
|
|
else: |
|
|
eval_results = None |
|
|
|
|
|
|
|
|
accelerator.wait_for_everyone() |
|
|
|
|
|
return {"status": "success", "eval_results": eval_results} |
|
|
|
|
|
except Exception as e: |
|
|
console.print(f"[red]β Training failed: {e}[/red]") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
return {"status": "error", "message": str(e)} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
results = main() |
|
|
if results["status"] == "success": |
|
|
console.print("[green]β
Training completed successfully![/green]") |
|
|
else: |
|
|
console.print(f"[red]β Training failed: {results['message']}[/red]") |
|
|
exit(1) |