| |
| """ |
| Universal trainer for Humigence - supports all dataset types |
| """ |
|
|
| import os |
| import torch |
| from typing import Dict, Any |
| from transformers import ( |
| AutoModelForCausalLM, |
| AutoTokenizer, |
| DataCollatorForLanguageModeling, |
| Trainer, |
| TrainingArguments, |
| ) |
| from peft import LoraConfig, get_peft_model |
| from rich.console import Console |
| from config.schema import TrainingConfig |
| from training.data_loader import auto_load_dataset, prepare_dataset_for_training |
|
|
| console = Console() |
|
|
|
|
| def run_universal_training(config: TrainingConfig, accelerator=None) -> Dict[str, Any]: |
| """ |
| Run training with dataset-agnostic support. |
| |
| Args: |
| config: TrainingConfig object with all training parameters |
| accelerator: Optional Accelerate accelerator for multi-GPU training |
| |
| Returns: |
| Dictionary with training results |
| """ |
| try: |
| console.print(f"[blue]π Starting universal training with model: {config.model}[/blue]") |
| console.print(f"[blue]π Output directory: {config.output_dir}[/blue]") |
| console.print(f"[blue]π Dataset: {config.dataset}[/blue]") |
| |
| |
| torch.backends.cuda.matmul.allow_tf32 = True |
| |
| |
| console.print("[blue]π Loading tokenizer...[/blue]") |
| tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| console.print("[blue]π€ Loading model...[/blue]") |
| model_obj = AutoModelForCausalLM.from_pretrained( |
| config.model, |
| torch_dtype=torch.bfloat16, |
| device_map="auto" |
| ) |
| |
| |
| console.print("[blue]π§ Configuring LoRA...[/blue]") |
| |
| |
| if "gpt" in config.model.lower() or "dialo" in config.model.lower(): |
| target_modules = ["c_attn", "c_proj"] |
| elif "llama" in config.model.lower() or "mistral" in config.model.lower(): |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] |
| else: |
| |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] |
| |
| |
| if config.lora_targets: |
| target_modules = config.lora_targets |
| |
| lora_config = LoraConfig( |
| r=config.lora_r, |
| lora_alpha=config.lora_alpha, |
| lora_dropout=config.lora_dropout, |
| target_modules=target_modules, |
| task_type="CAUSAL_LM", |
| bias="none" |
| ) |
| |
| model_obj = get_peft_model(model_obj, lora_config) |
| |
| |
| model_obj.enable_input_require_grads() |
| |
| model_obj.print_trainable_parameters() |
| |
| |
| console.print(f"[blue]π Loading dataset: {config.dataset.type}[/blue]") |
| |
| |
| if config.dataset.type == "wikitext": |
| dataset_spec = "wikitext" |
| elif config.dataset.type == "jsonl": |
| dataset_spec = f"jsonl:{config.dataset.path}" |
| elif config.dataset.type == "hf": |
| dataset_spec = f"hf:{config.dataset.name}" if not config.dataset.name.startswith("hf:") else config.dataset.name |
| else: |
| raise ValueError(f"Unknown dataset type: {config.dataset.type}") |
| |
| train_dataset, eval_dataset, dataset_metadata = auto_load_dataset( |
| dataset_spec, |
| text_field=config.dataset.text_field, |
| schema=config.dataset.schema_type, |
| role_markers=config.dataset.role_markers, |
| user_marker=config.dataset.user_marker, |
| assistant_marker=config.dataset.assistant_marker |
| ) |
| |
| |
| text_field = config.dataset.text_field or dataset_metadata.get("text_field", "text") |
| |
| |
| tokenized_train, tokenized_eval = prepare_dataset_for_training( |
| train_dataset, |
| eval_dataset, |
| tokenizer, |
| text_field=text_field, |
| max_length=config.block_size |
| ) |
| |
| console.print(f"[blue]π Train samples: {len(tokenized_train)}, Eval samples: {len(tokenized_eval)}[/blue]") |
| |
| |
| if config.max_steps is None: |
| steps_per_epoch = len(tokenized_train) // (config.batch_size * config.grad_accum) |
| config.max_steps = steps_per_epoch * config.epochs |
| console.print(f"[blue]π Calculated max_steps: {config.max_steps} (steps_per_epoch: {steps_per_epoch}, epochs: {config.epochs})[/blue]") |
| |
| |
| training_args = TrainingArguments( |
| output_dir=config.output_dir, |
| per_device_train_batch_size=config.batch_size, |
| per_device_eval_batch_size=config.batch_size, |
| gradient_accumulation_steps=config.grad_accum, |
| max_steps=config.max_steps, |
| learning_rate=config.learning_rate, |
| warmup_steps=config.warmup_steps, |
| logging_steps=config.logging_steps, |
| save_steps=config.save_steps, |
| eval_steps=config.eval_steps, |
| bf16=True, |
| ddp_find_unused_parameters=False, |
| remove_unused_columns=False, |
| gradient_checkpointing=config.gradient_checkpointing, |
| save_total_limit=2, |
| do_eval=True, |
| eval_strategy="steps", |
| save_strategy="steps", |
| load_best_model_at_end=True, |
| metric_for_best_model="eval_loss", |
| greater_is_better=False, |
| report_to=None, |
| ) |
| |
| |
| trainer = Trainer( |
| model=model_obj, |
| args=training_args, |
| train_dataset=tokenized_train, |
| eval_dataset=tokenized_eval, |
| data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), |
| processing_class=tokenizer, |
| ) |
| |
| |
| console.print("[green]β
Starting training...[/green]") |
| training_result = trainer.train() |
| |
| |
| console.print("[blue]πΎ Saving final model...[/blue]") |
| trainer.save_model() |
| tokenizer.save_pretrained(config.output_dir) |
| |
| |
| final_metrics = { |
| "train_loss": training_result.training_loss, |
| "train_runtime": training_result.metrics.get("train_runtime", 0), |
| "train_samples_per_second": training_result.metrics.get("train_samples_per_second", 0), |
| "train_steps_per_second": training_result.metrics.get("train_steps_per_second", 0), |
| "total_steps": training_result.global_step, |
| "epochs": training_result.metrics.get("epoch", 0), |
| } |
| |
| |
| if tokenized_eval: |
| eval_results = trainer.evaluate() |
| final_metrics.update({ |
| "eval_loss": eval_results.get("eval_loss", 0), |
| "eval_perplexity": eval_results.get("eval_perplexity", 0), |
| }) |
| |
| |
| final_metrics.update({ |
| "dataset_type": dataset_metadata.get("dataset_type", "unknown"), |
| "dataset_spec": dataset_metadata.get("dataset_spec", config.dataset), |
| "text_field": text_field, |
| "schema": dataset_metadata.get("schema", "unknown"), |
| }) |
| |
| console.print("[green]β
Training completed successfully![/green]") |
| console.print(f"[blue]π Final metrics: {final_metrics}[/blue]") |
| |
| return { |
| "status": "success", |
| "metrics": final_metrics, |
| "output_dir": config.output_dir, |
| "model_path": config.output_dir, |
| "dataset_metadata": dataset_metadata |
| } |
| |
| except Exception as e: |
| console.print(f"[red]β Training failed: {str(e)}[/red]") |
| return { |
| "status": "error", |
| "error": str(e), |
| "output_dir": config.output_dir |
| } |
|
|
|
|
| def run_universal_training_with_accelerator(config: TrainingConfig, accelerator) -> Dict[str, Any]: |
| """ |
| Run training with Accelerate for multi-GPU support. |
| |
| Args: |
| config: TrainingConfig object with all training parameters |
| accelerator: Accelerate accelerator instance |
| |
| Returns: |
| Dictionary with training results |
| """ |
| try: |
| console.print(f"[blue]π Starting multi-GPU training with model: {config.model}[/blue]") |
| console.print(f"[blue]π Output directory: {config.output_dir}[/blue]") |
| console.print(f"[blue]π Dataset: {config.dataset}[/blue]") |
| console.print(f"[blue]π§ Using {accelerator.num_processes} GPUs[/blue]") |
| |
| |
| torch.backends.cuda.matmul.allow_tf32 = True |
| |
| |
| console.print("[blue]π Loading tokenizer...[/blue]") |
| tokenizer = AutoTokenizer.from_pretrained(config.model, use_fast=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| console.print("[blue]π€ Loading model...[/blue]") |
| model = AutoModelForCausalLM.from_pretrained( |
| config.model, |
| torch_dtype=torch.float16 if config.dtype == "fp16" else torch.bfloat16, |
| device_map="auto" if config.gpu_mode == "multi" else None |
| ) |
| |
| |
| if config.lora: |
| console.print("[blue]π§ Applying LoRA configuration...[/blue]") |
| lora_config = LoraConfig( |
| r=config.lora_r, |
| lora_alpha=config.lora_alpha, |
| target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
| lora_dropout=config.lora_dropout, |
| bias="none", |
| task_type="CAUSAL_LM" |
| ) |
| model = get_peft_model(model, lora_config) |
| |
| |
| console.print("[blue]π Loading dataset...[/blue]") |
| train_dataset, eval_dataset, dataset_metadata = auto_load_dataset(config) |
| |
| |
| train_dataset = prepare_dataset_for_training(train_dataset, tokenizer, config) |
| if eval_dataset: |
| eval_dataset = prepare_dataset_for_training(eval_dataset, tokenizer, config) |
| |
| |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=False |
| ) |
| |
| |
| training_args = TrainingArguments( |
| output_dir=config.output_dir, |
| num_train_epochs=config.epochs, |
| per_device_train_batch_size=config.batch_size, |
| per_device_eval_batch_size=config.batch_size, |
| gradient_accumulation_steps=config.grad_accum, |
| learning_rate=config.learning_rate, |
| max_steps=config.max_steps, |
| warmup_steps=config.warmup_steps, |
| logging_steps=config.logging_steps, |
| save_steps=config.save_steps, |
| eval_steps=config.eval_steps, |
| evaluation_strategy="steps" if eval_dataset else "no", |
| save_strategy="steps", |
| load_best_model_at_end=True if eval_dataset else False, |
| metric_for_best_model="eval_loss" if eval_dataset else None, |
| greater_is_better=False, |
| fp16=config.dtype == "fp16", |
| bf16=config.dtype == "bf16", |
| gradient_checkpointing=config.gradient_checkpointing, |
| dataloader_num_workers=4, |
| remove_unused_columns=False, |
| report_to="tensorboard", |
| logging_dir=f"{config.output_dir}/logs", |
| save_total_limit=3, |
| seed=42, |
| data_seed=42, |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| data_collator=data_collator, |
| tokenizer=tokenizer, |
| ) |
| |
| |
| model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( |
| model, trainer.optimizer, trainer.get_train_dataloader(), trainer.get_eval_dataloader() |
| ) |
| |
| |
| trainer.model = model |
| trainer.optimizer = optimizer |
| trainer.train_dataloader = lambda: train_dataloader |
| trainer.eval_dataloader = lambda: eval_dataloader |
| |
| |
| console.print("[blue]π Starting training...[/blue]") |
| train_result = trainer.train() |
| |
| |
| console.print("[blue]πΎ Saving model...[/blue]") |
| trainer.save_model() |
| tokenizer.save_pretrained(config.output_dir) |
| |
| |
| final_metrics = { |
| "train_loss": train_result.training_loss, |
| "train_runtime": train_result.metrics.get("train_runtime", 0), |
| "train_samples_per_second": train_result.metrics.get("train_samples_per_second", 0), |
| "total_steps": train_result.metrics.get("train_steps", 0), |
| "epochs": train_result.metrics.get("train_epoch", 0), |
| } |
| |
| if eval_dataset: |
| eval_result = trainer.evaluate() |
| final_metrics.update({ |
| "eval_loss": eval_result.get("eval_loss", 0), |
| }) |
| |
| console.print("[green]β
Multi-GPU training completed successfully![/green]") |
| console.print(f"[blue]π Final metrics: {final_metrics}[/blue]") |
| |
| return { |
| "status": "success", |
| "metrics": final_metrics, |
| "output_dir": config.output_dir, |
| "model_path": config.output_dir, |
| "dataset_metadata": dataset_metadata |
| } |
| |
| except Exception as e: |
| console.print(f"[red]β Multi-GPU training failed: {str(e)}[/red]") |
| return { |
| "status": "error", |
| "error": str(e), |
| "output_dir": config.output_dir |
| } |
|
|