Spaces:
Sleeping
Sleeping
| """ | |
| src/train.py — Training loop for Phi-3-mini + LoRA on DialogSum. | |
| Loads the model and dataset, runs 3 epochs with HuggingFace Trainer, | |
| logs hyperparameters and per-epoch metrics to MLflow, then pushes | |
| the LoRA adapter and tokenizer to HuggingFace Hub. | |
| Run: python src/train.py | |
| Requires: HF_TOKEN in .env (for Hub push), GPU (T4 or better). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import mlflow | |
| from dotenv import load_dotenv | |
| from transformers import ( | |
| Trainer, | |
| TrainerCallback, | |
| TrainerControl, | |
| TrainerState, | |
| TrainingArguments, | |
| ) | |
| from src.data import DEFAULT_MAX_LENGTH, make_data_collator, prepare_datasets | |
| from src.model import ( | |
| HUB_REPO, | |
| LORA_ALPHA, | |
| LORA_DROPOUT, | |
| LORA_R, | |
| LORA_TARGET_MODULES, | |
| MODEL_ID, | |
| load_model_and_tokenizer, | |
| print_trainable_parameters, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Config | |
| # --------------------------------------------------------------------------- | |
| OUTPUT_DIR = "outputs/dialogsum-phi3-lora" | |
| MLFLOW_EXPERIMENT = "dialogsum-phi3-lora" | |
| HYPERPARAMS: dict = { | |
| "model_id": MODEL_ID, | |
| "hub_repo": HUB_REPO, | |
| "num_train_epochs": 3, | |
| "per_device_train_batch_size": 2, | |
| "gradient_accumulation_steps": 8, # effective batch size = 16 | |
| "learning_rate": 2e-4, | |
| "fp16": True, | |
| "max_length": DEFAULT_MAX_LENGTH, | |
| "lora_r": LORA_R, | |
| "lora_alpha": LORA_ALPHA, | |
| "lora_dropout": LORA_DROPOUT, | |
| "lora_target_modules": ",".join(LORA_TARGET_MODULES), | |
| } | |
| # --------------------------------------------------------------------------- | |
| # MLflow callback | |
| # --------------------------------------------------------------------------- | |
| class MLflowEpochCallback(TrainerCallback): | |
| """ | |
| Log train loss, eval loss, and learning rate to MLflow at the end of | |
| each epoch. The Trainer already computes these — this callback just | |
| forwards them to the active MLflow run. | |
| Uses `on_evaluate` (fires after each eval pass) rather than `on_epoch_end` | |
| because eval metrics are only available after evaluation completes. | |
| """ | |
| def on_evaluate( | |
| self, | |
| args: TrainingArguments, | |
| state: TrainerState, | |
| control: TrainerControl, | |
| metrics: dict, | |
| **kwargs, | |
| ) -> None: | |
| if not state.is_world_process_zero: | |
| return | |
| step = state.global_step | |
| epoch = int(state.epoch) if state.epoch else step | |
| log: dict[str, float] = {"epoch": float(epoch)} | |
| for key in ("eval_loss", "eval_runtime", "train_loss"): | |
| if key in metrics: | |
| log[key] = metrics[key] | |
| # learning rate is in the last log history entry | |
| for entry in reversed(state.log_history): | |
| if "learning_rate" in entry: | |
| log["learning_rate"] = entry["learning_rate"] | |
| break | |
| mlflow.log_metrics(log, step=step) | |
| def on_log( | |
| self, | |
| args: TrainingArguments, | |
| state: TrainerState, | |
| control: TrainerControl, | |
| logs: dict, | |
| **kwargs, | |
| ) -> None: | |
| """Also forward step-level train loss so the MLflow chart is smooth.""" | |
| if not state.is_world_process_zero: | |
| return | |
| step = state.global_step | |
| metrics: dict[str, float] = {} | |
| for key in ("loss", "learning_rate", "grad_norm"): | |
| if key in logs: | |
| metrics[key] = logs[key] | |
| if metrics: | |
| mlflow.log_metrics(metrics, step=step) | |
| # --------------------------------------------------------------------------- | |
| # Training entry point | |
| # --------------------------------------------------------------------------- | |
| def train() -> None: | |
| """ | |
| Full training pipeline: | |
| 1. Load model + tokenizer (4-bit quant + LoRA). | |
| 2. Tokenize DialogSum train/val splits. | |
| 3. Run Trainer for 3 epochs, logging to MLflow. | |
| 4. Push adapter + tokenizer to HuggingFace Hub. | |
| """ | |
| load_dotenv() | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise EnvironmentError( | |
| "HF_TOKEN not set. Copy .env.example to .env and add your token." | |
| ) | |
| # --- Model + tokenizer --- | |
| print("Loading model and tokenizer ...") | |
| model, tokenizer = load_model_and_tokenizer() | |
| print("\nTrainable parameters:") | |
| print_trainable_parameters(model) | |
| # --- Data --- | |
| print("\nPreparing datasets ...") | |
| train_ds, val_ds, _ = prepare_datasets(tokenizer) | |
| collator = make_data_collator(tokenizer) | |
| print(f" train: {len(train_ds):,} | val: {len(val_ds):,}") | |
| # --- TrainingArguments --- | |
| training_args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| num_train_epochs=HYPERPARAMS["num_train_epochs"], | |
| per_device_train_batch_size=HYPERPARAMS["per_device_train_batch_size"], | |
| gradient_accumulation_steps=HYPERPARAMS["gradient_accumulation_steps"], | |
| learning_rate=HYPERPARAMS["learning_rate"], | |
| fp16=HYPERPARAMS["fp16"], | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| logging_steps=50, | |
| save_total_limit=2, | |
| report_to="none", # MLflow handled manually via callback | |
| dataloader_pin_memory=False, # avoids issues with quantized models on some setups | |
| ) | |
| # --- MLflow run --- | |
| # Use a local file store explicitly — avoids path-encoding issues on | |
| # Windows when the username contains non-ASCII characters. | |
| mlflow.set_tracking_uri("mlflow_runs") | |
| mlflow.set_experiment(MLFLOW_EXPERIMENT) | |
| with mlflow.start_run(run_name="phi3-lora-dialogsum"): | |
| mlflow.log_params(HYPERPARAMS) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_ds, | |
| eval_dataset=val_ds, | |
| data_collator=collator, | |
| callbacks=[MLflowEpochCallback()], | |
| ) | |
| print("\nStarting training ...") | |
| trainer.train() | |
| # Log final eval loss explicitly | |
| final_metrics = trainer.evaluate() | |
| mlflow.log_metrics( | |
| {"final_eval_loss": final_metrics["eval_loss"]}, | |
| step=trainer.state.global_step, | |
| ) | |
| print(f"\nFinal eval loss: {final_metrics['eval_loss']:.4f}") | |
| # --- Push to Hub --- | |
| print(f"\nPushing adapter to Hub: {HUB_REPO} ...") | |
| model.push_to_hub(HUB_REPO, token=hf_token) | |
| print(f"Pushing tokenizer to Hub: {HUB_REPO} ...") | |
| tokenizer.push_to_hub(HUB_REPO, token=hf_token) | |
| print(f"\nDone. Model published at: https://huggingface.co/{HUB_REPO}") | |
| # --------------------------------------------------------------------------- | |
| # CLI entry point | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| train() | |