telcom
/

dee-tulu-train

@@ -60,6 +60,7 @@ Key flags:
 - `train_tulu.py` loads `.env`, logs into W&B, and reports through `Trainer(report_to=["wandb"])`.
 - Ensure `WANDB_API_KEY`, `WANDB_PROJECT`, and (optionally) `WANDB_ENTITY` are set in `.env`.
 - Each run captures hyperparameters and metrics; check the W&B UI for live loss curves and checkpoints.
 ## Model cache location
 - Base model weights download to the Hugging Face cache. You can point downloads to an external directory by setting `BASE_MODEL_CACHE` in `.env` (e.g., `/Volumes/JTQ-s/______GITLAB____/downloaded_base_models`); the script maps this to `HF_HOME`/`TRANSFORMERS_CACHE` before loading models.

 - `train_tulu.py` loads `.env`, logs into W&B, and reports through `Trainer(report_to=["wandb"])`.
 - Ensure `WANDB_API_KEY`, `WANDB_PROJECT`, and (optionally) `WANDB_ENTITY` are set in `.env`.
 - Each run captures hyperparameters and metrics; check the W&B UI for live loss curves and checkpoints.
+- Additional summaries are logged: `train_duration_seconds`, `train_examples`, `estimated_tokens`, `precision_mode` (bf16/fp16/fp32), `use_4bit`, `model_name`, `dataset_name`, `per_device_batch_size`, `gradient_accumulation_steps`, and `max_seq_length`.
 ## Model cache location
 - Base model weights download to the Hugging Face cache. You can point downloads to an external directory by setting `BASE_MODEL_CACHE` in `.env` (e.g., `/Volumes/JTQ-s/______GITLAB____/downloaded_base_models`); the script maps this to `HF_HOME`/`TRANSFORMERS_CACHE` before loading models.

train_tulu.py CHANGED Viewed

@@ -9,6 +9,7 @@ from __future__ import annotations
 import argparse
 import os
 from dataclasses import dataclass
 from typing import Dict, List
@@ -138,22 +139,47 @@ def configure_cache_from_env():
 def main():
     load_dotenv()
     configure_cache_from_env()
     cfg = parse_args()
     init_wandb(cfg)
     model, tokenizer = load_model_and_tokenizer(cfg)
     use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
     use_fp16 = torch.cuda.is_available() and not use_bf16
     raw_dataset = load_dataset(cfg.dataset_name)
     tokenized = raw_dataset["train"].map(
         lambda ex: tokenize_example(ex, tokenizer, cfg.max_seq_length),
         remove_columns=raw_dataset["train"].column_names,
     )
     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
     training_args = TrainingArguments(
         output_dir=cfg.output_dir,
@@ -169,6 +195,7 @@ def main():
         report_to=["wandb"],
         optim="paged_adamw_32bit",
     )
     trainer = Trainer(
         model=model,
@@ -177,11 +204,19 @@ def main():
         tokenizer=tokenizer,
         data_collator=data_collator,
     )
     trainer.train()
     trainer.save_model(cfg.output_dir)
     tokenizer.save_pretrained(cfg.output_dir)
     wandb.finish()
 if __name__ == "__main__":

 import argparse
 import os
+import time
 from dataclasses import dataclass
 from typing import Dict, List
 def main():
     load_dotenv()
+    # Load env vars (WANDB keys, optional cache path).
     configure_cache_from_env()
+    # Redirect HF cache if BASE_MODEL_CACHE is set.
     cfg = parse_args()
+    # Read CLI hyperparameters/model settings.
     init_wandb(cfg)
+    # Start a W&B run with config and login.
     model, tokenizer = load_model_and_tokenizer(cfg)
+    # Load base model + tokenizer with LoRA (and 4-bit if enabled).
     use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
     use_fp16 = torch.cuda.is_available() and not use_bf16
+    # Choose best available mixed precision (bf16 > fp16 > fp32).
+    precision_mode = "bf16" if use_bf16 else "fp16" if use_fp16 else "fp32"
     raw_dataset = load_dataset(cfg.dataset_name)
+    # Download/load the instruction dataset.
     tokenized = raw_dataset["train"].map(
         lambda ex: tokenize_example(ex, tokenizer, cfg.max_seq_length),
         remove_columns=raw_dataset["train"].column_names,
     )
+    # Format/tokenize dataset to fixed length with labels.
+    train_examples = len(tokenized)
+    total_tokens = train_examples * cfg.max_seq_length
+    wandb.summary.update(
+        {
+            "train_examples": train_examples,
+            "estimated_tokens": total_tokens,
+            "precision_mode": precision_mode,
+            "use_4bit": cfg.use_4bit,
+            "model_name": cfg.model_name,
+            "dataset_name": cfg.dataset_name,
+            "per_device_batch_size": cfg.per_device_batch_size,
+            "gradient_accumulation_steps": cfg.gradient_accumulation_steps,
+            "max_seq_length": cfg.max_seq_length,
+        }
+    )
     data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    # Pad/batch causal LM examples.
     training_args = TrainingArguments(
         output_dir=cfg.output_dir,
         report_to=["wandb"],
         optim="paged_adamw_32bit",
     )
+    # Trainer configuration (logging, saving, optimizer, precision).
     trainer = Trainer(
         model=model,
         tokenizer=tokenizer,
         data_collator=data_collator,
     )
+    # Wire model, data, and config into HF Trainer.
+    train_start = time.time()
     trainer.train()
+    # Run supervised finetuning (cross-entropy).
+    train_duration = time.time() - train_start
+    wandb.log({"train_duration_seconds": train_duration})
+    # Record wall-clock training time to W&B.
     trainer.save_model(cfg.output_dir)
     tokenizer.save_pretrained(cfg.output_dir)
+    # Save adapters/tokenizer to output_dir.
     wandb.finish()
+    # Close W&B run.
 if __name__ == "__main__":