Spaces:

revana
/

fingpt

Running

App Files Files Community

revana commited on 21 days ago

Commit

80a00d8

verified ·

1 Parent(s): 0df61ec

Upload fingpt/trainer.py

Browse files

Files changed (1) hide show

fingpt/trainer.py +239 -0

fingpt/trainer.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""Fine-tuning training loop for SFT and LoRA.
+Supports:
+  - Full supervised fine-tuning  (SFTConfig)
+  - LoRA fine-tuning             (LoRAConfig)
+  - bfloat16 AMP
+  - Gradient accumulation
+  - Cosine LR schedule with warmup
+  - Periodic eval + checkpoint
+  - Weights & Biases logging
+"""
+import math
+import time
+from pathlib import Path
+from typing import Optional
+import torch
+from torch.amp import autocast
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, get_cosine_schedule_with_warmup
+from .config import LoRAConfig, SFTConfig
+from .data import load_datasets, make_collate_fn
+from .lora import inject_lora, lora_state_dict
+try:
+    import wandb
+except ImportError:
+    wandb = None
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def _param_summary(model: torch.nn.Module) -> str:
+    total     = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    pct       = 100 * trainable / max(1, total)
+    return f"total={total:,}  trainable={trainable:,}  ({pct:.2f}%)"
+def _evaluate(model, loader, device, cfg, max_batches: int = 20) -> float:
+    model.eval()
+    total_loss = total_tok = 0
+    with torch.no_grad():
+        for i, batch in enumerate(loader):
+            if i >= max_batches:
+                break
+            input_ids = batch["input_ids"].to(device)
+            labels    = batch["labels"].to(device)
+            with autocast(device_type=device.type, dtype=torch.bfloat16, enabled=cfg.bf16):
+                out = model(input_ids=input_ids, labels=labels)
+            n           = (labels != -100).sum().item()
+            total_loss += out.loss.item() * n
+            total_tok  += n
+    return total_loss / max(1, total_tok)
+def _save(cfg, model, step: int, use_lora: bool, final: bool = False) -> None:
+    out = Path(cfg.output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    if use_lora:
+        # Save only the adapter weights (~50 MB) — base model stays on HuggingFace Hub
+        state = lora_state_dict(model)
+        tag   = "adapter_final.pt" if final else f"adapter_step_{step:07d}.pt"
+        meta  = {"step": step, "mode": "lora", "model_name": cfg.model_name,
+                 "lora_r": cfg.lora_r, "lora_alpha": cfg.lora_alpha,
+                 "lora_target_modules": cfg.lora_target_modules}
+    else:
+        # Full SFT: save the complete model state dict
+        raw   = model.module if hasattr(model, "module") else model
+        state = raw.state_dict()
+        tag   = "model_final.pt" if final else f"model_step_{step:07d}.pt"
+        meta  = {"step": step, "mode": "sft", "model_name": cfg.model_name}
+    path = out / tag
+    torch.save({"meta": meta, "state_dict": state}, path)
+    kind = "adapter" if use_lora else "model"
+    print(f"[fingpt] Saved {kind} → {path}  ({path.stat().st_size / 1e6:.0f} MB)")
+# ── Main training function ────────────────────────────────────────────────────
+def train(cfg: SFTConfig) -> None:
+    torch.manual_seed(cfg.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(cfg.seed)
+    use_lora = isinstance(cfg, LoRAConfig)
+    # ── Load tokenizer + datasets ──────────────────────────────────────────────
+    train_ds, val_ds, tokenizer = load_datasets(cfg)
+    # ── Load model ─────────────────────────────────────────────────────────────
+    t0 = time.time()
+    print(f"[fingpt] Loading {cfg.model_name} ...")
+    cuda_ok = torch.cuda.is_available()
+    try:
+        import accelerate  # noqa: F401
+        load_kwargs = {"device_map": "auto"} if cuda_ok else {}
+    except ImportError:
+        load_kwargs = {}
+    model = AutoModelForCausalLM.from_pretrained(
+        cfg.model_name,
+        torch_dtype=torch.bfloat16 if cfg.bf16 else torch.float32,
+        trust_remote_code=True,
+        **load_kwargs,
+    )
+    # Determine the device the model actually lives on
+    if load_kwargs:
+        # device_map="auto" — infer from the first parameter
+        device = next(model.parameters()).device
+    else:
+        device = torch.device("cuda" if cuda_ok else "cpu")
+        model = model.to(device)
+    print(f"[fingpt] Model on {device}  |  loaded in {time.time()-t0:.1f}s  |  {_param_summary(model)}")
+    # ── Inject LoRA adapters (LoRA mode only) ─────────────────────────────────
+    if use_lora:
+        model = inject_lora(
+            model,
+            target_modules=cfg.lora_target_modules,
+            r=cfg.lora_r,
+            alpha=cfg.lora_alpha,
+            dropout=cfg.lora_dropout,
+        )
+    else:
+        # Full SFT: all parameters are trainable
+        for p in model.parameters():
+            p.requires_grad_(True)
+    print(f"[fingpt] Trainable params  |  {_param_summary(model)}")
+    # ── DataLoaders ────────────────────────────────────────────────────────────
+    pad_id       = tokenizer.pad_token_id or 0
+    collate      = make_collate_fn(pad_id)
+    nw = getattr(cfg, "dataloader_workers", 4)
+    train_loader = DataLoader(
+        train_ds, batch_size=cfg.batch_size, shuffle=True,
+        collate_fn=collate, num_workers=nw, pin_memory=cuda_ok,
+    )
+    val_loader = DataLoader(
+        val_ds, batch_size=cfg.batch_size, shuffle=False,
+        collate_fn=collate, num_workers=min(nw, 2),
+    )
+    # ── Optimizer + LR schedule ────────────────────────────────────────────────
+    trainable_params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.AdamW(trainable_params, lr=cfg.lr, weight_decay=cfg.weight_decay)
+    total_steps  = cfg.max_steps or (
+        len(train_loader) * cfg.num_epochs // cfg.grad_accum_steps
+    )
+    total_steps  = max(1, total_steps)   # guard: avoid 0 with tiny datasets
+    warmup_steps = max(1, int(total_steps * cfg.warmup_ratio))
+    scheduler    = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
+    print(
+        f"[fingpt] Training  |  steps={total_steps:,}  warmup={warmup_steps}  "
+        f"lr={cfg.lr:.1e}  bs={cfg.batch_size}×{cfg.grad_accum_steps}  "
+        f"{'LoRA r=' + str(cfg.lora_r) if use_lora else 'Full SFT'}"
+    )
+    # ── Weights & Biases ───────────────────────────────────────────────────────
+    run = None
+    if cfg.use_wandb and wandb is not None:
+        run = wandb.init(
+            project=cfg.wandb_project,
+            name=cfg.wandb_run_name,
+            config=cfg.__dict__,
+        )
+    # ── Training loop ──────────────────────────────────────────────────────────
+    model.train()
+    step        = 0
+    t_step      = time.perf_counter()
+    accum_loss  = 0.0
+    for epoch in range(cfg.num_epochs):
+        for batch in train_loader:
+            input_ids = batch["input_ids"].to(device)
+            labels    = batch["labels"].to(device)
+            with autocast(device_type=device.type, dtype=torch.bfloat16, enabled=cfg.bf16):
+                out  = model(input_ids=input_ids, labels=labels)
+                loss = out.loss / cfg.grad_accum_steps
+            loss.backward()
+            accum_loss += loss.item()
+            # ── Optimizer step every grad_accum_steps micro-batches ────────────
+            if (step + 1) % cfg.grad_accum_steps == 0:
+                torch.nn.utils.clip_grad_norm_(trainable_params, cfg.max_grad_norm)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+            # ── Logging ────────────────────────────────────────────────────────
+            if step % cfg.log_steps == 0:
+                elapsed   = time.perf_counter() - t_step
+                t_step    = time.perf_counter()
+                real_loss = accum_loss * cfg.grad_accum_steps
+                accum_loss = 0.0
+                lr        = optimizer.param_groups[0]["lr"]
+                ppl       = math.exp(min(20, real_loss))
+                print(
+                    f"step {step:6d} | loss {real_loss:.4f} | ppl {ppl:.1f} "
+                    f"| lr {lr:.2e} | {elapsed:.1f}s"
+                )
+                if run:
+                    run.log({"train/loss": real_loss, "train/ppl": ppl,
+                              "train/lr": lr, "step": step})
+            # ── Evaluation ─────────────────────────────────────────────────────
+            if cfg.eval_steps and step % cfg.eval_steps == 0 and step > 0:
+                val_loss = _evaluate(model, val_loader, device, cfg)
+                val_ppl  = math.exp(min(20, val_loss))
+                print(f"  eval  | loss {val_loss:.4f} | ppl {val_ppl:.1f}")
+                if run:
+                    run.log({"eval/loss": val_loss, "eval/ppl": val_ppl, "step": step})
+                model.train()
+            # ── Checkpoint ──────��──────────────────────────────────────────────
+            if cfg.save_steps and step % cfg.save_steps == 0 and step > 0:
+                _save(cfg, model, step, use_lora)
+            step += 1
+            if cfg.max_steps and step >= cfg.max_steps:
+                break
+        if cfg.max_steps and step >= cfg.max_steps:
+            break
+    # ── Final save ─────────────────────────────────────────────────────────────
+    _save(cfg, model, step, use_lora, final=True)
+    print(f"[fingpt] Training complete  |  total steps={step:,}")
+    if run:
+        run.finish()