File size: 10,112 Bytes

98b7474

"""
Training logger: wandb + CSV + stdout + run summary JSON.

All logging is gated on ``enabled`` (typically ``is_main_process()``).
Wandb is optional -- if ``wandb`` is not installed or fails to init,
logging falls back to CSV + stdout silently.

CSV columns (one row per logged event):
  run_id, step, samples_seen, wall_time_sec, event_type,
  train_loss, loss_fine, loss_coarse, loss_ratio,
  grad_norm, lr_connector, lr_dino, lr_llm,
  throughput_samples_sec, gpu_mem_gb,
  val_loss, val_loss_fine, val_loss_coarse, val_loss_ratio,
  attention_entropy
"""

import csv
import json
import os
import subprocess
import time
from datetime import datetime
from pathlib import Path
from typing import Optional


def _get_git_hash() -> str:
    """Get current git commit hash, or 'unknown' if not in a repo."""
    try:
        result = subprocess.run(
            ["git", "rev-parse", "--short", "HEAD"],
            capture_output=True, text=True, timeout=5,
        )
        return result.stdout.strip() if result.returncode == 0 else "unknown"
    except Exception:
        return "unknown"


def _gpu_memory_gb() -> float:
    """Get current GPU memory allocated in GB, or 0 if no GPU."""
    try:
        import torch
        if torch.cuda.is_available():
            return torch.cuda.memory_allocated() / (1024 ** 3)
    except Exception:
        pass
    return 0.0


CSV_COLUMNS = [
    "run_id", "step", "samples_seen", "wall_time_sec", "event_type",
    "train_loss", "loss_fine", "loss_coarse", "loss_ratio",
    "grad_norm", "lr_connector", "lr_dino", "lr_llm",
    "throughput_samples_sec", "gpu_mem_gb",
    "val_loss", "val_loss_fine", "val_loss_coarse", "val_loss_ratio",
    "attention_entropy",
]


class TrainingLogger:
    """
    Unified logger that writes to wandb, structured CSV, and stdout.

    Parameters
    ----------
    project : str
        wandb project name.
    config : dict
        Training config to log as wandb config / CSV header metadata.
    enabled : bool
        If False, all log calls are no-ops (use for non-rank-0 processes).
    log_dir : str
        Directory for the CSV log file.
    """

    def __init__(
        self,
        project: str = "foveated-vlm",
        config: Optional[dict] = None,
        enabled: bool = True,
        log_dir: Optional[str] = None,
    ):
        self.enabled = enabled
        self._wandb_run = None
        self._csv_path = None
        self._csv_writer = None
        self._csv_file = None
        self._start_time = time.time()
        self._config = config or {}
        self._run_id = ""
        self._best_val_loss = float("inf")
        self._best_step = 0
        self._last_step = 0
        self._last_samples = 0
        self._git_hash = _get_git_hash()

        if not enabled:
            return

        # ---- Run ID ----
        run_name = (config or {}).get("wandb", {}).get("run_name", "run")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self._run_id = f"{run_name}_{timestamp}"

        # ---- wandb ----
        try:
            import wandb
            self._wandb_run = wandb.init(
                project=project,
                name=run_name,
                config=config or {},
                resume="allow",
            )
        except Exception:
            pass

        # ---- CSV ----
        if log_dir is None:
            log_dir = (config or {}).get("checkpoint", {}).get(
                "save_dir", "/workspace/logs"
            )
        self._log_dir = Path(log_dir)
        self._log_dir.mkdir(parents=True, exist_ok=True)

        self._csv_path = self._log_dir / f"metrics_{self._run_id}.csv"
        self._csv_file = open(self._csv_path, "w", newline="")
        self._csv_writer = csv.DictWriter(
            self._csv_file, fieldnames=CSV_COLUMNS, extrasaction="ignore",
        )
        self._csv_writer.writeheader()
        self._csv_file.flush()

    def _write_csv_row(self, row: dict):
        if self._csv_writer is not None:
            row.setdefault("run_id", self._run_id)
            row.setdefault("wall_time_sec", f"{time.time() - self._start_time:.1f}")
            self._csv_writer.writerow(row)
            self._csv_file.flush()

    def log_step(
        self,
        step: int,
        loss: float,
        fine_loss: float = 0.0,
        coarse_loss: float = 0.0,
        lr: float = 0.0,
        grad_norm: float = 0.0,
        samples_seen: int = 0,
        samples_per_sec: float = 0.0,
        lr_groups: Optional[dict] = None,
    ):
        """Log a training step with full metrics."""
        if not self.enabled:
            return

        self._last_step = step
        self._last_samples = samples_seen

        loss_ratio = fine_loss / max(coarse_loss, 1e-8) if coarse_loss > 0 else 0.0
        gpu_mem = _gpu_memory_gb()

        # Parse per-group LRs
        lr_connector = lr
        lr_dino = lr
        lr_llm = lr
        if lr_groups:
            lr_connector = lr_groups.get("connector", lr)
            lr_dino = lr_groups.get("dino", lr)
            lr_llm = lr_groups.get("llm", lr)

        # stdout
        print(
            f"  step {step:6d} | loss {loss:.4f} | "
            f"fine {fine_loss:.4f} | ratio {loss_ratio:.3f} | "
            f"lr {lr:.2e} | gnorm {grad_norm:.2f} | "
            f"{samples_per_sec:.0f} samp/s | {gpu_mem:.1f}GB",
            flush=True,
        )

        # wandb
        if self._wandb_run is not None:
            try:
                import wandb
                log_dict = {
                    "train/loss": loss,
                    "train/fine_loss": fine_loss,
                    "train/coarse_loss": coarse_loss,
                    "train/loss_ratio": loss_ratio,
                    "train/lr": lr,
                    "train/lr_connector": lr_connector,
                    "train/lr_dino": lr_dino,
                    "train/lr_llm": lr_llm,
                    "train/grad_norm": grad_norm,
                    "train/samples_seen": samples_seen,
                    "train/throughput": samples_per_sec,
                    "train/gpu_mem_gb": gpu_mem,
                }
                wandb.log(log_dict, step=step)
            except Exception:
                pass

        # CSV
        self._write_csv_row({
            "step": step,
            "samples_seen": samples_seen,
            "event_type": "train",
            "train_loss": f"{loss:.6f}",
            "loss_fine": f"{fine_loss:.6f}",
            "loss_coarse": f"{coarse_loss:.6f}",
            "loss_ratio": f"{loss_ratio:.4f}",
            "grad_norm": f"{grad_norm:.4f}",
            "lr_connector": f"{lr_connector:.2e}",
            "lr_dino": f"{lr_dino:.2e}",
            "lr_llm": f"{lr_llm:.2e}",
            "throughput_samples_sec": f"{samples_per_sec:.1f}",
            "gpu_mem_gb": f"{gpu_mem:.2f}",
        })

    def log_eval(
        self,
        step: int,
        val_loss: float,
        val_fine_loss: float = 0.0,
        val_coarse_loss: float = 0.0,
        attention_entropy: float = 0.0,
    ):
        """Log a validation result with extended metrics."""
        if not self.enabled:
            return

        val_ratio = val_fine_loss / max(val_coarse_loss, 1e-8) if val_coarse_loss > 0 else 0.0

        if val_loss < self._best_val_loss:
            self._best_val_loss = val_loss
            self._best_step = step

        print(
            f"  [eval] step {step:6d} | val_loss {val_loss:.4f} | "
            f"fine {val_fine_loss:.4f} | ratio {val_ratio:.3f} | "
            f"entropy {attention_entropy:.4f} | "
            f"best {self._best_val_loss:.4f}@{self._best_step}",
            flush=True,
        )

        if self._wandb_run is not None:
            try:
                import wandb
                wandb.log({
                    "eval/val_loss": val_loss,
                    "eval/val_fine_loss": val_fine_loss,
                    "eval/val_coarse_loss": val_coarse_loss,
                    "eval/val_loss_ratio": val_ratio,
                    "eval/attention_entropy": attention_entropy,
                    "eval/best_val_loss": self._best_val_loss,
                }, step=step)
            except Exception:
                pass

        self._write_csv_row({
            "step": step,
            "samples_seen": self._last_samples,
            "event_type": "eval",
            "val_loss": f"{val_loss:.6f}",
            "val_loss_fine": f"{val_fine_loss:.6f}",
            "val_loss_coarse": f"{val_coarse_loss:.6f}",
            "val_loss_ratio": f"{val_ratio:.4f}",
            "attention_entropy": f"{attention_entropy:.6f}",
        })

    def save_run_summary(self, final_loss: float = 0.0, total_samples: int = 0):
        """Save run summary JSON at end of training."""
        if not self.enabled:
            return

        elapsed = time.time() - self._start_time
        summary = {
            "run_id": self._run_id,
            "git_hash": self._git_hash,
            "config_file": self._config.get("_config_path", ""),
            "final_train_loss": final_loss,
            "best_val_loss": self._best_val_loss,
            "best_val_step": self._best_step,
            "total_steps": self._last_step,
            "total_samples": total_samples,
            "wall_time_sec": elapsed,
            "wall_time_hours": elapsed / 3600,
            "csv_path": str(self._csv_path) if self._csv_path else "",
            "timestamp": datetime.now().isoformat(),
        }

        summary_path = self._log_dir / f"run_summary_{self._run_id}.json"
        with open(summary_path, "w") as f:
            json.dump(summary, f, indent=2)
        print(f"  Run summary saved to {summary_path}", flush=True)

    def finish(self):
        """Flush and close all logging backends."""
        if not self.enabled:
            return

        if self._wandb_run is not None:
            try:
                import wandb
                wandb.finish()
            except Exception:
                pass

        if self._csv_file is not None:
            self._csv_file.close()
            self._csv_file = None