laion
/

captioning-whisper-proof_of_concept

Model card Files Files and versions

xet

Community

ChristophSchuhmann commited on Nov 15, 2025

Commit

2cd248d

verified ·

1 Parent(s): db42eb1

Upload train+.py with huggingface_hub

Browse files

Files changed (1) hide show

train+.py +1061 -0

train+.py ADDED Viewed

	@@ -0,0 +1,1061 @@

+# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Local single-process trainer (no torchrun/DDP).
+- Uses all visible GPUs via torch.nn.DataParallel (if >1 GPU), else single GPU.
+- Trains Whisper encoder (unfrozen) + small head; decoder is frozen (unused).
+- Supports data in .tar/.tar.gz (audio+json pairs inside) OR loose files:
+    <any>/<name>.wav|.mp3  +  <same-dir>/<name>.json
+- NEW: also supports GeminiProAudioSegments-style loose files:
+    <any>/<name>.audio.mp3 + <same-dir>/<name>.audio.json
+  with filtering on segment_duration + overlapping.
+- Adaptive batch probe (optional), BF16 preferred when supported (auto), FP16 fallback.
+- Periodic HTML evals (x-axis = seconds), ETA, full resumability (weights + states).
+- HTML eval embeds <audio> player with base64 audio + plots per sample.
+NOTE: "Resume" now means **weights-only resume** for a new phase on a (possibly) new dataset:
+- We load model weights from trainer_state.pt / trainer_state_best.pt,
+  but reset optimizer, scheduler, and all counters for this run.
+"""
+from __future__ import annotations
+import os, io, json, time, random, tarfile, base64, traceback, math
+from pathlib import Path
+from typing import List, Tuple, Dict, Any, Optional
+# =========================
+# ========= CONFIG ========
+# =========================
+DATA_DIR      = Path(os.getenv("DATA_DIR",      "./audiodata-full"))
+RESUME_DIR    = Path(os.getenv("RESUME_DIR",    "./resume"))
+OUT_DIR       = Path(os.getenv("OUT_DIR",       "./outs"))
+EPOCHS        = int(os.getenv("EPOCHS",         "2"))
+BATCH_SIZE    = int(os.getenv("BATCH_SIZE",     "16"))               # global batch (DataParallel will split)
+ADAPTIVE_BSZ  = int(os.getenv("ADAPTIVE_BSZ",   "1"))                # 1=probe; 0=use BATCH_SIZE as-is
+MAX_BSZ_CAP   = int(os.getenv("MAX_BSZ",        "0")) or None
+NUM_WORKERS   = int(os.getenv("NUM_WORKERS",    "4"))
+VAL_POOL      = int(os.getenv("EVAL_POOL",      "1000"))  # kept for backward compat; not used in new mix
+EVAL_FIRST    = int(os.getenv("EVAL_FIRST_SEEN","2000"))
+EVAL_EVERY    = int(os.getenv("EVAL_EVERY_SEEN","10000"))
+SEED          = int(os.getenv("SEED",           "1337"))
+HF_MODEL_ID   = os.getenv("HF_MODEL_ID",        "openai/whisper-small")
+# --- NEW: Gemini-specific config (hard-coded but override-able via env) ---
+GEMINI_DIR                    = Path(os.getenv("GEMINI_DIR", "/home/user/segdata-full/"))
+USE_GEMINI                    = int(os.getenv("USE_GEMINI", "1"))     # 1=use Gemini bucket, 0=ignore
+GEMINI_SEGMENT_DURATION       = os.getenv("GEMINI_SEGMENT_DURATION", "medium")  # filter on this
+GEMINI_INCLUDE_OVERLAP_TRUE   = bool(int(os.getenv("GEMINI_INCLUDE_OVERLAP_TRUE", "1")))
+GEMINI_INCLUDE_OVERLAP_FALSE  = bool(int(os.getenv("GEMINI_INCLUDE_OVERLAP_FALSE", "1")))
+GEMINI_OTHER_RATIO            = float(os.getenv("GEMINI_OTHER_RATIO", "0.50"))  # other bucket size = round(ratio * N_gem)
+VAL_FIXED_N                   = int(os.getenv("VAL_FIXED_N", "500"))  # fixed eval size from mixed pool
+# Optional offline model snapshot
+USE_LOCAL_MODELS   = bool(int(os.getenv("USE_LOCAL_MODELS", "0")))
+MODELS_SNAPSHOT_DIR= Path(os.getenv("MODELS_SNAPSHOT_DIR", "")) if USE_LOCAL_MODELS else None
+HF_HOME            = Path(os.getenv("HF_HOME", (OUT_DIR / ".hf")))
+TRANSFORMERS_CACHE = Path(os.getenv("TRANSFORMERS_CACHE", (OUT_DIR / ".hf" / "hub")))
+# Mixed precision: "auto" -> bf16 if supported else fp16; or "bf16"/"fp16"/"fp32"
+MIXED_PRECISION = os.getenv("MIXED_PRECISION", "auto").lower()
+# Optim/schedule
+LR              = 2e-4     # slightly higher LR for the new phase
+WEIGHT_DECAY    = 1e-3
+WARMUP_RATIO    = 0.05
+SCHEDULER       = os.getenv("SCHEDULER", "cosine")  # cosine|linear
+FREEZE_ENCODER  = False
+PIN_MEMORY      = True
+GRAD_CLIP_NORM  = 1.0
+INCLUDE_BG_IN_ACC = False
+# Resume / init behaviour
+# RESUME_MODE: "latest" (default), "best", or "none"
+# Now used only to choose which checkpoint to load **weights** from.
+RESUME_MODE      = os.getenv("RESUME_MODE", "latest").lower()
+INIT_WEIGHTS_STR = os.getenv("INIT_WEIGHTS", "").strip()
+INIT_WEIGHTS     = Path(INIT_WEIGHTS_STR) if INIT_WEIGHTS_STR else None
+# Data/model constants
+SAMPLE_RATE   = 16000
+CLIP_SECONDS  = 30.0
+NUM_FRAMES    = 1500
+NUM_TRACKS    = 2
+MAX_SEGMENTS  = 20
+LOG_EVERY     = 50
+HTML_TOP_N    = 12
+# =========================
+# ========= IMPORTS =======
+# =========================
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.nn import DataParallel
+# Headless plotting
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from transformers import (
+    WhisperFeatureExtractor,
+    WhisperModel,
+    get_cosine_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+)
+# =========================
+# ======= UTILITIES =======
+# =========================
+def setup_dirs():
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    RESUME_DIR.mkdir(parents=True, exist_ok=True)
+    (OUT_DIR / ".mplconfig").mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault("MPLCONFIGDIR", str((OUT_DIR / ".mplconfig").resolve()))
+    HF_HOME.mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault("HF_HOME", str(HF_HOME.resolve()))
+    os.environ.setdefault("TRANSFORMERS_CACHE", str(TRANSFORMERS_CACHE.resolve()))
+    # allocator (PyTorch >=2.x)
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")
+def set_seed(s: int):
+    random.seed(s); np.random.seed(s)
+    torch.manual_seed(s); torch.cuda.manual_seed_all(s)
+def preferred_dtype():
+    if MIXED_PRECISION == "bf16": return torch.bfloat16
+    if MIXED_PRECISION == "fp16": return torch.float16
+    if MIXED_PRECISION == "fp32": return torch.float32
+    if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+        return torch.bfloat16
+    return torch.float16 if torch.cuda.is_available() else torch.float32
+def _model_resolved_name(model_id: str) -> Tuple[str, bool]:
+    if USE_LOCAL_MODELS and MODELS_SNAPSHOT_DIR and MODELS_SNAPSHOT_DIR.is_dir():
+        local_dirname = model_id.replace("/", "__")
+        cand = MODELS_SNAPSHOT_DIR / local_dirname
+        if cand.is_dir():
+            return str(cand), True
+    return model_id, False
+# =========================
+# ========= DATA ==========
+# =========================
+ACCEPT_EXT = {".mp3", ".wav"}
+def index_tar_pairs_streaming(tar_path: Path) -> List[Tuple[str,str]]:
+    """
+    Returns list of (audio_member_name, json_member_name) inside a tar(ball).
+    """
+    pairs, mapping = [], {}
+    try:
+        with tarfile.open(tar_path, mode="r|*", ignore_zeros=True) as tf:
+            for m in tf:
+                if not m.isreg():
+                    continue
+                base, ext = os.path.splitext(m.name)
+                ext = ext.lower()
+                if ext in ACCEPT_EXT:
+                    mapping.setdefault(base, {})["audio"] = m.name
+                elif ext == ".json":
+                    mapping.setdefault(base, {})["json"] = m.name
+    except Exception:
+        return []
+    for base, d in mapping.items():
+        if "audio" in d and "json" in d:
+            pairs.append((d["audio"], d["json"]))
+    return pairs
+def index_loose_pairs(root: Path) -> List[Tuple[Path,Path]]:
+    """
+    Returns list of (audio_path, json_path) under root for loose files.
+    Pattern: <name>.(wav|mp3) + <name>.json
+    """
+    results = []
+    for audio in root.rglob("*"):
+        if not audio.is_file():
+            continue
+        if audio.suffix.lower() in ACCEPT_EXT:
+            j = audio.with_suffix(".json")
+            if j.exists():
+                results.append((audio, j))
+    return results
+# --- NEW: Gemini loose-file indexer ---
+def index_gemini_pairs(root: Path) -> List[Tuple[Path, Path]]:
+    """
+    Returns list of (audio_path, json_path) for Gemini-style pairs under root:
+       <anything>.audio.mp3  +  <same>.audio.json
+    """
+    results: List[Tuple[Path, Path]] = []
+    if not root.is_dir():
+        return results
+    for audio in root.rglob("*.audio.mp3"):
+        if not audio.is_file():
+            continue
+        j = audio.with_suffix(".json")  # sample_0.audio.mp3 -> sample_0.audio.json
+        if j.exists():
+            results.append((audio, j))
+    return results
+def _safe_extract_bytes(tar_path: Path, member_name: str) -> Optional[bytes]:
+    try:
+        with tarfile.open(tar_path, mode="r:*", ignore_zeros=True) as tf:
+            m = tf.getmember(member_name)
+            f = tf.extractfile(m)
+            return f.read() if f else None
+    except Exception:
+        pass
+    try:
+        with tarfile.open(tar_path, mode="r|*", ignore_zeros=True) as tf:
+            for m in tf:
+                if m.isreg() and m.name == member_name:
+                    f = tf.extractfile(m)
+                    return f.read() if f else None
+    except Exception:
+        pass
+    return None
+def read_json_bytes(b: Optional[bytes]) -> Dict[str, Any]:
+    if not b:
+        return {}
+    try:
+        return json.loads(b.decode("utf-8", errors="replace"))
+    except Exception:
+        return {}
+def read_member_json(tar_path: Path, member_name: str) -> Dict[str, Any]:
+    return read_json_bytes(_safe_extract_bytes(tar_path, member_name))
+def read_member_audio_30s(tar_path: Path, member_name: str) -> np.ndarray:
+    b = _safe_extract_bytes(tar_path, member_name)
+    return decode_audio_30s_bytes(b)
+def read_file_json(p: Path) -> Dict[str, Any]:
+    try:
+        return json.loads(p.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
+def read_file_audio_30s(p: Path) -> np.ndarray:
+    try:
+        with open(p, "rb") as f:
+            b = f.read()
+    except Exception:
+        b = None
+    return decode_audio_30s_bytes(b)
+def decode_audio_30s_bytes(b: Optional[bytes]) -> np.ndarray:
+    if not b:
+        return np.zeros(int(CLIP_SECONDS * SAMPLE_RATE), dtype=np.float32)
+    import soundfile as sf
+    import librosa
+    try:
+        with io.BytesIO(b) as bio:
+            wav, sr = sf.read(bio, dtype="float32", always_2d=False)
+        if wav.ndim == 2:
+            wav = wav.mean(axis=1)
+        if sr != SAMPLE_RATE:
+            wav = librosa.resample(wav, orig_sr=sr, target_sr=SAMPLE_RATE)
+        clip_samples = int(CLIP_SECONDS * SAMPLE_RATE)
+        if len(wav) < clip_samples:
+            wav = np.pad(wav, (0, clip_samples - len(wav)))
+        else:
+            wav = wav[:clip_samples]
+        return wav.astype(np.float32, copy=False)
+    except Exception:
+        return np.zeros(int(CLIP_SECONDS * SAMPLE_RATE), dtype=np.float32)
+def time_to_frame(t: float) -> int:
+    if t <= 0:
+        return 0
+    if t >= CLIP_SECONDS:
+        return NUM_FRAMES - 1
+    return max(0, min(NUM_FRAMES - 1, int(t / CLIP_SECONDS * NUM_FRAMES)))
+def parse_events(obj: Dict[str, Any]) -> List[Tuple[float,float]]:
+    seg = obj.get("segmentation", {})
+    cand = seg.get("events") if isinstance(seg, dict) else None
+    if not isinstance(cand, list):
+        cand = obj.get("events", [])
+    out = []
+    for e in cand or []:
+        st, et = e.get("start_time"), e.get("end_time")
+        if isinstance(st, (int, float)) and isinstance(et, (int, float)) and et > st:
+            s = max(0.0, float(st))
+            e_ = min(CLIP_SECONDS, float(et))
+            if e_ > s:
+                out.append((s, e_))
+    return out
+def build_labels_parity(events_sec: List[Tuple[float,float]]) -> torch.LongTensor:
+    ev = sorted(events_sec, key=lambda x: (x[0], x[1]))[:MAX_SEGMENTS]
+    frames = [(time_to_frame(s), time_to_frame(e)) for (s, e) in ev]
+    labels = torch.zeros((NUM_TRACKS, NUM_FRAMES), dtype=torch.long)
+    for i, (s, e) in enumerate(frames, start=1):
+        track = 0 if (i % 2 == 1) else 1
+        sl = labels[track, s:e+1]
+        bg = sl == 0
+        if bg.any():
+            sl[bg] = i
+    return labels
+class TarOrFileDataset(Dataset):
+    """
+    Each item is either:
+      {"kind":"tar", "tar": Path, "a": "member.wav", "j": "member.json"} or
+      {"kind":"file","a_path": Path, "j_path": Path}
+    """
+    def __init__(self, items: List[Dict[str,Any]], fe):
+        self.items = items
+        self.fe = fe
+    def __len__(self):
+        return len(self.items)
+    def __getitem__(self, idx):
+        it = self.items[idx]
+        if it["kind"] == "tar":
+            obj = read_member_json(it["tar"], it["j"])
+            wav = read_member_audio_30s(it["tar"], it["a"])
+            a_name = it["a"]
+        else:
+            obj = read_file_json(it["j_path"])
+            wav = read_file_audio_30s(it["a_path"])
+            a_name = str(it["a_path"].name)
+        ev = parse_events(obj)
+        labels = build_labels_parity(ev)
+        feat = self.fe(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt")
+        input_features = feat.input_features[0]
+        return {"x": input_features, "y": labels,
+                "meta": {"a": a_name, "ev": len(ev)}}
+def collate_fn(batch):
+    x = torch.stack([b["x"] for b in batch], dim=0)
+    y = torch.stack([b["y"] for b in batch], dim=0)
+    meta = {k: [b["meta"][k] for b in batch] for k in batch[0]["meta"]}
+    return {"x": x, "y": y, "meta": meta}
+# =========================
+# ========= MODEL =========
+# =========================
+class WhisperOddEven(nn.Module):
+    def __init__(self, base_id: str, freeze_encoder: bool):
+        super().__init__()
+        resolved, is_local = _model_resolved_name(base_id)
+        self.whisper = WhisperModel.from_pretrained(resolved, local_files_only=is_local)
+        # Freeze decoder (unused)
+        for p in self.whisper.decoder.parameters():
+            p.requires_grad = False
+        # Train encoder
+        for p in self.whisper.encoder.parameters():
+            p.requires_grad = not freeze_encoder
+        d_model = self.whisper.config.d_model
+        hidden = max(256, d_model // 2)
+        self.head = nn.Sequential(
+            nn.Linear(d_model, hidden),
+            nn.GELU(),
+            nn.Linear(hidden, NUM_TRACKS * (MAX_SEGMENTS + 1)),
+        )
+    def forward(self, input_features: torch.FloatTensor):
+        enc = self.whisper.encoder(input_features=input_features).last_hidden_state  # [B,1500,D]
+        B, T, D = enc.shape
+        logits = self.head(enc)  # [B,T,NUM_TRACKS*(C)]
+        C = MAX_SEGMENTS + 1
+        logits = logits.view(B, T, NUM_TRACKS, C).permute(0, 2, 1, 3).contiguous()
+        return logits  # [B,2,1500,C]
+def compute_loss(logits, labels):
+    B, TR, T, C = logits.shape
+    return F.cross_entropy(
+        logits.view(B * TR * T, C),
+        labels.view(B * TR * T),
+        reduction="mean",
+    )
+@torch.no_grad()
+def frame_accuracy(logits, labels, include_bg=False):
+    pred = logits.argmax(dim=-1)
+    if include_bg:
+        correct = (pred == labels).sum().item()
+        total = labels.numel()
+    else:
+        mask = labels != 0
+        correct = (pred[mask] == labels[mask]).sum().item()
+        total = mask.sum().item() if mask.any() else 1
+    return correct / max(1, total)
+# =========================
+# ======= REPORTING =======
+# =========================
+def _plot_tracks_seconds(pred_ids: torch.Tensor, title: str) -> bytes:
+    secs = np.linspace(0.0, CLIP_SECONDS, NUM_FRAMES)
+    fig = plt.figure(figsize=(10, 2.8))
+    ax = plt.gca()
+    im = ax.imshow(
+        pred_ids.numpy(),
+        aspect="auto",
+        interpolation="nearest",
+        origin="upper",
+        extent=[secs[0], secs[-1], -0.5, 1.5],
+    )
+    ax.set_title(title)
+    ax.set_xlabel("Time (s)")
+    ax.set_yticks([0, 1])
+    ax.set_yticklabels(["odd", "even"])
+    cb = plt.colorbar(im, fraction=0.046, pad=0.04)
+    cb.set_label("Segment ID")
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf.read()
+def _mime_for_ext(fn: str) -> str:
+    ext = Path(fn).suffix.lower()
+    if ext == ".mp3":
+        return "audio/mpeg"
+    if ext == ".wav":
+        return "audio/wav"
+    # Fallbackbrowsers may still play if encoded as generic octet-stream
+    return "audio/wav"
+def write_eval_html(out_dir: Path, eval_id: str, rows: List[Dict[str, Any]]):
+    html = [f"""<!doctype html><html><head><meta charset="utf-8">
+<style>
+ body{{font-family:system-ui,Segoe UI,Roboto,Arial,sans-serif;margin:20px}}
+ .card{{border:1px solid #ddd;border-radius:10px;padding:16px;margin:16px 0;box-shadow:0 2px 6px rgba(0,0,0,.05)}}
+ .grid{{display:grid;grid-template-columns:1fr 1fr;gap:12px}}
+ figure{{margin:0}}
+ figcaption{{font-size:13px;color:#555;margin-top:6px}}
+ audio{{width:100%;margin-top:8px}}
+</style>
+<title>Odd/Even Segmentation  Eval {eval_id}</title></head><body>"""]
+    for r in rows:
+        audio_html = ""
+        if r.get("audio_b64") and r.get("audio_mime"):
+            audio_html = (
+                '<audio controls preload="none">'
+                f'<source src="data:{r["audio_mime"]};base64,{r["audio_b64"]}" type="{r["audio_mime"]}">'
+                'Your browser does not support the audio element.'
+                '</audio>'
+                '<small style="color:#555">Listen: original eval audio</small>'
+            )
+        html.append(f"""
+<section class="card">
+  <h3>{r['a_name']}</h3>
+  {audio_html}
+  <div class="grid">
+    <figure>
+      <img src="data:image/png;base64,{r['png_raw']}" alt="raw">
+      <figcaption>RAW (avg acc vs GT: {r['acc_raw']:.3f})</figcaption>
+    </figure>
+    <figure>
+      <img src="data:image/png;base64,{r['png_sm']}" alt="smoothed">
+      <figcaption>SMOOTHED (avg acc vs GT: {r['acc_sm']:.3f})</figcaption>
+    </figure>
+  </div>
+</section>""")
+    html.append("</body></html>")
+    p = out_dir / f"eval_{eval_id}.html"
+    try:
+        p.write_text("\n".join(html), encoding="utf-8")
+    except Exception as e:
+        print(f"[eval-html] failed to write {p}: {e}", flush=True)
+    return p
+# =========================
+# ========= TRAIN =========
+# =========================
+def unwrap(model: nn.Module) -> nn.Module:
+    return model.module if isinstance(model, DataParallel) else model
+def main():
+    setup_dirs()
+    set_seed(SEED)
+    # logging
+    log_path = OUT_DIR / "train.log"
+    log_f = open(log_path, "a", buffering=1)
+    def log(*a):
+        s = " ".join(str(x) for x in a)
+        print(s, flush=True)
+        print(s, file=log_f, flush=True)
+    # index "other" data (original DATA_DIR)
+    tar_files = sorted(
+        set(
+            [p for p in DATA_DIR.rglob("*.tar") if p.is_file()]
+            + [p for p in DATA_DIR.rglob("*.tar.gz") if p.is_file()]
+        )
+    )
+    loose_pairs = index_loose_pairs(DATA_DIR)
+    log(f"==> Found {len(tar_files)} tarballs and {len(loose_pairs)} loose audio+json pairs in {DATA_DIR}")
+    other_items: List[Dict[str, Any]] = []
+    for tp in tar_files:
+        pairs = index_tar_pairs_streaming(tp)
+        log(f"[index] {tp.name}: {len(pairs)} pairs")
+        for a_m, j_m in pairs:
+            other_items.append({"kind": "tar", "tar": tp, "a": a_m, "j": j_m})
+    for a_p, j_p in loose_pairs:
+        other_items.append({"kind": "file", "a_path": a_p, "j_path": j_p})
+    log(f"[other] Total base items from DATA_DIR: {len(other_items)}")
+    # index Gemini data
+    gem_items: List[Dict[str, Any]] = []
+    if USE_GEMINI and GEMINI_DIR.is_dir():
+        raw_pairs = index_gemini_pairs(GEMINI_DIR)
+        log(f"[gemini] Scanning {GEMINI_DIR} -> {len(raw_pairs)} *.audio.mp3+json pairs (candidates)")
+        n_med = 0
+        n_ov_true = 0
+        n_ov_false = 0
+        n_bad_no_seg = 0
+        allowed_overlaps = []
+        if GEMINI_INCLUDE_OVERLAP_TRUE:
+            allowed_overlaps.append(True)
+        if GEMINI_INCLUDE_OVERLAP_FALSE:
+            allowed_overlaps.append(False)
+        for a_p, j_p in raw_pairs:
+            obj = read_file_json(j_p)
+            seg_dur = obj.get("segment_duration")
+            overlapping = obj.get("overlapping")
+            # Only medium
+            if seg_dur != GEMINI_SEGMENT_DURATION:
+                continue
+            n_med += 1
+            if overlapping is True:
+                n_ov_true += 1
+                if not GEMINI_INCLUDE_OVERLAP_TRUE:
+                    continue
+            elif overlapping is False:
+                n_ov_false += 1
+                if not GEMINI_INCLUDE_OVERLAP_FALSE:
+                    continue
+            else:
+                # overlapping field missing or weird -> skip
+                continue
+            # make sure segmentation/events exist
+            seg_block = obj.get("segmentation", {})
+            if not isinstance(seg_block, dict) or not isinstance(seg_block.get("events"), list):
+                n_bad_no_seg += 1
+                continue
+            gem_items.append({"kind": "file", "a_path": a_p, "j_path": j_p})
+        log(
+            f"[gemini] medium-candidates={n_med} "
+            f"(overlap true={n_ov_true}, overlap false={n_ov_false}, "
+            f"discarded missing/invalid seg={n_bad_no_seg})"
+        )
+        log(
+            f"[gemini] after filters "
+            f"(segment_duration='{GEMINI_SEGMENT_DURATION}', overlapping in {allowed_overlaps}) "
+            f"-> {len(gem_items)} items"
+        )
+    else:
+        if not USE_GEMINI:
+            log("[gemini] USE_GEMINI=0 -> Gemini bucket disabled")
+        else:
+            log(f"[gemini] Directory not found: {GEMINI_DIR} -> Gemini bucket disabled")
+    # build combined item list according to mixing rule
+    if gem_items:
+        N_gem = len(gem_items)
+        target_other = int(math.floor(N_gem * GEMINI_OTHER_RATIO + 0.5))
+        if target_other > len(other_items):
+            target_other = len(other_items)
+        random.shuffle(other_items)
+        sampled_other = other_items[:target_other]
+        combined_items = gem_items + sampled_other
+        random.shuffle(combined_items)
+        log(f"[mix] Gemini items: {N_gem}")
+        log(f"[mix] Sampling {target_other} items from other-data (ratio={GEMINI_OTHER_RATIO})")
+        log(f"[mix] Combined pool size (before train/val split): {len(combined_items)}")
+    else:
+        # Fallback: original behaviour (whole dataset from DATA_DIR)
+        combined_items = other_items
+        random.shuffle(combined_items)
+        log("[mix] WARNING: no Gemini items found -> training only on DATA_DIR mix")
+        log(f"[mix] Combined pool size: {len(combined_items)}")
+    if not combined_items:
+        log("[ERR] No usable audio+json pairs in final mix. Aborting.")
+        log_f.close()
+        return
+    # fixed-size validation set (up to VAL_FIXED_N)
+    val_n = min(VAL_FIXED_N, len(combined_items))
+    val_items = combined_items[:val_n]
+    train_items = combined_items[val_n:]
+    log(
+        f"[split] Final split -> Train={len(train_items)} | Val={len(val_items)} "
+        f"(VAL_FIXED_N={VAL_FIXED_N})"
+    )
+    # features
+    resolved, is_local = _model_resolved_name(HF_MODEL_ID)
+    fe = WhisperFeatureExtractor.from_pretrained(resolved, local_files_only=is_local)
+    # datasets & loaders
+    train_ds = TarOrFileDataset(train_items, fe)
+    val_ds = TarOrFileDataset(val_items, fe)
+    # provisional loader for batch probe
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=1,
+        shuffle=True,
+        num_workers=NUM_WORKERS,
+        pin_memory=PIN_MEMORY,
+        collate_fn=collate_fn,
+        persistent_workers=NUM_WORKERS > 0,
+        prefetch_factor=2,
+    )
+    # model
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    base_model = WhisperOddEven(HF_MODEL_ID, freeze_encoder=FREEZE_ENCODER).to(device)
+    # DataParallel if multi-GPU
+    n_gpu = torch.cuda.device_count()
+    if n_gpu > 1:
+        log(f"[gpu] Using DataParallel across {n_gpu} GPUs")
+        model = DataParallel(base_model, device_ids=list(range(n_gpu)))
+    else:
+        model = base_model
+    # optim, amp
+    optim = torch.optim.AdamW(
+        (p for p in model.parameters() if p.requires_grad),
+        lr=LR,
+        weight_decay=WEIGHT_DECAY,
+    )
+    use_dtype = preferred_dtype()
+    amp_enabled = use_dtype in (torch.float16, torch.bfloat16)
+    scaler = torch.cuda.amp.GradScaler(enabled=(amp_enabled and use_dtype == torch.float16))
+    # ---- weights-only resume / init ----
+    state_path = RESUME_DIR / "trainer_state.pt"
+    best_state_path = RESUME_DIR / "trainer_state_best.pt"
+    start_epoch = 1
+    global_step = 0
+    seen_samples = 0
+    state_loaded = False
+    if RESUME_MODE != "none":
+        state_to_load: Optional[Path] = None
+        if RESUME_MODE == "best" and best_state_path.exists():
+            state_to_load = best_state_path
+        elif state_path.exists():
+            state_to_load = state_path
+        if state_to_load is not None:
+            try:
+                state = torch.load(state_to_load, map_location="cpu")
+                unwrap(model).load_state_dict(state["model"])
+                state_loaded = True
+                log(
+                    f"[resume-weights] loaded model weights from {state_to_load}; "
+                    f"optimizer/scheduler/counters RESET for new dataset"
+                )
+            except Exception as e:
+                log(f"[resume] failed to load {state_to_load}: {e}")
+    # optional weights-only init from separate file (only if no trainer_state used)
+    if (not state_loaded) and INIT_WEIGHTS is not None and INIT_WEIGHTS.is_file():
+        try:
+            ckpt = torch.load(INIT_WEIGHTS, map_location="cpu")
+            unwrap(model).load_state_dict(ckpt)
+            log(f"[init] loaded weights from {INIT_WEIGHTS}")
+        except Exception as e:
+            log(f"[init] failed to load INIT_WEIGHTS {INIT_WEIGHTS}: {e}")
+    # batch probe (single-process; DP handles scattering)
+    def try_batch_size(bsz_try: int) -> bool:
+        try:
+            it = iter(train_loader)
+            batch = next(it)
+            x = batch["x"].to(device, non_blocking=True).repeat(bsz_try, 1, 1)
+            y = batch["y"].to(device, non_blocking=True).repeat(bsz_try, 1, 1)
+            with torch.autocast(
+                device_type="cuda" if torch.cuda.is_available() else "cpu",
+                enabled=amp_enabled,
+                dtype=use_dtype,
+            ):
+                logits = model(x)
+                loss = compute_loss(logits, y)
+            if scaler.is_enabled():
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            optim.zero_grad(set_to_none=True)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            return True
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                return False
+            raise
+        finally:
+            optim.zero_grad(set_to_none=True)
+    bsz = max(1, BATCH_SIZE)
+    if ADAPTIVE_BSZ:
+        log(f"[bsz] probing starting at {bsz} (cap={MAX_BSZ_CAP})")
+        if try_batch_size(bsz):
+            step = max(4, bsz // 2)
+            while True:
+                nxt = bsz + step
+                if MAX_BSZ_CAP and nxt > MAX_BSZ_CAP:
+                    break
+                ok = try_batch_size(nxt)
+                if not ok:
+                    break
+                bsz = nxt
+                step = max(4, step)
+                log(f"[bsz] increased to {bsz}")
+        else:
+            while bsz > 1 and not try_batch_size(bsz):
+                bsz = max(1, bsz // 2)
+            if bsz == 1:
+                log("[bsz] fell back to 1")
+    log(f"[bsz] final batch size = {bsz}")
+    # rebuild loader with final batch size (shuffles each epoch)
+    train_loader = DataLoader(
+        train_ds,
+        batch_size=bsz,
+        shuffle=True,
+        num_workers=NUM_WORKERS,
+        pin_memory=PIN_MEMORY,
+        collate_fn=collate_fn,
+        persistent_workers=NUM_WORKERS > 0,
+        prefetch_factor=2,
+    )
+    # scheduler, now with correct steps_per_epoch for this loader
+    steps_per_epoch = max(1, len(train_loader))
+    total_steps = max(1, EPOCHS * steps_per_epoch)
+    warmup = max(1, int(WARMUP_RATIO * total_steps))
+    sched = (
+        get_cosine_schedule_with_warmup(optim, warmup, total_steps)
+        if SCHEDULER == "cosine"
+        else get_linear_schedule_with_warmup(optim, warmup, total_steps)
+    )
+    # ETA helpers
+    ema_rate = None
+    total_samples = len(train_ds) * EPOCHS
+    def format_eta(secs: float) -> str:
+        secs = max(0.0, secs)
+        h = int(secs // 3600)
+        m = int((secs % 3600) // 60)
+        s = int(secs % 60)
+        return f"{h:02d}:{m:02d}:{s:02d}"
+    # helper to fetch original audio bytes for HTML embedding
+    def _audio_bytes_for_eval_item(ds: TarOrFileDataset, idx: int) -> Tuple[Optional[bytes], str, str]:
+        it = ds.items[idx]
+        if it["kind"] == "tar":
+            a_name = it["a"]
+            b = _safe_extract_bytes(it["tar"], a_name)
+            mime = _mime_for_ext(a_name)
+            disp = f"{Path(it['tar']).name} :: {a_name}"
+            return b, disp, mime
+        else:
+            p = it["a_path"]
+            try:
+                b = p.read_bytes()
+            except Exception:
+                b = None
+            mime = _mime_for_ext(str(p))
+            return b, p.name, mime
+    @torch.no_grad()
+    def evaluate(tag: str, n=400):
+        nonlocal ema_rate
+        unwrap(model).eval()
+        idx = list(range(len(val_ds)))
+        random.shuffle(idx)
+        sub = idx[: min(n, len(val_ds))]
+        tot_loss = 0.0
+        tot_acc = 0.0
+        rows: List[Dict[str, Any]] = []
+        for i in sub:
+            item = val_ds[i]
+            x_cpu = item["x"].unsqueeze(0)
+            y_cpu = item["y"].unsqueeze(0)
+            x = x_cpu.to(device, non_blocking=True)
+            y = y_cpu.to(device, non_blocking=True)
+            with torch.autocast(
+                device_type="cuda" if torch.cuda.is_available() else "cpu",
+                enabled=amp_enabled,
+                dtype=use_dtype,
+            ):
+                logits = unwrap(model)(x)
+                loss = compute_loss(logits, y).item()
+            acc = frame_accuracy(logits, y, include_bg=INCLUDE_BG_IN_ACC)
+            tot_loss += loss
+            tot_acc += acc
+            if len(rows) < HTML_TOP_N:
+                raw_ids = logits.argmax(dim=-1).squeeze(0).cpu()
+                sm_logits = F.avg_pool1d(
+                    logits.permute(0, 1, 3, 2)
+                    .contiguous()
+                    .view(1, NUM_TRACKS * (MAX_SEGMENTS + 1), NUM_FRAMES),
+                    kernel_size=9,
+                    stride=1,
+                    padding=4,
+                ).view(1, NUM_TRACKS, MAX_SEGMENTS + 1, NUM_FRAMES)
+                sm_logits = sm_logits.permute(0, 1, 3, 2).contiguous()
+                sm_ids = sm_logits.argmax(dim=-1).squeeze(0).cpu()
+                def acc_ignore_bg(pcpu: torch.Tensor, gtcpu: torch.Tensor) -> float:
+                    m = gtcpu != 0
+                    tot_ = int(m.sum().item())
+                    if tot_ == 0:
+                        return 0.0
+                    return float((pcpu[m] == gtcpu[m]).sum().item()) / tot_
+                acc_raw = (
+                    acc_ignore_bg(raw_ids[0], y_cpu[0, 0])
+                    + acc_ignore_bg(raw_ids[1], y_cpu[0, 1])
+                ) / 2.0
+                acc_sm = (
+                    acc_ignore_bg(sm_ids[0], y_cpu[0, 0])
+                    + acc_ignore_bg(sm_ids[1], y_cpu[0, 1])
+                ) / 2.0
+                png_raw = base64.b64encode(
+                    _plot_tracks_seconds(raw_ids, f"RAW  {i}")
+                ).decode("ascii")
+                png_sm = base64.b64encode(
+                    _plot_tracks_seconds(sm_ids, f"SMOOTHED  {i}")
+                ).decode("ascii")
+                a_bytes, disp_name, mime = _audio_bytes_for_eval_item(val_ds, i)
+                audio_b64 = base64.b64encode(a_bytes).decode("ascii") if a_bytes else None
+                rows.append(
+                    {
+                        "a_name": disp_name,
+                        "png_raw": png_raw,
+                        "png_sm": png_sm,
+                        "acc_raw": acc_raw,
+                        "acc_sm": acc_sm,
+                        "audio_b64": audio_b64,
+                        "audio_mime": mime,
+                    }
+                )
+        loss_avg = tot_loss / max(1, len(sub))
+        acc_avg = tot_acc / max(1, len(sub))
+        remaining = max(0.0, (total_samples - seen_samples) / max(1e-6, (ema_rate or 1.0)))
+        eta_str = format_eta(remaining)
+        html_path = write_eval_html(
+            OUT_DIR, f"{tag}_eta{eta_str.replace(':', '-')}", rows
+        )
+        log(f"[eval:{tag}] loss {loss_avg:.4f} acc {acc_avg:.4f} on {len(sub)} samples ? {html_path}")
+        unwrap(model).train()
+        return loss_avg
+    best_val = float("inf")
+    first_eval_threshold = EVAL_FIRST
+    if seen_samples >= first_eval_threshold:
+        first_eval_threshold = -1
+    periodic_eval_every = EVAL_EVERY if EVAL_EVERY > 0 else 0
+    unwrap(model).train()
+    for ep in range(start_epoch, EPOCHS + 1):
+        ep_t0 = time.time()
+        for step, batch in enumerate(train_loader, start=1):
+            t0 = time.time()
+            x = batch["x"].to(device, non_blocking=True)
+            y = batch["y"].to(device, non_blocking=True)
+            with torch.autocast(
+                device_type="cuda" if torch.cuda.is_available() else "cpu",
+                enabled=amp_enabled,
+                dtype=use_dtype,
+            ):
+                logits = model(x)
+                loss = compute_loss(logits, y)
+            if scaler.is_enabled():
+                scaler.scale(loss).backward()
+                scaler.unscale_(optim)
+            else:
+                loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP_NORM)
+            if scaler.is_enabled():
+                scaler.step(optim)
+                scaler.update()
+            else:
+                optim.step()
+            optim.zero_grad(set_to_none=True)
+            sched.step()
+            global_step += 1
+            seen_samples += x.size(0)
+            dt = max(1e-6, time.time() - t0)
+            rate = x.size(0) / dt
+            ema_rate = rate if (ema_rate is None) else (0.05 * rate + 0.95 * ema_rate)
+            remaining = max(
+                0.0,
+                (len(train_ds) * EPOCHS - seen_samples) / max(1e-6, ema_rate or 1.0),
+            )
+            if global_step % LOG_EVERY == 0:
+                acc_now = frame_accuracy(logits, y, include_bg=INCLUDE_BG_IN_ACC)
+                log(
+                    f"[train] ep {ep} step {global_step} loss {loss.item():.4f} acc {acc_now:.4f} "
+                    f"lr {sched.get_last_lr()[0]:.2e} seen {seen_samples}/{len(train_ds)*EPOCHS} "
+                    f"rate {ema_rate:.1f} samp/s ETA {format_eta(remaining)}"
+                )
+            # eval schedule
+            do_eval = False
+            prev_seen = seen_samples - x.size(0)
+            if first_eval_threshold != -1 and first_eval_threshold > 0:
+                if seen_samples >= first_eval_threshold and prev_seen < first_eval_threshold:
+                    do_eval = True
+                    first_eval_threshold = -1
+            elif periodic_eval_every > 0:
+                prev_bucket = prev_seen // periodic_eval_every
+                now_bucket = seen_samples // periodic_eval_every
+                if now_bucket != prev_bucket and now_bucket > 0:
+                    do_eval = True
+            if do_eval:
+                val_loss = evaluate(tag=f"gstep{global_step}_seen{seen_samples}")
+                # save "latest" trainer state
+                try:
+                    torch.save(
+                        {
+                            "epoch": ep,
+                            "global_step": global_step,
+                            "seen_samples": seen_samples,
+                            "model": unwrap(model).state_dict(),
+                            "optim": optim.state_dict(),
+                            "sched": sched.state_dict(),
+                            "scaler": scaler.state_dict() if scaler.is_enabled() else {},
+                        },
+                        state_path,
+                    )
+                    log(f"[save] trainer state ? {state_path}")
+                except Exception as e:
+                    log(f"[save] failed to write trainer state {state_path}: {e}")
+                # save best
+                if val_loss is not None and val_loss < best_val:
+                    best_val = val_loss
+                    try:
+                        torch.save(unwrap(model).state_dict(), OUT_DIR / "model_best.pt")
+                        torch.save(
+                            unwrap(model).whisper.encoder.state_dict(),
+                            OUT_DIR / "encoder_best.bin",
+                        )
+                        torch.save(
+                            {
+                                "epoch": ep,
+                                "global_step": global_step,
+                                "seen_samples": seen_samples,
+                                "model": unwrap(model).state_dict(),
+                                "optim": optim.state_dict(),
+                                "sched": sched.state_dict(),
+                                "scaler": scaler.state_dict()
+                                if scaler.is_enabled()
+                                else {},
+                            },
+                            best_state_path,
+                        )
+                        log(
+                            f"[save] new BEST (eval) ? {OUT_DIR/'model_best.pt'} "
+                            f"(state: {best_state_path})"
+                        )
+                    except Exception as e:
+                        log(f"[save] failed to write BEST checkpoint: {e}")
+        # end epoch
+        val_loss = evaluate(tag=f"epoch{ep}_end")
+        try:
+            torch.save(
+                {
+                    "epoch": ep + 1,
+                    "global_step": global_step,
+                    "seen_samples": seen_samples,
+                    "model": unwrap(model).state_dict(),
+                    "optim": optim.state_dict(),
+                    "sched": sched.state_dict(),
+                    "scaler": scaler.state_dict() if scaler.is_enabled() else {},
+                },
+                state_path,
+            )
+            log(f"[save] trainer state (epoch end) ? {state_path}")
+        except Exception as e:
+            log(f"[save] failed to write trainer state (epoch end) {state_path}: {e}")
+        if val_loss is not None and val_loss < best_val:
+            best_val = val_loss
+            try:
+                torch.save(unwrap(model).state_dict(), OUT_DIR / "model_best.pt")
+                torch.save(
+                    unwrap(model).whisper.encoder.state_dict(),
+                    OUT_DIR / "encoder_best.bin",
+                )
+                torch.save(
+                    {
+                        "epoch": ep + 1,
+                        "global_step": global_step,
+                        "seen_samples": seen_samples,
+                        "model": unwrap(model).state_dict(),
+                        "optim": optim.state_dict(),
+                        "sched": sched.state_dict(),
+                        "scaler": scaler.state_dict() if scaler.is_enabled() else {},
+                    },
+                    best_state_path,
+                )
+                log(
+                    f"[save] new BEST (epoch) ? {OUT_DIR/'model_best.pt'} "
+                    f"(state: {best_state_path})"
+                )
+            except Exception as e:
+                log(f"[save] failed to write BEST checkpoint (epoch end): {e}")
+        log(f"[epoch] {ep}/{EPOCHS} done in {time.time() - ep_t0:.1f}s")
+    log("\n[done] Training complete.")
+    log_f.close()
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception:
+        print("[FATAL]\n", traceback.format_exc(), flush=True)
+        raise