laion
/

captioning-whisper-proof_of_concept

Model card Files Files and versions

xet

Community

ChristophSchuhmann commited on Nov 15, 2025

Commit

bcceffa

verified ·

1 Parent(s): 9376900

Upload segmentation_infer_html.py with huggingface_hub

Browse files

Files changed (1) hide show

segmentation_infer_html.py +835 -0

segmentation_infer_html.py ADDED Viewed

	@@ -0,0 +1,835 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+segmentation_infer_smooth_segments.py
+- Loads WhisperOddEven checkpoint
+    /home/user/outs/segmentation_gemini_2p_medium_model_best.pt
+  (override via CKPT env var).
+- For each audio file in AUDIO_INPUT_DIR:
+    * load, resample to 16 kHz mono
+    * split into 30 s chunks
+    * run segmentation
+    * SMOOTH each track so that no segment (incl. background 0) is shorter than
+      MIN_SEGMENT_SEC seconds
+    * extract per-track segments (odd/even) and cut audio snippets
+    * build a MERGED timeline that starts/ends segments whenever either track
+      changes label, then smooth that merged timeline so that each merged
+      segment is also at least MIN_SEGMENT_SEC long, merging short segments
+      with neighbors using the rules described below.
+- Writes a single HTML report with:
+    * smoothed per-track heatmap
+    * merged-timeline heatmap
+    * tables of per-track segments (with audio players)
+    * tables of merged segments (with audio players)
+Merging rule for short merged segments:
+- If a merged segment is shorter than MIN_SEGMENT_SEC, merge it with one of its
+  immediate neighbors.
+- Prefer the neighbor whose (odd_label, even_label) matches this segment best
+  (majority vote over the two labels).
+- If similarity is equal (or one neighbor is missing), merge with the neighbor
+  that has the shorter duration. If still equal, merge with the left neighbor.
+"""
+from __future__ import annotations
+import os
+import io
+import sys
+import time
+import math
+import base64
+import shutil
+from pathlib import Path
+from typing import List, Dict, Any, Tuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# plotting
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+# audio
+import soundfile as sf
+import librosa
+from pydub import AudioSegment  # requires ffmpeg
+from transformers import WhisperFeatureExtractor, WhisperModel
+# =========================
+# ========== CONFIG =======
+# =========================
+AUDIO_INPUT_DIR = Path(os.getenv("AUDIO_INPUT_DIR", "./infer-audio"))
+OUT_DIR         = Path(os.getenv("OUT_DIR", "./outs_infer"))
+CKPT_PATH       = Path(os.getenv("CKPT", "/home/user/outs/segmentation_gemini_medium_no_overlap_4epochs_model_best.pt"))
+HF_MODEL_ID     = os.getenv("HF_MODEL_ID", "openai/whisper-small")
+USE_LOCAL_MODELS    = bool(int(os.getenv("USE_LOCAL_MODELS", "0")))
+MODELS_SNAPSHOT_DIR = Path(os.getenv("MODELS_SNAPSHOT_DIR", "")) if USE_LOCAL_MODELS else None
+HF_HOME             = Path(os.getenv("HF_HOME", (OUT_DIR / ".hf")))
+TRANSFORMERS_CACHE  = Path(os.getenv("TRANSFORMERS_CACHE", (OUT_DIR / ".hf" / "hub")))
+MIXED_PRECISION = os.getenv("MIXED_PRECISION", "auto").lower()
+# constants (must match training)
+SAMPLE_RATE   = 16000
+CLIP_SECONDS  = 30.0
+NUM_FRAMES    = 1500
+NUM_TRACKS    = 2
+MAX_SEGMENTS  = 20
+# --- MINIMUM SEGMENT LENGTH (seconds) for both per-track and merged segments ---
+MIN_SEGMENT_SEC = float(os.getenv("MIN_SEGMENT_SEC", "1.0"))
+MIN_SEGMENT_FRAMES = max(1, int(round(MIN_SEGMENT_SEC * NUM_FRAMES / CLIP_SECONDS)))
+FFMPEG_AVAILABLE = shutil.which("ffmpeg") is not None
+WARNED_NO_FFMPEG = False
+# =========================
+# ====== BASIC SETUP ======
+# =========================
+def setup_dirs():
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    (OUT_DIR / ".mplconfig").mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault("MPLCONFIGDIR", str((OUT_DIR / ".mplconfig").resolve()))
+    HF_HOME.mkdir(parents=True, exist_ok=True)
+    os.environ.setdefault("HF_HOME", str(HF_HOME.resolve()))
+    os.environ.setdefault("TRANSFORMERS_CACHE", str(TRANSFORMERS_CACHE.resolve()))
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")
+def preferred_dtype():
+    if MIXED_PRECISION == "bf16":
+        return torch.bfloat16
+    if MIXED_PRECISION == "fp16":
+        return torch.float16
+    if MIXED_PRECISION == "fp32":
+        return torch.float32
+    if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+        return torch.bfloat16
+    return torch.float16 if torch.cuda.is_available() else torch.float32
+def _model_resolved_name(model_id: str) -> Tuple[str, bool]:
+    if USE_LOCAL_MODELS and MODELS_SNAPSHOT_DIR and MODELS_SNAPSHOT_DIR.is_dir():
+        local_dirname = model_id.replace("/", "__")
+        cand = MODELS_SNAPSHOT_DIR / local_dirname
+        if cand.is_dir():
+            return str(cand), True
+    return model_id, False
+# =========================
+# ========= MODEL =========
+# =========================
+class WhisperOddEven(nn.Module):
+    def __init__(self, base_id: str, freeze_encoder: bool = False):
+        super().__init__()
+        resolved, is_local = _model_resolved_name(base_id)
+        self.whisper = WhisperModel.from_pretrained(resolved, local_files_only=is_local)
+        # decoder unused
+        for p in self.whisper.decoder.parameters():
+            p.requires_grad = False
+        for p in self.whisper.encoder.parameters():
+            p.requires_grad = not freeze_encoder
+        d_model = self.whisper.config.d_model
+        hidden = max(256, d_model // 2)
+        self.head = nn.Sequential(
+            nn.Linear(d_model, hidden),
+            nn.GELU(),
+            nn.Linear(hidden, NUM_TRACKS * (MAX_SEGMENTS + 1)),
+        )
+    def forward(self, input_features: torch.FloatTensor):
+        enc = self.whisper.encoder(input_features=input_features).last_hidden_state  # [B,1500,D]
+        B, T, D = enc.shape
+        logits = self.head(enc)  # [B,T,NUM_TRACKS*(C)]
+        C = MAX_SEGMENTS + 1
+        logits = logits.view(B, T, NUM_TRACKS, C).permute(0, 2, 1, 3).contiguous()
+        return logits  # [B,2,1500,C]
+# =========================
+# ====== AUDIO UTILS ======
+# =========================
+def load_audio_mono_16k(path: Path) -> np.ndarray:
+    wav, sr = librosa.load(str(path), sr=SAMPLE_RATE, mono=True)
+    if wav.ndim > 1:
+        wav = wav.mean(axis=0)
+    return wav.astype(np.float32, copy=False)
+def split_into_chunks(wav: np.ndarray, sr: int, clip_seconds: float):
+    chunk_size = int(clip_seconds * sr)
+    total = len(wav)
+    if total == 0:
+        return []
+    n_chunks = math.ceil(total / chunk_size)
+    chunks = []
+    for i in range(n_chunks):
+        start = i * chunk_size
+        end = min(start + chunk_size, total)
+        seg = wav[start:end]
+        if len(seg) < chunk_size:
+            seg = np.pad(seg, (0, chunk_size - len(seg)), mode="constant")
+        chunks.append((i, start, seg.astype(np.float32, copy=False)))
+    return chunks
+def wav_chunk_to_audio_bytes(wav: np.ndarray, sr: int):
+    """
+    Try to export as MP3 (if ffmpeg is available). Otherwise fall back to WAV.
+    Returns (audio_bytes, mime_type).
+    """
+    global WARNED_NO_FFMPEG
+    buf_wav = io.BytesIO()
+    sf.write(buf_wav, wav, sr, format="WAV")
+    wav_bytes = buf_wav.getvalue()
+    if not FFMPEG_AVAILABLE:
+        if not WARNED_NO_FFMPEG:
+            print("[audio] ffmpeg not found; embedding WAV instead of MP3.", flush=True)
+            WARNED_NO_FFMPEG = True
+        return wav_bytes, "audio/wav"
+    try:
+        buf_wav.seek(0)
+        audio = AudioSegment.from_file(buf_wav, format="wav")
+        out_buf = io.BytesIO()
+        audio.export(out_buf, format="mp3", bitrate="128k")
+        out_buf.seek(0)
+        return out_buf.read(), "audio/mpeg"
+    except Exception as e:
+        if not WARNED_NO_FFMPEG:
+            print(f"[audio] Failed to encode MP3, falling back to WAV: {e}", flush=True)
+            WARNED_NO_FFMPEG = True
+        return wav_bytes, "audio/wav"
+# =========================
+# ====== SEGMENT OPS ======
+# =========================
+def smooth_min_duration(ids: np.ndarray, min_frames: int, max_iter: int = 10) -> np.ndarray:
+    """
+    Enforce a minimum run length (in frames) for an ID sequence (1D).
+    Shorter runs are reassigned to the longer of their neighbors, iteratively.
+    """
+    ids = ids.copy()
+    n = len(ids)
+    if n == 0:
+        return ids
+    for _ in range(max_iter):
+        runs = []
+        start = 0
+        cur = ids[0]
+        for i in range(1, n):
+            if ids[i] != cur:
+                runs.append((cur, start, i))
+                start = i
+                cur = ids[i]
+        runs.append((cur, start, n))
+        changed = False
+        for ri, (label, s, e) in enumerate(runs):
+            length = e - s
+            if length >= min_frames:
+                continue
+            left = runs[ri - 1] if ri > 0 else None
+            right = runs[ri + 1] if ri + 1 < len(runs) else None
+            if left is None and right is None:
+                continue
+            if left is None:
+                new_label = right[0]
+            elif right is None:
+                new_label = left[0]
+            else:
+                len_left = left[2] - left[1]
+                len_right = right[2] - right[1]
+                new_label = left[0] if len_left >= len_right else right[0]
+            if new_label != label:
+                ids[s:e] = new_label
+                changed = True
+        if not changed:
+            break
+    return ids
+def extract_segments(ids: np.ndarray, include_bg: bool = False):
+    """
+    Return list of (label, frame_start, frame_end) runs.
+    Optionally filter out background label 0.
+    """
+    n = len(ids)
+    if n == 0:
+        return []
+    runs = []
+    start = 0
+    cur = ids[0]
+    for i in range(1, n):
+        if ids[i] != cur:
+            runs.append((cur, start, i))
+            start = i
+            cur = ids[i]
+    runs.append((cur, start, n))
+    if not include_bg:
+        runs = [(lab, s, e) for (lab, s, e) in runs if lab != 0]
+    return runs
+def frames_to_times(s: int, e: int):
+    start_t = s / NUM_FRAMES * CLIP_SECONDS
+    end_t = e / NUM_FRAMES * CLIP_SECONDS
+    return start_t, end_t
+def cut_wav(seg_wav: np.ndarray, start_t: float, end_t: float) -> np.ndarray:
+    start_samp = int(round(start_t * SAMPLE_RATE))
+    end_samp   = int(round(end_t * SAMPLE_RATE))
+    start_samp = max(0, min(start_samp, len(seg_wav)))
+    end_samp   = max(start_samp + 1, min(end_samp, len(seg_wav)))
+    return seg_wav[start_samp:end_samp]
+# =========================
+# ==== MERGED TIMELINE ====
+# =========================
+def smooth_merged_segments(merged: List[Tuple[int,int,int,int]], min_frames: int) -> List[Tuple[int,int,int,int]]:
+    """
+    Enforce minimum length for merged segments.
+    merged: list of (frame_start, frame_end, odd_label, even_label).
+    If a segment has length < min_frames, we merge it with a neighbor:
+      - If both neighbors exist, choose the one with higher similarity of
+        (odd_label, even_label). Similarity is number of matching labels (0..2).
+      - If similarity is equal, merge with the neighbor that has shorter
+        duration (in frames). If still equal, merge with the left neighbor.
+      - If only one neighbor exists, merge with that neighbor.
+    Returns a new merged list.
+    """
+    if len(merged) <= 1:
+        return merged
+    merged = list(merged)
+    def seg_len(seg):
+        return seg[1] - seg[0]
+    def sim(a, b):
+        # a,b: (fs,fe, odd,even)
+        score = 0
+        if a[2] == b[2]:
+            score += 1
+        if a[3] == b[3]:
+            score += 1
+        return score
+    changed = True
+    while changed:
+        changed = False
+        n = len(merged)
+        if n <= 1:
+            break
+        for i, seg in enumerate(merged):
+            length = seg_len(seg)
+            if length >= min_frames:
+                continue
+            left = merged[i - 1] if i > 0 else None
+            right = merged[i + 1] if i + 1 < n else None
+            if left is None and right is None:
+                continue
+            # Decide which neighbor to merge with
+            if left is not None and right is not None:
+                s_left = sim(seg, left)
+                s_right = sim(seg, right)
+                if s_left > s_right:
+                    target = "left"
+                elif s_right > s_left:
+                    target = "right"
+                else:
+                    # similarity tie -> choose shorter neighbor
+                    len_left = seg_len(left)
+                    len_right = seg_len(right)
+                    if len_left < len_right:
+                        target = "left"
+                    elif len_right < len_left:
+                        target = "right"
+                    else:
+                        target = "left"  # full tie -> left
+            elif left is not None:
+                target = "left"
+            else:
+                target = "right"
+            if target == "left":
+                fs = left[0]
+                fe = seg[1]
+                odd_label = left[2]
+                even_label = left[3]
+                merged[i - 1] = (fs, fe, odd_label, even_label)
+                del merged[i]
+            else:
+                fs = seg[0]
+                fe = right[1]
+                odd_label = right[2]
+                even_label = right[3]
+                merged[i + 1] = (fs, fe, odd_label, even_label)
+                del merged[i]
+            changed = True
+            break  # restart scanning with new list
+    return merged
+def build_merged_segments(ids_odd: np.ndarray, ids_even: np.ndarray, min_frames: int):
+    """
+    Build merged segmentation from two tracks and then smooth merged segments.
+    - boundaries are at 0, NUM_FRAMES, and every point where either track changes.
+    - for each raw merged segment we set odd/even labels via majority label.
+    - then we enforce minimum length for the merged segments via
+      smooth_merged_segments.
+    """
+    assert len(ids_odd) == len(ids_even) == NUM_FRAMES
+    n = NUM_FRAMES
+    boundaries = {0, n}
+    for ids in (ids_odd, ids_even):
+        cur = ids[0]
+        for i in range(1, n):
+            if ids[i] != cur:
+                boundaries.add(i)
+                cur = ids[i]
+    b = sorted(boundaries)
+    merged = []
+    for i in range(len(b) - 1):
+        s = b[i]
+        e = b[i + 1]
+        if e <= s:
+            continue
+        slice_odd = ids_odd[s:e]
+        slice_even = ids_even[s:e]
+        if slice_odd.size == 0 or slice_even.size == 0:
+            continue
+        odd_vals, odd_counts = np.unique(slice_odd, return_counts=True)
+        even_vals, even_counts = np.unique(slice_even, return_counts=True)
+        odd_label = int(odd_vals[np.argmax(odd_counts)])
+        even_label = int(even_vals[np.argmax(even_counts)])
+        merged.append((s, e, odd_label, even_label))
+    # Now enforce min length also on merged segments
+    merged = smooth_merged_segments(merged, min_frames)
+    return merged
+# =========================
+# ======= PLOTTING ========
+# =========================
+def _plot_tracks_seconds(pred_ids: torch.Tensor, title: str) -> bytes:
+    """
+    pred_ids: [2, NUM_FRAMES] LongTensor
+    """
+    secs = np.linspace(0.0, CLIP_SECONDS, NUM_FRAMES)
+    fig = plt.figure(figsize=(10, 2.8))
+    ax = plt.gca()
+    im = ax.imshow(
+        pred_ids.numpy(),
+        aspect="auto",
+        interpolation="nearest",
+        origin="upper",
+        extent=[secs[0], secs[-1], -0.5, 1.5],
+    )
+    ax.set_title(title)
+    ax.set_xlabel("Time (s)")
+    ax.set_yticks([0, 1])
+    ax.set_yticklabels(["odd", "even"])
+    cb = plt.colorbar(im, fraction=0.046, pad=0.04)
+    cb.set_label("Segment ID")
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf.read()
+def _plot_merged_segments(seg_ids: np.ndarray, title: str) -> bytes:
+    """
+    seg_ids: [NUM_FRAMES] array where each frame holds a merged-segment index.
+    """
+    secs = np.linspace(0.0, CLIP_SECONDS, NUM_FRAMES)
+    fig = plt.figure(figsize=(10, 2.8))
+    ax = plt.gca()
+    im = ax.imshow(
+        seg_ids[np.newaxis, :],
+        aspect="auto",
+        interpolation="nearest",
+        origin="upper",
+        extent=[secs[0], secs[-1], -0.5, 0.5],
+    )
+    ax.set_title(title)
+    ax.set_xlabel("Time (s)")
+    ax.set_yticks([0])
+    ax.set_yticklabels(["merged"])
+    cb = plt.colorbar(im, fraction=0.046, pad=0.04)
+    cb.set_label("Merged seg ID")
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf.read()
+# =========================
+# ========= HTML ==========
+# =========================
+def write_html_report(out_dir: Path, chunks: List[Dict[str, Any]]) -> Path:
+    ts = time.strftime("%Y%m%d_%H%M%S")
+    html = [f"""<!doctype html><html><head><meta charset="utf-8">
+<style>
+ body{{font-family:system-ui,Segoe UI,Roboto,Arial,sans-serif;margin:20px}}
+ .card{{border:1px solid #ddd;border-radius:10px;padding:16px;margin:16px 0;
+        box-shadow:0 2px 6px rgba(0,0,0,.05)}}
+ .grid{{display:grid;grid-template-columns:1fr 1fr;gap:12px}}
+ figure{{margin:0}}
+ figcaption{{font-size:13px;color:#555;margin-top:6px}}
+ audio{{width:100%;min-width:200px;margin-top:4px}}
+ .meta{{font-size:13px;color:#666;margin-bottom:4px}}
+ table{{border-collapse:collapse;width:100%;margin-top:8px;font-size:13px;table-layout:fixed}}
+ th,td{{border:1px solid #ddd;padding:4px 6px;text-align:left;vertical-align:top;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}}
+ th{{background:#f5f5f5}}
+</style>
+<title>Odd/Even Segmentation - Inference {ts}</title></head><body>
+<h1>Odd/Even Segmentation - Inference</h1>
+<p>
+This report shows <b>smoothed</b> segmentations for each 30-second chunk of your audio files.
+The model predicts two parallel time tracks ("odd" and "even") that can hold overlapping events.
+We first smooth each track so that <b>no segment (including background 0) is shorter than {MIN_SEGMENT_SEC:.2f} seconds</b>.
+Then:
+</p>
+<ul>
+<li><b>Per-track segments</b>: segments for each track (odd/even) with duration &gt;= {MIN_SEGMENT_SEC:.2f}s, each with its own audio player.</li>
+<li><b>Merged timeline</b>: a single segmentation where a new segment starts or ends whenever either track changes, and each merged segment is also at least {MIN_SEGMENT_SEC:.2f}s long by merging very short segments into their most similar neighbor.</li>
+</ul>
+"""]
+    for ch in chunks:
+        html.append(f"""
+<section class="card">
+  <h2>{ch['file_name']} - chunk {ch['chunk_idx']}</h2>
+  <div class="meta">
+    Chunk offset in file: {ch['chunk_offset']:.2f} - {ch['chunk_offset'] + CLIP_SECONDS:.2f} s
+  </div>
+  <div class="grid">
+    <figure>
+      <img src="data:image/png;base64,{ch['png_tracks']}" alt="smoothed tracks">
+      <figcaption>Smoothed per-track predictions (odd/even).</figcaption>
+    </figure>
+    <figure>
+      <img src="data:image/png;base64,{ch['png_merged']}" alt="merged timeline">
+      <figcaption>Merged timeline: segment borders whenever odd or even track changes label, then smoothed to enforce a minimum duration.</figcaption>
+    </figure>
+  </div>
+  <h3>Per-track segments (min {MIN_SEGMENT_SEC:.2f} s)</h3>
+  <p>Each row is one predicted event on the odd or even track. Times are relative to the start of this 30-second chunk.</p>
+  <table class="seg seg-track">
+    <colgroup>
+      <col style="width:5%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:45%">
+    </colgroup>
+    <tr><th>#</th><th>Track</th><th>Label ID</th><th>Start (s)</th><th>End (s)</th>
+        <th>Duration (s)</th><th>Audio</th></tr>
+""")
+        # per-track table
+        for i, seg in enumerate(ch["track_segments"], start=1):
+            audio_cell = ""
+            if seg["audio_b64"] and seg["audio_mime"]:
+                audio_cell = (
+                    '<audio controls preload="none">'
+                    f'<source src="data:{seg["audio_mime"]};base64,{seg["audio_b64"]}" '
+                    f'type="{seg["audio_mime"]}"></audio>'
+                )
+            html.append(
+                f"<tr><td>{i}</td>"
+                f"<td>{seg['track']}</td>"
+                f"<td>{seg['label']}</td>"
+                f"<td>{seg['start']:.2f}</td>"
+                f"<td>{seg['end']:.2f}</td>"
+                f"<td>{seg['dur']:.2f}</td>"
+                f"<td>{audio_cell}</td></tr>"
+            )
+        html.append("</table>")
+        # merged timeline table
+        html.append(f"""
+  <h3>Merged timeline segments</h3>
+  <p>
+The merged timeline splits the 30-second chunk wherever either the odd or even track changes label.
+Very short merged segments (shorter than {MIN_SEGMENT_SEC:.2f}s) are merged into their most similar neighbor
+based on odd/even labels; if both neighbors are equally similar, they are merged into the shorter neighbor.
+This yields a single sequence of non-overlapping segments that cover the entire chunk.
+Each row shows the majority label on the odd and even tracks within that merged segment.
+  </p>
+  <table class="seg seg-merged">
+    <colgroup>
+      <col style="width:5%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:10%">
+      <col style="width:45%">
+    </colgroup>
+    <tr><th>#</th><th>Start (s)</th><th>End (s)</th><th>Duration (s)</th>
+        <th>Odd label</th><th>Even label</th><th>Audio</th></tr>
+""")
+        for i, seg in enumerate(ch["merged_segments"], start=1):
+            audio_cell = ""
+            if seg["audio_b64"] and seg["audio_mime"]:
+                audio_cell = (
+                    '<audio controls preload="none">'
+                    f'<source src="data:{seg["audio_mime"]};base64,{seg["audio_b64"]}" '
+                    f'type="{seg["audio_mime"]}"></audio>'
+                )
+            html.append(
+                f"<tr><td>{i}</td>"
+                f"<td>{seg['start']:.2f}</td>"
+                f"<td>{seg['end']:.2f}</td>"
+                f"<td>{seg['dur']:.2f}</td>"
+                f"<td>{seg['odd_label']}</td>"
+                f"<td>{seg['even_label']}</td>"
+                f"<td>{audio_cell}</td></tr>"
+            )
+        html.append("</table></section>")
+    html.append("</body></html>")
+    out_path = out_dir / f"seg_infer_smooth_{ts}.html"
+    out_path.write_text("\n".join(html), encoding="utf-8")
+    return out_path
+# =========================
+# ========= MAIN ==========
+# =========================
+def main():
+    setup_dirs()
+    global AUDIO_INPUT_DIR
+    if len(sys.argv) > 1:
+        AUDIO_INPUT_DIR = Path(sys.argv[1])
+    if not AUDIO_INPUT_DIR.is_dir():
+        print(f"[ERR] AUDIO_INPUT_DIR not found or not a dir: {AUDIO_INPUT_DIR}", file=sys.stderr)
+        sys.exit(1)
+    if not CKPT_PATH.is_file():
+        print(f"[ERR] Checkpoint not found: {CKPT_PATH}", file=sys.stderr)
+        sys.exit(1)
+    print(f"[cfg] AUDIO_INPUT_DIR  = {AUDIO_INPUT_DIR}")
+    print(f"[cfg] OUT_DIR          = {OUT_DIR}")
+    print(f"[cfg] CKPT_PATH       = {CKPT_PATH}")
+    print(f"[cfg] HF_MODEL_ID     = {HF_MODEL_ID}")
+    print(f"[cfg] ffmpeg available: {FFMPEG_AVAILABLE}")
+    print(f"[cfg] MIN_SEGMENT_SEC  = {MIN_SEGMENT_SEC:.2f} (frames >= {MIN_SEGMENT_FRAMES})")
+    # find audio files
+    exts = {".wav", ".mp3", ".m4a", ".flac", ".ogg"}
+    audio_files: List[Path] = []
+    for p in AUDIO_INPUT_DIR.rglob("*"):
+        if p.is_file() and p.suffix.lower() in exts:
+            audio_files.append(p)
+    audio_files = sorted(audio_files)
+    if not audio_files:
+        print("[ERR] No audio files found.", file=sys.stderr)
+        sys.exit(1)
+    print(f"[scan] Found {len(audio_files)} audio files.")
+    # feature extractor
+    resolved, is_local = _model_resolved_name(HF_MODEL_ID)
+    fe = WhisperFeatureExtractor.from_pretrained(resolved, local_files_only=is_local)
+    # model + checkpoint
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    model = WhisperOddEven(HF_MODEL_ID, freeze_encoder=False).to(device)
+    state = torch.load(CKPT_PATH, map_location="cpu")
+    # accept full trainer_state dict or plain state_dict
+    if isinstance(state, dict) and "model" in state and any(
+        k.startswith("whisper.") for k in state["model"].keys()
+    ):
+        state = state["model"]
+    missing, unexpected = model.load_state_dict(state, strict=False)
+    print(f"[ckpt] Loaded checkpoint from {CKPT_PATH}")
+    if missing:
+        print(f"[ckpt] Missing keys: {missing}")
+    if unexpected:
+        print(f"[ckpt] Unexpected keys: {unexpected}")
+    model.eval()
+    use_dtype = preferred_dtype()
+    amp_enabled = use_dtype in (torch.float16, torch.bfloat16)
+    chunk_results: List[Dict[str, Any]] = []
+    with torch.no_grad():
+        for fpath in audio_files:
+            print(f"[file] {fpath}")
+            try:
+                wav = load_audio_mono_16k(fpath)
+            except Exception as e:
+                print(f"[file] Failed to load {fpath}: {e}")
+                continue
+            chunks = split_into_chunks(wav, SAMPLE_RATE, CLIP_SECONDS)
+            if not chunks:
+                print(f"[file] No audio samples in {fpath}")
+                continue
+            for chunk_idx, start_sample, seg in chunks:
+                chunk_offset_sec = start_sample / SAMPLE_RATE
+                # features
+                feat = fe(seg, sampling_rate=SAMPLE_RATE, return_tensors="pt")
+                x = feat.input_features.to(device)
+                # forward
+                with torch.autocast(
+                    device_type="cuda" if torch.cuda.is_available() else "cpu",
+                    enabled=amp_enabled,
+                    dtype=use_dtype,
+                ):
+                    logits = model(x)
+                # raw argmax
+                raw_ids = logits.argmax(dim=-1).squeeze(0).cpu().numpy()  # [2,1500]
+                # aggressive smoothing with min duration per track
+                sm_ids = np.zeros_like(raw_ids)
+                for tr in range(NUM_TRACKS):
+                    sm_ids[tr] = smooth_min_duration(raw_ids[tr], MIN_SEGMENT_FRAMES)
+                sm_ids_t = torch.from_numpy(sm_ids)
+                png_tracks = base64.b64encode(
+                    _plot_tracks_seconds(
+                        sm_ids_t,
+                        f"Smoothed tracks - {fpath.name} - chunk {chunk_idx}",
+                    )
+                ).decode("ascii")
+                # merged timeline with its own min-duration smoothing
+                merged = build_merged_segments(sm_ids[0], sm_ids[1], MIN_SEGMENT_FRAMES)
+                merged_index = np.zeros(NUM_FRAMES, dtype=np.int64)
+                for idx, (fs, fe_, _ol, _el) in enumerate(merged, start=1):
+                    merged_index[fs:fe_] = idx
+                png_merged = base64.b64encode(
+                    _plot_merged_segments(
+                        merged_index,
+                        f"Merged segments - {fpath.name} - chunk {chunk_idx}",
+                    )
+                ).decode("ascii")
+                # per-track segments -> audio snippets
+                track_segments: List[Dict[str, Any]] = []
+                for tr, track_name in enumerate(("odd", "even")):
+                    seg_runs = extract_segments(sm_ids[tr], include_bg=False)
+                    for (lab, fs, fe_) in seg_runs:
+                        start_t, end_t = frames_to_times(fs, fe_)
+                        dur = end_t - start_t
+                        if dur <= 0:
+                            continue
+                        sub_wav = cut_wav(seg, start_t, end_t)
+                        if sub_wav.size == 0:
+                            continue
+                        try:
+                            audio_bytes, audio_mime = wav_chunk_to_audio_bytes(sub_wav, SAMPLE_RATE)
+                            audio_b64 = base64.b64encode(audio_bytes).decode("ascii")
+                        except Exception as e:
+                            print(f"[audio] Failed per-track snippet for {fpath} chunk {chunk_idx}: {e}")
+                            audio_b64 = None
+                            audio_mime = None
+                        track_segments.append(
+                            {
+                                "track": track_name,
+                                "label": int(lab),
+                                "start": float(start_t),
+                                "end": float(end_t),
+                                "dur": float(dur),
+                                "audio_b64": audio_b64,
+                                "audio_mime": audio_mime,
+                            }
+                        )
+                # merged segments -> audio snippets
+                merged_segments: List[Dict[str, Any]] = []
+                for idx, (fs, fe_, odd_label, even_label) in enumerate(merged, start=1):
+                    start_t, end_t = frames_to_times(fs, fe_)
+                    dur = end_t - start_t
+                    if dur <= 0:
+                        continue
+                    sub_wav = cut_wav(seg, start_t, end_t)
+                    if sub_wav.size == 0:
+                        continue
+                    try:
+                        audio_bytes, audio_mime = wav_chunk_to_audio_bytes(sub_wav, SAMPLE_RATE)
+                        audio_b64 = base64.b64encode(audio_bytes).decode("ascii")
+                    except Exception as e:
+                        print(f"[audio] Failed merged snippet for {fpath} chunk {chunk_idx}: {e}")
+                        audio_b64 = None
+                        audio_mime = None
+                    merged_segments.append(
+                        {
+                            "idx": idx,
+                            "start": float(start_t),
+                            "end": float(end_t),
+                            "dur": float(dur),
+                            "odd_label": int(odd_label),
+                            "even_label": int(even_label),
+                            "audio_b64": audio_b64,
+                            "audio_mime": audio_mime,
+                        }
+                    )
+                chunk_results.append(
+                    {
+                        "file_name": fpath.name,
+                        "chunk_idx": int(chunk_idx),
+                        "chunk_offset": float(chunk_offset_sec),
+                        "png_tracks": png_tracks,
+                        "png_merged": png_merged,
+                        "track_segments": track_segments,
+                        "merged_segments": merged_segments,
+                    }
+                )
+    if not chunk_results:
+        print("[ERR] No chunk results; nothing to write.", file=sys.stderr)
+        sys.exit(1)
+    out_html = write_html_report(OUT_DIR, chunk_results)
+    print(f"[done] Wrote HTML report: {out_html}")
+if __name__ == "__main__":
+    main()