thanrl
/

GigaAM-v3

@@ -5,6 +5,8 @@ import os
 import sys
 import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from subprocess import CalledProcessError, run
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -35,6 +37,144 @@ _PIPELINE = None
 ### preprocess ###
 def load_audio(audio_path: str, sample_rate: int = SAMPLE_RATE) -> Tensor:
     """
     Load an audio file and resample it to the specified sample rate.
@@ -89,6 +229,13 @@ class FeatureExtractor(nn.Module):
         self.win_length = kwargs.get("win_length", sample_rate // 40)
         self.n_fft = kwargs.get("n_fft", sample_rate // 40)
         self.center = kwargs.get("center", True)
         self.featurizer = nn.Sequential(
             torchaudio.transforms.MelSpectrogram(
                 sample_rate=sample_rate,
@@ -97,10 +244,27 @@ class FeatureExtractor(nn.Module):
                 hop_length=self.hop_length,
                 n_fft=self.n_fft,
                 center=self.center,
             ),
             SpecScaler(),
         )
     def out_len(self, input_lengths: Tensor) -> Tensor:
         """
         Calculates the output length after the feature extraction process.
@@ -1107,6 +1271,54 @@ class CTCGreedyDecoding:
         return pred_texts
 class RNNTGreedyDecoding:
     def __init__(
         self,
@@ -1121,29 +1333,88 @@ class RNNTGreedyDecoding:
         self.blank_id = len(self.tokenizer)
         self.max_symbols = max_symbols_per_step
-    def _greedy_decode(self, head: RNNTHead, x: Tensor, seqlen: Tensor) -> str:
-        """
-        Internal helper function for performing greedy decoding on a single sequence.
-        """
         hyp: List[int] = []
         dec_state: Optional[Tensor] = None
         last_label: Optional[Tensor] = None
         for t in range(seqlen):
             f = x[t, :, :].unsqueeze(1)
             not_blank = True
             new_symbols = 0
             while not_blank and new_symbols < self.max_symbols:
                 g, hidden = head.decoder.predict(last_label, dec_state)
-                k = head.joint.joint(f, g)[0, 0, 0, :].argmax(0).item()
                 if k == self.blank_id:
                     not_blank = False
                 else:
                     hyp.append(int(k))
                     dec_state = hidden
-                    last_label = torch.tensor([[hyp[-1]]]).to(x.device)
                     new_symbols += 1
-        return self.tokenizer.decode(hyp)
     @torch.inference_mode()
     def decode(self, head: RNNTHead, encoded: Tensor, enc_len: Tensor) -> List[str]:
@@ -1159,6 +1430,23 @@ class RNNTGreedyDecoding:
         return pred_texts
 ### models ###
@@ -1180,9 +1468,20 @@ class GigaAM(nn.Module):
         Perform forward pass through the preprocessor and encoder.
         """
         features, feature_lengths = self.preprocessor(features, feature_lengths)
         if self._device.type == "cpu":
             return self.encoder(features, feature_lengths)
-        with torch.autocast(device_type=self._device.type, dtype=torch.float16):
             return self.encoder(features, feature_lengths)
     @property
@@ -1197,8 +1496,30 @@ class GigaAM(nn.Module):
         """
         Prepare an audio file for processing by loading it onto
         the correct device and converting its format.
         """
         wav = load_audio(wav_file)
         wav = wav.to(self._device).to(self._dtype).unsqueeze(0)
         length = torch.full([1], wav.shape[-1], device=self._device)
         return wav, length
@@ -1252,6 +1573,100 @@ class GigaAMASR(GigaAM):
         encoded, encoded_len = self.forward(wav, length)
         return self.decoding.decode(self.head, encoded, encoded_len)[0]
     def forward_for_export(self, features: Tensor, feature_lengths: Tensor) -> Tensor:
         """
         Encoder-decoder forward to save model entirely in onnx format.

 import sys
 import warnings
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from contextlib import contextmanager, nullcontext
 from pathlib import Path
 from subprocess import CalledProcessError, run
 from typing import Any, Dict, List, Optional, Tuple, Union
 ### preprocess ###
+# --- Debug/robustness toggles (env vars, no config changes required) ---
+# Set GIGAAM_DEBUG=1 to enable warnings and per-utterance stats printing
+# Set GIGAAM_FORCE_FP32=1 to disable autocast and run encoder in fp32
+# Set GIGAAM_PAD_START_MS / GIGAAM_PAD_END_MS to pad waveform with silence (milliseconds)
+# Set GIGAAM_MELS_PAD_MODE to override torchaudio MelSpectrogram pad_mode (e.g. "constant" or "reflect")
+# Set GIGAAM_MELS_CENTER to override center (0/1) for MelSpectrogram
+def _env_flag(name: str, default: bool = False) -> bool:
+    v = os.environ.get(name, None)
+    if v is None:
+        return default
+    return v.strip().lower() in {"1", "true", "yes", "y", "on"}
+def _env_int(name: str, default: int = 0) -> int:
+    v = os.environ.get(name, None)
+    if v is None or v == "":
+        return default
+    try:
+        return int(float(v))
+    except Exception:
+        return default
+def _env_str(name: str, default: str = "") -> str:
+    v = os.environ.get(name, None)
+    if v is None or v == "":
+        return default
+    return str(v)
+def _env_opt_bool(name: str):
+    v = os.environ.get(name, None)
+    if v is None or v == "":
+        return None
+    return v.strip().lower() in {"1", "true", "yes", "y", "on"}
+def _print_once(msg: str):
+    # avoid spamming in batched scenarios
+    key = "_GIGAAM_PRINTED"
+    printed = globals().setdefault(key, set())
+    if msg not in printed:
+        print(msg)
+        printed.add(msg)
+def audio_stats(wav: Tensor, sr: int = SAMPLE_RATE) -> Dict[str, Any]:
+    # wav: 1D float tensor in [-1, 1] (best-effort)
+    if wav.numel() == 0:
+        return {"samples": 0, "seconds": 0.0}
+    x = wav.detach()
+    x = x.float().view(-1)
+    mean = x.mean().item()
+    x0 = x - mean
+    rms = torch.sqrt(torch.mean(x0 * x0)).item()
+    peak = torch.max(torch.abs(x)).item()
+    # "clipping" heuristic for int16-style inputs: near full-scale
+    clip_frac = (torch.abs(x) >= 0.999).float().mean().item()
+    # leading/trailing silence (rough): threshold at -45 dBFS ~= 0.0056
+    thr = 10 ** (-45 / 20)
+    above = (torch.abs(x) > thr).nonzero(as_tuple=False).view(-1)
+    if above.numel() == 0:
+        lead_s = x.numel() / sr
+        trail_s = x.numel() / sr
+    else:
+        lead_s = (above[0].item() / sr)
+        trail_s = ((x.numel() - 1 - above[-1].item()) / sr)
+    return {
+        "samples": int(x.numel()),
+        "seconds": float(x.numel() / sr),
+        "dtype": str(wav.dtype),
+        "mean": float(mean),
+        "rms": float(rms),
+        "peak": float(peak),
+        "clip_frac": float(clip_frac),
+        "lead_silence_s": float(lead_s),
+        "trail_silence_s": float(trail_s),
+        "nan": bool(torch.isnan(x).any().item()),
+        "inf": bool(torch.isinf(x).any().item()),
+    }
+def pad_wav(wav: Tensor, sr: int, pad_start_ms: int = 0, pad_end_ms: int = 0) -> Tensor:
+    if pad_start_ms <= 0 and pad_end_ms <= 0:
+        return wav
+    pad_start = int(sr * pad_start_ms / 1000.0)
+    pad_end = int(sr * pad_end_ms / 1000.0)
+    if pad_start < 0 or pad_end < 0:
+        return wav
+    dtype = wav.dtype
+    device = wav.device
+    pre = torch.zeros(pad_start, dtype=dtype, device=device)
+    post = torch.zeros(pad_end, dtype=dtype, device=device)
+    return torch.cat([pre, wav.view(-1), post], dim=0)
+def print_env_versions():
+    try:
+        import transformers as _tf
+        tfv = getattr(_tf, "__version__", "unknown")
+    except Exception:
+        tfv = "unknown"
+    _print_once(f"[GigaAM debug] torch={torch.__version__} torchaudio={torchaudio.__version__} transformers={tfv}")
+@contextmanager
+def temp_environ(**updates: str):
+    """Temporarily set os.environ keys for the duration of a context."""
+    old = {}
+    try:
+        for k, v in updates.items():
+            old[k] = os.environ.get(k, None)
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = str(v)
+        yield
+    finally:
+        for k, prev in old.items():
+            if prev is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = prev
+@contextmanager
+def temporary_module_dtype(module: nn.Module, dtype: torch.dtype):
+    """Temporarily cast a module to dtype; restores original dtype afterwards."""
+    try:
+        p = next(module.parameters())
+        orig = p.dtype
+    except StopIteration:
+        orig = dtype
+    if orig == dtype:
+        yield
+        return
+    module.to(dtype)
+    try:
+        yield
+    finally:
+        module.to(orig)
+@dataclass
+class DecodeDebug:
+    text: str
+    stats: Dict[str, Any]
 def load_audio(audio_path: str, sample_rate: int = SAMPLE_RATE) -> Tensor:
     """
     Load an audio file and resample it to the specified sample rate.
         self.win_length = kwargs.get("win_length", sample_rate // 40)
         self.n_fft = kwargs.get("n_fft", sample_rate // 40)
         self.center = kwargs.get("center", True)
+        env_center = _env_opt_bool("GIGAAM_MELS_CENTER")
+        if env_center is not None:
+            self.center = bool(env_center)
+        self.pad_mode = kwargs.get("pad_mode", "reflect")
+        env_pad_mode = _env_str("GIGAAM_MELS_PAD_MODE", "")
+        if env_pad_mode:
+            self.pad_mode = env_pad_mode
         self.featurizer = nn.Sequential(
             torchaudio.transforms.MelSpectrogram(
                 sample_rate=sample_rate,
                 hop_length=self.hop_length,
                 n_fft=self.n_fft,
                 center=self.center,
+                pad_mode=self.pad_mode,
             ),
             SpecScaler(),
         )
+    def set_mels_padding(self, *, center: Optional[bool] = None, pad_mode: Optional[str] = None) -> None:
+        """Hot-swap MelSpectrogram padding behavior for debugging."""
+        if center is not None:
+            self.center = bool(center)
+            # try to update the underlying transform if possible
+            m = self.featurizer[0]
+            if hasattr(m, "center"):
+                m.center = self.center  # type: ignore[attr-defined]
+        if pad_mode is not None:
+            self.pad_mode = str(pad_mode)
+            m = self.featurizer[0]
+            if hasattr(m, "pad_mode"):
+                m.pad_mode = self.pad_mode  # type: ignore[attr-defined]
+            elif hasattr(m, "spectrogram") and hasattr(m.spectrogram, "pad_mode"):
+                m.spectrogram.pad_mode = self.pad_mode  # type: ignore[attr-defined]
     def out_len(self, input_lengths: Tensor) -> Tensor:
         """
         Calculates the output length after the feature extraction process.
         return pred_texts
+    @torch.inference_mode()
+    def decode_with_debug(
+        self, head: CTCHead, encoded: Tensor, lengths: Tensor, topk: int = 5
+    ) -> Tuple[List[str], List[DecodeDebug]]:
+        """Like decode(), but also returns per-utterance blank/argmax diagnostics."""
+        log_probs = head(encoder_output=encoded)
+        labels = log_probs.argmax(dim=-1, keepdim=False)
+        b, t, c = log_probs.shape
+        pred_texts = self.decode(head, encoded, lengths)
+        debugs: List[DecodeDebug] = []
+        for i in range(b):
+            L = int(lengths[i].item())
+            L = max(0, min(L, t))
+            if L == 0:
+                debugs.append(DecodeDebug(text=pred_texts[i], stats={"enc_len": 0}))
+                continue
+            lab = labels[i, :L]
+            blank = (lab == self.blank_id)
+            blank_ratio = float(blank.float().mean().item())
+            # first frame where argmax != blank
+            nonblank_idx = (~blank).nonzero(as_tuple=False).view(-1)
+            first_nonblank = int(nonblank_idx[0].item()) if nonblank_idx.numel() else None
+            # top-k distribution at a few frames (start/mid/end) for quick inspection
+            probe_frames = sorted(set([0, L // 2, max(0, L - 1)]))
+            probes: Dict[str, Any] = {}
+            for pf in probe_frames:
+                vals, idxs = torch.topk(log_probs[i, pf, :], k=min(topk, c), dim=-1)
+                probes[str(pf)] = {
+                    "topk_ids": idxs.detach().cpu().tolist(),
+                    "topk_logp": [float(v) for v in vals.detach().cpu().tolist()],
+                    "blank_logp": float(log_probs[i, pf, self.blank_id].item()),
+                }
+            debugs.append(
+                DecodeDebug(
+                    text=pred_texts[i],
+                    stats={
+                        "enc_len": L,
+                        "blank_ratio_argmax": blank_ratio,
+                        "first_nonblank_frame": first_nonblank,
+                        "probe_frames": probes,
+                    },
+                )
+            )
+        return pred_texts, debugs
 class RNNTGreedyDecoding:
     def __init__(
         self,
         self.blank_id = len(self.tokenizer)
         self.max_symbols = max_symbols_per_step
+    def _greedy_decode_impl(
+        self,
+        head: RNNTHead,
+        x: Tensor,
+        seqlen: Tensor,
+        collect_stats: bool = False,
+        topk: int = 5,
+    ) -> DecodeDebug:
+        """Greedy RNNT decode for a single sequence, with optional blank diagnostics."""
         hyp: List[int] = []
         dec_state: Optional[Tensor] = None
         last_label: Optional[Tensor] = None
+        # Diagnostics (kept lightweight unless collect_stats=True)
+        total_joint_steps = 0
+        blank_steps = 0
+        emitted_steps = 0
+        first_emit_frame: Optional[int] = None
+        blank_margins: List[float] = []
+        probe_frames: Dict[str, Any] = {}
         for t in range(seqlen):
             f = x[t, :, :].unsqueeze(1)
             not_blank = True
             new_symbols = 0
             while not_blank and new_symbols < self.max_symbols:
                 g, hidden = head.decoder.predict(last_label, dec_state)
+                logp = head.joint.joint(f, g)[0, 0, 0, :]  # log-probs over vocab+blank
+                total_joint_steps += 1
+                k = int(logp.argmax(0).item())
+                if collect_stats:
+                    # how strongly blank beats the best non-blank
+                    blank_lp = float(logp[self.blank_id].item())
+                    best_nonblank_lp = float(logp[: self.blank_id].max().item())
+                    blank_margins.append(blank_lp - best_nonblank_lp)
+                    if t in (0, int(seqlen) // 2, max(0, int(seqlen) - 1)) and str(t) not in probe_frames:
+                        vals, idxs = torch.topk(logp, k=min(topk, logp.numel()))
+                        probe_frames[str(int(t))] = {
+                            "topk_ids": idxs.detach().cpu().tolist(),
+                            "topk_logp": [float(v) for v in vals.detach().cpu().tolist()],
+                            "blank_logp": blank_lp,
+                        }
                 if k == self.blank_id:
+                    blank_steps += 1
                     not_blank = False
                 else:
+                    emitted_steps += 1
+                    if first_emit_frame is None:
+                        first_emit_frame = int(t)
                     hyp.append(int(k))
                     dec_state = hidden
+                    last_label = torch.tensor([[hyp[-1]]], device=x.device)
                     new_symbols += 1
+        text = self.tokenizer.decode(hyp)
+        stats: Dict[str, Any] = {}
+        if collect_stats:
+            # Summaries only (avoid huge blobs)
+            if blank_margins:
+                bm = torch.tensor(blank_margins)
+                stats["blank_margin_mean"] = float(bm.mean().item())
+                stats["blank_margin_p50"] = float(bm.median().item())
+                stats["blank_margin_p90"] = float(torch.quantile(bm, 0.9).item())
+            stats.update(
+                {
+                    "enc_len": int(seqlen),
+                    "total_joint_steps": int(total_joint_steps),
+                    "blank_steps": int(blank_steps),
+                    "emitted_steps": int(emitted_steps),
+                    "blank_step_frac": float(blank_steps / max(1, total_joint_steps)),
+                    "first_emit_frame": first_emit_frame,
+                    "probe_frames": probe_frames,
+                }
+            )
+        return DecodeDebug(text=text, stats=stats)
+    def _greedy_decode(self, head: RNNTHead, x: Tensor, seqlen: Tensor) -> str:
+        """Backward-compatible greedy decode (no stats)."""
+        return self._greedy_decode_impl(head, x, seqlen, collect_stats=False).text
     @torch.inference_mode()
     def decode(self, head: RNNTHead, encoded: Tensor, enc_len: Tensor) -> List[str]:
         return pred_texts
+    @torch.inference_mode()
+    def decode_with_debug(
+        self, head: RNNTHead, encoded: Tensor, enc_len: Tensor, topk: int = 5
+    ) -> Tuple[List[str], List[DecodeDebug]]:
+        """Like decode(), but also returns per-utterance blank diagnostics."""
+        b = encoded.shape[0]
+        encoded_t = encoded.transpose(1, 2)
+        texts: List[str] = []
+        debugs: List[DecodeDebug] = []
+        for i in range(b):
+            inseq = encoded_t[i, :, :].unsqueeze(1)
+            dbg = self._greedy_decode_impl(head, inseq, enc_len[i], collect_stats=True, topk=topk)
+            texts.append(dbg.text)
+            debugs.append(dbg)
+        return texts, debugs
 ### models ###
         Perform forward pass through the preprocessor and encoder.
         """
         features, feature_lengths = self.preprocessor(features, feature_lengths)
+        if _env_flag("GIGAAM_DEBUG", False):
+            print_env_versions()
+        # CPU: no autocast
         if self._device.type == "cpu":
             return self.encoder(features, feature_lengths)
+        # GPU: optionally disable autocast to debug fp16-boundary failures
+        force_fp32 = _env_flag("GIGAAM_FORCE_FP32", False)
+        if force_fp32:
+            features = features.float()
+        with torch.autocast(device_type=self._device.type, dtype=torch.float16, enabled=not force_fp32):
             return self.encoder(features, feature_lengths)
     @property
         """
         Prepare an audio file for processing by loading it onto
         the correct device and converting its format.
+        Debug/robustness (env vars):
+          - GIGAAM_DEBUG=1 prints waveform stats
+          - GIGAAM_PAD_START_MS / GIGAAM_PAD_END_MS pad silence (milliseconds)
         """
         wav = load_audio(wav_file)
+        # Optional padding to reduce edge effects from STFT centering/padding
+        pad_start_ms = _env_int("GIGAAM_PAD_START_MS", 0)
+        pad_end_ms = _env_int("GIGAAM_PAD_END_MS", 0)
+        if pad_start_ms or pad_end_ms:
+            wav = pad_wav(wav, SAMPLE_RATE, pad_start_ms=pad_start_ms, pad_end_ms=pad_end_ms)
+        if _env_flag("GIGAAM_DEBUG", False):
+            st = audio_stats(wav, SAMPLE_RATE)
+            # Very rough "this might be off-distribution" checks
+            if abs(st.get("mean", 0.0)) > 1e-3:
+                print(f"[GigaAM debug] WARNING: DC-ish mean={st['mean']:.4g} for {wav_file}")
+            if st.get("clip_frac", 0.0) > 0.001:
+                print(f"[GigaAM debug] WARNING: possible clipping frac={st['clip_frac']:.4g} for {wav_file}")
+            if st.get("nan") or st.get("inf"):
+                print(f"[GigaAM debug] ERROR: NaN/Inf in waveform for {wav_file}")
+            print(f"[GigaAM debug] wav stats for {wav_file}: {json.dumps(st, ensure_ascii=False)}")
         wav = wav.to(self._device).to(self._dtype).unsqueeze(0)
         length = torch.full([1], wav.shape[-1], device=self._device)
         return wav, length
         encoded, encoded_len = self.forward(wav, length)
         return self.decoding.decode(self.head, encoded, encoded_len)[0]
+    @torch.inference_mode()
+    def transcribe_debug(
+        self,
+        wav_file: str,
+        *,
+        topk: int = 5,
+        try_fixes: bool = True,
+        pad_ms: int = 500,
+    ) -> Dict[str, Any]:
+        """Run transcription plus diagnostics. If empty, optionally try common fixes.
+        Returns a JSON-serializable dict with:
+          - attempts: list of {strategy, text, decode_stats}
+        """
+        report: Dict[str, Any] = {
+            "wav_file": wav_file,
+            "torch": torch.__version__,
+            "torchaudio": torchaudio.__version__,
+            "attempts": [],
+        }
+        pre = self.preprocessor
+        orig_center = getattr(pre, "center", None)
+        orig_pad_mode = getattr(pre, "pad_mode", None)
+        def _run(strategy: str, *, force_fp32: bool = False, pad_start_ms: int = 0, pad_end_ms: int = 0, mels_pad_mode: Optional[str] = None):
+            # Apply per-attempt toggles via env (forward()/prepare_wav() read these)
+            env = {
+                "GIGAAM_DEBUG": "1",
+                "GIGAAM_FORCE_FP32": "1" if force_fp32 else None,
+                "GIGAAM_PAD_START_MS": str(pad_start_ms) if pad_start_ms else None,
+                "GIGAAM_PAD_END_MS": str(pad_end_ms) if pad_end_ms else None,
+            }
+            with temp_environ(**env):
+                # Hot-swap mel padding mode if requested
+                if mels_pad_mode is not None and hasattr(pre, "set_mels_padding"):
+                    pre.set_mels_padding(pad_mode=mels_pad_mode)
+                dtype_ctx = temporary_module_dtype(self, torch.float32) if force_fp32 else nullcontext()
+                with dtype_ctx:
+                    wav, length = self.prepare_wav(wav_file)
+                    if length.item() > LONGFORM_THRESHOLD:
+                        raise ValueError("Too long wav file, use 'transcribe_longform' method.")
+                    encoded, encoded_len = self.forward(wav, length)
+                    if hasattr(self.decoding, "decode_with_debug"):
+                        texts, debugs = self.decoding.decode_with_debug(self.head, encoded, encoded_len, topk=topk)  # type: ignore[attr-defined]
+                        text = texts[0]
+                        dec_stats = debugs[0].stats
+                    else:
+                        text = self.decoding.decode(self.head, encoded, encoded_len)[0]
+                        dec_stats = {}
+                # Restore mel settings after attempt
+                if hasattr(pre, "set_mels_padding"):
+                    pre.set_mels_padding(center=orig_center if isinstance(orig_center, bool) else None, pad_mode=orig_pad_mode if isinstance(orig_pad_mode, str) else None)
+            report["attempts"].append(
+                {"strategy": strategy, "text": text, "decode_stats": dec_stats}
+            )
+            return text
+        # Attempt 0: baseline
+        txt = _run("baseline")
+        if txt != "" or not try_fixes:
+            report["final_text"] = txt
+            return report
+        # Fix 1: rerun with fp32 (disable autocast)
+        txt = _run("force_fp32", force_fp32=True)
+        if txt != "":
+            report["final_text"] = txt
+            return report
+        # Fix 2: pad both ends (helps with STFT centering + reflect padding edge artifacts)
+        txt = _run("pad_silence_both_ends", pad_start_ms=pad_ms, pad_end_ms=pad_ms)
+        if txt != "":
+            report["final_text"] = txt
+            return report
+        # Fix 3: stop reflect padding in the spectrogram (pad_mode=constant) + pad both ends
+        txt = _run("mels_pad_mode_constant_plus_pad", pad_start_ms=pad_ms, pad_end_ms=pad_ms, mels_pad_mode="constant")
+        report["final_text"] = txt
+        return report
+    @torch.inference_mode()
+    def transcribe_resilient(self, wav_file: str, **kwargs) -> str:
+        """Convenience wrapper: return non-empty transcription if any fix works."""
+        rep = self.transcribe_debug(wav_file, **kwargs)
+        for att in rep.get("attempts", []):
+            if att.get("text", "") != "":
+                return att["text"]
+        return rep.get("final_text", "")
     def forward_for_export(self, features: Tensor, feature_lengths: Tensor) -> Tensor:
         """
         Encoder-decoder forward to save model entirely in onnx format.