Spaces:

woranit
/

Thai-Speech-to-Text-ct2

Sleeping

App Files Files Community

woranit commited on Sep 5

Commit

c31871d

verified ·

1 Parent(s): 647f8e1

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -436

app.py CHANGED Viewed

@@ -1,485 +1,97 @@
-# app.py — Thai ASR on faster-whisper (quiet-speech safe + gap-fill + rescue windows + optional WhisperX alignment)
-# Works on HF Spaces (CPU) and will auto-use GPU if available.
 import os
-import tempfile
-import subprocess
-from typing import List, Tuple, Optional
 import gradio as gr
 from faster_whisper import WhisperModel
-# =========================
-# Config / environment
-# =========================
-MODEL_ID = os.getenv("MODEL_ID", "Thaweewat/whisper-th-medium-ct2")
-# Try GPU if torch is present; else CPU
-try:
-    import torch  # optional; only used to detect GPU
-    HAS_CUDA = torch.cuda.is_available()
-except Exception:
-    HAS_CUDA = False
-DEVICE = "cuda" if HAS_CUDA else "cpu"
-COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8_float16" if DEVICE == "cuda" else "int8")
-CPU_THREADS = int(os.getenv("CPU_THREADS", os.cpu_count() or 4))
-NUM_WORKERS = int(os.getenv("NUM_WORKERS", 1))
-# Optional domain bias (proper nouns help): set in Space → Variables
-# e.g. "อนุทิน ชาญวีรกูล พรรคภูมิใจไทย พรรคประชาชน นายกรัฐมนตรี สภาผู้แทนราษฎร ลงมติ"
-BIAS_PROMPT = (os.getenv("INITIAL_PROMPT_TH") or "").strip()
-# Loudness normalization toggle (set LOUDNORM=1 in Space Variables to enable)
-LOUDNORM = os.getenv("LOUDNORM", "0") == "1"
-# ---- Gap-fill knobs ----
-GAP_MIN_SECONDS = float(os.getenv("GAP_MIN_SECONDS", "0.8"))   # consider holes >= 0.8s
-GAP_MAX_SECONDS = float(os.getenv("GAP_MAX_SECONDS", "40.0"))  # retry longer holes
-GAP_MAX_COUNT   = int(os.getenv("GAP_MAX_COUNT", "20"))        # allow many gap retries
-GAP_PAD         = float(os.getenv("GAP_PAD", "2.0"))           # more context around gaps
-# Join/merge behavior after recovery
-JOIN_GAP = float(os.getenv("JOIN_GAP", "0.35"))                # join segments separated by <= this
-# =========================
-# Optional WhisperX import (alignment)
-# =========================
-HAS_WHISPERX = False
-try:
-    import whisperx  # type: ignore
-    HAS_WHISPERX = True
-except Exception as _e:
-    HAS_WHISPERX = False
-# =========================
-# Load model (one-time)
-# =========================
-model = WhisperModel(
-    MODEL_ID,
-    device=DEVICE,
-    compute_type=COMPUTE_TYPE,
-    cpu_threads=CPU_THREADS,
-    num_workers=NUM_WORKERS,
-)
-# =========================
-# Helpers
-# =========================
-def _fmt_srt_time(t: Optional[float]) -> str:
     if t is None:
         t = 0.0
-    ms = int(round(float(t) * 1000))
     h, ms = divmod(ms, 3600000)
     m, ms = divmod(ms, 60000)
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
 def _segments_to_srt(segments: List[Tuple[int, float, float, str]]) -> str:
     lines = []
     for i, start, end, text in segments:
         lines.append(str(i))
         lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
         lines.append((text or "").strip())
-        lines.append("")
     return "\n".join(lines).strip() + "\n"
-def _ensure_mono16k(src_path: str) -> str:
     """
-    Convert any audio to mono/16k WAV for stable timestamps.
-    If LOUDNORM=1, apply EBU R128 loudness normalization to reduce 'no-speech' drops on quiet spans.
     """
-    out = tempfile.NamedTemporaryFile(prefix="norm_", suffix=".wav", delete=False)
-    out_path = out.name
-    out.close()
-    cmd = [
-        "ffmpeg", "-nostdin", "-loglevel", "error", "-y",
-        "-i", src_path,
-    ]
-    if LOUDNORM:
-        cmd += ["-af", "loudnorm=I=-16:LRA=11:TP=-1.5"]
-    cmd += ["-ac", "1", "-ar", "16000", out_path]
-    subprocess.run(cmd, check=True)
-    return out_path
-def _ffmpeg_trim(src_path: str, start: float, end: float) -> str:
-    """Create a temp WAV of [start, end]."""
-    start = max(0.0, float(start))
-    end = max(start, float(end))
-    out = tempfile.NamedTemporaryFile(prefix="clip_", suffix=".wav", delete=False)
-    out_path = out.name
-    out.close()
-    cmd = [
-        "ffmpeg", "-nostdin", "-loglevel", "error", "-y",
-        "-ss", f"{start:.3f}", "-to", f"{end:.3f}",
-        "-i", src_path, "-ac", "1", "-ar", "16000", out_path,
-    ]
-    subprocess.run(cmd, check=True)
-    return out_path
-def _run_asr(audio_path: str, use_vad: bool, vad_opts: dict, decode_opts: dict):
-    # Build kwargs so we can omit None-only fields safely
-    kwargs = dict(
-        vad_filter=use_vad,
-        vad_parameters=vad_opts if use_vad else None,
-        initial_prompt=BIAS_PROMPT if BIAS_PROMPT else None,
         **decode_opts,
     )
-    # Remove keys with value None (compat for older faster-whisper)
-    for k in ["log_prob_threshold", "compression_ratio_threshold", "patience"]:
-        if k in kwargs and kwargs[k] is None:
-            kwargs.pop(k)
-    segments_iter, info = model.transcribe(audio_path, **kwargs)
-    segs: List[Tuple[int, float, float, str]] = []
-    texts: List[str] = []
-    last_end = 0.0
     for idx, seg in enumerate(segments_iter, start=1):
-        start = float(seg.start) if seg.start is not None else last_end
         end = float(seg.end) if seg.end is not None else start
         text = (seg.text or "").strip()
         segs.append((idx, start, end, text))
         texts.append(text)
-        last_end = max(last_end, end)
-    transcript = "\n".join(texts).strip()
-    return segs, transcript, info, last_end
-def _find_gaps(segs: List[Tuple[int, float, float, str]], total_dur: float):
-    """Return list of (gap_start, gap_end, left_idx, right_idx)."""
-    gaps = []
-    if not segs:
-        return gaps
-    # Gap before first
-    if segs[0][1] >= GAP_MIN_SECONDS:
-        gaps.append((0.0, min(segs[0][1], GAP_MAX_SECONDS), None, 0))
-    # Gaps between
-    for i in range(len(segs) - 1):
-        cur_end = segs[i][2]
-        nxt_start = segs[i + 1][1]
-        gap = nxt_start - cur_end
-        if gap >= GAP_MIN_SECONDS:
-            gap_end = min(nxt_start, cur_end + GAP_MAX_SECONDS)
-            gaps.append((cur_end, gap_end, i, i + 1))
-    # Gap after last
-    tail_gap = total_dur - segs[-1][2]
-    if tail_gap >= GAP_MIN_SECONDS:
-        gap_end = min(total_dur, segs[-1][2] + GAP_MAX_SECONDS)
-        gaps.append((segs[-1][2], gap_end, len(segs) - 1, None))
-    return gaps[:GAP_MAX_COUNT]
-def _gap_fill(audio_path: str, segs: List[Tuple[int, float, float, str]], total_dur: float, decode_opts_base: dict):
-    """Re-decode suspicious gaps without VAD. Returns a new merged list."""
-    if total_dur <= 0 or not segs:
-        return segs
-    gaps = _find_gaps(segs, total_dur)
-    if not gaps:
-        return segs
-    print(f"[ASR] gap-fill: found {len(gaps)} gap(s) to retry")
-    # Slightly stronger decode for recovery
-    decode_opts_fb = dict(decode_opts_base)
-    decode_opts_fb.update({
-        "beam_size": max(2, decode_opts_base.get("beam_size", 1)),
-        "best_of": 1,
-        "temperature": 0.0,
-        "no_speech_threshold": 0.02,
-        "condition_on_previous_text": False,
-        # keep log_prob_threshold/compression disabled
-        "log_prob_threshold": None,
-        "compression_ratio_threshold": None,
-        # (no patience key)
-    })
-    recovered: List[Tuple[int, float, float, str]] = []
-    for (gs, ge, _left_idx, _right_idx) in gaps:
-        clip_start = max(0.0, gs - GAP_PAD)
-        clip_end   = min(total_dur, ge + GAP_PAD)
-        if clip_end - clip_start <= 0.12:
-            continue
-        try:
-            clip_path = _ffmpeg_trim(audio_path, clip_start, clip_end)
-            segs_c, _, _, _ = _run_asr(clip_path, False, {}, decode_opts_fb)
-            os.unlink(clip_path)
-        except Exception as e:
-            print(f"[ASR] gap-fill error on {clip_start:.2f}-{clip_end:.2f}: {e}")
-            continue
-        # Re-map clip-local times back to absolute, keep only inside the gap (+/- 0.2s tolerance)
-        for _, s, e, t in segs_c:
-            text = (t or "").strip()
-            if not text:
-                continue
-            abs_s = clip_start + max(0.0, s)
-            abs_e = clip_start + max(0.0, e)
-            if abs_e <= gs - 0.20 or abs_s >= ge + 0.20:
-                continue
-            recovered.append((0, abs_s, abs_e, text))
-    if not recovered:
-        return segs
-    # Merge + sort + reindex; also join tiny holes between neighbors
-    merged = segs + recovered
-    merged.sort(key=lambda x: x[1])  # by start
-    deduped: List[Tuple[int, float, float, str]] = []
-    for tup in merged:
-        if deduped:
-            prev = deduped[-1]
-            gap = tup[1] - prev[2]
-            if 0.0 <= gap <= JOIN_GAP:
-                deduped[-1] = (prev[0], prev[1], tup[2], (prev[3] + " " + tup[3]).strip())
-                continue
-            if gap < 0.15:
-                new_text = prev[3] if len(prev[3]) >= len(tup[3]) else tup[3]
-                deduped[-1] = (prev[0], min(prev[1], tup[1]), max(prev[2], tup[2]), new_text)
-                continue
-        deduped.append(tup)
-    reindexed = [(i + 1, s, e, t) for i, (_, s, e, t) in enumerate(deduped)]
-    print(f"[ASR] gap-fill: inserted {len(recovered)} piece(s); total segs={len(reindexed)}")
-    return reindexed
-# ---------- surgical rescue for specified windows ----------
-def _parse_windows(text: str):
-    """
-    Parse "20-38,60-75" -> [(20.0, 38.0), (60.0, 75.0)]
-    """
-    windows = []
-    if not text:
-        return windows
-    for chunk in text.split(","):
-        chunk = chunk.strip()
-        if "-" in chunk:
-            a, b = chunk.split("-", 1)
-            try:
-                a = float(a.strip()); b = float(b.strip())
-                if b > a:
-                    windows.append((a, b))
-            except:
-                continue
-    return windows
-def _rescue_windows(audio_path: str, windows: List[Tuple[float,float]], base_opts: dict):
-    rescued = []
-    if not windows:
-        return rescued
-    for (a, b) in windows:
-        try:
-            # small context around window
-            clip = _ffmpeg_trim(audio_path, max(0.0, a - 1.0), b + 1.0)
-            opts = dict(base_opts)
-            opts.update({
-                "beam_size": 2,
-                "best_of": 1,
-                "temperature": 0.0,
-                "no_speech_threshold": 0.02,
-                "condition_on_previous_text": False,
-                "log_prob_threshold": None,
-                "compression_ratio_threshold": None,
-            })
-            segs_c, _, _, _ = _run_asr(clip, False, {}, opts)
-            os.unlink(clip)
-        except Exception as e:
-            print("rescue error", a, b, e);
-            continue
-        for _, s, e, t in segs_c:
-            t = (t or "").strip()
-            if not t:
-                continue
-            abs_s = max(0.0, (a - 1.0) + max(0.0, s or 0.0))
-            abs_e = max(abs_s, (a - 1.0) + max(0.0, e or 0.0))
-            # keep only inside the requested window (+/- 0.2s tolerance)
-            if abs_e < a - 0.20 or abs_s > b + 0.20:
-                continue
-            rescued.append((0, abs_s, abs_e, t))
-    return rescued
-def _merge_with_join(segs: List[Tuple[int,float,float,str]]):
-    if not segs:
-        return segs
-    segs.sort(key=lambda x: x[1])
-    out: List[Tuple[int,float,float,str]] = []
-    for tup in segs:
-        if out:
-            prev = out[-1]
-            gap = tup[1] - prev[2]
-            if 0.0 <= gap <= JOIN_GAP:
-                out[-1] = (prev[0], prev[1], tup[2], (prev[3] + " " + tup[3]).strip())
-                continue
-            if gap < 0.15:
-                new_text = prev[3] if len(prev[3]) >= len(tup[3]) else tup[3]
-                out[-1] = (prev[0], min(prev[1], tup[1]), max(prev[2], tup[2]), new_text)
-                continue
-        out.append(tup)
-    return [(i+1, s, e, t) for i, (_, s, e, t) in enumerate(out)]
-def _squash_tail_repeats(text: str) -> str:
-    # Common outro repeats in Thai; keep a single one
-    import re
-    text = text.strip()
-    text = re.sub(r"(สวัสดีครับ|สวัสดีค่ะ)(\s*\1){1,}$", r"\1", text)
-    return text
-# ---------- WhisperX alignment ----------
-def _align_with_whisperx(audio_path: str, segments: List[Tuple[int,float,float,str]], lang_code: str = "th"):
-    """
-    segments: [(idx, start, end, text), ...]
-    returns:  [(idx, start, end, text)] with refined start/end from word-level alignment.
-    """
-    if not segments or not HAS_WHISPERX:
-        return segments
-    try:
-        device = "cuda" if HAS_CUDA else "cpu"
-        align_model, metadata = whisperx.load_align_model(language_code=lang_code, device=device)
-        # Convert to list[dict] for whisperx
-        seg_dicts = [{"start": s, "end": e, "text": t} for (_i, s, e, t) in segments]
-        aligned = whisperx.align(
-            seg_dicts, align_model, metadata, audio_path, device,
-            return_char_alignments=False
-        )
-        out = []
-        for i, seg in enumerate(aligned.get("segments", []), start=1):
-            s = float(seg.get("start", seg_dicts[i-1]["start"]))
-            e = float(seg.get("end",   seg_dicts[i-1]["end"]))
-            t = seg.get("text", seg_dicts[i-1]["text"])
-            out.append((i, s, e, t))
-        return out if out else segments
-    except Exception as e:
-        print("[Align] WhisperX alignment failed:", e)
-        return segments
-# =========================
-# Transcribe main
-# =========================
-def transcribe(audio_path: Optional[str], vad_mode: str, enable_gapfill: bool, rescue_text: str, use_alignment: bool):
-    if not audio_path:
-        return "", None, []
-    # Normalize audio to mono/16k for consistent timestamps
-    try:
-        wav_path = _ensure_mono16k(audio_path)
-    except Exception as e:
-        return f"แปลงไฟล์เสียงด้วย ffmpeg ไม่สำเร็จ: {e}", None, []
-    # ---- Quiet-speech safe decode options ----
-    decode_opts = dict(
-        language="th",
-        task="transcribe",
-        beam_size=2,                   # small recall bump
-        best_of=1,
-        temperature=0.0,
-        # patience omitted for compatibility
-        condition_on_previous_text=False,
-        # ↓↓↓ Make Whisper reluctant to drop quiet spans ↓↓↓
-        no_speech_threshold=0.05,
-        log_prob_threshold=None,                 # disable hard drop by avg logprob
-        compression_ratio_threshold=None,        # disable CR gate (music/noise)
-        chunk_length=20,              # shorter chunks reduce all-or-nothing drops
-    )
-    # Gentler VAD (only used if we choose VAD path)
-    vad_opts = dict(
-        threshold=0.08,
-        min_silence_duration_ms=420,
-        min_speech_duration_ms=80,
-        speech_pad_ms=1200,
-    )
-    # Choose whether to start with VAD or NO-VAD
-    use_vad_first = (vad_mode == "AUTO (VAD on)")
-    # Pass 1
-    segs1, _text1, info1, last_end1 = _run_asr(
-        wav_path, use_vad_first, vad_opts if use_vad_first else {}, decode_opts
-    )
-    dur = float(getattr(info1, "duration", 0.0) or 0.0)
-    cov1 = (last_end1 / dur * 100.0) if dur > 0 else 0.0
-    print(f"[ASR] pass1 ({'VAD' if use_vad_first else 'NO-VAD'}) coverage: "
-          f"{last_end1:.2f}/{dur:.2f}s ({cov1:.1f}%) | segs={len(segs1)}")
-    chosen_segs = segs1
-    # Fallback: try the other mode if we obviously ended early
-    if dur > 0 and (last_end1 < 0.98 * dur or len(segs1) == 0):
-        decode_opts_fb = dict(decode_opts, no_speech_threshold=0.02)
-        segs2, _text2, info2, last_end2 = _run_asr(
-            wav_path, not use_vad_first, {} if use_vad_first else vad_opts, decode_opts_fb
-        )
-        cov2 = (last_end2 / dur * 100.0) if dur > 0 else 0.0
-        print(f"[ASR] pass2 ({'NO-VAD' if use_vad_first else 'VAD'}) coverage: "
-              f"{last_end2:.2f}/{dur:.2f}s ({cov2:.1f}%) | segs={len(segs2)}")
-        if last_end2 > last_end1 + 0.5:
-            chosen_segs = segs2
-    # Gap-fill to rescue missed mid-sentences (optional)
-    if enable_gapfill:
-        chosen_segs = _gap_fill(wav_path, chosen_segs, dur, decode_opts)
-    # Surgical rescue for user-provided windows (e.g., "20-38,60-75")
-    windows = _parse_windows(rescue_text)
-    if windows:
-        rescued = _rescue_windows(wav_path, windows, decode_opts)
-        if rescued:
-            chosen_segs = _merge_with_join(chosen_segs + rescued)
-    # Optional WhisperX alignment (refine timings & recover boundary words)
-    if use_alignment and HAS_WHISPERX:
-        chosen_segs = _align_with_whisperx(wav_path, chosen_segs, lang_code="th")
     # Build outputs
-    transcript = "\n".join([t for (_, _, _, t) in chosen_segs]).strip()
-    transcript = _squash_tail_repeats(transcript)
-    # SRT file
-    srt_str = _segments_to_srt(chosen_segs)
     srt_path = "/tmp/output.srt"
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_str)
     seg_dicts = [
-        {"index": i, "start": s, "end": e, "text": t}
-        for (i, s, e, t) in chosen_segs
     ]
-    # Clean temp wav
-    try:
-        os.unlink(wav_path)
-    except Exception:
-        pass
     return transcript, srt_path, seg_dicts
-# =========================
-# UI
-# =========================
-with gr.Blocks(title="Thai ASR — faster-whisper (quiet-speech safe)") as demo:
-    gr.Markdown("## 🇹🇭 Thai ASR — faster-whisper (`Thaweewat/whisper-th-medium-ct2`)\n"
-                "หลีกเลี่ยงคำหายช่วงเสียงเบา: ปิด gate เคร่ง, chunk 20s, Gap-fill + Rescue Windows\n"
-                "มีตัวเลือกปรับปรุงเวลา/ขอบคำด้วย WhisperX (แนะนำ GPU)")
-    audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="อัปโหลดไฟล์เสียงหรืออัดเสียง")
     with gr.Row():
-        vad_mode = gr.Radio(choices=["AUTO (VAD on)", "NO-VAD"], value="NO-VAD", label="VAD โหมด")
-        gapfill  = gr.Checkbox(value=True, label="กู้หลุมกลางประโยค (Gap-fill)")
-        align_cb = gr.Checkbox(value=False, label="WhisperX word alignment (GPU แนะนำ)")
-    rescue = gr.Textbox(value="", lines=1, label="Rescue windows (วินาที) เช่น 20-38,60-75 — ว่างไว้ถ้าไม่ใช้")
-    btn   = gr.Button("ถอดเสียง", variant="primary")
-    out_text = gr.Textbox(label="ผลถอดเสียง", lines=14)
-    out_srt  = gr.File(label="ดาวน์โหลดไฟล์ .srt")
-    out_json = gr.JSON(label="Segments (index/start/end/text)")
-    btn.click(fn=transcribe, inputs=[audio, vad_mode, gapfill, rescue, align_cb], outputs=[out_text, out_srt, out_json])
 if __name__ == "__main__":
-    demo.queue(max_size=8).launch()

+# app.py — Thai ASR on faster-whisper using Thaweewat/whisper-th-medium-ct2
 import os
+from pathlib import Path
+from typing import List, Tuple
+import torch
 import gradio as gr
 from faster_whisper import WhisperModel
+MODEL_ID = "Thaweewat/whisper-th-medium-ct2"
+# Pick device/compute type
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+COMPUTE_TYPE = "int8_float16" if DEVICE == "cuda" else "int8"
+# Load once at startup (first cold start will download the model)
+model = WhisperModel(MODEL_ID, device=DEVICE, compute_type=COMPUTE_TYPE)
+def _fmt_srt_time(t: float) -> str:
+    """Format seconds -> SRT timestamp."""
     if t is None:
         t = 0.0
+    ms = int(round(t * 1000))
     h, ms = divmod(ms, 3600000)
     m, ms = divmod(ms, 60000)
     s, ms = divmod(ms, 1000)
     return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
 def _segments_to_srt(segments: List[Tuple[int, float, float, str]]) -> str:
+    """[(idx, start, end, text)] -> SRT string."""
     lines = []
     for i, start, end, text in segments:
         lines.append(str(i))
         lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
         lines.append((text or "").strip())
+        lines.append("")  # blank line between cues
     return "\n".join(lines).strip() + "\n"
+def transcribe(audio_path: str):
     """
+    audio_path: Gradio supplies a file path.
+    Returns: transcript text, SRT file path, and list of segment dicts
     """
+    # Thai-only decoding, with VAD to skip silence
+    decode_opts = dict(language="th", task="transcribe", beam_size=5, best_of=5, temperature=[0.0, 0.2, 0.4])
+    vad_opts = dict(min_silence_duration_ms=500)
+    segments_iter, info = model.transcribe(
+        audio_path,
+        vad_filter=True,
+        vad_parameters=vad_opts,
         **decode_opts,
     )
+    segs = []
+    texts = []
     for idx, seg in enumerate(segments_iter, start=1):
+        start = float(seg.start) if seg.start is not None else 0.0
         end = float(seg.end) if seg.end is not None else start
         text = (seg.text or "").strip()
         segs.append((idx, start, end, text))
         texts.append(text)
     # Build outputs
+    transcript = "\n".join(texts).strip()
+    # Write SRT to a temp file (Gradio will serve it)
+    srt_str = _segments_to_srt(segs)
     srt_path = "/tmp/output.srt"
     with open(srt_path, "w", encoding="utf-8") as f:
         f.write(srt_str)
+    # JSON-friendly segments
     seg_dicts = [
+        {"index": i, "start": start, "end": end, "text": text}
+        for (i, start, end, text) in segs
     ]
     return transcript, srt_path, seg_dicts
+with gr.Blocks() as demo:
+    gr.Markdown("## 🇹🇭 Thai ASR — faster-whisper (`Thaweewat/whisper-th-medium-ct2`)")
     with gr.Row():
+        audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio")
+    with gr.Row():
+        btn = gr.Button("Transcribe", variant="primary")
+    with gr.Row():
+        out_text = gr.Textbox(label="Transcript", lines=12)
+    with gr.Row():
+        out_srt = gr.File(label="Download SRT")
+    with gr.Row():
+        out_json = gr.JSON(label="Segments (start/end/text)")
+    btn.click(fn=transcribe, inputs=audio, outputs=[out_text, out_srt, out_json])
 if __name__ == "__main__":
+    demo.launch()