Spaces:

STLooo
/

live-caption-mvp

Sleeping

App Files Files Community

STLooo commited on Feb 22

Commit

4436daf

verified ·

1 Parent(s): 3028fe4

Update app.py

Browse files

Files changed (1) hide show

app.py +315 -121

app.py CHANGED Viewed

@@ -1,208 +1,402 @@
-import hashlib
 import time
-from dataclasses import dataclass, asdict
-from typing import Dict, List, Optional
 import gradio as gr
 from faster_whisper import WhisperModel
-# ====== Config ======
-CHUNK_SECONDS = 4.0  # MVP建议 3~6 秒；CPU 上先用 4 秒更稳
-MODEL_NAME = "small"  # CPU先 small；不够快再降到 base
-COMPUTE_TYPE = "int8"
 DEVICE = "cpu"
-# ====== Model ======
-model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)
-# ====== In-memory state (MVP) ======
 @dataclass
 class Chunk:
     chunk_id: int
     start_s: float
     end_s: float
-    raw_text: str
-    edited_text: str = ""
-    status: str = "raw"  # raw / published
-    lang: str = "auto"
     rev: int = 0
-STATE: Dict[str, List[Chunk]] = {}  # key: session_id -> chunks
-def _session_id():
-    # simple per-browser session id
-    return str(int(time.time()*1000))
-def _hash_text(s: str) -> str:
-    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:10]
-def transcribe_audio_to_chunks(audio_path: str, session_id: str) -> str:
-    """
-    Split-like behavior by asking whisper for timestamps; we treat each segment as a "chunk".
-    This is more robust than naive slicing when we don't control ffmpeg.
-    """
-    segments, info = model.transcribe(audio_path, vad_filter=True)
-    lang = getattr(info, "language", None) or "auto"
     chunks: List[Chunk] = []
     cid = 0
     for seg in segments:
-        text = (seg.text or "").strip()
-        if not text:
             continue
         chunks.append(Chunk(
             chunk_id=cid,
             start_s=float(seg.start),
             end_s=float(seg.end),
-            raw_text=text,
-            edited_text=text,
             status="raw",
-            lang=lang,
-            rev=0
         ))
         cid += 1
     STATE[session_id] = chunks
-    return f"OK: {len(chunks)} chunks · detected_lang={lang}"
-def get_editor_table(session_id: str):
-    chunks = STATE.get(session_id, [])
     rows = []
-    for c in chunks:
         rows.append([
             c.chunk_id,
             f"{c.start_s:.2f}-{c.end_s:.2f}",
-            c.lang,
             c.status,
-            c.raw_text,
-            c.edited_text,
             c.rev
         ])
     return rows
-def publish_one(session_id: str, chunk_id: int, edited_text: str):
     chunks = STATE.get(session_id, [])
     if chunk_id < 0 or chunk_id >= len(chunks):
-        return "Chunk ID out of range", get_editor_table(session_id)
     c = chunks[chunk_id]
-    c.edited_text = edited_text.strip() if edited_text else c.edited_text
     c.status = "published"
     c.rev += 1
-    return f"Published chunk #{chunk_id} rev={c.rev}", get_editor_table(session_id)
 def publish_all(session_id: str):
     chunks = STATE.get(session_id, [])
     for c in chunks:
         if c.status != "published":
             c.status = "published"
             c.rev += 1
-    return f"Published ALL ({len(chunks)} chunks)", get_editor_table(session_id)
-def audience_view(session_id: str, lang_choice: str):
     """
-    MVP: 观众端只展示 published 的 edited_text。
-    这里先不做翻译（你要英文/中文，下一版加翻译模块）。
     """
     chunks = STATE.get(session_id, [])
     published = [c for c in chunks if c.status == "published"]
-    # 展示成简单 HTML，带一个免费TTS按钮（浏览器 speechSynthesis）
-    lines = []
-    for c in published[-50:]:
-        text = c.edited_text or c.raw_text
-        # 观众端语言选择先只影响TTS朗读语言，不做翻译（下一版做）
-        lines.append(f"<div style='padding:10px 12px;border:1px solid #ddd;border-radius:10px;margin:10px 0;'>"
-                     f"<div style='font-size:12px;color:#666'>#{c.chunk_id} · {c.start_s:.2f}-{c.end_s:.2f} · {c.lang}</div>"
-                     f"<div class='txt'>{text}</div>"
-                     f"</div>")
-    html = f"""
-    <div>
-      <div style="display:flex;gap:8px;align-items:center;flex-wrap:wrap;">
-        <b>Audience (Published)</b>
-        <button onclick="toggleTTS()" style="padding:8px 10px;">TTS: <span id='ttsState'>Off</span></button>
-      </div>
-      <div id="wrap">{''.join(lines) if lines else '<i>No published captions yet.</i>'}</div>
-    </div>
-    <script>
-      let ttsOn = false;
-      function toggleTTS(){{
-        ttsOn = !ttsOn;
-        document.getElementById('ttsState').innerText = ttsOn ? 'On' : 'Off';
-        if(!ttsOn) window.speechSynthesis.cancel();
-      }}
-      function speakAll(){{
-        if(!ttsOn) return;
-        if(!('speechSynthesis' in window)) return;
-        window.speechSynthesis.cancel();
-        const nodes = document.querySelectorAll('.txt');
-        const lang = "{'zh-CN' if lang_choice=='zh' else 'en-US'}";
-        nodes.forEach(n => {{
-          const u = new SpeechSynthesisUtterance(n.innerText);
-          u.lang = lang;
-          window.speechSynthesis.speak(u);
-        }});
-      }}
-      // 点击任何字幕块就朗读该句（更实用）
-      document.querySelectorAll('.txt').forEach(n => {{
-        n.style.cursor = 'pointer';
-        n.onclick = () => {{
-          if(!ttsOn) return;
-          window.speechSynthesis.cancel();
-          const u = new SpeechSynthesisUtterance(n.innerText);
-          u.lang = "{'zh-CN' if lang_choice=='zh' else 'en-US'}";
-          window.speechSynthesis.speak(u);
-        }};
-      }});
-    </script>
-    """
-    return html
 with gr.Blocks(title="Live Caption MVP (HF)") as demo:
-    gr.Markdown("# Live Caption MVP (HF)\n上傳音檔 → Whisper 轉寫 → 校對 → 發佈 → 觀眾端字幕（可選免費TTS）")
-    session_id = gr.State(_session_id())
     with gr.Tab("1) Ingest"):
-        audio = gr.Audio(type="filepath", label="Upload audio (iPhone m4a/wav/mp3)")
         btn_run = gr.Button("Transcribe & Build Chunks")
         ingest_status = gr.Textbox(label="Status", interactive=False)
     with gr.Tab("2) Editor"):
-        gr.Markdown("校對台：選 chunk_id，修改 edited_text，Publish。")
         table = gr.Dataframe(
-            headers=["chunk_id", "time", "lang", "status", "raw_text", "edited_text", "rev"],
-            datatype=["number","str","str","str","str","str","number"],
-            row_count=10,
-            col_count=(7, "fixed"),
             interactive=False
         )
         chunk_id_in = gr.Number(label="chunk_id", value=0, precision=0)
-        edited_in = gr.Textbox(label="edited_text (paste here)", lines=3)
-        btn_pub = gr.Button("Publish One")
-        btn_pub_all = gr.Button("Publish All")
         editor_status = gr.Textbox(label="Editor Status", interactive=False)
     with gr.Tab("3) Audience"):
-        lang_choice = gr.Radio(choices=["zh","en"], value="zh", label="TTS language (MVP only affects reading voice)")
         btn_refresh = gr.Button("Refresh Audience View")
-        audience_html = gr.HTML()
-    def _do_ingest(audio_path, sid):
         if not audio_path:
             return "Please upload an audio file first.", []
-        msg = transcribe_audio_to_chunks(audio_path, sid)
-        return msg, get_editor_table(sid)
-    btn_run.click(_do_ingest, inputs=[audio, session_id], outputs=[ingest_status, table])
-    def _pub_one(sid, cid, text):
-        msg, rows = publish_one(sid, int(cid), text)
-        return msg, rows
-    btn_pub.click(_pub_one, inputs=[session_id, chunk_id_in, edited_in], outputs=[editor_status, table])
-    btn_pub_all.click(lambda sid: publish_all(sid), inputs=[session_id], outputs=[editor_status, table])
-    btn_refresh.click(lambda sid, lc: audience_view(sid, lc), inputs=[session_id, lang_choice], outputs=[audience_html])
 demo.launch()

+import os
 import time
+import base64
+import hashlib
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
 import gradio as gr
 from faster_whisper import WhisperModel
+# Tencent Cloud SDK
+from tencentcloud.common import credential
+from tencentcloud.common.profile.client_profile import ClientProfile
+from tencentcloud.common.profile.http_profile import HttpProfile
+# Tencent TMT (Translate)
+from tencentcloud.tmt.v20180321 import tmt_client, models as tmt_models
+# Tencent TTS (Text-to-Speech)
+from tencentcloud.tts.v20190823 import tts_client, models as tts_models
+# ======================
+# Config
+# ======================
+MODEL_NAME = os.getenv("WHISPER_MODEL", "small")   # CPU: small; if slow -> base
 DEVICE = "cpu"
+COMPUTE_TYPE = "int8"
+# Tencent region
+TENCENT_REGION = os.getenv("TENCENT_REGION", "ap-shanghai").strip()
+# Tencent TTS voice types
+# Default voice types:
+# - ZH default: 0 (often "云小宁" default timbre)
+# - EN: 101001 is commonly used in docs as an example timbre ID; if it fails, set your own in Secrets.
+VOICE_EN = int(os.getenv("TENCENT_TTS_VOICE_EN", "101001"))
+VOICE_ZH = int(os.getenv("TENCENT_TTS_VOICE_ZH", "0"))
+# Generate TTS only for latest published line (to avoid load)
+TTS_GENERATE_MODE = "latest_only"  # keep MVP stable
+# ======================
+# Helpers
+# ======================
+def _now_ms() -> int:
+    return int(time.time() * 1000)
+def _session_id() -> str:
+    return str(_now_ms())
+def _hash(s: str) -> str:
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:12]
+def _require_env(name: str) -> str:
+    v = os.getenv(name, "").strip()
+    if not v:
+        raise RuntimeError(f"Missing env: {name}. Set it in HF Space Settings → Secrets.")
+    return v
+# ======================
+# Tencent Clients
+# ======================
+_TMT_CLIENT: Optional[tmt_client.TmtClient] = None
+_TTS_CLIENT: Optional[tts_client.TtsClient] = None
+def _make_client(endpoint: str):
+    secret_id = _require_env("TENCENT_SECRET_ID")
+    secret_key = _require_env("TENCENT_SECRET_KEY")
+    cred = credential.Credential(secret_id, secret_key)
+    httpProfile = HttpProfile()
+    httpProfile.endpoint = endpoint
+    clientProfile = ClientProfile()
+    clientProfile.httpProfile = httpProfile
+    return cred, clientProfile
+def get_tmt_client() -> tmt_client.TmtClient:
+    global _TMT_CLIENT
+    if _TMT_CLIENT is not None:
+        return _TMT_CLIENT
+    cred, clientProfile = _make_client("tmt.tencentcloudapi.com")
+    _TMT_CLIENT = tmt_client.TmtClient(cred, TENCENT_REGION, clientProfile)
+    return _TMT_CLIENT
+def get_tts_client() -> tts_client.TtsClient:
+    global _TTS_CLIENT
+    if _TTS_CLIENT is not None:
+        return _TTS_CLIENT
+    cred, clientProfile = _make_client("tts.tencentcloudapi.com")
+    _TTS_CLIENT = tts_client.TtsClient(cred, TENCENT_REGION, clientProfile)
+    return _TTS_CLIENT
+# ======================
+# Whisper Model
+# ======================
+whisper = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)
+# ======================
+# In-memory State (MVP)
+# ======================
 @dataclass
 class Chunk:
     chunk_id: int
     start_s: float
     end_s: float
+    raw_text_en: str
+    edited_text_en: str
+    status: str = "raw"     # raw / published
     rev: int = 0
+    zh_text: str = ""       # translation (on publish)
+    tts_en_path: str = ""   # cached mp3 filepath
+    tts_zh_path: str = ""   # cached mp3 filepath
+STATE: Dict[str, List[Chunk]] = {}     # session_id -> chunks
+# caches across sessions (MVP)
+TRANS_CACHE: Dict[str, str] = {}       # key -> zh text
+TTS_CACHE: Dict[str, str] = {}         # key -> mp3 path
+# ======================
+# Translation (EN -> ZH) with caching
+# ======================
+def translate_en_to_zh(text_en: str) -> str:
+    text_en = (text_en or "").strip()
+    if not text_en:
+        return ""
+    key = f"tmt:en->zh:{_hash(text_en)}"
+    if key in TRANS_CACHE:
+        return TRANS_CACHE[key]
+    client = get_tmt_client()
+    req = tmt_models.TextTranslateRequest()
+    req.SourceText = text_en
+    req.Source = "en"
+    req.Target = "zh"
+    req.ProjectId = 0
+    resp = client.TextTranslate(req)
+    out = getattr(resp, "TargetText", "") or ""
+    TRANS_CACHE[key] = out
+    return out
+# ======================
+# TTS (Text -> mp3) with caching
+# ======================
+def tts_to_mp3(text: str, voice_type: int) -> str:
+    text = (text or "").strip()
+    if not text:
+        return ""
+    key = f"tts:{voice_type}:{_hash(text)}"
+    if key in TTS_CACHE:
+        return TTS_CACHE[key]
+    client = get_tts_client()
+    req = tts_models.TextToVoiceRequest()
+    req.Text = text
+    req.SessionId = key
+    req.ModelType = 1
+    req.VoiceType = voice_type
+    req.Volume = 5
+    req.Speed = 0
+    req.SampleRate = 16000
+    req.Codec = "mp3"
+    resp = client.TextToVoice(req)
+    audio_b64 = getattr(resp, "Audio", "") or ""
+    if not audio_b64:
+        return ""
+    audio_bytes = base64.b64decode(audio_b64)
+    out_dir = "outputs"
+    os.makedirs(out_dir, exist_ok=True)
+    path = os.path.join(out_dir, f"{key}.mp3")
+    with open(path, "wb") as f:
+        f.write(audio_bytes)
+    TTS_CACHE[key] = path
+    return path
+# ======================
+# Core pipeline
+# ======================
+def transcribe_to_chunks(audio_path: str, session_id: str) -> str:
+    segments, info = whisper.transcribe(audio_path, vad_filter=True)
+    detected = getattr(info, "language", None) or "auto"
     chunks: List[Chunk] = []
     cid = 0
     for seg in segments:
+        txt = (seg.text or "").strip()
+        if not txt:
             continue
         chunks.append(Chunk(
             chunk_id=cid,
             start_s=float(seg.start),
             end_s=float(seg.end),
+            raw_text_en=txt,
+            edited_text_en=txt,
             status="raw",
+            rev=0,
+            zh_text="",
+            tts_en_path="",
+            tts_zh_path=""
         ))
         cid += 1
     STATE[session_id] = chunks
+    return f"OK: {len(chunks)} chunks · detected_lang={detected} · model={MODEL_NAME}/{COMPUTE_TYPE}"
+def editor_table(session_id: str):
     rows = []
+    for c in STATE.get(session_id, []):
         rows.append([
             c.chunk_id,
             f"{c.start_s:.2f}-{c.end_s:.2f}",
             c.status,
+            c.raw_text_en,
+            c.edited_text_en,
+            c.zh_text,
             c.rev
         ])
     return rows
+def publish_one(session_id: str, chunk_id: int, edited_text_en: str):
     chunks = STATE.get(session_id, [])
     if chunk_id < 0 or chunk_id >= len(chunks):
+        return "Chunk ID out of range", editor_table(session_id)
     c = chunks[chunk_id]
+    if edited_text_en and edited_text_en.strip():
+        c.edited_text_en = edited_text_en.strip()
     c.status = "published"
     c.rev += 1
+    # Translate after publish (cost control + higher quality)
+    try:
+        c.zh_text = translate_en_to_zh(c.edited_text_en)
+        msg = f"Published #{chunk_id} rev={c.rev} · translated"
+    except Exception as e:
+        c.zh_text = ""
+        msg = f"Published #{chunk_id} rev={c.rev} · translation failed: {str(e)}"
+    # Reset TTS cache for this chunk if text changed
+    c.tts_en_path = ""
+    c.tts_zh_path = ""
+    return msg, editor_table(session_id)
 def publish_all(session_id: str):
     chunks = STATE.get(session_id, [])
+    ok, fail = 0, 0
     for c in chunks:
         if c.status != "published":
             c.status = "published"
             c.rev += 1
+        if not c.zh_text and c.edited_text_en:
+            try:
+                c.zh_text = translate_en_to_zh(c.edited_text_en)
+                ok += 1
+            except:
+                fail += 1
+        c.tts_en_path = ""
+        c.tts_zh_path = ""
+    return f"Published ALL · translated_ok={ok} fail={fail}", editor_table(session_id)
+# ======================
+# Audience rendering + TTS generation (stable MVP)
+# ======================
+def render_audience_html(chunks: List[Chunk], view_lang: str) -> str:
+    # show last 50 published
+    published = [c for c in chunks if c.status == "published"][-50:]
+    def one(c: Chunk) -> str:
+        en = (c.edited_text_en or c.raw_text_en).strip()
+        zh = (c.zh_text or "").strip()
+        text = zh if view_lang == "zh" else en
+        return (
+            "<div style='padding:10px 12px;border:1px solid #ddd;border-radius:10px;margin:10px 0;'>"
+            f"<div style='font-size:12px;color:#666'>#{c.chunk_id} · {c.start_s:.2f}-{c.end_s:.2f}</div>"
+            f"<div style='font-size:16px;line-height:1.45'>{text}</div>"
+            "</div>"
+        )
+    if not published:
+        return "<i>No published captions yet.</i>"
+    return "".join(one(c) for c in published)
+def ensure_latest_tts(session_id: str, view_lang: str) -> Tuple[str, Optional[str]]:
     """
+    Returns (status_msg, audio_filepath_or_None) for the latest published chunk in selected language.
+    This avoids heavy load and avoids relying on browser speechSynthesis.
     """
     chunks = STATE.get(session_id, [])
     published = [c for c in chunks if c.status == "published"]
+    if not published:
+        return "No published captions yet.", None
+    latest = published[-1]
+    # Ensure translation exists if user wants ZH
+    if view_lang == "zh" and not latest.zh_text:
+        try:
+            latest.zh_text = translate_en_to_zh(latest.edited_text_en)
+        except Exception as e:
+            return f"ZH translation failed: {str(e)}", None
+    try:
+        if view_lang == "en":
+            if not latest.tts_en_path:
+                latest.tts_en_path = tts_to_mp3(latest.edited_text_en, VOICE_EN)
+            return f"TTS ready (EN) for chunk #{latest.chunk_id}", latest.tts_en_path or None
+        else:
+            if not latest.tts_zh_path:
+                latest.tts_zh_path = tts_to_mp3(latest.zh_text, VOICE_ZH)
+            return f"TTS ready (ZH) for chunk #{latest.chunk_id}", latest.tts_zh_path or None
+    except Exception as e:
+        return f"TTS failed: {str(e)}", None
+def refresh_audience(session_id: str, view_lang: str):
+    chunks = STATE.get(session_id, [])
+    html = render_audience_html(chunks, view_lang)
+    tts_msg, audio_path = ensure_latest_tts(session_id, view_lang)
+    return html, tts_msg, audio_path
+# ======================
+# Gradio UI
+# ======================
 with gr.Blocks(title="Live Caption MVP (HF)") as demo:
+    gr.Markdown(
+        "# Live Caption MVP (HF)\n"
+        "全英文轉寫 → 校對（EN）→ 自動翻譯（ZH）→ 發佈 → 觀眾端 EN/ZH 字幕 + 後端 TTS 生成 mp3 播放（不依賴手機瀏覽器 TTS）"
+    )
+    sid = gr.State(_session_id())
     with gr.Tab("1) Ingest"):
+        gr.Markdown("上傳 iPhone 錄音檔（m4a/wav/mp3）→ 轉寫切段（Whisper segments）")
+        audio = gr.Audio(type="filepath", label="Upload audio")
         btn_run = gr.Button("Transcribe & Build Chunks")
         ingest_status = gr.Textbox(label="Status", interactive=False)
     with gr.Tab("2) Editor"):
+        gr.Markdown("校對台：修改英文後 Publish，系統自動翻譯成中文（只對 Publish 後內容翻譯，省錢且更準）。")
         table = gr.Dataframe(
+            headers=["chunk_id", "time", "status", "raw_en", "edited_en", "zh", "rev"],
+            datatype=["number", "str", "str", "str", "str", "str", "number"],
             interactive=False
         )
         chunk_id_in = gr.Number(label="chunk_id", value=0, precision=0)
+        edited_in = gr.Textbox(label="edited_en (paste here)", lines=3)
+        btn_pub_one = gr.Button("Publish One (translate)")
+        btn_pub_all = gr.Button("Publish All (translate missing)")
         editor_status = gr.Textbox(label="Editor Status", interactive=False)
     with gr.Tab("3) Audience"):
+        gr.Markdown(
+            "觀眾端：顯示已發佈字幕。按 Refresh 會同時產生「最新一句」的音檔（EN 或 ZH 取決於選擇），用播放器播放。"
+        )
+        view_lang = gr.Radio(choices=["en", "zh"], value="zh", label="View language")
         btn_refresh = gr.Button("Refresh Audience View")
+        aud_html = gr.HTML(label="Captions")
+        tts_status = gr.Textbox(label="TTS Status", interactive=False)
+        aud_audio = gr.Audio(label="Play latest line", type="filepath")
+    # ---- Actions ----
+    def _do_ingest(audio_path, session_id):
         if not audio_path:
             return "Please upload an audio file first.", []
+        msg = transcribe_to_chunks(audio_path, session_id)
+        return msg, editor_table(session_id)
+    btn_run.click(_do_ingest, inputs=[audio, sid], outputs=[ingest_status, table])
+    def _pub_one(session_id, cid, txt):
+        return publish_one(session_id, int(cid), txt)
+    btn_pub_one.click(_pub_one, inputs=[sid, chunk_id_in, edited_in], outputs=[editor_status, table])
+    btn_pub_all.click(lambda session_id: publish_all(session_id), inputs=[sid], outputs=[editor_status, table])
+    btn_refresh.click(refresh_audience, inputs=[sid, view_lang], outputs=[aud_html, tts_status, aud_audio])
 demo.launch()