Spaces:

STLooo
/

live-caption-mvp

Sleeping

App Files Files Community

live-caption-mvp / app.py

STLooo

Update app.py

4436daf verified 14 days ago

raw

history blame contribute delete

13.1 kB

	import os
	import time
	import base64
	import hashlib
	from dataclasses import dataclass
	from typing import Dict, List, Optional, Tuple

	import gradio as gr
	from faster_whisper import WhisperModel

	# Tencent Cloud SDK
	from tencentcloud.common import credential
	from tencentcloud.common.profile.client_profile import ClientProfile
	from tencentcloud.common.profile.http_profile import HttpProfile

	# Tencent TMT (Translate)
	from tencentcloud.tmt.v20180321 import tmt_client, models as tmt_models

	# Tencent TTS (Text-to-Speech)
	from tencentcloud.tts.v20190823 import tts_client, models as tts_models


	# ======================
	# Config
	# ======================
	MODEL_NAME = os.getenv("WHISPER_MODEL", "small") # CPU: small; if slow -> base
	DEVICE = "cpu"
	COMPUTE_TYPE = "int8"

	# Tencent region
	TENCENT_REGION = os.getenv("TENCENT_REGION", "ap-shanghai").strip()

	# Tencent TTS voice types
	# Default voice types:
	# - ZH default: 0 (often "云小宁" default timbre)
	# - EN: 101001 is commonly used in docs as an example timbre ID; if it fails, set your own in Secrets.
	VOICE_EN = int(os.getenv("TENCENT_TTS_VOICE_EN", "101001"))
	VOICE_ZH = int(os.getenv("TENCENT_TTS_VOICE_ZH", "0"))

	# Generate TTS only for latest published line (to avoid load)
	TTS_GENERATE_MODE = "latest_only" # keep MVP stable


	# ======================
	# Helpers
	# ======================
	def _now_ms() -> int:
	return int(time.time() * 1000)

	def _session_id() -> str:
	return str(_now_ms())

	def _hash(s: str) -> str:
	return hashlib.sha256(s.encode("utf-8")).hexdigest()[:12]

	def _require_env(name: str) -> str:
	v = os.getenv(name, "").strip()
	if not v:
	raise RuntimeError(f"Missing env: {name}. Set it in HF Space Settings → Secrets.")
	return v


	# ======================
	# Tencent Clients
	# ======================
	_TMT_CLIENT: Optional[tmt_client.TmtClient] = None
	_TTS_CLIENT: Optional[tts_client.TtsClient] = None

	def _make_client(endpoint: str):
	secret_id = _require_env("TENCENT_SECRET_ID")
	secret_key = _require_env("TENCENT_SECRET_KEY")

	cred = credential.Credential(secret_id, secret_key)

	httpProfile = HttpProfile()
	httpProfile.endpoint = endpoint

	clientProfile = ClientProfile()
	clientProfile.httpProfile = httpProfile

	return cred, clientProfile

	def get_tmt_client() -> tmt_client.TmtClient:
	global _TMT_CLIENT
	if _TMT_CLIENT is not None:
	return _TMT_CLIENT

	cred, clientProfile = _make_client("tmt.tencentcloudapi.com")
	_TMT_CLIENT = tmt_client.TmtClient(cred, TENCENT_REGION, clientProfile)
	return _TMT_CLIENT

	def get_tts_client() -> tts_client.TtsClient:
	global _TTS_CLIENT
	if _TTS_CLIENT is not None:
	return _TTS_CLIENT

	cred, clientProfile = _make_client("tts.tencentcloudapi.com")
	_TTS_CLIENT = tts_client.TtsClient(cred, TENCENT_REGION, clientProfile)
	return _TTS_CLIENT


	# ======================
	# Whisper Model
	# ======================
	whisper = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)


	# ======================
	# In-memory State (MVP)
	# ======================
	@dataclass
	class Chunk:
	chunk_id: int
	start_s: float
	end_s: float
	raw_text_en: str
	edited_text_en: str
	status: str = "raw" # raw / published
	rev: int = 0
	zh_text: str = "" # translation (on publish)
	tts_en_path: str = "" # cached mp3 filepath
	tts_zh_path: str = "" # cached mp3 filepath

	STATE: Dict[str, List[Chunk]] = {} # session_id -> chunks

	# caches across sessions (MVP)
	TRANS_CACHE: Dict[str, str] = {} # key -> zh text
	TTS_CACHE: Dict[str, str] = {} # key -> mp3 path


	# ======================
	# Translation (EN -> ZH) with caching
	# ======================
	def translate_en_to_zh(text_en: str) -> str:
	text_en = (text_en or "").strip()
	if not text_en:
	return ""

	key = f"tmt:en->zh:{_hash(text_en)}"
	if key in TRANS_CACHE:
	return TRANS_CACHE[key]

	client = get_tmt_client()
	req = tmt_models.TextTranslateRequest()
	req.SourceText = text_en
	req.Source = "en"
	req.Target = "zh"
	req.ProjectId = 0

	resp = client.TextTranslate(req)
	out = getattr(resp, "TargetText", "") or ""
	TRANS_CACHE[key] = out
	return out


	# ======================
	# TTS (Text -> mp3) with caching
	# ======================
	def tts_to_mp3(text: str, voice_type: int) -> str:
	text = (text or "").strip()
	if not text:
	return ""

	key = f"tts:{voice_type}:{_hash(text)}"
	if key in TTS_CACHE:
	return TTS_CACHE[key]

	client = get_tts_client()
	req = tts_models.TextToVoiceRequest()
	req.Text = text
	req.SessionId = key
	req.ModelType = 1
	req.VoiceType = voice_type
	req.Volume = 5
	req.Speed = 0
	req.SampleRate = 16000
	req.Codec = "mp3"

	resp = client.TextToVoice(req)
	audio_b64 = getattr(resp, "Audio", "") or ""
	if not audio_b64:
	return ""

	audio_bytes = base64.b64decode(audio_b64)

	out_dir = "outputs"
	os.makedirs(out_dir, exist_ok=True)
	path = os.path.join(out_dir, f"{key}.mp3")
	with open(path, "wb") as f:
	f.write(audio_bytes)

	TTS_CACHE[key] = path
	return path


	# ======================
	# Core pipeline
	# ======================
	def transcribe_to_chunks(audio_path: str, session_id: str) -> str:
	segments, info = whisper.transcribe(audio_path, vad_filter=True)
	detected = getattr(info, "language", None) or "auto"

	chunks: List[Chunk] = []
	cid = 0
	for seg in segments:
	txt = (seg.text or "").strip()
	if not txt:
	continue
	chunks.append(Chunk(
	chunk_id=cid,
	start_s=float(seg.start),
	end_s=float(seg.end),
	raw_text_en=txt,
	edited_text_en=txt,
	status="raw",
	rev=0,
	zh_text="",
	tts_en_path="",
	tts_zh_path=""
	))
	cid += 1

	STATE[session_id] = chunks
	return f"OK: {len(chunks)} chunks · detected_lang={detected} · model={MODEL_NAME}/{COMPUTE_TYPE}"

	def editor_table(session_id: str):
	rows = []
	for c in STATE.get(session_id, []):
	rows.append([
	c.chunk_id,
	f"{c.start_s:.2f}-{c.end_s:.2f}",
	c.status,
	c.raw_text_en,
	c.edited_text_en,
	c.zh_text,
	c.rev
	])
	return rows

	def publish_one(session_id: str, chunk_id: int, edited_text_en: str):
	chunks = STATE.get(session_id, [])
	if chunk_id < 0 or chunk_id >= len(chunks):
	return "Chunk ID out of range", editor_table(session_id)

	c = chunks[chunk_id]
	if edited_text_en and edited_text_en.strip():
	c.edited_text_en = edited_text_en.strip()

	c.status = "published"
	c.rev += 1

	# Translate after publish (cost control + higher quality)
	try:
	c.zh_text = translate_en_to_zh(c.edited_text_en)
	msg = f"Published #{chunk_id} rev={c.rev} · translated"
	except Exception as e:
	c.zh_text = ""
	msg = f"Published #{chunk_id} rev={c.rev} · translation failed: {str(e)}"

	# Reset TTS cache for this chunk if text changed
	c.tts_en_path = ""
	c.tts_zh_path = ""

	return msg, editor_table(session_id)

	def publish_all(session_id: str):
	chunks = STATE.get(session_id, [])
	ok, fail = 0, 0
	for c in chunks:
	if c.status != "published":
	c.status = "published"
	c.rev += 1
	if not c.zh_text and c.edited_text_en:
	try:
	c.zh_text = translate_en_to_zh(c.edited_text_en)
	ok += 1
	except:
	fail += 1
	c.tts_en_path = ""
	c.tts_zh_path = ""
	return f"Published ALL · translated_ok={ok} fail={fail}", editor_table(session_id)


	# ======================
	# Audience rendering + TTS generation (stable MVP)
	# ======================
	def render_audience_html(chunks: List[Chunk], view_lang: str) -> str:
	# show last 50 published
	published = [c for c in chunks if c.status == "published"][-50:]

	def one(c: Chunk) -> str:
	en = (c.edited_text_en or c.raw_text_en).strip()
	zh = (c.zh_text or "").strip()
	text = zh if view_lang == "zh" else en
	return (
	"<div style='padding:10px 12px;border:1px solid #ddd;border-radius:10px;margin:10px 0;'>"
	f"<div style='font-size:12px;color:#666'>#{c.chunk_id} · {c.start_s:.2f}-{c.end_s:.2f}</div>"
	f"<div style='font-size:16px;line-height:1.45'>{text}</div>"
	"</div>"
	)

	if not published:
	return "<i>No published captions yet.</i>"

	return "".join(one(c) for c in published)

	def ensure_latest_tts(session_id: str, view_lang: str) -> Tuple[str, Optional[str]]:
	"""
	Returns (status_msg, audio_filepath_or_None) for the latest published chunk in selected language.
	This avoids heavy load and avoids relying on browser speechSynthesis.
	"""
	chunks = STATE.get(session_id, [])
	published = [c for c in chunks if c.status == "published"]
	if not published:
	return "No published captions yet.", None

	latest = published[-1]

	# Ensure translation exists if user wants ZH
	if view_lang == "zh" and not latest.zh_text:
	try:
	latest.zh_text = translate_en_to_zh(latest.edited_text_en)
	except Exception as e:
	return f"ZH translation failed: {str(e)}", None

	try:
	if view_lang == "en":
	if not latest.tts_en_path:
	latest.tts_en_path = tts_to_mp3(latest.edited_text_en, VOICE_EN)
	return f"TTS ready (EN) for chunk #{latest.chunk_id}", latest.tts_en_path or None
	else:
	if not latest.tts_zh_path:
	latest.tts_zh_path = tts_to_mp3(latest.zh_text, VOICE_ZH)
	return f"TTS ready (ZH) for chunk #{latest.chunk_id}", latest.tts_zh_path or None
	except Exception as e:
	return f"TTS failed: {str(e)}", None

	def refresh_audience(session_id: str, view_lang: str):
	chunks = STATE.get(session_id, [])
	html = render_audience_html(chunks, view_lang)
	tts_msg, audio_path = ensure_latest_tts(session_id, view_lang)
	return html, tts_msg, audio_path


	# ======================
	# Gradio UI
	# ======================
	with gr.Blocks(title="Live Caption MVP (HF)") as demo:
	gr.Markdown(
	"# Live Caption MVP (HF)\n"
	"全英文轉寫 → 校對（EN）→ 自動翻譯（ZH）→ 發佈 → 觀眾端 EN/ZH 字幕 + 後端 TTS 生成 mp3 播放（不依賴手機瀏覽器 TTS）"
	)

	sid = gr.State(_session_id())

	with gr.Tab("1) Ingest"):
	gr.Markdown("上傳 iPhone 錄音檔（m4a/wav/mp3）→ 轉寫切段（Whisper segments）")
	audio = gr.Audio(type="filepath", label="Upload audio")
	btn_run = gr.Button("Transcribe & Build Chunks")
	ingest_status = gr.Textbox(label="Status", interactive=False)

	with gr.Tab("2) Editor"):
	gr.Markdown("校對台：修改英文後 Publish，系統自動翻譯成中文（只對 Publish 後內容翻譯，省錢且更準）。")
	table = gr.Dataframe(
	headers=["chunk_id", "time", "status", "raw_en", "edited_en", "zh", "rev"],
	datatype=["number", "str", "str", "str", "str", "str", "number"],
	interactive=False
	)
	chunk_id_in = gr.Number(label="chunk_id", value=0, precision=0)
	edited_in = gr.Textbox(label="edited_en (paste here)", lines=3)
	btn_pub_one = gr.Button("Publish One (translate)")
	btn_pub_all = gr.Button("Publish All (translate missing)")
	editor_status = gr.Textbox(label="Editor Status", interactive=False)

	with gr.Tab("3) Audience"):
	gr.Markdown(
	"觀眾端：顯示已發佈字幕。按 Refresh 會同時產生「最新一句」的音檔（EN 或 ZH 取決於選擇），用播放器播放。"
	)
	view_lang = gr.Radio(choices=["en", "zh"], value="zh", label="View language")
	btn_refresh = gr.Button("Refresh Audience View")
	aud_html = gr.HTML(label="Captions")
	tts_status = gr.Textbox(label="TTS Status", interactive=False)
	aud_audio = gr.Audio(label="Play latest line", type="filepath")

	# ---- Actions ----
	def _do_ingest(audio_path, session_id):
	if not audio_path:
	return "Please upload an audio file first.", []
	msg = transcribe_to_chunks(audio_path, session_id)
	return msg, editor_table(session_id)

	btn_run.click(_do_ingest, inputs=[audio, sid], outputs=[ingest_status, table])

	def _pub_one(session_id, cid, txt):
	return publish_one(session_id, int(cid), txt)

	btn_pub_one.click(_pub_one, inputs=[sid, chunk_id_in, edited_in], outputs=[editor_status, table])
	btn_pub_all.click(lambda session_id: publish_all(session_id), inputs=[sid], outputs=[editor_status, table])

	btn_refresh.click(refresh_audience, inputs=[sid, view_lang], outputs=[aud_html, tts_status, aud_audio])

	demo.launch()