Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
PlotWeaver — Live Commentary Translation Platform (Single File)
|
| 3 |
================================================================
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os, io, re, time, base64, struct, shutil, subprocess, tempfile, logging
|
|
@@ -35,89 +35,69 @@ QWEN_VOICES = [
|
|
| 35 |
# }
|
| 36 |
|
| 37 |
LANGUAGES = {
|
| 38 |
-
# ----
|
| 39 |
-
"
|
| 40 |
-
"nllb": "
|
| 41 |
-
"yourvoic_voices": ["Peter"], "tts_engine": "qwen",
|
| 42 |
-
"qwen_code": "ar", "qwen_name": "Modern Standard Arabic (العربية الفصحى)",
|
| 43 |
-
},
|
| 44 |
-
"Spanish": {
|
| 45 |
-
"nllb": "spa_Latn", "yourvoic_lang": "es-ES",
|
| 46 |
-
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 47 |
-
"qwen_code": "es", "qwen_name": "Spanish",
|
| 48 |
-
},
|
| 49 |
-
"French": {
|
| 50 |
-
"nllb": "fra_Latn", "yourvoic_lang": "fr-FR",
|
| 51 |
-
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 52 |
-
"qwen_code": "fr", "qwen_name": "French",
|
| 53 |
-
},
|
| 54 |
-
"German": {
|
| 55 |
-
"nllb": "deu_Latn", "yourvoic_lang": "de-DE",
|
| 56 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 57 |
-
"qwen_code": "
|
| 58 |
},
|
| 59 |
-
"Mandarin": {
|
| 60 |
"nllb": "zho_Hans", "yourvoic_lang": "zh-CN",
|
| 61 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 62 |
"qwen_code": "zh", "qwen_name": "Mandarin Chinese",
|
| 63 |
},
|
| 64 |
-
"Italian": {
|
| 65 |
-
"nllb": "ita_Latn", "yourvoic_lang": "it-IT",
|
| 66 |
-
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 67 |
-
"qwen_code": "it", "qwen_name": "Italian",
|
| 68 |
-
},
|
| 69 |
"Japanese": {
|
| 70 |
"nllb": "jpn_Jpan", "yourvoic_lang": "ja-JP",
|
| 71 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 72 |
"qwen_code": "ja", "qwen_name": "Japanese",
|
| 73 |
},
|
| 74 |
-
"Portuguese": {
|
| 75 |
-
"nllb": "por_Latn", "yourvoic_lang": "pt-BR",
|
| 76 |
-
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 77 |
-
"qwen_code": "pt", "qwen_name": "Portuguese",
|
| 78 |
-
},
|
| 79 |
-
"Hindi": {
|
| 80 |
-
"nllb": "hin_Deva", "yourvoic_lang": "hi-IN",
|
| 81 |
-
"yourvoic_voices": ["Rahul", "Deepika", "Aditya"], "tts_engine": "qwen",
|
| 82 |
-
"qwen_code": "hi", "qwen_name": "Hindi",
|
| 83 |
-
},
|
| 84 |
"Korean": {
|
| 85 |
"nllb": "kor_Hang", "yourvoic_lang": "ko-KR",
|
| 86 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 87 |
"qwen_code": "ko", "qwen_name": "Korean",
|
| 88 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
"Russian": {
|
| 90 |
"nllb": "rus_Cyrl", "yourvoic_lang": "ru-RU",
|
| 91 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 92 |
"qwen_code": "ru", "qwen_name": "Russian",
|
| 93 |
},
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
"
|
| 98 |
-
"yourvoic_voices": [], "tts_engine": "local",
|
| 99 |
-
"qwen_code": None, "qwen_name": None,
|
| 100 |
},
|
| 101 |
-
"
|
| 102 |
-
"nllb": "
|
| 103 |
-
"yourvoic_voices": [], "tts_engine": "
|
| 104 |
-
"qwen_code":
|
| 105 |
},
|
| 106 |
-
"
|
| 107 |
-
"nllb": "
|
| 108 |
-
"yourvoic_voices": [], "tts_engine": "
|
| 109 |
-
"qwen_code":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
},
|
|
|
|
|
|
|
| 111 |
"Swahili": {
|
| 112 |
"nllb": "swh_Latn", "yourvoic_lang": "sw-KE",
|
| 113 |
"yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
|
| 114 |
"qwen_code": None, "qwen_name": None,
|
| 115 |
},
|
| 116 |
-
"Zulu": {
|
| 117 |
-
"nllb": "zul_Latn", "yourvoic_lang": None,
|
| 118 |
-
"yourvoic_voices": [], "tts_engine": "local",
|
| 119 |
-
"qwen_code": None, "qwen_name": None,
|
| 120 |
-
},
|
| 121 |
"Amharic": {
|
| 122 |
"nllb": "amh_Ethi", "yourvoic_lang": "am-ET",
|
| 123 |
"yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
|
|
@@ -130,6 +110,11 @@ LANGUAGES = {
|
|
| 130 |
},
|
| 131 |
|
| 132 |
# ---- South Asian (YourVoic TTS + NLLB MT) ----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
"Bengali": {
|
| 134 |
"nllb": "ben_Beng", "yourvoic_lang": "bn-IN",
|
| 135 |
"yourvoic_voices": ["Sneha", "Aryan"], "tts_engine": "yourvoic",
|
|
@@ -281,8 +266,8 @@ LANGUAGE_GROUPS = {
|
|
| 281 |
# All language display names (for dropdowns)
|
| 282 |
ALL_LANGUAGE_NAMES = sorted(LANGUAGES.keys())
|
| 283 |
|
| 284 |
-
# Languages that use
|
| 285 |
-
|
| 286 |
|
| 287 |
# Languages that use YourVoic API
|
| 288 |
YOURVOIC_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "yourvoic"]
|
|
@@ -299,12 +284,12 @@ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
|
|
| 299 |
asr_pipe = None
|
| 300 |
mt_tokenizer = None
|
| 301 |
mt_model = None
|
| 302 |
-
|
| 303 |
|
| 304 |
|
| 305 |
def load_models():
|
| 306 |
"""Load all models at startup."""
|
| 307 |
-
global asr_pipe, mt_tokenizer, mt_model
|
| 308 |
from transformers import (
|
| 309 |
pipeline as hf_pipeline,
|
| 310 |
AutoTokenizer,
|
|
@@ -335,17 +320,6 @@ def load_models():
|
|
| 335 |
mt_tokenizer.src_lang = "eng_Latn"
|
| 336 |
print(" MT loaded")
|
| 337 |
|
| 338 |
-
# Local TTS (Yoruba)
|
| 339 |
-
TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
|
| 340 |
-
print(f" Loading local TTS: {TTS_MODEL_ID}")
|
| 341 |
-
tts_pipe_local = hf_pipeline(
|
| 342 |
-
"text-to-speech",
|
| 343 |
-
model=TTS_MODEL_ID,
|
| 344 |
-
device=DEVICE,
|
| 345 |
-
torch_dtype=TORCH_DTYPE,
|
| 346 |
-
)
|
| 347 |
-
print(" Local TTS loaded")
|
| 348 |
-
|
| 349 |
# Diagnostics
|
| 350 |
print(f"\n=== Device diagnostics ===")
|
| 351 |
print(f"CUDA available: {torch.cuda.is_available()}")
|
|
@@ -353,8 +327,8 @@ def load_models():
|
|
| 353 |
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
| 354 |
print(f"ASR on: {next(asr_pipe.model.parameters()).device}")
|
| 355 |
print(f"MT on: {next(mt_model.parameters()).device}")
|
| 356 |
-
print(f"TTS on: {next(tts_pipe_local.model.parameters()).device}")
|
| 357 |
print(f"YourVoic API key: {'set' if os.environ.get('YOURVOIC_API_KEY') else 'NOT SET'}")
|
|
|
|
| 358 |
print(f"==========================\n")
|
| 359 |
print("All models loaded!")
|
| 360 |
|
|
@@ -538,7 +512,7 @@ def mux_video_audio(video_path, audio_path, output_path, extend_video=False, tar
|
|
| 538 |
|
| 539 |
|
| 540 |
# =============================================================================
|
| 541 |
-
# TTS ENGINE: YourVoic API
|
| 542 |
# =============================================================================
|
| 543 |
|
| 544 |
YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
|
|
@@ -546,80 +520,110 @@ YOURVOIC_STREAM_URL = "https://yourvoic.com/api/v1/tts/stream"
|
|
| 546 |
|
| 547 |
|
| 548 |
def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
|
| 549 |
-
"""
|
| 550 |
-
Synthesize text using YourVoic API.
|
| 551 |
-
Returns (audio_array, sample_rate) or raises on failure.
|
| 552 |
-
"""
|
| 553 |
if not YOURVOIC_API_KEY:
|
| 554 |
-
raise RuntimeError(
|
| 555 |
-
"YOURVOIC_API_KEY not set. Add it as a Space secret."
|
| 556 |
-
)
|
| 557 |
|
| 558 |
-
headers = {
|
| 559 |
-
|
| 560 |
-
"Content-Type": "application/json",
|
| 561 |
-
}
|
| 562 |
-
payload = {
|
| 563 |
-
"text": text,
|
| 564 |
-
"voice": voice,
|
| 565 |
-
"language": language_code,
|
| 566 |
-
"model": "aura-prime",
|
| 567 |
-
"speed": speed,
|
| 568 |
-
}
|
| 569 |
|
| 570 |
t0 = time.time()
|
| 571 |
-
response = requests.post(
|
| 572 |
-
YOURVOIC_STREAM_URL,
|
| 573 |
-
headers=headers,
|
| 574 |
-
json=payload,
|
| 575 |
-
stream=True,
|
| 576 |
-
timeout=60,
|
| 577 |
-
)
|
| 578 |
|
| 579 |
if response.status_code != 200:
|
| 580 |
-
raise RuntimeError(
|
| 581 |
-
f"YourVoic API error {response.status_code}: {response.text[:200]}"
|
| 582 |
-
)
|
| 583 |
|
| 584 |
-
#
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
| 587 |
for chunk in response.iter_content(chunk_size=8192):
|
| 588 |
-
|
| 589 |
-
tmp_raw.close()
|
| 590 |
|
| 591 |
elapsed = time.time() - t0
|
| 592 |
-
logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
# Try reading directly with soundfile
|
| 595 |
try:
|
| 596 |
-
audio_array, sample_rate = sf.read(
|
| 597 |
-
os.unlink(
|
| 598 |
return audio_array, sample_rate
|
| 599 |
except Exception as e:
|
| 600 |
-
logger.warning(f"soundfile can't read
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
-
# Fallback: convert with ffmpeg
|
| 603 |
try:
|
| 604 |
-
|
| 605 |
-
tmp_wav = tmp_raw.name + ".wav"
|
| 606 |
result = subprocess.run(
|
| 607 |
-
["ffmpeg", "-y", "-i",
|
| 608 |
-
"-acodec", "pcm_s16le", "-ar", "24000", "-ac", "1", tmp_wav],
|
| 609 |
capture_output=True, text=True,
|
| 610 |
)
|
| 611 |
-
os.unlink(
|
| 612 |
if result.returncode != 0:
|
| 613 |
-
raise RuntimeError(f"ffmpeg
|
| 614 |
-
audio_array, sample_rate = sf.read(
|
| 615 |
-
os.unlink(
|
| 616 |
return audio_array, sample_rate
|
| 617 |
except Exception as e2:
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
os.unlink(f)
|
| 622 |
-
raise RuntimeError(f"Failed to decode YourVoic audio: {e2}")
|
| 623 |
|
| 624 |
|
| 625 |
def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
|
|
@@ -629,42 +633,26 @@ def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter",
|
|
| 629 |
return output_path, sr
|
| 630 |
|
| 631 |
|
| 632 |
-
def synthesize_local(text, tts_pipe):
|
| 633 |
-
"""
|
| 634 |
-
Synthesize text using local HuggingFace TTS pipeline (MMS-TTS).
|
| 635 |
-
Returns (audio_array, sample_rate).
|
| 636 |
-
"""
|
| 637 |
-
t0 = time.time()
|
| 638 |
-
result = tts_pipe(text)
|
| 639 |
-
audio = np.array(result["audio"]).squeeze()
|
| 640 |
-
sr = result["sampling_rate"]
|
| 641 |
-
elapsed = time.time() - t0
|
| 642 |
-
logger.info(f"Local TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio)/sr:.1f}s audio")
|
| 643 |
-
return audio, sr
|
| 644 |
|
| 645 |
|
| 646 |
-
def synthesize_chunked(text, language_config,
|
| 647 |
"""
|
| 648 |
-
Synthesize long text by chunking into sentence groups.
|
| 649 |
-
Routes to either YourVoic or local TTS based on language config.
|
| 650 |
|
| 651 |
Args:
|
| 652 |
text: Full text to synthesize
|
| 653 |
-
language_config: Dict from LANGUAGES (has
|
| 654 |
-
tts_pipe: Local HuggingFace TTS pipeline (needed for local engine)
|
| 655 |
sentences_per_chunk: How many sentences to synthesize per API call
|
| 656 |
|
| 657 |
Returns:
|
| 658 |
(audio_array, sample_rate)
|
| 659 |
"""
|
| 660 |
-
import re
|
| 661 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 662 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 663 |
|
| 664 |
if not sentences:
|
| 665 |
-
return np.
|
| 666 |
|
| 667 |
-
engine = language_config["tts_engine"]
|
| 668 |
audio_segments = []
|
| 669 |
output_sr = None
|
| 670 |
|
|
@@ -674,20 +662,14 @@ def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk
|
|
| 674 |
continue
|
| 675 |
|
| 676 |
try:
|
| 677 |
-
if
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
|
| 681 |
-
else:
|
| 682 |
-
if tts_pipe is None:
|
| 683 |
-
raise RuntimeError("Local TTS pipeline not loaded")
|
| 684 |
-
audio_seg, seg_sr = synthesize_local(chunk_text, tts_pipe)
|
| 685 |
|
| 686 |
if output_sr is None:
|
| 687 |
output_sr = seg_sr
|
| 688 |
if len(audio_seg) > 0:
|
| 689 |
audio_segments.append(audio_seg)
|
| 690 |
-
# Small silence between chunks
|
| 691 |
silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
|
| 692 |
audio_segments.append(silence)
|
| 693 |
|
|
@@ -696,11 +678,9 @@ def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk
|
|
| 696 |
continue
|
| 697 |
|
| 698 |
if not audio_segments:
|
| 699 |
-
# Return a short silence instead of empty array to prevent Gradio crash
|
| 700 |
fallback_sr = output_sr or 16000
|
| 701 |
-
silence = np.zeros(int(0.5 * fallback_sr), dtype=np.float32)
|
| 702 |
logger.warning("All TTS chunks failed — returning silence")
|
| 703 |
-
return
|
| 704 |
|
| 705 |
return np.concatenate(audio_segments), output_sr
|
| 706 |
|
|
@@ -1008,7 +988,7 @@ def get_voices_for_language(lang_name):
|
|
| 1008 |
elif engine == "yourvoic" and config.get("yourvoic_voices"):
|
| 1009 |
return config["yourvoic_voices"]
|
| 1010 |
elif engine == "local":
|
| 1011 |
-
return ["
|
| 1012 |
return ["Peter"]
|
| 1013 |
|
| 1014 |
|
|
@@ -1053,7 +1033,7 @@ def full_pipeline_audio(audio_input, target_language):
|
|
| 1053 |
# TTS
|
| 1054 |
t0 = time.time()
|
| 1055 |
audio_out, sr_out = synthesize_chunked(
|
| 1056 |
-
translated, lang_config
|
| 1057 |
)
|
| 1058 |
log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
|
| 1059 |
|
|
@@ -1088,7 +1068,7 @@ def full_pipeline_text(english_text, target_language, voice_name):
|
|
| 1088 |
# TTS
|
| 1089 |
t0 = time.time()
|
| 1090 |
audio_out, sr_out = synthesize_chunked(
|
| 1091 |
-
translated, lang_config
|
| 1092 |
)
|
| 1093 |
log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
|
| 1094 |
|
|
@@ -1101,7 +1081,7 @@ def full_pipeline_text(english_text, target_language, voice_name):
|
|
| 1101 |
def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=gr.Progress()):
|
| 1102 |
"""
|
| 1103 |
Dub a video into one or more target languages.
|
| 1104 |
-
Routes to Qwen Omni for global languages,
|
| 1105 |
"""
|
| 1106 |
if video_path is None:
|
| 1107 |
return None, "Please upload a video."
|
|
@@ -1173,7 +1153,7 @@ def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=g
|
|
| 1173 |
progress(0.65, desc=f"{lang_name}: synthesizing...")
|
| 1174 |
t0 = time.time()
|
| 1175 |
tgt_audio, tgt_sr = synthesize_chunked(
|
| 1176 |
-
translated, lang_config
|
| 1177 |
)
|
| 1178 |
sf.write(tgt_audio_raw, tgt_audio, tgt_sr)
|
| 1179 |
tgt_duration = len(tgt_audio) / tgt_sr
|
|
@@ -1247,7 +1227,7 @@ with gr.Blocks(
|
|
| 1247 |
<div class="main-header">
|
| 1248 |
<h1>PlotWeaver</h1>
|
| 1249 |
<p>Live commentary translation platform — English to 40+ languages</p>
|
| 1250 |
-
<p style="font-size:0.8rem; color:#999">
|
| 1251 |
</div>
|
| 1252 |
""")
|
| 1253 |
|
|
@@ -1373,7 +1353,7 @@ with gr.Blocks(
|
|
| 1373 |
gr.Markdown(
|
| 1374 |
"Upload a video with English commentary and get back a dubbed version. "
|
| 1375 |
"**Global languages** (Arabic, French, Spanish, etc.) use Qwen Omni for best quality. "
|
| 1376 |
-
"**African languages**
|
| 1377 |
)
|
| 1378 |
|
| 1379 |
with gr.Row():
|
|
@@ -1496,7 +1476,7 @@ with gr.Blocks(
|
|
| 1496 |
info += f"**YourVoic language:** `{config.get('yourvoic_lang', 'N/A')}`\n\n"
|
| 1497 |
info += f"**Available voices:** {', '.join(voices) if voices else 'Peter (default)'}"
|
| 1498 |
else:
|
| 1499 |
-
info += f"**Engine:**
|
| 1500 |
info += f"**NLLB code:** `{config.get('nllb', 'N/A')}`\n\n"
|
| 1501 |
info += "Uses locally fine-tuned models on GPU. Voice selection not available."
|
| 1502 |
|
|
@@ -1510,8 +1490,8 @@ with gr.Blocks(
|
|
| 1510 |
**PlotWeaver** by PlotweaverAI | Models:
|
| 1511 |
[ASR](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
|
| 1512 |
[MT](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
|
| 1513 |
-
[TTS](https://
|
| 1514 |
-
[
|
| 1515 |
""")
|
| 1516 |
|
| 1517 |
|
|
|
|
| 1 |
"""
|
| 2 |
PlotWeaver — Live Commentary Translation Platform (Single File)
|
| 3 |
================================================================
|
| 4 |
+
Two engines: Qwen Omni | YourVoic API (with NLLB MT)
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os, io, re, time, base64, struct, shutil, subprocess, tempfile, logging
|
|
|
|
| 35 |
# }
|
| 36 |
|
| 37 |
LANGUAGES = {
|
| 38 |
+
# ---- Qwen Omni Languages (end-to-end speech-to-speech, 11 languages) ----
|
| 39 |
+
"English": {
|
| 40 |
+
"nllb": "eng_Latn", "yourvoic_lang": "en-US",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 42 |
+
"qwen_code": "en", "qwen_name": "English",
|
| 43 |
},
|
| 44 |
+
"Chinese (Mandarin)": {
|
| 45 |
"nllb": "zho_Hans", "yourvoic_lang": "zh-CN",
|
| 46 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 47 |
"qwen_code": "zh", "qwen_name": "Mandarin Chinese",
|
| 48 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
"Japanese": {
|
| 50 |
"nllb": "jpn_Jpan", "yourvoic_lang": "ja-JP",
|
| 51 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 52 |
"qwen_code": "ja", "qwen_name": "Japanese",
|
| 53 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
"Korean": {
|
| 55 |
"nllb": "kor_Hang", "yourvoic_lang": "ko-KR",
|
| 56 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 57 |
"qwen_code": "ko", "qwen_name": "Korean",
|
| 58 |
},
|
| 59 |
+
"German": {
|
| 60 |
+
"nllb": "deu_Latn", "yourvoic_lang": "de-DE",
|
| 61 |
+
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 62 |
+
"qwen_code": "de", "qwen_name": "German",
|
| 63 |
+
},
|
| 64 |
+
"French": {
|
| 65 |
+
"nllb": "fra_Latn", "yourvoic_lang": "fr-FR",
|
| 66 |
+
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 67 |
+
"qwen_code": "fr", "qwen_name": "French",
|
| 68 |
+
},
|
| 69 |
"Russian": {
|
| 70 |
"nllb": "rus_Cyrl", "yourvoic_lang": "ru-RU",
|
| 71 |
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 72 |
"qwen_code": "ru", "qwen_name": "Russian",
|
| 73 |
},
|
| 74 |
+
"Portuguese": {
|
| 75 |
+
"nllb": "por_Latn", "yourvoic_lang": "pt-BR",
|
| 76 |
+
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 77 |
+
"qwen_code": "pt", "qwen_name": "Portuguese",
|
|
|
|
|
|
|
| 78 |
},
|
| 79 |
+
"Spanish": {
|
| 80 |
+
"nllb": "spa_Latn", "yourvoic_lang": "es-ES",
|
| 81 |
+
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 82 |
+
"qwen_code": "es", "qwen_name": "Spanish",
|
| 83 |
},
|
| 84 |
+
"Italian": {
|
| 85 |
+
"nllb": "ita_Latn", "yourvoic_lang": "it-IT",
|
| 86 |
+
"yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
|
| 87 |
+
"qwen_code": "it", "qwen_name": "Italian",
|
| 88 |
+
},
|
| 89 |
+
"Arabic": {
|
| 90 |
+
"nllb": "arb_Arab", "yourvoic_lang": "ar-SA",
|
| 91 |
+
"yourvoic_voices": ["Peter"], "tts_engine": "qwen",
|
| 92 |
+
"qwen_code": "ar", "qwen_name": "Modern Standard Arabic",
|
| 93 |
},
|
| 94 |
+
|
| 95 |
+
# ---- African Languages (YourVoic API) ----
|
| 96 |
"Swahili": {
|
| 97 |
"nllb": "swh_Latn", "yourvoic_lang": "sw-KE",
|
| 98 |
"yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
|
| 99 |
"qwen_code": None, "qwen_name": None,
|
| 100 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
"Amharic": {
|
| 102 |
"nllb": "amh_Ethi", "yourvoic_lang": "am-ET",
|
| 103 |
"yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
|
|
|
|
| 110 |
},
|
| 111 |
|
| 112 |
# ---- South Asian (YourVoic TTS + NLLB MT) ----
|
| 113 |
+
"Hindi": {
|
| 114 |
+
"nllb": "hin_Deva", "yourvoic_lang": "hi-IN",
|
| 115 |
+
"yourvoic_voices": ["Rahul", "Deepika", "Aditya"], "tts_engine": "yourvoic",
|
| 116 |
+
"qwen_code": None, "qwen_name": None,
|
| 117 |
+
},
|
| 118 |
"Bengali": {
|
| 119 |
"nllb": "ben_Beng", "yourvoic_lang": "bn-IN",
|
| 120 |
"yourvoic_voices": ["Sneha", "Aryan"], "tts_engine": "yourvoic",
|
|
|
|
| 266 |
# All language display names (for dropdowns)
|
| 267 |
ALL_LANGUAGE_NAMES = sorted(LANGUAGES.keys())
|
| 268 |
|
| 269 |
+
# Languages that use YourVoic API
|
| 270 |
+
YOURVOIC_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "yourvoic"]
|
| 271 |
|
| 272 |
# Languages that use YourVoic API
|
| 273 |
YOURVOIC_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "yourvoic"]
|
|
|
|
| 284 |
asr_pipe = None
|
| 285 |
mt_tokenizer = None
|
| 286 |
mt_model = None
|
| 287 |
+
|
| 288 |
|
| 289 |
|
| 290 |
def load_models():
|
| 291 |
"""Load all models at startup."""
|
| 292 |
+
global asr_pipe, mt_tokenizer, mt_model
|
| 293 |
from transformers import (
|
| 294 |
pipeline as hf_pipeline,
|
| 295 |
AutoTokenizer,
|
|
|
|
| 320 |
mt_tokenizer.src_lang = "eng_Latn"
|
| 321 |
print(" MT loaded")
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# Diagnostics
|
| 324 |
print(f"\n=== Device diagnostics ===")
|
| 325 |
print(f"CUDA available: {torch.cuda.is_available()}")
|
|
|
|
| 327 |
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
| 328 |
print(f"ASR on: {next(asr_pipe.model.parameters()).device}")
|
| 329 |
print(f"MT on: {next(mt_model.parameters()).device}")
|
|
|
|
| 330 |
print(f"YourVoic API key: {'set' if os.environ.get('YOURVOIC_API_KEY') else 'NOT SET'}")
|
| 331 |
+
print(f"Dashscope key: {'set' if os.environ.get('DASHSCOPE_API_KEY') else 'NOT SET'}")
|
| 332 |
print(f"==========================\n")
|
| 333 |
print("All models loaded!")
|
| 334 |
|
|
|
|
| 512 |
|
| 513 |
|
| 514 |
# =============================================================================
|
| 515 |
+
# TTS ENGINE: YourVoic API
|
| 516 |
# =============================================================================
|
| 517 |
|
| 518 |
YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
|
|
|
|
| 520 |
|
| 521 |
|
| 522 |
def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
|
| 523 |
+
"""Synthesize text using YourVoic API."""
|
|
|
|
|
|
|
|
|
|
| 524 |
if not YOURVOIC_API_KEY:
|
| 525 |
+
raise RuntimeError("YOURVOIC_API_KEY not set.")
|
|
|
|
|
|
|
| 526 |
|
| 527 |
+
headers = {"X-API-Key": YOURVOIC_API_KEY, "Content-Type": "application/json"}
|
| 528 |
+
payload = {"text": text, "voice": voice, "language": language_code, "model": "aura-prime", "speed": speed}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
t0 = time.time()
|
| 531 |
+
response = requests.post(YOURVOIC_STREAM_URL, headers=headers, json=payload, stream=True, timeout=60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
if response.status_code != 200:
|
| 534 |
+
raise RuntimeError(f"YourVoic error {response.status_code}: {response.text[:200]}")
|
|
|
|
|
|
|
| 535 |
|
| 536 |
+
# Detect format from content-type header
|
| 537 |
+
ct = response.headers.get("content-type", "").lower()
|
| 538 |
+
logger.info(f"YourVoic content-type: {ct}")
|
| 539 |
+
|
| 540 |
+
# Collect audio bytes
|
| 541 |
+
audio_data = b""
|
| 542 |
for chunk in response.iter_content(chunk_size=8192):
|
| 543 |
+
audio_data += chunk
|
|
|
|
| 544 |
|
| 545 |
elapsed = time.time() - t0
|
| 546 |
+
logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio_data)} bytes")
|
| 547 |
+
|
| 548 |
+
# Log first bytes for format detection
|
| 549 |
+
magic = audio_data[:16] if len(audio_data) > 16 else audio_data
|
| 550 |
+
logger.info(f"YourVoic first bytes: {magic[:8]}")
|
| 551 |
+
|
| 552 |
+
# Determine file extension from content-type or magic bytes
|
| 553 |
+
if b"RIFF" in audio_data[:4]:
|
| 554 |
+
ext = ".wav"
|
| 555 |
+
elif b"\xff\xfb" in audio_data[:3] or b"\xff\xf3" in audio_data[:3] or b"ID3" in audio_data[:3]:
|
| 556 |
+
ext = ".mp3"
|
| 557 |
+
elif b"OggS" in audio_data[:4]:
|
| 558 |
+
ext = ".ogg"
|
| 559 |
+
elif b"fLaC" in audio_data[:4]:
|
| 560 |
+
ext = ".flac"
|
| 561 |
+
elif "mp3" in ct or "mpeg" in ct:
|
| 562 |
+
ext = ".mp3"
|
| 563 |
+
elif "ogg" in ct:
|
| 564 |
+
ext = ".ogg"
|
| 565 |
+
elif "wav" in ct:
|
| 566 |
+
ext = ".wav"
|
| 567 |
+
elif "flac" in ct:
|
| 568 |
+
ext = ".flac"
|
| 569 |
+
elif "linear16" in ct or "pcm" in ct or "l16" in ct:
|
| 570 |
+
ext = ".raw"
|
| 571 |
+
else:
|
| 572 |
+
ext = ".mp3" # Most common API default
|
| 573 |
+
logger.warning(f"Unknown YourVoic format (ct={ct}), guessing mp3")
|
| 574 |
+
|
| 575 |
+
# Save with correct extension
|
| 576 |
+
tmp_path = tempfile.NamedTemporaryFile(suffix=ext, delete=False).name
|
| 577 |
+
with open(tmp_path, "wb") as f:
|
| 578 |
+
f.write(audio_data)
|
| 579 |
|
| 580 |
# Try reading directly with soundfile
|
| 581 |
try:
|
| 582 |
+
audio_array, sample_rate = sf.read(tmp_path, dtype="float32")
|
| 583 |
+
os.unlink(tmp_path)
|
| 584 |
return audio_array, sample_rate
|
| 585 |
except Exception as e:
|
| 586 |
+
logger.warning(f"soundfile can't read {ext}: {e}")
|
| 587 |
+
|
| 588 |
+
# Handle raw PCM (linear16): wrap in WAV header
|
| 589 |
+
if ext == ".raw":
|
| 590 |
+
try:
|
| 591 |
+
sr = 24000
|
| 592 |
+
raw_data = audio_data
|
| 593 |
+
wav_path = tmp_path + ".wav"
|
| 594 |
+
with open(wav_path, "wb") as f:
|
| 595 |
+
f.write(b"RIFF")
|
| 596 |
+
f.write(struct.pack("<I", 36 + len(raw_data)))
|
| 597 |
+
f.write(b"WAVE")
|
| 598 |
+
f.write(b"fmt ")
|
| 599 |
+
f.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16))
|
| 600 |
+
f.write(b"data")
|
| 601 |
+
f.write(struct.pack("<I", len(raw_data)))
|
| 602 |
+
f.write(raw_data)
|
| 603 |
+
audio_array, sample_rate = sf.read(wav_path, dtype="float32")
|
| 604 |
+
os.unlink(tmp_path)
|
| 605 |
+
os.unlink(wav_path)
|
| 606 |
+
return audio_array, sample_rate
|
| 607 |
+
except Exception as e:
|
| 608 |
+
logger.warning(f"Raw PCM wrap failed: {e}")
|
| 609 |
|
| 610 |
+
# Fallback: convert with ffmpeg
|
| 611 |
try:
|
| 612 |
+
wav_path = tmp_path + ".wav"
|
|
|
|
| 613 |
result = subprocess.run(
|
| 614 |
+
["ffmpeg", "-y", "-i", tmp_path, "-acodec", "pcm_s16le", "-ar", "24000", "-ac", "1", wav_path],
|
|
|
|
| 615 |
capture_output=True, text=True,
|
| 616 |
)
|
| 617 |
+
os.unlink(tmp_path)
|
| 618 |
if result.returncode != 0:
|
| 619 |
+
raise RuntimeError(f"ffmpeg failed: {result.stderr[-300:]}")
|
| 620 |
+
audio_array, sample_rate = sf.read(wav_path, dtype="float32")
|
| 621 |
+
os.unlink(wav_path)
|
| 622 |
return audio_array, sample_rate
|
| 623 |
except Exception as e2:
|
| 624 |
+
for f in [tmp_path, tmp_path + ".wav"]:
|
| 625 |
+
if os.path.exists(f): os.unlink(f)
|
| 626 |
+
raise RuntimeError(f"YourVoic decode failed: {e2}")
|
|
|
|
|
|
|
| 627 |
|
| 628 |
|
| 629 |
def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
|
|
|
|
| 633 |
return output_path, sr
|
| 634 |
|
| 635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
|
| 638 |
+
def synthesize_chunked(text, language_config, sentences_per_chunk=2):
|
| 639 |
"""
|
| 640 |
+
Synthesize long text by chunking into sentence groups via YourVoic API.
|
|
|
|
| 641 |
|
| 642 |
Args:
|
| 643 |
text: Full text to synthesize
|
| 644 |
+
language_config: Dict from LANGUAGES (has yourvoic_lang, yourvoic_voices, etc.)
|
|
|
|
| 645 |
sentences_per_chunk: How many sentences to synthesize per API call
|
| 646 |
|
| 647 |
Returns:
|
| 648 |
(audio_array, sample_rate)
|
| 649 |
"""
|
|
|
|
| 650 |
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 651 |
sentences = [s.strip() for s in sentences if s.strip()]
|
| 652 |
|
| 653 |
if not sentences:
|
| 654 |
+
return np.zeros(int(0.5 * 16000), dtype=np.float32), 16000
|
| 655 |
|
|
|
|
| 656 |
audio_segments = []
|
| 657 |
output_sr = None
|
| 658 |
|
|
|
|
| 662 |
continue
|
| 663 |
|
| 664 |
try:
|
| 665 |
+
voice = language_config["yourvoic_voices"][0] if language_config.get("yourvoic_voices") else "Peter"
|
| 666 |
+
lang_code = language_config["yourvoic_lang"]
|
| 667 |
+
audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
if output_sr is None:
|
| 670 |
output_sr = seg_sr
|
| 671 |
if len(audio_seg) > 0:
|
| 672 |
audio_segments.append(audio_seg)
|
|
|
|
| 673 |
silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
|
| 674 |
audio_segments.append(silence)
|
| 675 |
|
|
|
|
| 678 |
continue
|
| 679 |
|
| 680 |
if not audio_segments:
|
|
|
|
| 681 |
fallback_sr = output_sr or 16000
|
|
|
|
| 682 |
logger.warning("All TTS chunks failed — returning silence")
|
| 683 |
+
return np.zeros(int(0.5 * fallback_sr), dtype=np.float32), fallback_sr
|
| 684 |
|
| 685 |
return np.concatenate(audio_segments), output_sr
|
| 686 |
|
|
|
|
| 988 |
elif engine == "yourvoic" and config.get("yourvoic_voices"):
|
| 989 |
return config["yourvoic_voices"]
|
| 990 |
elif engine == "local":
|
| 991 |
+
return ["Peter"]
|
| 992 |
return ["Peter"]
|
| 993 |
|
| 994 |
|
|
|
|
| 1033 |
# TTS
|
| 1034 |
t0 = time.time()
|
| 1035 |
audio_out, sr_out = synthesize_chunked(
|
| 1036 |
+
translated, lang_config
|
| 1037 |
)
|
| 1038 |
log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
|
| 1039 |
|
|
|
|
| 1068 |
# TTS
|
| 1069 |
t0 = time.time()
|
| 1070 |
audio_out, sr_out = synthesize_chunked(
|
| 1071 |
+
translated, lang_config
|
| 1072 |
)
|
| 1073 |
log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
|
| 1074 |
|
|
|
|
| 1081 |
def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=gr.Progress()):
|
| 1082 |
"""
|
| 1083 |
Dub a video into one or more target languages.
|
| 1084 |
+
Routes to Qwen Omni for global languages, YourVoic for others.
|
| 1085 |
"""
|
| 1086 |
if video_path is None:
|
| 1087 |
return None, "Please upload a video."
|
|
|
|
| 1153 |
progress(0.65, desc=f"{lang_name}: synthesizing...")
|
| 1154 |
t0 = time.time()
|
| 1155 |
tgt_audio, tgt_sr = synthesize_chunked(
|
| 1156 |
+
translated, lang_config
|
| 1157 |
)
|
| 1158 |
sf.write(tgt_audio_raw, tgt_audio, tgt_sr)
|
| 1159 |
tgt_duration = len(tgt_audio) / tgt_sr
|
|
|
|
| 1227 |
<div class="main-header">
|
| 1228 |
<h1>PlotWeaver</h1>
|
| 1229 |
<p>Live commentary translation platform — English to 40+ languages</p>
|
| 1230 |
+
<p style="font-size:0.8rem; color:#999">Qwen Omni (11 languages) + YourVoic API + NLLB-200 (27 languages)</p>
|
| 1231 |
</div>
|
| 1232 |
""")
|
| 1233 |
|
|
|
|
| 1353 |
gr.Markdown(
|
| 1354 |
"Upload a video with English commentary and get back a dubbed version. "
|
| 1355 |
"**Global languages** (Arabic, French, Spanish, etc.) use Qwen Omni for best quality. "
|
| 1356 |
+
"**African/regional languages** use YourVoic API with NLLB translation."
|
| 1357 |
)
|
| 1358 |
|
| 1359 |
with gr.Row():
|
|
|
|
| 1476 |
info += f"**YourVoic language:** `{config.get('yourvoic_lang', 'N/A')}`\n\n"
|
| 1477 |
info += f"**Available voices:** {', '.join(voices) if voices else 'Peter (default)'}"
|
| 1478 |
else:
|
| 1479 |
+
info += f"**Engine:** Not available\n\n"
|
| 1480 |
info += f"**NLLB code:** `{config.get('nllb', 'N/A')}`\n\n"
|
| 1481 |
info += "Uses locally fine-tuned models on GPU. Voice selection not available."
|
| 1482 |
|
|
|
|
| 1490 |
**PlotWeaver** by PlotweaverAI | Models:
|
| 1491 |
[ASR](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
|
| 1492 |
[MT](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
|
| 1493 |
+
[TTS](https://yourvoic.com) |
|
| 1494 |
+
[Qwen Omni](https://www.alibabacloud.com/help/en/model-studio/qwen-omni)
|
| 1495 |
""")
|
| 1496 |
|
| 1497 |
|