PlotweaverModel commited on
Commit
771ab52
·
verified ·
1 Parent(s): a197f00

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -165
app.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  PlotWeaver — Live Commentary Translation Platform (Single File)
3
  ================================================================
4
- Three engines: Qwen Omni | YourVoic API | Local (Whisper+NLLB+MMS-TTS)
5
  """
6
 
7
  import os, io, re, time, base64, struct, shutil, subprocess, tempfile, logging
@@ -35,89 +35,69 @@ QWEN_VOICES = [
35
  # }
36
 
37
  LANGUAGES = {
38
- # ---- Global Languages (Qwen Omni best quality) ----
39
- "Arabic": {
40
- "nllb": "arb_Arab", "yourvoic_lang": "ar-SA",
41
- "yourvoic_voices": ["Peter"], "tts_engine": "qwen",
42
- "qwen_code": "ar", "qwen_name": "Modern Standard Arabic (العربية الفصحى)",
43
- },
44
- "Spanish": {
45
- "nllb": "spa_Latn", "yourvoic_lang": "es-ES",
46
- "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
47
- "qwen_code": "es", "qwen_name": "Spanish",
48
- },
49
- "French": {
50
- "nllb": "fra_Latn", "yourvoic_lang": "fr-FR",
51
- "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
52
- "qwen_code": "fr", "qwen_name": "French",
53
- },
54
- "German": {
55
- "nllb": "deu_Latn", "yourvoic_lang": "de-DE",
56
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
57
- "qwen_code": "de", "qwen_name": "German",
58
  },
59
- "Mandarin": {
60
  "nllb": "zho_Hans", "yourvoic_lang": "zh-CN",
61
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
62
  "qwen_code": "zh", "qwen_name": "Mandarin Chinese",
63
  },
64
- "Italian": {
65
- "nllb": "ita_Latn", "yourvoic_lang": "it-IT",
66
- "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
67
- "qwen_code": "it", "qwen_name": "Italian",
68
- },
69
  "Japanese": {
70
  "nllb": "jpn_Jpan", "yourvoic_lang": "ja-JP",
71
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
72
  "qwen_code": "ja", "qwen_name": "Japanese",
73
  },
74
- "Portuguese": {
75
- "nllb": "por_Latn", "yourvoic_lang": "pt-BR",
76
- "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
77
- "qwen_code": "pt", "qwen_name": "Portuguese",
78
- },
79
- "Hindi": {
80
- "nllb": "hin_Deva", "yourvoic_lang": "hi-IN",
81
- "yourvoic_voices": ["Rahul", "Deepika", "Aditya"], "tts_engine": "qwen",
82
- "qwen_code": "hi", "qwen_name": "Hindi",
83
- },
84
  "Korean": {
85
  "nllb": "kor_Hang", "yourvoic_lang": "ko-KR",
86
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
87
  "qwen_code": "ko", "qwen_name": "Korean",
88
  },
 
 
 
 
 
 
 
 
 
 
89
  "Russian": {
90
  "nllb": "rus_Cyrl", "yourvoic_lang": "ru-RU",
91
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
92
  "qwen_code": "ru", "qwen_name": "Russian",
93
  },
94
-
95
- # ---- African Languages (Local pipeline: Whisper → NLLB → MMS-TTS) ----
96
- "Yoruba": {
97
- "nllb": "yor_Latn", "yourvoic_lang": None,
98
- "yourvoic_voices": [], "tts_engine": "local",
99
- "qwen_code": None, "qwen_name": None,
100
  },
101
- "Hausa": {
102
- "nllb": "hau_Latn", "yourvoic_lang": None,
103
- "yourvoic_voices": [], "tts_engine": "local",
104
- "qwen_code": None, "qwen_name": None,
105
  },
106
- "Igbo": {
107
- "nllb": "ibo_Latn", "yourvoic_lang": None,
108
- "yourvoic_voices": [], "tts_engine": "local",
109
- "qwen_code": None, "qwen_name": None,
 
 
 
 
 
110
  },
 
 
111
  "Swahili": {
112
  "nllb": "swh_Latn", "yourvoic_lang": "sw-KE",
113
  "yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
114
  "qwen_code": None, "qwen_name": None,
115
  },
116
- "Zulu": {
117
- "nllb": "zul_Latn", "yourvoic_lang": None,
118
- "yourvoic_voices": [], "tts_engine": "local",
119
- "qwen_code": None, "qwen_name": None,
120
- },
121
  "Amharic": {
122
  "nllb": "amh_Ethi", "yourvoic_lang": "am-ET",
123
  "yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
@@ -130,6 +110,11 @@ LANGUAGES = {
130
  },
131
 
132
  # ---- South Asian (YourVoic TTS + NLLB MT) ----
 
 
 
 
 
133
  "Bengali": {
134
  "nllb": "ben_Beng", "yourvoic_lang": "bn-IN",
135
  "yourvoic_voices": ["Sneha", "Aryan"], "tts_engine": "yourvoic",
@@ -281,8 +266,8 @@ LANGUAGE_GROUPS = {
281
  # All language display names (for dropdowns)
282
  ALL_LANGUAGE_NAMES = sorted(LANGUAGES.keys())
283
 
284
- # Languages that use local TTS (your fine-tuned models)
285
- LOCAL_TTS_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "local"]
286
 
287
  # Languages that use YourVoic API
288
  YOURVOIC_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "yourvoic"]
@@ -299,12 +284,12 @@ TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
299
  asr_pipe = None
300
  mt_tokenizer = None
301
  mt_model = None
302
- tts_pipe_local = None # Local TTS for Yoruba/Hausa/Igbo/Zulu
303
 
304
 
305
  def load_models():
306
  """Load all models at startup."""
307
- global asr_pipe, mt_tokenizer, mt_model, tts_pipe_local
308
  from transformers import (
309
  pipeline as hf_pipeline,
310
  AutoTokenizer,
@@ -335,17 +320,6 @@ def load_models():
335
  mt_tokenizer.src_lang = "eng_Latn"
336
  print(" MT loaded")
337
 
338
- # Local TTS (Yoruba)
339
- TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
340
- print(f" Loading local TTS: {TTS_MODEL_ID}")
341
- tts_pipe_local = hf_pipeline(
342
- "text-to-speech",
343
- model=TTS_MODEL_ID,
344
- device=DEVICE,
345
- torch_dtype=TORCH_DTYPE,
346
- )
347
- print(" Local TTS loaded")
348
-
349
  # Diagnostics
350
  print(f"\n=== Device diagnostics ===")
351
  print(f"CUDA available: {torch.cuda.is_available()}")
@@ -353,8 +327,8 @@ def load_models():
353
  print(f"CUDA device: {torch.cuda.get_device_name(0)}")
354
  print(f"ASR on: {next(asr_pipe.model.parameters()).device}")
355
  print(f"MT on: {next(mt_model.parameters()).device}")
356
- print(f"TTS on: {next(tts_pipe_local.model.parameters()).device}")
357
  print(f"YourVoic API key: {'set' if os.environ.get('YOURVOIC_API_KEY') else 'NOT SET'}")
 
358
  print(f"==========================\n")
359
  print("All models loaded!")
360
 
@@ -538,7 +512,7 @@ def mux_video_audio(video_path, audio_path, output_path, extend_video=False, tar
538
 
539
 
540
  # =============================================================================
541
- # TTS ENGINE: YourVoic API + Local MMS-TTS
542
  # =============================================================================
543
 
544
  YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
@@ -546,80 +520,110 @@ YOURVOIC_STREAM_URL = "https://yourvoic.com/api/v1/tts/stream"
546
 
547
 
548
  def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
549
- """
550
- Synthesize text using YourVoic API.
551
- Returns (audio_array, sample_rate) or raises on failure.
552
- """
553
  if not YOURVOIC_API_KEY:
554
- raise RuntimeError(
555
- "YOURVOIC_API_KEY not set. Add it as a Space secret."
556
- )
557
 
558
- headers = {
559
- "X-API-Key": YOURVOIC_API_KEY,
560
- "Content-Type": "application/json",
561
- }
562
- payload = {
563
- "text": text,
564
- "voice": voice,
565
- "language": language_code,
566
- "model": "aura-prime",
567
- "speed": speed,
568
- }
569
 
570
  t0 = time.time()
571
- response = requests.post(
572
- YOURVOIC_STREAM_URL,
573
- headers=headers,
574
- json=payload,
575
- stream=True,
576
- timeout=60,
577
- )
578
 
579
  if response.status_code != 200:
580
- raise RuntimeError(
581
- f"YourVoic API error {response.status_code}: {response.text[:200]}"
582
- )
583
 
584
- # Collect streamed audio bytes into a temp file
585
- import tempfile
586
- tmp_raw = tempfile.NamedTemporaryFile(suffix=".audio", delete=False)
 
 
 
587
  for chunk in response.iter_content(chunk_size=8192):
588
- tmp_raw.write(chunk)
589
- tmp_raw.close()
590
 
591
  elapsed = time.time() - t0
592
- logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
  # Try reading directly with soundfile
595
  try:
596
- audio_array, sample_rate = sf.read(tmp_raw.name, dtype="float32")
597
- os.unlink(tmp_raw.name)
598
  return audio_array, sample_rate
599
  except Exception as e:
600
- logger.warning(f"soundfile can't read YourVoic output directly: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
- # Fallback: convert with ffmpeg to WAV
603
  try:
604
- import subprocess
605
- tmp_wav = tmp_raw.name + ".wav"
606
  result = subprocess.run(
607
- ["ffmpeg", "-y", "-i", tmp_raw.name,
608
- "-acodec", "pcm_s16le", "-ar", "24000", "-ac", "1", tmp_wav],
609
  capture_output=True, text=True,
610
  )
611
- os.unlink(tmp_raw.name)
612
  if result.returncode != 0:
613
- raise RuntimeError(f"ffmpeg conversion failed: {result.stderr[:200]}")
614
- audio_array, sample_rate = sf.read(tmp_wav, dtype="float32")
615
- os.unlink(tmp_wav)
616
  return audio_array, sample_rate
617
  except Exception as e2:
618
- # Clean up
619
- for f in [tmp_raw.name, tmp_raw.name + ".wav"]:
620
- if os.path.exists(f):
621
- os.unlink(f)
622
- raise RuntimeError(f"Failed to decode YourVoic audio: {e2}")
623
 
624
 
625
  def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
@@ -629,42 +633,26 @@ def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter",
629
  return output_path, sr
630
 
631
 
632
- def synthesize_local(text, tts_pipe):
633
- """
634
- Synthesize text using local HuggingFace TTS pipeline (MMS-TTS).
635
- Returns (audio_array, sample_rate).
636
- """
637
- t0 = time.time()
638
- result = tts_pipe(text)
639
- audio = np.array(result["audio"]).squeeze()
640
- sr = result["sampling_rate"]
641
- elapsed = time.time() - t0
642
- logger.info(f"Local TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio)/sr:.1f}s audio")
643
- return audio, sr
644
 
645
 
646
- def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk=2):
647
  """
648
- Synthesize long text by chunking into sentence groups.
649
- Routes to either YourVoic or local TTS based on language config.
650
 
651
  Args:
652
  text: Full text to synthesize
653
- language_config: Dict from LANGUAGES (has tts_engine, yourvoic_lang, etc.)
654
- tts_pipe: Local HuggingFace TTS pipeline (needed for local engine)
655
  sentences_per_chunk: How many sentences to synthesize per API call
656
 
657
  Returns:
658
  (audio_array, sample_rate)
659
  """
660
- import re
661
  sentences = re.split(r'(?<=[.!?])\s+', text)
662
  sentences = [s.strip() for s in sentences if s.strip()]
663
 
664
  if not sentences:
665
- return np.array([], dtype=np.float32), 16000
666
 
667
- engine = language_config["tts_engine"]
668
  audio_segments = []
669
  output_sr = None
670
 
@@ -674,20 +662,14 @@ def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk
674
  continue
675
 
676
  try:
677
- if engine == "yourvoic":
678
- voice = language_config["yourvoic_voices"][0] if language_config["yourvoic_voices"] else "Peter"
679
- lang_code = language_config["yourvoic_lang"]
680
- audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
681
- else:
682
- if tts_pipe is None:
683
- raise RuntimeError("Local TTS pipeline not loaded")
684
- audio_seg, seg_sr = synthesize_local(chunk_text, tts_pipe)
685
 
686
  if output_sr is None:
687
  output_sr = seg_sr
688
  if len(audio_seg) > 0:
689
  audio_segments.append(audio_seg)
690
- # Small silence between chunks
691
  silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
692
  audio_segments.append(silence)
693
 
@@ -696,11 +678,9 @@ def synthesize_chunked(text, language_config, tts_pipe=None, sentences_per_chunk
696
  continue
697
 
698
  if not audio_segments:
699
- # Return a short silence instead of empty array to prevent Gradio crash
700
  fallback_sr = output_sr or 16000
701
- silence = np.zeros(int(0.5 * fallback_sr), dtype=np.float32)
702
  logger.warning("All TTS chunks failed — returning silence")
703
- return silence, fallback_sr
704
 
705
  return np.concatenate(audio_segments), output_sr
706
 
@@ -1008,7 +988,7 @@ def get_voices_for_language(lang_name):
1008
  elif engine == "yourvoic" and config.get("yourvoic_voices"):
1009
  return config["yourvoic_voices"]
1010
  elif engine == "local":
1011
- return ["Default (local model)"]
1012
  return ["Peter"]
1013
 
1014
 
@@ -1053,7 +1033,7 @@ def full_pipeline_audio(audio_input, target_language):
1053
  # TTS
1054
  t0 = time.time()
1055
  audio_out, sr_out = synthesize_chunked(
1056
- translated, lang_config, tts_pipe=tts_pipe_local
1057
  )
1058
  log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
1059
 
@@ -1088,7 +1068,7 @@ def full_pipeline_text(english_text, target_language, voice_name):
1088
  # TTS
1089
  t0 = time.time()
1090
  audio_out, sr_out = synthesize_chunked(
1091
- translated, lang_config, tts_pipe=tts_pipe_local
1092
  )
1093
  log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
1094
 
@@ -1101,7 +1081,7 @@ def full_pipeline_text(english_text, target_language, voice_name):
1101
  def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=gr.Progress()):
1102
  """
1103
  Dub a video into one or more target languages.
1104
- Routes to Qwen Omni for global languages, local pipeline for African languages.
1105
  """
1106
  if video_path is None:
1107
  return None, "Please upload a video."
@@ -1173,7 +1153,7 @@ def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=g
1173
  progress(0.65, desc=f"{lang_name}: synthesizing...")
1174
  t0 = time.time()
1175
  tgt_audio, tgt_sr = synthesize_chunked(
1176
- translated, lang_config, tts_pipe=tts_pipe_local
1177
  )
1178
  sf.write(tgt_audio_raw, tgt_audio, tgt_sr)
1179
  tgt_duration = len(tgt_audio) / tgt_sr
@@ -1247,7 +1227,7 @@ with gr.Blocks(
1247
  <div class="main-header">
1248
  <h1>PlotWeaver</h1>
1249
  <p>Live commentary translation platform &mdash; English to 40+ languages</p>
1250
- <p style="font-size:0.8rem; color:#999">ASR (Whisper) &rarr; MT (NLLB-200) &rarr; TTS (YourVoic + local models)</p>
1251
  </div>
1252
  """)
1253
 
@@ -1373,7 +1353,7 @@ with gr.Blocks(
1373
  gr.Markdown(
1374
  "Upload a video with English commentary and get back a dubbed version. "
1375
  "**Global languages** (Arabic, French, Spanish, etc.) use Qwen Omni for best quality. "
1376
- "**African languages** (Yoruba, Hausa, etc.) use the local Whisper NLLB → MMS-TTS pipeline."
1377
  )
1378
 
1379
  with gr.Row():
@@ -1496,7 +1476,7 @@ with gr.Blocks(
1496
  info += f"**YourVoic language:** `{config.get('yourvoic_lang', 'N/A')}`\n\n"
1497
  info += f"**Available voices:** {', '.join(voices) if voices else 'Peter (default)'}"
1498
  else:
1499
- info += f"**Engine:** Local pipeline (Whisper ASR + NLLB MT + MMS-TTS)\n\n"
1500
  info += f"**NLLB code:** `{config.get('nllb', 'N/A')}`\n\n"
1501
  info += "Uses locally fine-tuned models on GPU. Voice selection not available."
1502
 
@@ -1510,8 +1490,8 @@ with gr.Blocks(
1510
  **PlotWeaver** by PlotweaverAI | Models:
1511
  [ASR](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
1512
  [MT](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
1513
- [TTS](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new) |
1514
- [YourVoic API](https://yourvoic.com)
1515
  """)
1516
 
1517
 
 
1
  """
2
  PlotWeaver — Live Commentary Translation Platform (Single File)
3
  ================================================================
4
+ Two engines: Qwen Omni | YourVoic API (with NLLB MT)
5
  """
6
 
7
  import os, io, re, time, base64, struct, shutil, subprocess, tempfile, logging
 
35
  # }
36
 
37
  LANGUAGES = {
38
+ # ---- Qwen Omni Languages (end-to-end speech-to-speech, 11 languages) ----
39
+ "English": {
40
+ "nllb": "eng_Latn", "yourvoic_lang": "en-US",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
42
+ "qwen_code": "en", "qwen_name": "English",
43
  },
44
+ "Chinese (Mandarin)": {
45
  "nllb": "zho_Hans", "yourvoic_lang": "zh-CN",
46
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
47
  "qwen_code": "zh", "qwen_name": "Mandarin Chinese",
48
  },
 
 
 
 
 
49
  "Japanese": {
50
  "nllb": "jpn_Jpan", "yourvoic_lang": "ja-JP",
51
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
52
  "qwen_code": "ja", "qwen_name": "Japanese",
53
  },
 
 
 
 
 
 
 
 
 
 
54
  "Korean": {
55
  "nllb": "kor_Hang", "yourvoic_lang": "ko-KR",
56
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
57
  "qwen_code": "ko", "qwen_name": "Korean",
58
  },
59
+ "German": {
60
+ "nllb": "deu_Latn", "yourvoic_lang": "de-DE",
61
+ "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
62
+ "qwen_code": "de", "qwen_name": "German",
63
+ },
64
+ "French": {
65
+ "nllb": "fra_Latn", "yourvoic_lang": "fr-FR",
66
+ "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
67
+ "qwen_code": "fr", "qwen_name": "French",
68
+ },
69
  "Russian": {
70
  "nllb": "rus_Cyrl", "yourvoic_lang": "ru-RU",
71
  "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
72
  "qwen_code": "ru", "qwen_name": "Russian",
73
  },
74
+ "Portuguese": {
75
+ "nllb": "por_Latn", "yourvoic_lang": "pt-BR",
76
+ "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
77
+ "qwen_code": "pt", "qwen_name": "Portuguese",
 
 
78
  },
79
+ "Spanish": {
80
+ "nllb": "spa_Latn", "yourvoic_lang": "es-ES",
81
+ "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
82
+ "qwen_code": "es", "qwen_name": "Spanish",
83
  },
84
+ "Italian": {
85
+ "nllb": "ita_Latn", "yourvoic_lang": "it-IT",
86
+ "yourvoic_voices": ["Peter", "Kylie"], "tts_engine": "qwen",
87
+ "qwen_code": "it", "qwen_name": "Italian",
88
+ },
89
+ "Arabic": {
90
+ "nllb": "arb_Arab", "yourvoic_lang": "ar-SA",
91
+ "yourvoic_voices": ["Peter"], "tts_engine": "qwen",
92
+ "qwen_code": "ar", "qwen_name": "Modern Standard Arabic",
93
  },
94
+
95
+ # ---- African Languages (YourVoic API) ----
96
  "Swahili": {
97
  "nllb": "swh_Latn", "yourvoic_lang": "sw-KE",
98
  "yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
99
  "qwen_code": None, "qwen_name": None,
100
  },
 
 
 
 
 
101
  "Amharic": {
102
  "nllb": "amh_Ethi", "yourvoic_lang": "am-ET",
103
  "yourvoic_voices": ["Peter"], "tts_engine": "yourvoic",
 
110
  },
111
 
112
  # ---- South Asian (YourVoic TTS + NLLB MT) ----
113
+ "Hindi": {
114
+ "nllb": "hin_Deva", "yourvoic_lang": "hi-IN",
115
+ "yourvoic_voices": ["Rahul", "Deepika", "Aditya"], "tts_engine": "yourvoic",
116
+ "qwen_code": None, "qwen_name": None,
117
+ },
118
  "Bengali": {
119
  "nllb": "ben_Beng", "yourvoic_lang": "bn-IN",
120
  "yourvoic_voices": ["Sneha", "Aryan"], "tts_engine": "yourvoic",
 
266
  # All language display names (for dropdowns)
267
  ALL_LANGUAGE_NAMES = sorted(LANGUAGES.keys())
268
 
269
+ # Languages that use YourVoic API
270
+ YOURVOIC_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "yourvoic"]
271
 
272
  # Languages that use YourVoic API
273
  YOURVOIC_LANGUAGES = [k for k, v in LANGUAGES.items() if v["tts_engine"] == "yourvoic"]
 
284
  asr_pipe = None
285
  mt_tokenizer = None
286
  mt_model = None
287
+
288
 
289
 
290
  def load_models():
291
  """Load all models at startup."""
292
+ global asr_pipe, mt_tokenizer, mt_model
293
  from transformers import (
294
  pipeline as hf_pipeline,
295
  AutoTokenizer,
 
320
  mt_tokenizer.src_lang = "eng_Latn"
321
  print(" MT loaded")
322
 
 
 
 
 
 
 
 
 
 
 
 
323
  # Diagnostics
324
  print(f"\n=== Device diagnostics ===")
325
  print(f"CUDA available: {torch.cuda.is_available()}")
 
327
  print(f"CUDA device: {torch.cuda.get_device_name(0)}")
328
  print(f"ASR on: {next(asr_pipe.model.parameters()).device}")
329
  print(f"MT on: {next(mt_model.parameters()).device}")
 
330
  print(f"YourVoic API key: {'set' if os.environ.get('YOURVOIC_API_KEY') else 'NOT SET'}")
331
+ print(f"Dashscope key: {'set' if os.environ.get('DASHSCOPE_API_KEY') else 'NOT SET'}")
332
  print(f"==========================\n")
333
  print("All models loaded!")
334
 
 
512
 
513
 
514
  # =============================================================================
515
+ # TTS ENGINE: YourVoic API
516
  # =============================================================================
517
 
518
  YOURVOIC_API_KEY = os.environ.get("YOURVOIC_API_KEY", "")
 
520
 
521
 
522
  def synthesize_yourvoic(text, language_code, voice="Peter", speed=1.0):
523
+ """Synthesize text using YourVoic API."""
 
 
 
524
  if not YOURVOIC_API_KEY:
525
+ raise RuntimeError("YOURVOIC_API_KEY not set.")
 
 
526
 
527
+ headers = {"X-API-Key": YOURVOIC_API_KEY, "Content-Type": "application/json"}
528
+ payload = {"text": text, "voice": voice, "language": language_code, "model": "aura-prime", "speed": speed}
 
 
 
 
 
 
 
 
 
529
 
530
  t0 = time.time()
531
+ response = requests.post(YOURVOIC_STREAM_URL, headers=headers, json=payload, stream=True, timeout=60)
 
 
 
 
 
 
532
 
533
  if response.status_code != 200:
534
+ raise RuntimeError(f"YourVoic error {response.status_code}: {response.text[:200]}")
 
 
535
 
536
+ # Detect format from content-type header
537
+ ct = response.headers.get("content-type", "").lower()
538
+ logger.info(f"YourVoic content-type: {ct}")
539
+
540
+ # Collect audio bytes
541
+ audio_data = b""
542
  for chunk in response.iter_content(chunk_size=8192):
543
+ audio_data += chunk
 
544
 
545
  elapsed = time.time() - t0
546
+ logger.info(f"YourVoic TTS: {len(text)} chars, {elapsed:.2f}s, {len(audio_data)} bytes")
547
+
548
+ # Log first bytes for format detection
549
+ magic = audio_data[:16] if len(audio_data) > 16 else audio_data
550
+ logger.info(f"YourVoic first bytes: {magic[:8]}")
551
+
552
+ # Determine file extension from content-type or magic bytes
553
+ if b"RIFF" in audio_data[:4]:
554
+ ext = ".wav"
555
+ elif b"\xff\xfb" in audio_data[:3] or b"\xff\xf3" in audio_data[:3] or b"ID3" in audio_data[:3]:
556
+ ext = ".mp3"
557
+ elif b"OggS" in audio_data[:4]:
558
+ ext = ".ogg"
559
+ elif b"fLaC" in audio_data[:4]:
560
+ ext = ".flac"
561
+ elif "mp3" in ct or "mpeg" in ct:
562
+ ext = ".mp3"
563
+ elif "ogg" in ct:
564
+ ext = ".ogg"
565
+ elif "wav" in ct:
566
+ ext = ".wav"
567
+ elif "flac" in ct:
568
+ ext = ".flac"
569
+ elif "linear16" in ct or "pcm" in ct or "l16" in ct:
570
+ ext = ".raw"
571
+ else:
572
+ ext = ".mp3" # Most common API default
573
+ logger.warning(f"Unknown YourVoic format (ct={ct}), guessing mp3")
574
+
575
+ # Save with correct extension
576
+ tmp_path = tempfile.NamedTemporaryFile(suffix=ext, delete=False).name
577
+ with open(tmp_path, "wb") as f:
578
+ f.write(audio_data)
579
 
580
  # Try reading directly with soundfile
581
  try:
582
+ audio_array, sample_rate = sf.read(tmp_path, dtype="float32")
583
+ os.unlink(tmp_path)
584
  return audio_array, sample_rate
585
  except Exception as e:
586
+ logger.warning(f"soundfile can't read {ext}: {e}")
587
+
588
+ # Handle raw PCM (linear16): wrap in WAV header
589
+ if ext == ".raw":
590
+ try:
591
+ sr = 24000
592
+ raw_data = audio_data
593
+ wav_path = tmp_path + ".wav"
594
+ with open(wav_path, "wb") as f:
595
+ f.write(b"RIFF")
596
+ f.write(struct.pack("<I", 36 + len(raw_data)))
597
+ f.write(b"WAVE")
598
+ f.write(b"fmt ")
599
+ f.write(struct.pack("<IHHIIHH", 16, 1, 1, sr, sr * 2, 2, 16))
600
+ f.write(b"data")
601
+ f.write(struct.pack("<I", len(raw_data)))
602
+ f.write(raw_data)
603
+ audio_array, sample_rate = sf.read(wav_path, dtype="float32")
604
+ os.unlink(tmp_path)
605
+ os.unlink(wav_path)
606
+ return audio_array, sample_rate
607
+ except Exception as e:
608
+ logger.warning(f"Raw PCM wrap failed: {e}")
609
 
610
+ # Fallback: convert with ffmpeg
611
  try:
612
+ wav_path = tmp_path + ".wav"
 
613
  result = subprocess.run(
614
+ ["ffmpeg", "-y", "-i", tmp_path, "-acodec", "pcm_s16le", "-ar", "24000", "-ac", "1", wav_path],
 
615
  capture_output=True, text=True,
616
  )
617
+ os.unlink(tmp_path)
618
  if result.returncode != 0:
619
+ raise RuntimeError(f"ffmpeg failed: {result.stderr[-300:]}")
620
+ audio_array, sample_rate = sf.read(wav_path, dtype="float32")
621
+ os.unlink(wav_path)
622
  return audio_array, sample_rate
623
  except Exception as e2:
624
+ for f in [tmp_path, tmp_path + ".wav"]:
625
+ if os.path.exists(f): os.unlink(f)
626
+ raise RuntimeError(f"YourVoic decode failed: {e2}")
 
 
627
 
628
 
629
  def synthesize_yourvoic_to_file(text, language_code, output_path, voice="Peter", speed=1.0):
 
633
  return output_path, sr
634
 
635
 
 
 
 
 
 
 
 
 
 
 
 
 
636
 
637
 
638
+ def synthesize_chunked(text, language_config, sentences_per_chunk=2):
639
  """
640
+ Synthesize long text by chunking into sentence groups via YourVoic API.
 
641
 
642
  Args:
643
  text: Full text to synthesize
644
+ language_config: Dict from LANGUAGES (has yourvoic_lang, yourvoic_voices, etc.)
 
645
  sentences_per_chunk: How many sentences to synthesize per API call
646
 
647
  Returns:
648
  (audio_array, sample_rate)
649
  """
 
650
  sentences = re.split(r'(?<=[.!?])\s+', text)
651
  sentences = [s.strip() for s in sentences if s.strip()]
652
 
653
  if not sentences:
654
+ return np.zeros(int(0.5 * 16000), dtype=np.float32), 16000
655
 
 
656
  audio_segments = []
657
  output_sr = None
658
 
 
662
  continue
663
 
664
  try:
665
+ voice = language_config["yourvoic_voices"][0] if language_config.get("yourvoic_voices") else "Peter"
666
+ lang_code = language_config["yourvoic_lang"]
667
+ audio_seg, seg_sr = synthesize_yourvoic(chunk_text, lang_code, voice)
 
 
 
 
 
668
 
669
  if output_sr is None:
670
  output_sr = seg_sr
671
  if len(audio_seg) > 0:
672
  audio_segments.append(audio_seg)
 
673
  silence = np.zeros(int(0.15 * seg_sr), dtype=np.float32)
674
  audio_segments.append(silence)
675
 
 
678
  continue
679
 
680
  if not audio_segments:
 
681
  fallback_sr = output_sr or 16000
 
682
  logger.warning("All TTS chunks failed — returning silence")
683
+ return np.zeros(int(0.5 * fallback_sr), dtype=np.float32), fallback_sr
684
 
685
  return np.concatenate(audio_segments), output_sr
686
 
 
988
  elif engine == "yourvoic" and config.get("yourvoic_voices"):
989
  return config["yourvoic_voices"]
990
  elif engine == "local":
991
+ return ["Peter"]
992
  return ["Peter"]
993
 
994
 
 
1033
  # TTS
1034
  t0 = time.time()
1035
  audio_out, sr_out = synthesize_chunked(
1036
+ translated, lang_config
1037
  )
1038
  log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
1039
 
 
1068
  # TTS
1069
  t0 = time.time()
1070
  audio_out, sr_out = synthesize_chunked(
1071
+ translated, lang_config
1072
  )
1073
  log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
1074
 
 
1081
  def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=gr.Progress()):
1082
  """
1083
  Dub a video into one or more target languages.
1084
+ Routes to Qwen Omni for global languages, YourVoic for others.
1085
  """
1086
  if video_path is None:
1087
  return None, "Please upload a video."
 
1153
  progress(0.65, desc=f"{lang_name}: synthesizing...")
1154
  t0 = time.time()
1155
  tgt_audio, tgt_sr = synthesize_chunked(
1156
+ translated, lang_config
1157
  )
1158
  sf.write(tgt_audio_raw, tgt_audio, tgt_sr)
1159
  tgt_duration = len(tgt_audio) / tgt_sr
 
1227
  <div class="main-header">
1228
  <h1>PlotWeaver</h1>
1229
  <p>Live commentary translation platform &mdash; English to 40+ languages</p>
1230
+ <p style="font-size:0.8rem; color:#999">Qwen Omni (11 languages) + YourVoic API + NLLB-200 (27 languages)</p>
1231
  </div>
1232
  """)
1233
 
 
1353
  gr.Markdown(
1354
  "Upload a video with English commentary and get back a dubbed version. "
1355
  "**Global languages** (Arabic, French, Spanish, etc.) use Qwen Omni for best quality. "
1356
+ "**African/regional languages** use YourVoic API with NLLB translation."
1357
  )
1358
 
1359
  with gr.Row():
 
1476
  info += f"**YourVoic language:** `{config.get('yourvoic_lang', 'N/A')}`\n\n"
1477
  info += f"**Available voices:** {', '.join(voices) if voices else 'Peter (default)'}"
1478
  else:
1479
+ info += f"**Engine:** Not available\n\n"
1480
  info += f"**NLLB code:** `{config.get('nllb', 'N/A')}`\n\n"
1481
  info += "Uses locally fine-tuned models on GPU. Voice selection not available."
1482
 
 
1490
  **PlotWeaver** by PlotweaverAI | Models:
1491
  [ASR](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
1492
  [MT](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
1493
+ [TTS](https://yourvoic.com) |
1494
+ [Qwen Omni](https://www.alibabacloud.com/help/en/model-studio/qwen-omni)
1495
  """)
1496
 
1497