PlotweaverModel commited on
Commit
a7ec219
·
verified ·
1 Parent(s): 00d1940

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -555
app.py DELETED
@@ -1,555 +0,0 @@
1
- """
2
- PlotWeaver — Live Commentary Translation Platform
3
- ===================================================
4
- Event management, multi-language dubbing, live streaming.
5
- """
6
-
7
- import os
8
- import time
9
- import tempfile
10
- import numpy as np
11
- import re
12
- import soundfile as sf
13
- import gradio as gr
14
- import logging
15
-
16
- logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
17
- logger = logging.getLogger(__name__)
18
-
19
- from languages import LANGUAGES, LANGUAGE_GROUPS, ALL_LANGUAGE_NAMES, QWEN_VOICES
20
- from tts_engine import synthesize_chunked
21
- from qwen_engine import dub_video_qwen, translate_chunk_qwen
22
- from pipeline import (
23
- load_models, transcribe, translate_text, translate_sentence,
24
- split_into_sentences, extract_audio_from_video, get_media_duration,
25
- stretch_audio_to_duration, mux_video_audio, tts_pipe_local,
26
- )
27
- import pipeline
28
-
29
- # Load all models at startup
30
- load_models()
31
-
32
-
33
- # =============================================================================
34
- # Helper functions
35
- # =============================================================================
36
-
37
- def get_voices_for_language(lang_name):
38
- """Get available voices for a language based on its engine."""
39
- config = LANGUAGES.get(lang_name, {})
40
- engine = config.get("tts_engine", "local")
41
- if engine == "qwen":
42
- return QWEN_VOICES
43
- elif engine == "yourvoic" and config.get("yourvoic_voices"):
44
- return config["yourvoic_voices"]
45
- elif engine == "local":
46
- return ["Default (local model)"]
47
- return ["Peter"]
48
-
49
-
50
- def full_pipeline_audio(audio_input, target_language):
51
- """Full pipeline: English audio → target language audio."""
52
- if audio_input is None:
53
- return None, "Please upload or record audio."
54
-
55
- lang_config = LANGUAGES.get(target_language)
56
- if not lang_config:
57
- return None, f"Language '{target_language}' not configured."
58
-
59
- sample_rate, audio_array = audio_input
60
- audio_array = audio_array.astype(np.float32)
61
- if audio_array.ndim > 1:
62
- audio_array = audio_array.mean(axis=1)
63
- if audio_array.max() > 1.0 or audio_array.min() < -1.0:
64
- max_val = max(abs(audio_array.max()), abs(audio_array.min()))
65
- if max_val > 0:
66
- audio_array = audio_array / max_val
67
-
68
- log = []
69
- total_start = time.time()
70
-
71
- # ASR
72
- t0 = time.time()
73
- english = transcribe(audio_array, sample_rate)
74
- log.append(f"**ASR** ({time.time()-t0:.2f}s)\n{english}")
75
- if not english:
76
- return None, "ASR returned empty text."
77
-
78
- # MT
79
- t0 = time.time()
80
- nllb_code = lang_config["nllb"]
81
- translated, en_sents, tgt_sents = translate_text(english, nllb_code, fast=False)
82
- log.append(f"\n**Translation** ({time.time()-t0:.2f}s)")
83
- for e, t in zip(en_sents, tgt_sents):
84
- log.append(f" EN: {e}\n {target_language.upper()}: {t}")
85
- if not translated:
86
- return None, "Translation returned empty."
87
-
88
- # TTS
89
- t0 = time.time()
90
- audio_out, sr_out = synthesize_chunked(
91
- translated, lang_config, tts_pipe=pipeline.tts_pipe_local
92
- )
93
- log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
94
-
95
- total = time.time() - total_start
96
- log.append(f"\n**Total: {total:.2f}s**")
97
-
98
- return (sr_out, audio_out), "\n".join(log)
99
-
100
-
101
- def full_pipeline_text(english_text, target_language, voice_name):
102
- """Text-only pipeline: English text → target language audio."""
103
- if not english_text or not english_text.strip():
104
- return None, "Please enter English text."
105
-
106
- lang_config = LANGUAGES.get(target_language)
107
- if not lang_config:
108
- return None, f"Language '{target_language}' not configured."
109
-
110
- log = []
111
- total_start = time.time()
112
-
113
- # MT
114
- t0 = time.time()
115
- nllb_code = lang_config["nllb"]
116
- translated, en_sents, tgt_sents = translate_text(english_text.strip(), nllb_code, fast=False)
117
- log.append(f"**Translation** ({time.time()-t0:.2f}s)")
118
- for e, t in zip(en_sents, tgt_sents):
119
- log.append(f" EN: {e}\n {target_language.upper()}: {t}")
120
- if not translated:
121
- return None, "Translation returned empty."
122
-
123
- # TTS
124
- t0 = time.time()
125
- audio_out, sr_out = synthesize_chunked(
126
- translated, lang_config, tts_pipe=pipeline.tts_pipe_local
127
- )
128
- log.append(f"\n**TTS** ({time.time()-t0:.2f}s) = {len(audio_out)/sr_out:.1f}s audio")
129
-
130
- total = time.time() - total_start
131
- log.append(f"\n**Total: {total:.2f}s**")
132
-
133
- return (sr_out, audio_out), "\n".join(log)
134
-
135
-
136
- def dub_video(video_path, target_languages, dub_voice, chunk_seconds, progress=gr.Progress()):
137
- """
138
- Dub a video into one or more target languages.
139
- Routes to Qwen Omni for global languages, local pipeline for African languages.
140
- """
141
- if video_path is None:
142
- return None, "Please upload a video."
143
-
144
- if not target_languages:
145
- return None, "Please select at least one target language."
146
-
147
- results_log = []
148
- output_videos = []
149
-
150
- for lang_name in target_languages:
151
- lang_config = LANGUAGES.get(lang_name)
152
- if not lang_config:
153
- results_log.append(f"**{lang_name}**: not configured, skipped")
154
- continue
155
-
156
- engine = lang_config.get("tts_engine", "local")
157
- results_log.append(f"\n{'='*50}")
158
- results_log.append(f"**Dubbing: {lang_name}** (engine: {engine})")
159
- results_log.append(f"{'='*50}")
160
-
161
- try:
162
- if engine == "qwen":
163
- # Qwen Omni: end-to-end speech-to-speech (best for global languages)
164
- qwen_lang_name = lang_config.get("qwen_name", lang_name)
165
- voice = dub_voice if dub_voice in QWEN_VOICES else "Ethan"
166
- out_video, log_text = dub_video_qwen(
167
- video_path, qwen_lang_name, voice=voice,
168
- chunk_seconds=chunk_seconds, progress_fn=progress,
169
- )
170
- results_log.append(log_text)
171
- if out_video:
172
- output_videos.append(out_video)
173
-
174
- else:
175
- # Local/YourVoic pipeline: ASR → NLLB → TTS
176
- work_dir = tempfile.mkdtemp(prefix=f"dub_{lang_name}_")
177
- extracted_audio = os.path.join(work_dir, "audio.wav")
178
- tgt_audio_raw = os.path.join(work_dir, "tgt_raw.wav")
179
- tgt_audio_aligned = os.path.join(work_dir, "tgt_aligned.wav")
180
- output_video = os.path.join(work_dir, f"dubbed_{lang_name}.mp4")
181
-
182
- progress(0.05, desc=f"{lang_name}: extracting audio...")
183
- extract_audio_from_video(video_path, extracted_audio)
184
- video_duration = get_media_duration(video_path)
185
- results_log.append(f"Video: {video_duration:.1f}s")
186
-
187
- audio_array, sr = sf.read(extracted_audio, dtype="float32")
188
- if audio_array.ndim > 1:
189
- audio_array = audio_array.mean(axis=1)
190
-
191
- progress(0.15, desc=f"{lang_name}: transcribing...")
192
- t0 = time.time()
193
- english = transcribe(audio_array, sr)
194
- results_log.append(f"ASR: {time.time()-t0:.1f}s")
195
- if not english:
196
- results_log.append("ASR empty — skipped")
197
- continue
198
-
199
- progress(0.4, desc=f"{lang_name}: translating...")
200
- t0 = time.time()
201
- nllb_code = lang_config["nllb"]
202
- translated, _, _ = translate_text(english, nllb_code, fast=True)
203
- results_log.append(f"MT: {time.time()-t0:.1f}s")
204
- if not translated:
205
- results_log.append("Translation empty — skipped")
206
- continue
207
-
208
- progress(0.65, desc=f"{lang_name}: synthesizing...")
209
- t0 = time.time()
210
- tgt_audio, tgt_sr = synthesize_chunked(
211
- translated, lang_config, tts_pipe=pipeline.tts_pipe_local
212
- )
213
- sf.write(tgt_audio_raw, tgt_audio, tgt_sr)
214
- tgt_duration = len(tgt_audio) / tgt_sr
215
- results_log.append(f"TTS: {time.time()-t0:.1f}s ({tgt_duration:.1f}s audio)")
216
-
217
- progress(0.85, desc=f"{lang_name}: aligning...")
218
- MAX_STRETCH = 1.2
219
- stretch_ratio = tgt_duration / video_duration
220
-
221
- if stretch_ratio <= MAX_STRETCH:
222
- if abs(stretch_ratio - 1.0) > 0.02:
223
- stretch_audio_to_duration(tgt_audio_raw, tgt_audio_aligned, video_duration)
224
- else:
225
- import shutil
226
- shutil.copy(tgt_audio_raw, tgt_audio_aligned)
227
- extend_video = False
228
- final_duration = video_duration
229
- else:
230
- import shutil
231
- shutil.copy(tgt_audio_raw, tgt_audio_aligned)
232
- extend_video = True
233
- final_duration = tgt_duration
234
- results_log.append(f"Audio longer ({stretch_ratio:.1f}x) — extending video")
235
-
236
- progress(0.95, desc=f"{lang_name}: combining...")
237
- mux_video_audio(
238
- video_path, tgt_audio_aligned, output_video,
239
- extend_video=extend_video, target_duration=final_duration
240
- )
241
- output_videos.append(output_video)
242
-
243
- except Exception as e:
244
- logger.exception(f"Dubbing {lang_name} failed")
245
- results_log.append(f"Error: {str(e)}")
246
-
247
- progress(1.0, desc="Done!")
248
- final_video = output_videos[0] if output_videos else None
249
- return final_video, "\n".join(results_log)
250
-
251
-
252
- def update_voices(language):
253
- """Update voice dropdown when language changes."""
254
- voices = get_voices_for_language(language)
255
- return gr.update(choices=voices, value=voices[0])
256
-
257
-
258
- # =============================================================================
259
- # Gradio UI
260
- # =============================================================================
261
-
262
- EXAMPLES = [
263
- "And it's a brilliant goal from the striker!",
264
- "The referee has shown a yellow card. Corner kick for the home team.",
265
- "What a save by the goalkeeper! The match is heading into injury time.",
266
- "He dribbles past two defenders and shoots! The ball hits the back of the net!",
267
- ]
268
-
269
- CSS = """
270
- .main-header { text-align: center; margin-bottom: 0.5rem; }
271
- .main-header h1 { font-size: 1.8rem; font-weight: 700; margin: 0; }
272
- .main-header p { color: #666; font-size: 0.95rem; }
273
- .lang-group-label { font-weight: 600; font-size: 0.85rem; color: #888; text-transform: uppercase; letter-spacing: 0.05em; margin-top: 0.5rem; }
274
- """
275
-
276
- with gr.Blocks(
277
- title="PlotWeaver — Live Commentary Translation",
278
- theme=gr.themes.Soft(),
279
- css=CSS,
280
- ) as demo:
281
-
282
- gr.HTML("""
283
- <div class="main-header">
284
- <h1>PlotWeaver</h1>
285
- <p>Live commentary translation platform &mdash; English to 40+ languages</p>
286
- <p style="font-size:0.8rem; color:#999">ASR (Whisper) &rarr; MT (NLLB-200) &rarr; TTS (YourVoic + local models)</p>
287
- </div>
288
- """)
289
-
290
- with gr.Tabs():
291
-
292
- # ====== TAB 1: EVENT MANAGEMENT ======
293
- with gr.TabItem("Event Management"):
294
- gr.Markdown("### Create new event")
295
- gr.Markdown("Configure your live broadcast event with target languages and input source.")
296
-
297
- with gr.Row():
298
- with gr.Column(scale=2):
299
- event_name = gr.Textbox(
300
- label="Event name",
301
- placeholder="e.g. Premier League: Arsenal vs. Chelsea",
302
- )
303
- with gr.Row():
304
- start_time = gr.Textbox(label="Start time", placeholder="08:30 PM")
305
- end_time = gr.Textbox(label="End time", placeholder="10:30 PM")
306
- event_date = gr.Textbox(label="Date", placeholder="2026-06-06")
307
-
308
- gr.Markdown("#### Input source")
309
- input_method = gr.Radio(
310
- choices=["RTMP Stream", "WebRTC (Browser)", "Direct Audio Feed"],
311
- value="RTMP Stream",
312
- label="Input method",
313
- )
314
-
315
- gr.Markdown("#### Target languages")
316
- gr.Markdown("Select languages for simultaneous broadcast. Additional languages consume more stream minutes.")
317
-
318
- # Language checkboxes grouped by category
319
- target_langs = gr.CheckboxGroup(
320
- choices=ALL_LANGUAGE_NAMES,
321
- label="Languages",
322
- value=["Yoruba"],
323
- )
324
-
325
- with gr.Column(scale=1):
326
- gr.Markdown("#### Estimate summary")
327
- estimate_display = gr.Markdown(
328
- value="**Event:** Not configured\n\n**Languages:** 1 selected\n\n**Estimated duration:** --\n\n**Total estimate:** --"
329
- )
330
- create_event_btn = gr.Button("Create Event", variant="primary", size="lg")
331
- event_status = gr.Markdown("")
332
-
333
- def update_estimate(name, langs, start, end):
334
- n_langs = len(langs) if langs else 0
335
- lang_list = ", ".join(langs) if langs else "None"
336
- return (
337
- f"**Event:** {name or 'Not set'}\n\n"
338
- f"**Languages:** {n_langs} selected\n\n"
339
- f"{lang_list}\n\n"
340
- f"**Input:** Configured\n\n"
341
- f"**Rate:** 1x (Standard)"
342
- )
343
-
344
- for inp in [event_name, target_langs, start_time, end_time]:
345
- inp.change(
346
- fn=update_estimate,
347
- inputs=[event_name, target_langs, start_time, end_time],
348
- outputs=[estimate_display],
349
- )
350
-
351
- def create_event(name, langs):
352
- if not name:
353
- return "Please enter an event name."
354
- if not langs:
355
- return "Please select at least one language."
356
- return f"Event **{name}** created with {len(langs)} languages: {', '.join(langs)}"
357
-
358
- create_event_btn.click(
359
- fn=create_event,
360
- inputs=[event_name, target_langs],
361
- outputs=[event_status],
362
- )
363
-
364
- # ====== TAB 2: LIVE STUDIO ======
365
- with gr.TabItem("Live Studio"):
366
- gr.Markdown("### Live streaming translation")
367
- gr.Markdown("Record or stream English commentary and hear it translated in real-time.")
368
-
369
- with gr.Row():
370
- studio_language = gr.Dropdown(
371
- choices=ALL_LANGUAGE_NAMES,
372
- value="Yoruba",
373
- label="Target language",
374
- )
375
- studio_voice = gr.Dropdown(
376
- choices=get_voices_for_language("Yoruba"),
377
- value=get_voices_for_language("Yoruba")[0],
378
- label="Voice",
379
- )
380
-
381
- studio_language.change(
382
- fn=update_voices,
383
- inputs=[studio_language],
384
- outputs=[studio_voice],
385
- )
386
-
387
- with gr.Row():
388
- with gr.Column():
389
- studio_audio_in = gr.Audio(
390
- label="English commentary (upload or record)",
391
- type="numpy",
392
- sources=["upload", "microphone"],
393
- )
394
- studio_translate_btn = gr.Button("Translate", variant="primary", size="lg")
395
-
396
- with gr.Column():
397
- studio_audio_out = gr.Audio(label="Translated audio", type="numpy", autoplay=True)
398
- studio_log = gr.Markdown(label="Pipeline log")
399
-
400
- studio_translate_btn.click(
401
- fn=full_pipeline_audio,
402
- inputs=[studio_audio_in, studio_language],
403
- outputs=[studio_audio_out, studio_log],
404
- )
405
-
406
- # ====== TAB 3: VIDEO DUBBING ======
407
- with gr.TabItem("Video Dubbing"):
408
- gr.Markdown("### Video dubbing (English → multi-language)")
409
- gr.Markdown(
410
- "Upload a video with English commentary and get back a dubbed version. "
411
- "**Global languages** (Arabic, French, Spanish, etc.) use Qwen Omni for best quality. "
412
- "**African languages** (Yoruba, Hausa, etc.) use the local Whisper → NLLB → MMS-TTS pipeline."
413
- )
414
-
415
- with gr.Row():
416
- with gr.Column():
417
- dub_video_in = gr.Video(label="Upload English video", sources=["upload"])
418
- dub_languages = gr.CheckboxGroup(
419
- choices=ALL_LANGUAGE_NAMES,
420
- label="Target languages",
421
- value=["Yoruba"],
422
- )
423
- with gr.Row():
424
- dub_voice = gr.Dropdown(
425
- choices=QWEN_VOICES,
426
- value="Ethan",
427
- label="Voice (for Qwen languages)",
428
- info="Applies to Arabic, French, Spanish, etc. Local languages use default voice.",
429
- )
430
- dub_chunk_slider = gr.Slider(
431
- minimum=30, maximum=300, value=120, step=10,
432
- label="Chunk duration (seconds)",
433
- info="Shorter = more API calls but less timeout risk.",
434
- )
435
- dub_btn = gr.Button("Dub Video", variant="primary", size="lg")
436
-
437
- with gr.Column():
438
- dub_video_out = gr.Video(label="Dubbed video (download from player)")
439
- dub_log = gr.Markdown(
440
- label="Processing log",
441
- value="Upload a video and select languages to start."
442
- )
443
-
444
- dub_btn.click(
445
- fn=dub_video,
446
- inputs=[dub_video_in, dub_languages, dub_voice, dub_chunk_slider],
447
- outputs=[dub_video_out, dub_log],
448
- )
449
-
450
- # ====== TAB 4: TEXT TRANSLATION ======
451
- with gr.TabItem("Text \u2192 Audio"):
452
- gr.Markdown("### Text to translated speech")
453
- gr.Markdown("Type English text, choose a language, and hear the translated audio.")
454
-
455
- with gr.Row():
456
- text_language = gr.Dropdown(
457
- choices=ALL_LANGUAGE_NAMES,
458
- value="Yoruba",
459
- label="Target language",
460
- )
461
- text_voice = gr.Dropdown(
462
- choices=get_voices_for_language("Yoruba"),
463
- value=get_voices_for_language("Yoruba")[0],
464
- label="Voice",
465
- )
466
-
467
- text_language.change(
468
- fn=update_voices,
469
- inputs=[text_language],
470
- outputs=[text_voice],
471
- )
472
-
473
- with gr.Row():
474
- with gr.Column():
475
- text_input = gr.Textbox(
476
- label="English text",
477
- placeholder="Type English football commentary here...",
478
- lines=4,
479
- )
480
- text_btn = gr.Button("Translate to speech", variant="primary", size="lg")
481
- gr.Examples(
482
- examples=[[e] for e in EXAMPLES],
483
- inputs=[text_input],
484
- label="Example commentary",
485
- )
486
-
487
- with gr.Column():
488
- text_audio_out = gr.Audio(label="Translated audio", type="numpy", autoplay=True)
489
- text_log = gr.Markdown(label="Pipeline log")
490
-
491
- text_btn.click(
492
- fn=full_pipeline_text,
493
- inputs=[text_input, text_language, text_voice],
494
- outputs=[text_audio_out, text_log],
495
- )
496
-
497
- # ====== TAB 5: RECORDINGS ======
498
- with gr.TabItem("Recordings & Clips"):
499
- gr.Markdown("### Recordings management")
500
- gr.Markdown(
501
- "Past dubbed recordings will appear here. "
502
- "This feature is coming soon — for now, use Video Dubbing to create new recordings "
503
- "and download them from the player."
504
- )
505
-
506
- # ====== TAB 6: VOICE MODELS ======
507
- with gr.TabItem("Voice Models"):
508
- gr.Markdown("### Voice model library")
509
- gr.Markdown("Browse available voices for each language.")
510
-
511
- voice_lang_select = gr.Dropdown(
512
- choices=ALL_LANGUAGE_NAMES,
513
- value="Yoruba",
514
- label="Select language",
515
- )
516
- voice_info = gr.Markdown()
517
-
518
- def show_voice_info(lang):
519
- config = LANGUAGES.get(lang, {})
520
- engine = config.get("tts_engine", "unknown")
521
- voices = config.get("yourvoic_voices", [])
522
-
523
- info = f"### {lang}\n\n"
524
- if engine == "qwen":
525
- info += f"**Engine:** Qwen 3.5 Omni (end-to-end speech-to-speech)\n\n"
526
- info += f"This is the highest quality option. Qwen handles ASR + translation + TTS in a single API call, "
527
- info += f"preserving tone, emotion, and pacing from the original speaker.\n\n"
528
- info += f"**Available voices ({len(QWEN_VOICES)}):** {', '.join(QWEN_VOICES[:10])}... and {len(QWEN_VOICES)-10} more\n\n"
529
- info += f"All voices support all Qwen languages."
530
- elif engine == "yourvoic":
531
- info += f"**Engine:** YourVoic API (TTS) + NLLB-200 (translation)\n\n"
532
- info += f"**YourVoic language:** `{config.get('yourvoic_lang', 'N/A')}`\n\n"
533
- info += f"**Available voices:** {', '.join(voices) if voices else 'Peter (default)'}"
534
- else:
535
- info += f"**Engine:** Local pipeline (Whisper ASR + NLLB MT + MMS-TTS)\n\n"
536
- info += f"**NLLB code:** `{config.get('nllb', 'N/A')}`\n\n"
537
- info += "Uses locally fine-tuned models on GPU. Voice selection not available."
538
-
539
- return info
540
-
541
- voice_lang_select.change(fn=show_voice_info, inputs=[voice_lang_select], outputs=[voice_info])
542
- demo.load(fn=show_voice_info, inputs=[voice_lang_select], outputs=[voice_info])
543
-
544
- gr.Markdown("""
545
- ---
546
- **PlotWeaver** by PlotweaverAI | Models:
547
- [ASR](https://huggingface.co/PlotweaverAI/whisper-small-de-en) |
548
- [MT](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) |
549
- [TTS](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new) |
550
- [YourVoic API](https://yourvoic.com)
551
- """)
552
-
553
-
554
- if __name__ == "__main__":
555
- demo.launch()