Files changed (1) hide show
  1. app.py +99 -135
app.py CHANGED
@@ -2,8 +2,7 @@
2
  Live Football Commentary Pipeline β€” English β†’ Yoruba
3
  =====================================================
4
  Gradio app for HuggingFace Spaces.
5
-
6
- Pipeline: ASR (Whisper) β†’ MT (NLLB-200) β†’ TTS (MMS-TTS Yoruba)
7
  """
8
 
9
  import torch
@@ -11,11 +10,9 @@ import numpy as np
11
  import re
12
  import time
13
  import gradio as gr
14
- from transformers import (
15
- pipeline as hf_pipeline,
16
- AutoTokenizer,
17
- AutoModelForSeq2SeqLM,
18
- )
19
 
20
  # =============================================================================
21
  # Configuration
@@ -24,19 +21,40 @@ from transformers import (
24
  ASR_MODEL_ID = "PlotweaverAI/whisper-small-de-en"
25
  MT_MODEL_ID = "PlotweaverAI/nllb-200-distilled-600M-african-6lang"
26
  TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
 
27
 
28
  MT_SRC_LANG = "eng_Latn"
29
  MT_TGT_LANG = "yor_Latn"
30
 
31
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
  TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  # =============================================================================
36
  # Load models (runs once at startup)
37
  # =============================================================================
38
 
39
- print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
40
  print("Loading models...")
41
 
42
  # ASR
@@ -49,15 +67,16 @@ asr_pipe = hf_pipeline(
49
  )
50
  print(" ASR loaded βœ“")
51
 
52
- # MT
53
- print(f" Loading MT: {MT_MODEL_ID}")
54
  mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
55
- mt_model = AutoModelForSeq2SeqLM.from_pretrained(
56
- MT_MODEL_ID,
57
- torch_dtype=TORCH_DTYPE,
58
- ).to(DEVICE)
59
- mt_tokenizer.src_lang = MT_SRC_LANG
60
- print(" MT loaded βœ“")
 
61
 
62
  # TTS
63
  print(f" Loading TTS: {TTS_MODEL_ID}")
@@ -72,7 +91,7 @@ print("All models loaded!")
72
 
73
 
74
  # =============================================================================
75
- # Pipeline functions (from working Colab notebook)
76
  # =============================================================================
77
 
78
  def split_into_sentences(text):
@@ -80,16 +99,10 @@ def split_into_sentences(text):
80
  text = text.strip()
81
  if not text:
82
  return []
83
-
84
- # Normalize case
85
  text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
86
-
87
- # If text has punctuation, split on it
88
  if re.search(r'[.!?]', text):
89
  sentences = re.split(r'(?<=[.!?])\s+', text)
90
  return [s.strip() for s in sentences if s.strip()]
91
-
92
- # No punctuation β€” split into ~12 word chunks
93
  words = text.split()
94
  MAX_WORDS = 12
95
  sentences = []
@@ -113,31 +126,55 @@ def transcribe(audio_array, sample_rate=16000):
113
  return result["text"].strip()
114
 
115
 
116
- def translate_sentence(text, max_length=256):
117
- """MT: Translate a single sentence from English to Yoruba."""
118
- inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
119
- tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(MT_TGT_LANG)
120
-
121
- with torch.no_grad():
122
- output_ids = mt_model.generate(
123
- **inputs,
124
- max_length=max_length,
125
- forced_bos_token_id=tgt_lang_id,
126
- repetition_penalty=1.5,
127
- no_repeat_ngram_size=3,
128
- num_beams=4,
129
- early_stopping=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  )
131
- return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
132
 
133
 
134
  def translate_long_text(text):
135
- """Split into sentences and translate each individually."""
136
  sentences = split_into_sentences(text)
137
- translations = []
138
- for sent in sentences:
139
- yo = translate_sentence(sent)
140
- translations.append(yo)
141
  return ' '.join(translations), sentences, translations
142
 
143
 
@@ -154,99 +191,65 @@ def synthesize(text):
154
  # =============================================================================
155
 
156
  def process_audio(audio_input):
157
- """
158
- Full pipeline: English audio β†’ Yoruba audio.
159
- audio_input: tuple of (sample_rate, numpy_array) from Gradio.
160
- """
161
  if audio_input is None:
162
  return None, "⚠️ No audio provided. Please upload or record audio."
163
 
164
  sample_rate, audio_array = audio_input
165
-
166
- # Convert to float32 mono if needed
167
  audio_array = audio_array.astype(np.float32)
168
  if audio_array.ndim > 1:
169
  audio_array = audio_array.mean(axis=1)
170
-
171
- # Normalize to [-1, 1] if integer audio
172
  if audio_array.max() > 1.0 or audio_array.min() < -1.0:
173
  audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
174
 
175
  total_start = time.time()
176
  log_lines = []
177
 
178
- # Step 1: ASR
179
  t0 = time.time()
180
  english_text = transcribe(audio_array, sample_rate)
181
- asr_time = time.time() - t0
182
- log_lines.append(f"**🎀 ASR** ({asr_time:.2f}s)")
183
- log_lines.append(f"English: {english_text}")
184
- log_lines.append("")
185
-
186
  if not english_text:
187
- return None, "⚠️ ASR returned empty text. Please try with clearer audio."
188
 
189
- # Step 2: MT (sentence by sentence)
190
  t0 = time.time()
191
  yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text)
192
- mt_time = time.time() - t0
193
- log_lines.append(f"**πŸ”„ Translation** ({mt_time:.2f}s)")
194
  for en_s, yo_s in zip(en_sentences, yo_sentences):
195
  log_lines.append(f" EN: {en_s}")
196
  log_lines.append(f" YO: {yo_s}")
197
  log_lines.append("")
198
-
199
  if not yoruba_text:
200
  return None, "⚠️ Translation returned empty text."
201
 
202
- # Step 3: TTS
203
  t0 = time.time()
204
  yoruba_audio, output_sr = synthesize(yoruba_text)
205
- tts_time = time.time() - t0
206
- log_lines.append(f"**πŸ”Š TTS** ({tts_time:.2f}s) β†’ {len(yoruba_audio)/output_sr:.2f}s of audio")
207
-
208
- total = time.time() - total_start
209
- log_lines.append("")
210
- log_lines.append(f"**Total: {total:.2f}s**")
211
-
212
- log_output = "\n".join(log_lines)
213
 
214
- return (output_sr, yoruba_audio), log_output
215
 
216
 
217
  def process_text(english_text):
218
- """
219
- Text-only mode: English text β†’ Yoruba text + audio.
220
- Skips the ASR stage β€” useful for testing MT + TTS.
221
- """
222
  if not english_text or not english_text.strip():
223
  return None, "⚠️ Please enter some English text."
224
 
225
  total_start = time.time()
226
  log_lines = []
227
 
228
- # MT
229
  t0 = time.time()
230
  yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip())
231
- mt_time = time.time() - t0
232
- log_lines.append(f"**πŸ”„ Translation** ({mt_time:.2f}s)")
233
  for en_s, yo_s in zip(en_sentences, yo_sentences):
234
  log_lines.append(f" EN: {en_s}")
235
  log_lines.append(f" YO: {yo_s}")
236
  log_lines.append("")
237
-
238
  if not yoruba_text:
239
  return None, "⚠️ Translation returned empty text."
240
 
241
- # TTS
242
  t0 = time.time()
243
  yoruba_audio, output_sr = synthesize(yoruba_text)
244
- tts_time = time.time() - t0
245
- log_lines.append(f"**πŸ”Š TTS** ({tts_time:.2f}s) β†’ {len(yoruba_audio)/output_sr:.2f}s of audio")
246
-
247
- total = time.time() - total_start
248
- log_lines.append("")
249
- log_lines.append(f"**Total: {total:.2f}s**")
250
 
251
  return (output_sr, yoruba_audio), "\n".join(log_lines)
252
 
@@ -257,12 +260,8 @@ def process_text(english_text):
257
 
258
  DESCRIPTION = """
259
  # 🏟️ Live Football Commentary β€” English β†’ Yoruba
260
-
261
  Translate English football commentary into Yoruba speech in real-time.
262
-
263
- **Pipeline:** ASR (Whisper) β†’ MT (NLLB-200) β†’ TTS (MMS-TTS Yoruba)
264
-
265
- Upload or record English commentary audio, and get back Yoruba audio + full transcript.
266
  """
267
 
268
  EXAMPLES_TEXT = [
@@ -272,66 +271,32 @@ EXAMPLES_TEXT = [
272
  "He dribbles past two defenders and shoots! The ball hits the back of the net!",
273
  ]
274
 
275
- with gr.Blocks(
276
- title="Football Commentary EN→YO",
277
- theme=gr.themes.Soft(),
278
- ) as demo:
279
-
280
  gr.Markdown(DESCRIPTION)
281
 
282
  with gr.Tabs():
283
-
284
- # ---- Tab 1: Audio β†’ Audio (Full Pipeline) ----
285
  with gr.TabItem("πŸŽ™οΈ Audio β†’ Audio (Full Pipeline)"):
286
  gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.")
287
-
288
  with gr.Row():
289
  with gr.Column():
290
- audio_input = gr.Audio(
291
- label="English Commentary Audio",
292
- type="numpy",
293
- sources=["upload", "microphone"],
294
- )
295
  audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
296
-
297
  with gr.Column():
298
  audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
299
  audio_log = gr.Markdown(label="Pipeline Log")
 
300
 
301
- audio_submit_btn.click(
302
- fn=process_audio,
303
- inputs=[audio_input],
304
- outputs=[audio_output, audio_log],
305
- )
306
-
307
- # ---- Tab 2: Text β†’ Audio (Skip ASR) ----
308
  with gr.TabItem("πŸ“ Text β†’ Audio (Translation + TTS)"):
309
- gr.Markdown("Type or paste English text to translate to Yoruba and hear the result. Useful for testing without audio.")
310
-
311
  with gr.Row():
312
  with gr.Column():
313
- text_input = gr.Textbox(
314
- label="English Text",
315
- placeholder="Type English football commentary here...",
316
- lines=4,
317
- )
318
  text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
319
-
320
- gr.Examples(
321
- examples=[[e] for e in EXAMPLES_TEXT],
322
- inputs=[text_input],
323
- label="Example Commentary",
324
- )
325
-
326
  with gr.Column():
327
  text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
328
  text_log = gr.Markdown(label="Pipeline Log")
329
-
330
- text_submit_btn.click(
331
- fn=process_text,
332
- inputs=[text_input],
333
- outputs=[text_audio_output, text_log],
334
- )
335
 
336
  gr.Markdown("""
337
  ---
@@ -341,6 +306,5 @@ with gr.Blocks(
341
  [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
342
  """)
343
 
344
- # Launch
345
  if __name__ == "__main__":
346
- demo.launch()
 
2
  Live Football Commentary Pipeline β€” English β†’ Yoruba
3
  =====================================================
4
  Gradio app for HuggingFace Spaces.
5
+ Pipeline: ASR (Whisper) β†’ MT (NLLB-200 via CTranslate2) β†’ TTS (MMS-TTS Yoruba)
 
6
  """
7
 
8
  import torch
 
10
  import re
11
  import time
12
  import gradio as gr
13
+ import ctranslate2
14
+ from transformers import AutoTokenizer
15
+ from transformers import pipeline as hf_pipeline
 
 
16
 
17
  # =============================================================================
18
  # Configuration
 
21
  ASR_MODEL_ID = "PlotweaverAI/whisper-small-de-en"
22
  MT_MODEL_ID = "PlotweaverAI/nllb-200-distilled-600M-african-6lang"
23
  TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
24
+ CT2_MODEL_DIR = "./nllb_ct2" # Local dir where converted model is saved
25
 
26
  MT_SRC_LANG = "eng_Latn"
27
  MT_TGT_LANG = "yor_Latn"
28
 
29
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
  TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
31
+ CT2_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
+ CT2_COMPUTE_TYPE = "int8_float16" if torch.cuda.is_available() else "int8"
33
+
34
+
35
+ # =============================================================================
36
+ # Convert MT model to CTranslate2 format (runs once at startup if needed)
37
+ # =============================================================================
38
+
39
+ import os
40
+ if not os.path.exists(CT2_MODEL_DIR):
41
+ print(f"Converting {MT_MODEL_ID} to CTranslate2 format...")
42
+ import subprocess
43
+ subprocess.run([
44
+ "ct2-transformers-converter",
45
+ "--model", MT_MODEL_ID,
46
+ "--output_dir", CT2_MODEL_DIR,
47
+ "--quantization", "int8", # int8 = fastest on CPU; use int8_float16 on GPU
48
+ "--force",
49
+ ], check=True)
50
+ print("Conversion done βœ“")
51
 
52
 
53
  # =============================================================================
54
  # Load models (runs once at startup)
55
  # =============================================================================
56
 
57
+ print(f"Device: {DEVICE} | CT2 Compute: {CT2_COMPUTE_TYPE}")
58
  print("Loading models...")
59
 
60
  # ASR
 
67
  )
68
  print(" ASR loaded βœ“")
69
 
70
+ # MT β€” CTranslate2 Translator (replaces AutoModelForSeq2SeqLM)
71
+ print(f" Loading MT (CTranslate2): {CT2_MODEL_DIR}")
72
  mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
73
+ mt_translator = ctranslate2.Translator(
74
+ CT2_MODEL_DIR,
75
+ device=CT2_DEVICE,
76
+ compute_type=CT2_COMPUTE_TYPE,
77
+ inter_threads=2, # allows parallel sentence translations
78
+ )
79
+ print(" MT (CTranslate2) loaded βœ“")
80
 
81
  # TTS
82
  print(f" Loading TTS: {TTS_MODEL_ID}")
 
91
 
92
 
93
  # =============================================================================
94
+ # Pipeline functions
95
  # =============================================================================
96
 
97
  def split_into_sentences(text):
 
99
  text = text.strip()
100
  if not text:
101
  return []
 
 
102
  text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
 
 
103
  if re.search(r'[.!?]', text):
104
  sentences = re.split(r'(?<=[.!?])\s+', text)
105
  return [s.strip() for s in sentences if s.strip()]
 
 
106
  words = text.split()
107
  MAX_WORDS = 12
108
  sentences = []
 
126
  return result["text"].strip()
127
 
128
 
129
+ def translate_batch_ct2(sentences):
130
+ """
131
+ MT: Translate a batch of sentences from English β†’ Yoruba using CTranslate2.
132
+ Much faster than calling .generate() per sentence.
133
+ """
134
+ # Tokenize all sentences at once
135
+ mt_tokenizer.src_lang = MT_SRC_LANG
136
+ tgt_lang_token = MT_TGT_LANG
137
+
138
+ # Encode to token strings (CTranslate2 works with token lists, not IDs)
139
+ tokenized = [
140
+ mt_tokenizer.convert_ids_to_tokens(
141
+ mt_tokenizer.encode(s, add_special_tokens=True)
142
+ )
143
+ for s in sentences
144
+ ]
145
+
146
+ tgt_prefix = [[tgt_lang_token]] * len(sentences)
147
+
148
+ results = mt_translator.translate_batch(
149
+ tokenized,
150
+ target_prefix=tgt_prefix,
151
+ beam_size=4,
152
+ repetition_penalty=1.5,
153
+ no_repeat_ngram_size=3,
154
+ max_decoding_length=256,
155
+ )
156
+
157
+ translations = []
158
+ for result in results:
159
+ tokens = result.hypotheses[0]
160
+ # Remove the language token prefix if present
161
+ if tokens and tokens[0] == tgt_lang_token:
162
+ tokens = tokens[1:]
163
+ text = mt_tokenizer.decode(
164
+ mt_tokenizer.convert_tokens_to_ids(tokens),
165
+ skip_special_tokens=True,
166
  )
167
+ translations.append(text)
168
+
169
+ return translations
170
 
171
 
172
  def translate_long_text(text):
173
+ """Split into sentences and translate as a batch."""
174
  sentences = split_into_sentences(text)
175
+ if not sentences:
176
+ return "", [], []
177
+ translations = translate_batch_ct2(sentences)
 
178
  return ' '.join(translations), sentences, translations
179
 
180
 
 
191
  # =============================================================================
192
 
193
  def process_audio(audio_input):
 
 
 
 
194
  if audio_input is None:
195
  return None, "⚠️ No audio provided. Please upload or record audio."
196
 
197
  sample_rate, audio_array = audio_input
 
 
198
  audio_array = audio_array.astype(np.float32)
199
  if audio_array.ndim > 1:
200
  audio_array = audio_array.mean(axis=1)
 
 
201
  if audio_array.max() > 1.0 or audio_array.min() < -1.0:
202
  audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
203
 
204
  total_start = time.time()
205
  log_lines = []
206
 
 
207
  t0 = time.time()
208
  english_text = transcribe(audio_array, sample_rate)
209
+ log_lines.append(f"**🎀 ASR** ({time.time()-t0:.2f}s)")
210
+ log_lines.append(f"English: {english_text}\n")
 
 
 
211
  if not english_text:
212
+ return None, "⚠️ ASR returned empty text."
213
 
 
214
  t0 = time.time()
215
  yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text)
216
+ log_lines.append(f"**πŸ”„ Translation (CTranslate2)** ({time.time()-t0:.2f}s)")
 
217
  for en_s, yo_s in zip(en_sentences, yo_sentences):
218
  log_lines.append(f" EN: {en_s}")
219
  log_lines.append(f" YO: {yo_s}")
220
  log_lines.append("")
 
221
  if not yoruba_text:
222
  return None, "⚠️ Translation returned empty text."
223
 
 
224
  t0 = time.time()
225
  yoruba_audio, output_sr = synthesize(yoruba_text)
226
+ log_lines.append(f"**πŸ”Š TTS** ({time.time()-t0:.2f}s) β†’ {len(yoruba_audio)/output_sr:.2f}s of audio")
227
+ log_lines.append(f"\n**Total: {time.time()-total_start:.2f}s**")
 
 
 
 
 
 
228
 
229
+ return (output_sr, yoruba_audio), "\n".join(log_lines)
230
 
231
 
232
  def process_text(english_text):
 
 
 
 
233
  if not english_text or not english_text.strip():
234
  return None, "⚠️ Please enter some English text."
235
 
236
  total_start = time.time()
237
  log_lines = []
238
 
 
239
  t0 = time.time()
240
  yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip())
241
+ log_lines.append(f"**πŸ”„ Translation (CTranslate2)** ({time.time()-t0:.2f}s)")
 
242
  for en_s, yo_s in zip(en_sentences, yo_sentences):
243
  log_lines.append(f" EN: {en_s}")
244
  log_lines.append(f" YO: {yo_s}")
245
  log_lines.append("")
 
246
  if not yoruba_text:
247
  return None, "⚠️ Translation returned empty text."
248
 
 
249
  t0 = time.time()
250
  yoruba_audio, output_sr = synthesize(yoruba_text)
251
+ log_lines.append(f"**πŸ”Š TTS** ({time.time()-t0:.2f}s) β†’ {len(yoruba_audio)/output_sr:.2f}s of audio")
252
+ log_lines.append(f"\n**Total: {time.time()-total_start:.2f}s**")
 
 
 
 
253
 
254
  return (output_sr, yoruba_audio), "\n".join(log_lines)
255
 
 
260
 
261
  DESCRIPTION = """
262
  # 🏟️ Live Football Commentary β€” English β†’ Yoruba
 
263
  Translate English football commentary into Yoruba speech in real-time.
264
+ **Pipeline:** ASR (Whisper) β†’ MT (NLLB-200 via CTranslate2) β†’ TTS (MMS-TTS Yoruba)
 
 
 
265
  """
266
 
267
  EXAMPLES_TEXT = [
 
271
  "He dribbles past two defenders and shoots! The ball hits the back of the net!",
272
  ]
273
 
274
+ with gr.Blocks(title="Football Commentary EN→YO", theme=gr.themes.Soft()) as demo:
 
 
 
 
275
  gr.Markdown(DESCRIPTION)
276
 
277
  with gr.Tabs():
 
 
278
  with gr.TabItem("πŸŽ™οΈ Audio β†’ Audio (Full Pipeline)"):
279
  gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.")
 
280
  with gr.Row():
281
  with gr.Column():
282
+ audio_input = gr.Audio(label="English Commentary Audio", type="numpy", sources=["upload", "microphone"])
 
 
 
 
283
  audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
 
284
  with gr.Column():
285
  audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
286
  audio_log = gr.Markdown(label="Pipeline Log")
287
+ audio_submit_btn.click(fn=process_audio, inputs=[audio_input], outputs=[audio_output, audio_log])
288
 
 
 
 
 
 
 
 
289
  with gr.TabItem("πŸ“ Text β†’ Audio (Translation + TTS)"):
290
+ gr.Markdown("Type or paste English text to translate to Yoruba and hear the result.")
 
291
  with gr.Row():
292
  with gr.Column():
293
+ text_input = gr.Textbox(label="English Text", placeholder="Type English football commentary here...", lines=4)
 
 
 
 
294
  text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
295
+ gr.Examples(examples=[[e] for e in EXAMPLES_TEXT], inputs=[text_input], label="Example Commentary")
 
 
 
 
 
 
296
  with gr.Column():
297
  text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
298
  text_log = gr.Markdown(label="Pipeline Log")
299
+ text_submit_btn.click(fn=process_text, inputs=[text_input], outputs=[text_audio_output, text_log])
 
 
 
 
 
300
 
301
  gr.Markdown("""
302
  ---
 
306
  [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
307
  """)
308
 
 
309
  if __name__ == "__main__":
310
+ demo.launch()