app file update
Browse files
app.py
CHANGED
|
@@ -114,28 +114,45 @@ def transcribe(audio_array, sample_rate=16000):
|
|
| 114 |
return result["text"].strip()
|
| 115 |
|
| 116 |
|
| 117 |
-
def translate_sentence(text, max_length=256):
|
| 118 |
-
"""MT: Single sentence English to Yoruba.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
|
| 120 |
with torch.no_grad():
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 131 |
|
| 132 |
|
| 133 |
-
def translate_text(text):
|
| 134 |
"""Split and translate sentence by sentence."""
|
| 135 |
sentences = split_into_sentences(text)
|
| 136 |
if not sentences:
|
| 137 |
return ""
|
| 138 |
-
translations = [translate_sentence(s) for s in sentences]
|
| 139 |
return ' '.join(translations)
|
| 140 |
|
| 141 |
|
|
@@ -158,8 +175,8 @@ def process_chunk(audio_array, sample_rate):
|
|
| 158 |
if not english:
|
| 159 |
return None, None, "", "", 0
|
| 160 |
|
| 161 |
-
# MT
|
| 162 |
-
yoruba = translate_text(english)
|
| 163 |
if not yoruba:
|
| 164 |
return None, None, english, "", 0
|
| 165 |
|
|
@@ -323,7 +340,10 @@ def streaming_process(audio_input, state):
|
|
| 323 |
state.transcript_yo.append(yoruba)
|
| 324 |
|
| 325 |
if audio_out is not None and len(audio_out) > 0:
|
| 326 |
-
|
|
|
|
|
|
|
|
|
|
| 327 |
else:
|
| 328 |
return None, format_live_log(state), state
|
| 329 |
|
|
@@ -362,8 +382,6 @@ DESCRIPTION = """
|
|
| 362 |
# Live Football Commentary \u2014 English \u2192 Yoruba
|
| 363 |
|
| 364 |
Translate English football commentary into Yoruba speech in real-time.
|
| 365 |
-
|
| 366 |
-
**Pipeline:** ASR (Whisper) \u2192 MT (NLLB-200) \u2192 TTS (MMS-TTS Yoruba)
|
| 367 |
"""
|
| 368 |
|
| 369 |
STREAMING_INSTRUCTIONS = """
|
|
@@ -374,7 +392,6 @@ STREAMING_INSTRUCTIONS = """
|
|
| 374 |
4. The transcript updates live below
|
| 375 |
5. Click **Clear** to reset
|
| 376 |
|
| 377 |
-
**Expected latency:** ~3\u20135 seconds behind your speech.
|
| 378 |
""".format(chunk_dur=CHUNK_DURATION_S)
|
| 379 |
|
| 380 |
EXAMPLES_TEXT = [
|
|
@@ -414,6 +431,7 @@ with gr.Blocks(
|
|
| 414 |
label="Yoruba Output",
|
| 415 |
type="numpy",
|
| 416 |
autoplay=True,
|
|
|
|
| 417 |
)
|
| 418 |
stream_log = gr.Markdown(
|
| 419 |
label="Live Transcript",
|
|
|
|
| 114 |
return result["text"].strip()
|
| 115 |
|
| 116 |
|
| 117 |
+
def translate_sentence(text, max_length=256, fast=False):
|
| 118 |
+
"""MT: Single sentence English to Yoruba.
|
| 119 |
+
|
| 120 |
+
fast=True uses greedy decoding (3-4x faster) for streaming mode.
|
| 121 |
+
fast=False uses beam search for better quality in batch mode.
|
| 122 |
+
"""
|
| 123 |
inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
|
| 124 |
with torch.no_grad():
|
| 125 |
+
if fast:
|
| 126 |
+
# Greedy decoding - much faster, slightly lower quality
|
| 127 |
+
output_ids = mt_model.generate(
|
| 128 |
+
**inputs,
|
| 129 |
+
max_length=max_length,
|
| 130 |
+
forced_bos_token_id=tgt_lang_id,
|
| 131 |
+
repetition_penalty=1.5,
|
| 132 |
+
no_repeat_ngram_size=3,
|
| 133 |
+
num_beams=1,
|
| 134 |
+
do_sample=False,
|
| 135 |
+
)
|
| 136 |
+
else:
|
| 137 |
+
# Beam search - better quality, slower
|
| 138 |
+
output_ids = mt_model.generate(
|
| 139 |
+
**inputs,
|
| 140 |
+
max_length=max_length,
|
| 141 |
+
forced_bos_token_id=tgt_lang_id,
|
| 142 |
+
repetition_penalty=1.5,
|
| 143 |
+
no_repeat_ngram_size=3,
|
| 144 |
+
num_beams=4,
|
| 145 |
+
early_stopping=True,
|
| 146 |
+
)
|
| 147 |
return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 148 |
|
| 149 |
|
| 150 |
+
def translate_text(text, fast=False):
|
| 151 |
"""Split and translate sentence by sentence."""
|
| 152 |
sentences = split_into_sentences(text)
|
| 153 |
if not sentences:
|
| 154 |
return ""
|
| 155 |
+
translations = [translate_sentence(s, fast=fast) for s in sentences]
|
| 156 |
return ' '.join(translations)
|
| 157 |
|
| 158 |
|
|
|
|
| 175 |
if not english:
|
| 176 |
return None, None, "", "", 0
|
| 177 |
|
| 178 |
+
# MT (fast mode for streaming - greedy decoding)
|
| 179 |
+
yoruba = translate_text(english, fast=True)
|
| 180 |
if not yoruba:
|
| 181 |
return None, None, english, "", 0
|
| 182 |
|
|
|
|
| 340 |
state.transcript_yo.append(yoruba)
|
| 341 |
|
| 342 |
if audio_out is not None and len(audio_out) > 0:
|
| 343 |
+
# Convert to int16 PCM format for streaming Audio output
|
| 344 |
+
audio_out = np.clip(audio_out, -1.0, 1.0)
|
| 345 |
+
audio_int16 = (audio_out * 32767).astype(np.int16)
|
| 346 |
+
return (sr_out, audio_int16), format_live_log(state), state
|
| 347 |
else:
|
| 348 |
return None, format_live_log(state), state
|
| 349 |
|
|
|
|
| 382 |
# Live Football Commentary \u2014 English \u2192 Yoruba
|
| 383 |
|
| 384 |
Translate English football commentary into Yoruba speech in real-time.
|
|
|
|
|
|
|
| 385 |
"""
|
| 386 |
|
| 387 |
STREAMING_INSTRUCTIONS = """
|
|
|
|
| 392 |
4. The transcript updates live below
|
| 393 |
5. Click **Clear** to reset
|
| 394 |
|
|
|
|
| 395 |
""".format(chunk_dur=CHUNK_DURATION_S)
|
| 396 |
|
| 397 |
EXAMPLES_TEXT = [
|
|
|
|
| 431 |
label="Yoruba Output",
|
| 432 |
type="numpy",
|
| 433 |
autoplay=True,
|
| 434 |
+
streaming=True,
|
| 435 |
)
|
| 436 |
stream_log = gr.Markdown(
|
| 437 |
label="Live Transcript",
|