PlotweaverModel commited on
Commit
7f24b54
·
verified ·
1 Parent(s): 9cf39bc

app file update

Browse files
Files changed (1) hide show
  1. app.py +37 -19
app.py CHANGED
@@ -114,28 +114,45 @@ def transcribe(audio_array, sample_rate=16000):
114
  return result["text"].strip()
115
 
116
 
117
- def translate_sentence(text, max_length=256):
118
- """MT: Single sentence English to Yoruba."""
 
 
 
 
119
  inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
120
  with torch.no_grad():
121
- output_ids = mt_model.generate(
122
- **inputs,
123
- max_length=max_length,
124
- forced_bos_token_id=tgt_lang_id,
125
- repetition_penalty=1.5,
126
- no_repeat_ngram_size=3,
127
- num_beams=4,
128
- early_stopping=True,
129
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
131
 
132
 
133
- def translate_text(text):
134
  """Split and translate sentence by sentence."""
135
  sentences = split_into_sentences(text)
136
  if not sentences:
137
  return ""
138
- translations = [translate_sentence(s) for s in sentences]
139
  return ' '.join(translations)
140
 
141
 
@@ -158,8 +175,8 @@ def process_chunk(audio_array, sample_rate):
158
  if not english:
159
  return None, None, "", "", 0
160
 
161
- # MT
162
- yoruba = translate_text(english)
163
  if not yoruba:
164
  return None, None, english, "", 0
165
 
@@ -323,7 +340,10 @@ def streaming_process(audio_input, state):
323
  state.transcript_yo.append(yoruba)
324
 
325
  if audio_out is not None and len(audio_out) > 0:
326
- return (sr_out, audio_out), format_live_log(state), state
 
 
 
327
  else:
328
  return None, format_live_log(state), state
329
 
@@ -362,8 +382,6 @@ DESCRIPTION = """
362
  # Live Football Commentary \u2014 English \u2192 Yoruba
363
 
364
  Translate English football commentary into Yoruba speech in real-time.
365
-
366
- **Pipeline:** ASR (Whisper) \u2192 MT (NLLB-200) \u2192 TTS (MMS-TTS Yoruba)
367
  """
368
 
369
  STREAMING_INSTRUCTIONS = """
@@ -374,7 +392,6 @@ STREAMING_INSTRUCTIONS = """
374
  4. The transcript updates live below
375
  5. Click **Clear** to reset
376
 
377
- **Expected latency:** ~3\u20135 seconds behind your speech.
378
  """.format(chunk_dur=CHUNK_DURATION_S)
379
 
380
  EXAMPLES_TEXT = [
@@ -414,6 +431,7 @@ with gr.Blocks(
414
  label="Yoruba Output",
415
  type="numpy",
416
  autoplay=True,
 
417
  )
418
  stream_log = gr.Markdown(
419
  label="Live Transcript",
 
114
  return result["text"].strip()
115
 
116
 
117
+ def translate_sentence(text, max_length=256, fast=False):
118
+ """MT: Single sentence English to Yoruba.
119
+
120
+ fast=True uses greedy decoding (3-4x faster) for streaming mode.
121
+ fast=False uses beam search for better quality in batch mode.
122
+ """
123
  inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
124
  with torch.no_grad():
125
+ if fast:
126
+ # Greedy decoding - much faster, slightly lower quality
127
+ output_ids = mt_model.generate(
128
+ **inputs,
129
+ max_length=max_length,
130
+ forced_bos_token_id=tgt_lang_id,
131
+ repetition_penalty=1.5,
132
+ no_repeat_ngram_size=3,
133
+ num_beams=1,
134
+ do_sample=False,
135
+ )
136
+ else:
137
+ # Beam search - better quality, slower
138
+ output_ids = mt_model.generate(
139
+ **inputs,
140
+ max_length=max_length,
141
+ forced_bos_token_id=tgt_lang_id,
142
+ repetition_penalty=1.5,
143
+ no_repeat_ngram_size=3,
144
+ num_beams=4,
145
+ early_stopping=True,
146
+ )
147
  return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
148
 
149
 
150
+ def translate_text(text, fast=False):
151
  """Split and translate sentence by sentence."""
152
  sentences = split_into_sentences(text)
153
  if not sentences:
154
  return ""
155
+ translations = [translate_sentence(s, fast=fast) for s in sentences]
156
  return ' '.join(translations)
157
 
158
 
 
175
  if not english:
176
  return None, None, "", "", 0
177
 
178
+ # MT (fast mode for streaming - greedy decoding)
179
+ yoruba = translate_text(english, fast=True)
180
  if not yoruba:
181
  return None, None, english, "", 0
182
 
 
340
  state.transcript_yo.append(yoruba)
341
 
342
  if audio_out is not None and len(audio_out) > 0:
343
+ # Convert to int16 PCM format for streaming Audio output
344
+ audio_out = np.clip(audio_out, -1.0, 1.0)
345
+ audio_int16 = (audio_out * 32767).astype(np.int16)
346
+ return (sr_out, audio_int16), format_live_log(state), state
347
  else:
348
  return None, format_live_log(state), state
349
 
 
382
  # Live Football Commentary \u2014 English \u2192 Yoruba
383
 
384
  Translate English football commentary into Yoruba speech in real-time.
 
 
385
  """
386
 
387
  STREAMING_INSTRUCTIONS = """
 
392
  4. The transcript updates live below
393
  5. Click **Clear** to reset
394
 
 
395
  """.format(chunk_dur=CHUNK_DURATION_S)
396
 
397
  EXAMPLES_TEXT = [
 
431
  label="Yoruba Output",
432
  type="numpy",
433
  autoplay=True,
434
+ streaming=True,
435
  )
436
  stream_log = gr.Markdown(
437
  label="Live Transcript",