Spaces:

SamuelM0422
/

audio_translator

Sleeping

App Files Files Community

SamuelM0422 commited on Mar 1, 2025

Commit

943fd9a

·

verified ·

1 Parent(s): 0e72f25

Upload 7 files

Files changed (7) hide show

app.py +19 -0
examples/Example 01.mp3 +0 -0
examples/Example 02.mp3 +0 -0
examples/Example 03.mp3 +0 -0
examples/Example 04.mp3 +0 -0
examples/Example 05.mp3 +0 -0
helper_function.py +40 -0

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import gradio as gr
+from helper_function import speech_to_speech_translation
+demo = gr.Blocks()
+title = 'Audio translator 🇧🇷 ➡️ 🇺🇸'
+description = 'A stacked aproach for translating audios from Portuguese to English'
+translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(label='Input', sources=['upload', 'microphone'], type='filepath'),
+    outputs=[gr.Audio(label='Output', type='numpy'), gr.Textbox(label="Tradução")],
+    flagging_mode='never',
+    examples=examples,
+    title=title,
+    description=description
+)
+translate.launch()

examples/Example 01.mp3 ADDED Viewed

Binary file (19.7 kB). View file

examples/Example 02.mp3 ADDED Viewed

Binary file (20.8 kB). View file

examples/Example 03.mp3 ADDED Viewed

Binary file (54 kB). View file

examples/Example 04.mp3 ADDED Viewed

Binary file (33.7 kB). View file

examples/Example 05.mp3 ADDED Viewed

Binary file (43.9 kB). View file

helper_function.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import numpy as np
+import torch
+from transformers import pipeline
+from transformers import VitsModel, VitsTokenizer
+from IPython.display import Audio
+from pathlib import Path
+examples = list(Path('').glob('*mp3'))
+examples.sort()
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+pipe = pipeline(
+    'automatic-speech-recognition', model='openai/whisper-base', device=device,
+)
+model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+target_dtype=np.int16
+max_range = np.iinfo(target_dtype).max
+def speech_to_speech_translation(filepath):
+  print(filepath)
+  translation = pipe(filepath, max_new_tokens=256, generate_kwargs={'task': 'translate'})['text']
+  inputs = tokenizer(translation, return_tensors="pt")
+  input_ids = inputs["input_ids"]
+  model.eval()
+  with torch.inference_mode():
+      outputs = model(input_ids)
+  speech = outputs["waveform"]
+  synthesised_speech = speech / torch.max(torch.abs(speech))  # Normaliza para [-1, 1]
+  synthesised_speech = (speech * max_range).numpy().astype(target_dtype)
+  Audio(synthesised_speech, rate=16000)
+  return (16000, synthesised_speech.squeeze()), translation