Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- app.py +19 -0
- examples/Example 01.mp3 +0 -0
- examples/Example 02.mp3 +0 -0
- examples/Example 03.mp3 +0 -0
- examples/Example 04.mp3 +0 -0
- examples/Example 05.mp3 +0 -0
- helper_function.py +40 -0
app.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from helper_function import speech_to_speech_translation
|
| 3 |
+
|
| 4 |
+
demo = gr.Blocks()
|
| 5 |
+
|
| 6 |
+
title = 'Audio translator 🇧🇷 ➡️ 🇺🇸'
|
| 7 |
+
description = 'A stacked aproach for translating audios from Portuguese to English'
|
| 8 |
+
|
| 9 |
+
translate = gr.Interface(
|
| 10 |
+
fn=speech_to_speech_translation,
|
| 11 |
+
inputs=gr.Audio(label='Input', sources=['upload', 'microphone'], type='filepath'),
|
| 12 |
+
outputs=[gr.Audio(label='Output', type='numpy'), gr.Textbox(label="Tradução")],
|
| 13 |
+
flagging_mode='never',
|
| 14 |
+
examples=examples,
|
| 15 |
+
title=title,
|
| 16 |
+
description=description
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
translate.launch()
|
examples/Example 01.mp3
ADDED
|
Binary file (19.7 kB). View file
|
|
|
examples/Example 02.mp3
ADDED
|
Binary file (20.8 kB). View file
|
|
|
examples/Example 03.mp3
ADDED
|
Binary file (54 kB). View file
|
|
|
examples/Example 04.mp3
ADDED
|
Binary file (33.7 kB). View file
|
|
|
examples/Example 05.mp3
ADDED
|
Binary file (43.9 kB). View file
|
|
|
helper_function.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from transformers import VitsModel, VitsTokenizer
|
| 6 |
+
from IPython.display import Audio
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
examples = list(Path('').glob('*mp3'))
|
| 10 |
+
examples.sort()
|
| 11 |
+
|
| 12 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 13 |
+
pipe = pipeline(
|
| 14 |
+
'automatic-speech-recognition', model='openai/whisper-base', device=device,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
| 18 |
+
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
|
| 19 |
+
|
| 20 |
+
target_dtype=np.int16
|
| 21 |
+
max_range = np.iinfo(target_dtype).max
|
| 22 |
+
|
| 23 |
+
def speech_to_speech_translation(filepath):
|
| 24 |
+
print(filepath)
|
| 25 |
+
translation = pipe(filepath, max_new_tokens=256, generate_kwargs={'task': 'translate'})['text']
|
| 26 |
+
|
| 27 |
+
inputs = tokenizer(translation, return_tensors="pt")
|
| 28 |
+
input_ids = inputs["input_ids"]
|
| 29 |
+
|
| 30 |
+
model.eval()
|
| 31 |
+
with torch.inference_mode():
|
| 32 |
+
outputs = model(input_ids)
|
| 33 |
+
|
| 34 |
+
speech = outputs["waveform"]
|
| 35 |
+
synthesised_speech = speech / torch.max(torch.abs(speech)) # Normaliza para [-1, 1]
|
| 36 |
+
synthesised_speech = (speech * max_range).numpy().astype(target_dtype)
|
| 37 |
+
|
| 38 |
+
Audio(synthesised_speech, rate=16000)
|
| 39 |
+
|
| 40 |
+
return (16000, synthesised_speech.squeeze()), translation
|