|
|
import gradio as gr |
|
|
import speech_recognition as sr |
|
|
from gtts import gTTS |
|
|
from pydub import AudioSegment |
|
|
import numpy as np |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
recognizer = sr.Recognizer() |
|
|
|
|
|
def speech_to_speech_translation(audio_path): |
|
|
if audio_path is None: |
|
|
return None |
|
|
|
|
|
|
|
|
sound = AudioSegment.from_file(audio_path) |
|
|
sound = sound.set_channels(1).set_frame_rate(16000) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
|
|
sound.export(f.name, format="wav") |
|
|
wav_path = f.name |
|
|
|
|
|
|
|
|
with sr.AudioFile(wav_path) as source: |
|
|
audio_data = recognizer.record(source) |
|
|
text = recognizer.recognize_google(audio_data) |
|
|
|
|
|
os.remove(wav_path) |
|
|
|
|
|
|
|
|
tts = gTTS(text=text, lang="fr") |
|
|
mp3_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name |
|
|
tts.save(mp3_path) |
|
|
|
|
|
audio_out = AudioSegment.from_file(mp3_path, format="mp3") |
|
|
samples = np.array(audio_out.get_array_of_samples()).astype(np.float32) |
|
|
samples /= np.max(np.abs(samples)) |
|
|
|
|
|
os.remove(mp3_path) |
|
|
|
|
|
return (audio_out.frame_rate, samples) |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=speech_to_speech_translation, |
|
|
inputs=gr.Audio(type="filepath", label="Input speech"), |
|
|
outputs=gr.Audio(type="numpy", label="French speech output"), |
|
|
title="Speech-to-Speech Translation (French)", |
|
|
allow_flagging="never" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|