lini / app.py
explorer7's picture
Update app.py
b240c75 verified
import gradio as gr
import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
import numpy as np
import tempfile
import os
recognizer = sr.Recognizer()
def speech_to_speech_translation(audio_path):
if audio_path is None:
return None
# Convert input to wav
sound = AudioSegment.from_file(audio_path)
sound = sound.set_channels(1).set_frame_rate(16000)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sound.export(f.name, format="wav")
wav_path = f.name
# Speech → English text (free Google STT)
with sr.AudioFile(wav_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
os.remove(wav_path)
# English → French speech
tts = gTTS(text=text, lang="fr")
mp3_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
tts.save(mp3_path)
audio_out = AudioSegment.from_file(mp3_path, format="mp3")
samples = np.array(audio_out.get_array_of_samples()).astype(np.float32)
samples /= np.max(np.abs(samples))
os.remove(mp3_path)
return (audio_out.frame_rate, samples)
demo = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(type="filepath", label="Input speech"),
outputs=gr.Audio(type="numpy", label="French speech output"),
title="Speech-to-Speech Translation (French)",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch()