import gradio as gr import speech_recognition as sr from gtts import gTTS from pydub import AudioSegment import numpy as np import tempfile import os recognizer = sr.Recognizer() def speech_to_speech_translation(audio_path): if audio_path is None: return None # Convert input to wav sound = AudioSegment.from_file(audio_path) sound = sound.set_channels(1).set_frame_rate(16000) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sound.export(f.name, format="wav") wav_path = f.name # Speech → English text (free Google STT) with sr.AudioFile(wav_path) as source: audio_data = recognizer.record(source) text = recognizer.recognize_google(audio_data) os.remove(wav_path) # English → French speech tts = gTTS(text=text, lang="fr") mp3_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name tts.save(mp3_path) audio_out = AudioSegment.from_file(mp3_path, format="mp3") samples = np.array(audio_out.get_array_of_samples()).astype(np.float32) samples /= np.max(np.abs(samples)) os.remove(mp3_path) return (audio_out.frame_rate, samples) demo = gr.Interface( fn=speech_to_speech_translation, inputs=gr.Audio(type="filepath", label="Input speech"), outputs=gr.Audio(type="numpy", label="French speech output"), title="Speech-to-Speech Translation (French)", allow_flagging="never" ) if __name__ == "__main__": demo.launch()