import gradio as gr
import speech_recognition as sr
from gtts import gTTS
from pydub import AudioSegment
import numpy as np
import tempfile
import os

recognizer = sr.Recognizer()

def speech_to_speech_translation(audio_path):
    if audio_path is None:
        return None

    # Convert input to wav
    sound = AudioSegment.from_file(audio_path)
    sound = sound.set_channels(1).set_frame_rate(16000)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        sound.export(f.name, format="wav")
        wav_path = f.name

    # Speech → English text (free Google STT)
    with sr.AudioFile(wav_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)

    os.remove(wav_path)

    # English → French speech
    tts = gTTS(text=text, lang="fr")
    mp3_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
    tts.save(mp3_path)

    audio_out = AudioSegment.from_file(mp3_path, format="mp3")
    samples = np.array(audio_out.get_array_of_samples()).astype(np.float32)
    samples /= np.max(np.abs(samples))

    os.remove(mp3_path)

    return (audio_out.frame_rate, samples)

demo = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(type="filepath", label="Input speech"),
    outputs=gr.Audio(type="numpy", label="French speech output"),
    title="Speech-to-Speech Translation (French)",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()