import gradio as gr import numpy as np import scipy.io.wavfile as wavfile from transformers import pipeline # Load models asr = pipeline("automatic-speech-recognition", model="openai/whisper-small") tts = pipeline("text-to-speech", model="facebook/mms-tts-hin") def speech_to_speech(audio): # audio = (sample_rate, numpy_array) sample_rate, audio_data = audio # ---- FIX AUDIO FORMAT ---- # Convert stereo to mono if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) # Convert to float32 audio_data = audio_data.astype(np.float32) # Normalize audio_data = audio_data / np.max(np.abs(audio_data) + 1e-9) # Speech → Text result = asr(audio_data, sampling_rate=sample_rate) text = result["text"] # Text → Speech (Hindi voice) speech = tts(text) # Save output wavfile.write("output.wav", speech["sampling_rate"], speech["audio"]) return text, "output.wav" demo = gr.Interface( fn=speech_to_speech, inputs=gr.Audio(type="numpy", label="Speak here"), outputs=[ gr.Textbox(label="Recognized Text"), gr.Audio(label="Hindi Speech Output") ], title="Speech to Speech AI (Hindi)", description="Speak into the mic, AI listens and replies in Hindi" ) demo.launch()