TestS2S / app.py
Dinoking's picture
commit
4a0144d verified
import gradio as gr
import numpy as np
import scipy.io.wavfile as wavfile
from transformers import pipeline
# Load models
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
tts = pipeline("text-to-speech", model="facebook/mms-tts-hin")
def speech_to_speech(audio):
# audio = (sample_rate, numpy_array)
sample_rate, audio_data = audio
# ---- FIX AUDIO FORMAT ----
# Convert stereo to mono
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
# Convert to float32
audio_data = audio_data.astype(np.float32)
# Normalize
audio_data = audio_data / np.max(np.abs(audio_data) + 1e-9)
# Speech → Text
result = asr(audio_data, sampling_rate=sample_rate)
text = result["text"]
# Text → Speech (Hindi voice)
speech = tts(text)
# Save output
wavfile.write("output.wav", speech["sampling_rate"], speech["audio"])
return text, "output.wav"
demo = gr.Interface(
fn=speech_to_speech,
inputs=gr.Audio(type="numpy", label="Speak here"),
outputs=[
gr.Textbox(label="Recognized Text"),
gr.Audio(label="Hindi Speech Output")
],
title="Speech to Speech AI (Hindi)",
description="Speak into the mic, AI listens and replies in Hindi"
)
demo.launch()