|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import scipy.io.wavfile as wavfile |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small") |
|
|
tts = pipeline("text-to-speech", model="facebook/mms-tts-hin") |
|
|
|
|
|
def speech_to_speech(audio): |
|
|
|
|
|
sample_rate, audio_data = audio |
|
|
|
|
|
|
|
|
|
|
|
if len(audio_data.shape) > 1: |
|
|
audio_data = np.mean(audio_data, axis=1) |
|
|
|
|
|
|
|
|
audio_data = audio_data.astype(np.float32) |
|
|
|
|
|
|
|
|
audio_data = audio_data / np.max(np.abs(audio_data) + 1e-9) |
|
|
|
|
|
|
|
|
result = asr(audio_data, sampling_rate=sample_rate) |
|
|
text = result["text"] |
|
|
|
|
|
|
|
|
speech = tts(text) |
|
|
|
|
|
|
|
|
wavfile.write("output.wav", speech["sampling_rate"], speech["audio"]) |
|
|
|
|
|
return text, "output.wav" |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=speech_to_speech, |
|
|
inputs=gr.Audio(type="numpy", label="Speak here"), |
|
|
outputs=[ |
|
|
gr.Textbox(label="Recognized Text"), |
|
|
gr.Audio(label="Hindi Speech Output") |
|
|
], |
|
|
title="Speech to Speech AI (Hindi)", |
|
|
description="Speak into the mic, AI listens and replies in Hindi" |
|
|
) |
|
|
|
|
|
demo.launch() |