File size: 1,300 Bytes
cc77ee4
4a0144d
cc77ee4
 
 
 
 
5e9c08a
cc77ee4
 
b688c52
 
 
4a0144d
 
 
 
 
 
 
 
 
 
 
cc77ee4
b688c52
5e9c08a
cc77ee4
5e9c08a
cc77ee4
 
b688c52
cc77ee4
 
 
 
 
 
b688c52
cc77ee4
 
5e9c08a
cc77ee4
5e9c08a
b688c52
cc77ee4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import gradio as gr
import numpy as np
import scipy.io.wavfile as wavfile
from transformers import pipeline

# Load models
asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")
tts = pipeline("text-to-speech", model="facebook/mms-tts-hin")

def speech_to_speech(audio):
    # audio = (sample_rate, numpy_array)
    sample_rate, audio_data = audio

    # ---- FIX AUDIO FORMAT ----
    # Convert stereo to mono
    if len(audio_data.shape) > 1:
        audio_data = np.mean(audio_data, axis=1)

    # Convert to float32
    audio_data = audio_data.astype(np.float32)

    # Normalize
    audio_data = audio_data / np.max(np.abs(audio_data) + 1e-9)

    # Speech → Text
    result = asr(audio_data, sampling_rate=sample_rate)
    text = result["text"]

    # Text → Speech (Hindi voice)
    speech = tts(text)

    # Save output
    wavfile.write("output.wav", speech["sampling_rate"], speech["audio"])

    return text, "output.wav"

demo = gr.Interface(
    fn=speech_to_speech,
    inputs=gr.Audio(type="numpy", label="Speak here"),
    outputs=[
        gr.Textbox(label="Recognized Text"),
        gr.Audio(label="Hindi Speech Output")
    ],
    title="Speech to Speech AI (Hindi)",
    description="Speak into the mic, AI listens and replies in Hindi"
)

demo.launch()