Voice-Assistant / app.py
Udyan's picture
Update app.py
2f853b3 verified
import gradio as gr
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import numpy as np
# Speech β†’ Text
stt = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base"
)
# LLM
llm = pipeline(
"text-generation",
model="distilgpt2"
)
# Text β†’ Speech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# simple default speaker embedding
speaker_embeddings = torch.randn(1, 512)
def voice_assistant(audio):
if audio is None:
return "No audio", "No audio", None
# 1. Unpack the tuple
sr, y = audio
# 2. Convert to float32 (Whisper requirement)
y = y.astype(np.float32)
y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1
# 3. Speech β†’ Text
speech_text = stt(y)["text"]
# AI response
response = llm(
speech_text,
max_new_tokens=60
)[0]["generated_text"]
# Prepare text for TTS
inputs = processor(text=response, return_tensors="pt")
speech = tts_model.generate_speech(
inputs["input_ids"],
speaker_embeddings
)
audio_output = speech.cpu().numpy()
# 1. Normalize the volume (so it's not too quiet)
audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping
# 2. Scale to 16-bit PCM (Required for most players)
audio_output = (audio_output * 32767).astype(np.int16)
# 3. SpeechT5 outputs at 16000Hz
return speech_text, response, (16000, audio_output)
iface = gr.Interface(
fn=voice_assistant,
inputs=gr.Audio(
sources=["microphone"],
type="numpy",
label="Speak here"
),
outputs=[
gr.Textbox(label="Recognized Speech"),
gr.Textbox(label="AI Response"),
gr.Audio(label="Voice Reply")
],
title="Voice AI Assistant",
description="Speak and the assistant will respond with voice"
)
iface.launch(server_name="0.0.0.0", server_port=7860)