Spaces:
Sleeping
Sleeping
File size: 2,043 Bytes
c6b9d3a b165f66 29b525c c6b9d3a f2fca2e 7096802 c6b9d3a b165f66 7096802 f2fca2e 7096802 29b525c b165f66 29b525c 7096802 528c943 3a3acd6 528c943 7096802 b165f66 7096802 20f9cf6 b165f66 550e262 528c943 1ddb113 2f853b3 b165f66 1ddb113 550e262 7096802 d39dd52 7096802 3487b88 550e262 baeecda | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import gradio as gr
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import numpy as np
# Speech → Text
stt = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base"
)
# LLM
llm = pipeline(
"text-generation",
model="distilgpt2"
)
# Text → Speech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# simple default speaker embedding
speaker_embeddings = torch.randn(1, 512)
def voice_assistant(audio):
if audio is None:
return "No audio", "No audio", None
# 1. Unpack the tuple
sr, y = audio
# 2. Convert to float32 (Whisper requirement)
y = y.astype(np.float32)
y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1
# 3. Speech → Text
speech_text = stt(y)["text"]
# AI response
response = llm(
speech_text,
max_new_tokens=60
)[0]["generated_text"]
# Prepare text for TTS
inputs = processor(text=response, return_tensors="pt")
speech = tts_model.generate_speech(
inputs["input_ids"],
speaker_embeddings
)
audio_output = speech.cpu().numpy()
# 1. Normalize the volume (so it's not too quiet)
audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping
# 2. Scale to 16-bit PCM (Required for most players)
audio_output = (audio_output * 32767).astype(np.int16)
# 3. SpeechT5 outputs at 16000Hz
return speech_text, response, (16000, audio_output)
iface = gr.Interface(
fn=voice_assistant,
inputs=gr.Audio(
sources=["microphone"],
type="numpy",
label="Speak here"
),
outputs=[
gr.Textbox(label="Recognized Speech"),
gr.Textbox(label="AI Response"),
gr.Audio(label="Voice Reply")
],
title="Voice AI Assistant",
description="Speak and the assistant will respond with voice"
)
iface.launch(server_name="0.0.0.0", server_port=7860) |