import gradio as gr from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech import torch import numpy as np # Speech → Text stt = pipeline( "automatic-speech-recognition", model="openai/whisper-base" ) # LLM llm = pipeline( "text-generation", model="distilgpt2" ) # Text → Speech processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") # simple default speaker embedding speaker_embeddings = torch.randn(1, 512) def voice_assistant(audio): if audio is None: return "No audio", "No audio", None # 1. Unpack the tuple sr, y = audio # 2. Convert to float32 (Whisper requirement) y = y.astype(np.float32) y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1 # 3. Speech → Text speech_text = stt(y)["text"] # AI response response = llm( speech_text, max_new_tokens=60 )[0]["generated_text"] # Prepare text for TTS inputs = processor(text=response, return_tensors="pt") speech = tts_model.generate_speech( inputs["input_ids"], speaker_embeddings ) audio_output = speech.cpu().numpy() # 1. Normalize the volume (so it's not too quiet) audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping # 2. Scale to 16-bit PCM (Required for most players) audio_output = (audio_output * 32767).astype(np.int16) # 3. SpeechT5 outputs at 16000Hz return speech_text, response, (16000, audio_output) iface = gr.Interface( fn=voice_assistant, inputs=gr.Audio( sources=["microphone"], type="numpy", label="Speak here" ), outputs=[ gr.Textbox(label="Recognized Speech"), gr.Textbox(label="AI Response"), gr.Audio(label="Voice Reply") ], title="Voice AI Assistant", description="Speak and the assistant will respond with voice" ) iface.launch(server_name="0.0.0.0", server_port=7860)