Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech | |
| import torch | |
| import numpy as np | |
| # Speech β Text | |
| stt = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-base" | |
| ) | |
| # LLM | |
| llm = pipeline( | |
| "text-generation", | |
| model="distilgpt2" | |
| ) | |
| # Text β Speech | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| # simple default speaker embedding | |
| speaker_embeddings = torch.randn(1, 512) | |
| def voice_assistant(audio): | |
| if audio is None: | |
| return "No audio", "No audio", None | |
| # 1. Unpack the tuple | |
| sr, y = audio | |
| # 2. Convert to float32 (Whisper requirement) | |
| y = y.astype(np.float32) | |
| y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1 | |
| # 3. Speech β Text | |
| speech_text = stt(y)["text"] | |
| # AI response | |
| response = llm( | |
| speech_text, | |
| max_new_tokens=60 | |
| )[0]["generated_text"] | |
| # Prepare text for TTS | |
| inputs = processor(text=response, return_tensors="pt") | |
| speech = tts_model.generate_speech( | |
| inputs["input_ids"], | |
| speaker_embeddings | |
| ) | |
| audio_output = speech.cpu().numpy() | |
| # 1. Normalize the volume (so it's not too quiet) | |
| audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping | |
| # 2. Scale to 16-bit PCM (Required for most players) | |
| audio_output = (audio_output * 32767).astype(np.int16) | |
| # 3. SpeechT5 outputs at 16000Hz | |
| return speech_text, response, (16000, audio_output) | |
| iface = gr.Interface( | |
| fn=voice_assistant, | |
| inputs=gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| label="Speak here" | |
| ), | |
| outputs=[ | |
| gr.Textbox(label="Recognized Speech"), | |
| gr.Textbox(label="AI Response"), | |
| gr.Audio(label="Voice Reply") | |
| ], | |
| title="Voice AI Assistant", | |
| description="Speak and the assistant will respond with voice" | |
| ) | |
| iface.launch(server_name="0.0.0.0", server_port=7860) |