File size: 2,043 Bytes
c6b9d3a
b165f66
 
29b525c
c6b9d3a
f2fca2e
7096802
 
 
c6b9d3a
 
b165f66
7096802
f2fca2e
 
7096802
 
29b525c
b165f66
 
 
29b525c
 
7096802
 
 
528c943
3a3acd6
 
 
 
 
 
 
 
 
 
 
528c943
 
7096802
 
b165f66
7096802
 
20f9cf6
b165f66
 
 
 
 
 
550e262
528c943
1ddb113
2f853b3
 
 
 
 
 
 
b165f66
1ddb113
550e262
 
7096802
d39dd52
 
 
 
 
7096802
 
 
 
 
 
3487b88
550e262
 
baeecda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import numpy as np

# Speech → Text
stt = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base"
)

# LLM
llm = pipeline(
    "text-generation",
    model="distilgpt2"
)

# Text → Speech
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

# simple default speaker embedding
speaker_embeddings = torch.randn(1, 512)


def voice_assistant(audio):
    if audio is None:
        return "No audio", "No audio", None
    
    # 1. Unpack the tuple
    sr, y = audio 
    
    # 2. Convert to float32 (Whisper requirement)
    y = y.astype(np.float32)
    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1

    # 3. Speech → Text
    speech_text = stt(y)["text"]

    # AI response
    response = llm(
        speech_text,
        max_new_tokens=60
    )[0]["generated_text"]

    # Prepare text for TTS
    inputs = processor(text=response, return_tensors="pt")

    speech = tts_model.generate_speech(
        inputs["input_ids"],
        speaker_embeddings
    )

    audio_output = speech.cpu().numpy()

    # 1. Normalize the volume (so it's not too quiet)
    audio_output = np.clip(audio_output, -1.0, 1.0) # Prevent clipping
    
    # 2. Scale to 16-bit PCM (Required for most players)
    audio_output = (audio_output * 32767).astype(np.int16)

    # 3. SpeechT5 outputs at 16000Hz
    return speech_text, response, (16000, audio_output)


iface = gr.Interface(
    fn=voice_assistant,
    inputs=gr.Audio(
        sources=["microphone"],
        type="numpy",
        label="Speak here"
    ),
    outputs=[
        gr.Textbox(label="Recognized Speech"),
        gr.Textbox(label="AI Response"),
        gr.Audio(label="Voice Reply")
    ],
    title="Voice AI Assistant",
    description="Speak and the assistant will respond with voice"
)

iface.launch(server_name="0.0.0.0", server_port=7860)