File size: 2,961 Bytes
a350173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import numpy as np
from utils.speech_to_text import SpeechRecognizer
from utils.text_to_speech import TextToSpeech

class VoiceChatApp:
    def __init__(self):
        self.speech_recognizer = SpeechRecognizer()
        self.tts_engine = TextToSpeech()
        self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Alex?"
        self.chat_history = []

    async def welcome_audio(self):
        """Generate and play the welcome message"""
        sample_rate, audio_data = await self.tts_engine.synthesize(self.welcome_message)
        audio_response = (sample_rate, audio_data.astype(np.int16))
        self.chat_history.append((None, self.welcome_message))
        return self.chat_history, audio_response

    async def process_audio(self, audio, history):
        """Process user audio input and generate response"""
        if audio is None:
            return history, (24000, np.zeros(24000, dtype=np.int16)), None

        # Speech to text
        text_input = await self.speech_recognizer.transcribe(audio)
        if not text_input:
            return history, (24000, np.zeros(24000, dtype=np.int16)), None

        # Generate response
        response = "This is a test response. Please confirm if you can hear this clearly."
        
        # Text to speech
        sample_rate, audio_data = await self.tts_engine.synthesize(response)
        audio_response = (sample_rate, audio_data.astype(np.int16))

        # Update chat history
        history.append((text_input, response))
        
        return history, audio_response, None

    def launch(self):
        """Launch the Gradio interface"""
        with gr.Blocks(title="Voice-Enabled Chatbot") as interface:
            with gr.Row():
                with gr.Column(scale=2):
                    chatbot = gr.Chatbot(label="Chat History", height=400)
                    audio_input = gr.Audio(sources=["microphone"], type="numpy",
                                        label="Speak Here", interactive=True)
                    audio_output = gr.Audio(label="Assistant Response", autoplay=True, elem_classes="compact-audio")

            # Initial welcome message
            interface.load(
                fn=self.welcome_audio,
                outputs=[chatbot, audio_output]
            )

            # Audio processing chain
            audio_input.change(
                fn=self.process_audio,
                inputs=[audio_input, chatbot],
                outputs=[chatbot, audio_output, audio_input],
                api_name="process_audio"
            ).then(
                lambda: None,
                None,
                audio_input,
                queue=False
            )

        interface.launch(
            server_name="127.0.0.1",
            server_port=7860,
            share=True,
            debug=True
        )

if __name__ == "__main__":
    app = VoiceChatApp()
    app.launch()