|
|
import gradio as gr |
|
|
import numpy as np |
|
|
from utils.speech_to_text import SpeechRecognizer |
|
|
from utils.text_to_speech import TextToSpeech |
|
|
|
|
|
class VoiceChatApp: |
|
|
def __init__(self): |
|
|
self.speech_recognizer = SpeechRecognizer() |
|
|
self.tts_engine = TextToSpeech() |
|
|
self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Alex?" |
|
|
self.chat_history = [] |
|
|
|
|
|
async def welcome_audio(self): |
|
|
"""Generate and play the welcome message""" |
|
|
sample_rate, audio_data = await self.tts_engine.synthesize(self.welcome_message) |
|
|
audio_response = (sample_rate, audio_data.astype(np.int16)) |
|
|
self.chat_history.append((None, self.welcome_message)) |
|
|
return self.chat_history, audio_response |
|
|
|
|
|
async def process_audio(self, audio, history): |
|
|
"""Process user audio input and generate response""" |
|
|
if audio is None: |
|
|
return history, (24000, np.zeros(24000, dtype=np.int16)), None |
|
|
|
|
|
|
|
|
text_input = await self.speech_recognizer.transcribe(audio) |
|
|
if not text_input: |
|
|
return history, (24000, np.zeros(24000, dtype=np.int16)), None |
|
|
|
|
|
|
|
|
response = "This is a test response. Please confirm if you can hear this clearly." |
|
|
|
|
|
|
|
|
sample_rate, audio_data = await self.tts_engine.synthesize(response) |
|
|
audio_response = (sample_rate, audio_data.astype(np.int16)) |
|
|
|
|
|
|
|
|
history.append((text_input, response)) |
|
|
|
|
|
return history, audio_response, None |
|
|
|
|
|
def launch(self): |
|
|
"""Launch the Gradio interface""" |
|
|
with gr.Blocks(title="Voice-Enabled Chatbot") as interface: |
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
chatbot = gr.Chatbot(label="Chat History", height=400) |
|
|
audio_input = gr.Audio(sources=["microphone"], type="numpy", |
|
|
label="Speak Here", interactive=True) |
|
|
audio_output = gr.Audio(label="Assistant Response", autoplay=True, elem_classes="compact-audio") |
|
|
|
|
|
|
|
|
interface.load( |
|
|
fn=self.welcome_audio, |
|
|
outputs=[chatbot, audio_output] |
|
|
) |
|
|
|
|
|
|
|
|
audio_input.change( |
|
|
fn=self.process_audio, |
|
|
inputs=[audio_input, chatbot], |
|
|
outputs=[chatbot, audio_output, audio_input], |
|
|
api_name="process_audio" |
|
|
).then( |
|
|
lambda: None, |
|
|
None, |
|
|
audio_input, |
|
|
queue=False |
|
|
) |
|
|
|
|
|
interface.launch( |
|
|
server_name="127.0.0.1", |
|
|
server_port=7860, |
|
|
share=True, |
|
|
debug=True |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app = VoiceChatApp() |
|
|
app.launch() |