gencent / app /main.py
hjaved202's picture
Upload folder using huggingface_hub
a350173 verified
import gradio as gr
import numpy as np
from utils.speech_to_text import SpeechRecognizer
from utils.text_to_speech import TextToSpeech
class VoiceChatApp:
def __init__(self):
self.speech_recognizer = SpeechRecognizer()
self.tts_engine = TextToSpeech()
self.welcome_message = "Hello, this is GenCent AI calling. This is a follow-up call. Am I speaking to Alex?"
self.chat_history = []
async def welcome_audio(self):
"""Generate and play the welcome message"""
sample_rate, audio_data = await self.tts_engine.synthesize(self.welcome_message)
audio_response = (sample_rate, audio_data.astype(np.int16))
self.chat_history.append((None, self.welcome_message))
return self.chat_history, audio_response
async def process_audio(self, audio, history):
"""Process user audio input and generate response"""
if audio is None:
return history, (24000, np.zeros(24000, dtype=np.int16)), None
# Speech to text
text_input = await self.speech_recognizer.transcribe(audio)
if not text_input:
return history, (24000, np.zeros(24000, dtype=np.int16)), None
# Generate response
response = "This is a test response. Please confirm if you can hear this clearly."
# Text to speech
sample_rate, audio_data = await self.tts_engine.synthesize(response)
audio_response = (sample_rate, audio_data.astype(np.int16))
# Update chat history
history.append((text_input, response))
return history, audio_response, None
def launch(self):
"""Launch the Gradio interface"""
with gr.Blocks(title="Voice-Enabled Chatbot") as interface:
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Chat History", height=400)
audio_input = gr.Audio(sources=["microphone"], type="numpy",
label="Speak Here", interactive=True)
audio_output = gr.Audio(label="Assistant Response", autoplay=True, elem_classes="compact-audio")
# Initial welcome message
interface.load(
fn=self.welcome_audio,
outputs=[chatbot, audio_output]
)
# Audio processing chain
audio_input.change(
fn=self.process_audio,
inputs=[audio_input, chatbot],
outputs=[chatbot, audio_output, audio_input],
api_name="process_audio"
).then(
lambda: None,
None,
audio_input,
queue=False
)
interface.launch(
server_name="127.0.0.1",
server_port=7860,
share=True,
debug=True
)
if __name__ == "__main__":
app = VoiceChatApp()
app.launch()