Spaces:

develops20
/

VoiceSupportAgent

Sleeping

App Files Files Community

VoiceSupportAgent / app.py

develops20

Update app.py

e4674b9 verified 10 months ago

raw

history blame

12.5 kB

	import gradio as gr
	import speech_recognition as sr
	import requests
	import json
	import os
	from datetime import datetime, timedelta
	import tempfile
	import io
	import base64
	from typing import Optional, Dict, Any
	import asyncio
	import aiohttp

	# Configuration
	ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
	GOOGLE_CALENDAR_CREDENTIALS = os.getenv("GOOGLE_CALENDAR_CREDENTIALS")
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

	# ElevenLabs configuration
	ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Default voice, can be changed
	ELEVENLABS_API_URL = "https://api.elevenlabs.io/v1"

	class VoiceAgent:
	def __init__(self):
	self.recognizer = sr.Recognizer()
	self.microphone = sr.Microphone()

	async def speech_to_text(self, audio_file) -> str:
	"""Convert speech to text using speech_recognition"""
	try:
	with sr.AudioFile(audio_file) as source:
	audio = self.recognizer.record(source)
	text = self.recognizer.recognize_google(audio)
	return text
	except Exception as e:
	return f"Error in speech recognition: {str(e)}"

	async def text_to_speech(self, text: str) -> bytes:
	"""Convert text to speech using ElevenLabs"""
	if not ELEVENLABS_API_KEY:
	raise ValueError("ElevenLabs API key not found")

	url = f"{ELEVENLABS_API_URL}/text-to-speech/{ELEVENLABS_VOICE_ID}"
	headers = {
	"Accept": "audio/mpeg",
	"Content-Type": "application/json",
	"xi-api-key": ELEVENLABS_API_KEY
	}

	data = {
	"text": text,
	"model_id": "eleven_monolingual_v1",
	"voice_settings": {
	"stability": 0.5,
	"similarity_boost": 0.5
	}
	}

	async with aiohttp.ClientSession() as session:
	async with session.post(url, json=data, headers=headers) as response:
	if response.status == 200:
	return await response.read()
	else:
	raise Exception(f"ElevenLabs API error: {response.status}")

	async def process_with_mcp(self, user_input: str) -> Dict[str, Any]:
	"""Process user input using MCP (Model Context Protocol)"""
	# Detect intent
	intent = self.detect_intent(user_input)

	if intent == "calendar":
	return await self.handle_calendar_request(user_input)
	else:
	return await self.handle_general_question(user_input)

	def detect_intent(self, text: str) -> str:
	"""Simple intent detection"""
	calendar_keywords = ["schedule", "appointment", "meeting", "calendar", "book", "reserve"]
	if any(keyword in text.lower() for keyword in calendar_keywords):
	return "calendar"
	return "general"

	async def handle_calendar_request(self, text: str) -> Dict[str, Any]:
	"""Handle calendar appointment creation"""
	try:
	# Extract appointment details using simple parsing
	# In a real implementation, you'd use NLP or LLM for better extraction
	appointment_data = self.extract_appointment_details(text)

	# Create calendar event (simplified - would use Google Calendar API)
	event_summary = f"Appointment: {appointment_data.get('title', 'New Meeting')}"
	event_time = appointment_data.get('time', 'TBD')

	response_text = f"I've scheduled your {event_summary} for {event_time}. Please note: This is a demo - in production, this would create an actual Google Calendar event."

	return {
	"type": "calendar",
	"response": response_text,
	"success": True,
	"event_data": appointment_data
	}
	except Exception as e:
	return {
	"type": "calendar",
	"response": f"I encountered an error while scheduling your appointment: {str(e)}",
	"success": False
	}

	def extract_appointment_details(self, text: str) -> Dict[str, str]:
	"""Extract appointment details from text (simplified)"""
	# This is a basic implementation - in production, use NLP/LLM
	details = {
	"title": "Meeting",
	"time": "Next available slot",
	"duration": "30 minutes"
	}

	# Simple keyword extraction
	if "doctor" in text.lower():
	details["title"] = "Doctor Appointment"
	elif "meeting" in text.lower():
	details["title"] = "Meeting"
	elif "call" in text.lower():
	details["title"] = "Phone Call"

	# Extract time mentions (basic)
	words = text.lower().split()
	for i, word in enumerate(words):
	if word in ["tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", "friday"]:
	details["time"] = word.capitalize()
	break
	elif "at" in words and i < len(words) - 1:
	if any(char.isdigit() for char in words[i + 1]):
	details["time"] = f"at {words[i + 1]}"
	break

	return details

	async def handle_general_question(self, text: str) -> Dict[str, Any]:
	"""Handle general questions"""
	# Simple responses - in production, integrate with LLM
	responses = {
	"hello": "Hello! I'm your voice assistant. I can help you schedule appointments or answer questions.",
	"how are you": "I'm doing well, thank you! How can I help you today?",
	"weather": "I'm a demo assistant focused on calendar management. For weather, I'd need to integrate with a weather API.",
	"time": f"The current time is {datetime.now().strftime('%I:%M %p')}",
	"default": "I understand you're asking about something. As a demo assistant, I can help you schedule appointments or provide basic information. What would you like to do?"
	}

	text_lower = text.lower()
	response_text = responses.get("default")

	for key, response in responses.items():
	if key in text_lower:
	response_text = response
	break

	return {
	"type": "general",
	"response": response_text,
	"success": True
	}

	# Initialize the agent
	agent = VoiceAgent()

	async def process_voice_input(audio_file):
	"""Process voice input and return voice response"""
	if audio_file is None:
	return None, "Please record some audio first."

	try:
	# Convert speech to text
	text = await agent.speech_to_text(audio_file)
	if text.startswith("Error"):
	return None, text

	# Process with MCP
	result = await agent.process_with_mcp(text)
	response_text = result["response"]

	# Convert response to speech
	if ELEVENLABS_API_KEY:
	try:
	audio_bytes = await agent.text_to_speech(response_text)
	# Save to temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
	tmp_file.write(audio_bytes)
	return tmp_file.name, f"You said: '{text}'\n\nResponse: {response_text}"
	except Exception as e:
	return None, f"Text-to-speech error: {str(e)}\n\nYou said: '{text}'\nResponse: {response_text}"
	else:
	return None, f"You said: '{text}'\n\nResponse: {response_text}\n\n(Note: Set ELEVENLABS_API_KEY for voice output)"

	except Exception as e:
	return None, f"Error processing audio: {str(e)}"

	def process_text_input(text_input):
	"""Process text input directly"""
	if not text_input.strip():
	return "Please enter some text."

	try:
	# Process with MCP
	result = asyncio.run(agent.process_with_mcp(text_input))
	return result["response"]
	except Exception as e:
	return f"Error processing text: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Voice Agent - Gradio MCP Hackathon", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎤 Voice Agent with MCP

	Hackathon Project: Gradio Agents & MCP Hackathon

	This lightweight voice agent can:
	- 🗣️ Process voice input and respond with voice
	- 📅 Schedule calendar appointments
	- ❓ Answer general questions
	- 🔧 Uses MCP (Model Context Protocol) for processing

	## Setup Instructions:
	1. Set `ELEVENLABS_API_KEY` environment variable for voice synthesis
	2. Set `GOOGLE_CALENDAR_CREDENTIALS` for calendar integration (optional)
	3. Try voice input or type your questions below!
	""")

	with gr.Tab("🎤 Voice Mode"):
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Record your voice"
	)
	voice_button = gr.Button("Process Voice Input", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="AI Response (Voice)")
	text_output = gr.Textbox(
	label="Conversation Log",
	lines=6,
	interactive=False
	)

	voice_button.click(
	fn=process_voice_input,
	inputs=[audio_input],
	outputs=[audio_output, text_output]
	)

	with gr.Tab("💬 Text Mode"):
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Type your message",
	placeholder="Ask me anything or request to schedule an appointment...",
	lines=3
	)
	text_button = gr.Button("Send Message", variant="primary")

	with gr.Column():
	text_response = gr.Textbox(
	label="AI Response",
	lines=6,
	interactive=False
	)

	text_button.click(
	fn=process_text_input,
	inputs=[text_input],
	outputs=[text_response]
	)

	# Quick action buttons
	gr.Markdown("### Quick Actions:")
	with gr.Row():
	quick_hello = gr.Button("👋 Say Hello")
	quick_time = gr.Button("🕐 What time is it?")
	quick_appointment = gr.Button("📅 Schedule appointment tomorrow at 2pm")

	quick_hello.click(
	fn=lambda: process_text_input("hello"),
	outputs=[text_response]
	)

	quick_time.click(
	fn=lambda: process_text_input("what time is it"),
	outputs=[text_response]
	)

	quick_appointment.click(
	fn=lambda: process_text_input("schedule an appointment tomorrow at 2pm"),
	outputs=[text_response]
	)

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## About This Project

	This is a hackathon submission for the Gradio Agents & MCP Hackathon.

	### Features:
	- Voice Input/Output: Uses speech recognition and ElevenLabs TTS
	- MCP Integration: Implements Model Context Protocol for intelligent processing
	- Calendar Management: Can schedule appointments (demo mode)
	- Lightweight: Optimized for Hugging Face Spaces

	### Technologies Used:
	- Gradio: For the web interface
	- ElevenLabs: For text-to-speech synthesis
	- MCP: For intelligent request processing
	- Speech Recognition: For voice-to-text conversion

	### Environment Variables:
	- `ELEVENLABS_API_KEY`: Your ElevenLabs API key
	- `GOOGLE_CALENDAR_CREDENTIALS`: Google Calendar API credentials (optional)

	### Example Interactions:
	- "Hello, how are you?"
	- "What time is it?"
	- "Schedule a doctor appointment for tomorrow at 3pm"
	- "Book a meeting with John next Monday"
	""")

	if __name__ == "__main__":
	demo.launch()