develops20's picture
Update app.py
e4674b9 verified
raw
history blame
12.5 kB
import gradio as gr
import speech_recognition as sr
import requests
import json
import os
from datetime import datetime, timedelta
import tempfile
import io
import base64
from typing import Optional, Dict, Any
import asyncio
import aiohttp
# Configuration
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
GOOGLE_CALENDAR_CREDENTIALS = os.getenv("GOOGLE_CALENDAR_CREDENTIALS")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# ElevenLabs configuration
ELEVENLABS_VOICE_ID = "21m00Tcm4TlvDq8ikWAM" # Default voice, can be changed
ELEVENLABS_API_URL = "https://api.elevenlabs.io/v1"
class VoiceAgent:
def __init__(self):
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
async def speech_to_text(self, audio_file) -> str:
"""Convert speech to text using speech_recognition"""
try:
with sr.AudioFile(audio_file) as source:
audio = self.recognizer.record(source)
text = self.recognizer.recognize_google(audio)
return text
except Exception as e:
return f"Error in speech recognition: {str(e)}"
async def text_to_speech(self, text: str) -> bytes:
"""Convert text to speech using ElevenLabs"""
if not ELEVENLABS_API_KEY:
raise ValueError("ElevenLabs API key not found")
url = f"{ELEVENLABS_API_URL}/text-to-speech/{ELEVENLABS_VOICE_ID}"
headers = {
"Accept": "audio/mpeg",
"Content-Type": "application/json",
"xi-api-key": ELEVENLABS_API_KEY
}
data = {
"text": text,
"model_id": "eleven_monolingual_v1",
"voice_settings": {
"stability": 0.5,
"similarity_boost": 0.5
}
}
async with aiohttp.ClientSession() as session:
async with session.post(url, json=data, headers=headers) as response:
if response.status == 200:
return await response.read()
else:
raise Exception(f"ElevenLabs API error: {response.status}")
async def process_with_mcp(self, user_input: str) -> Dict[str, Any]:
"""Process user input using MCP (Model Context Protocol)"""
# Detect intent
intent = self.detect_intent(user_input)
if intent == "calendar":
return await self.handle_calendar_request(user_input)
else:
return await self.handle_general_question(user_input)
def detect_intent(self, text: str) -> str:
"""Simple intent detection"""
calendar_keywords = ["schedule", "appointment", "meeting", "calendar", "book", "reserve"]
if any(keyword in text.lower() for keyword in calendar_keywords):
return "calendar"
return "general"
async def handle_calendar_request(self, text: str) -> Dict[str, Any]:
"""Handle calendar appointment creation"""
try:
# Extract appointment details using simple parsing
# In a real implementation, you'd use NLP or LLM for better extraction
appointment_data = self.extract_appointment_details(text)
# Create calendar event (simplified - would use Google Calendar API)
event_summary = f"Appointment: {appointment_data.get('title', 'New Meeting')}"
event_time = appointment_data.get('time', 'TBD')
response_text = f"I've scheduled your {event_summary} for {event_time}. Please note: This is a demo - in production, this would create an actual Google Calendar event."
return {
"type": "calendar",
"response": response_text,
"success": True,
"event_data": appointment_data
}
except Exception as e:
return {
"type": "calendar",
"response": f"I encountered an error while scheduling your appointment: {str(e)}",
"success": False
}
def extract_appointment_details(self, text: str) -> Dict[str, str]:
"""Extract appointment details from text (simplified)"""
# This is a basic implementation - in production, use NLP/LLM
details = {
"title": "Meeting",
"time": "Next available slot",
"duration": "30 minutes"
}
# Simple keyword extraction
if "doctor" in text.lower():
details["title"] = "Doctor Appointment"
elif "meeting" in text.lower():
details["title"] = "Meeting"
elif "call" in text.lower():
details["title"] = "Phone Call"
# Extract time mentions (basic)
words = text.lower().split()
for i, word in enumerate(words):
if word in ["tomorrow", "today", "monday", "tuesday", "wednesday", "thursday", "friday"]:
details["time"] = word.capitalize()
break
elif "at" in words and i < len(words) - 1:
if any(char.isdigit() for char in words[i + 1]):
details["time"] = f"at {words[i + 1]}"
break
return details
async def handle_general_question(self, text: str) -> Dict[str, Any]:
"""Handle general questions"""
# Simple responses - in production, integrate with LLM
responses = {
"hello": "Hello! I'm your voice assistant. I can help you schedule appointments or answer questions.",
"how are you": "I'm doing well, thank you! How can I help you today?",
"weather": "I'm a demo assistant focused on calendar management. For weather, I'd need to integrate with a weather API.",
"time": f"The current time is {datetime.now().strftime('%I:%M %p')}",
"default": "I understand you're asking about something. As a demo assistant, I can help you schedule appointments or provide basic information. What would you like to do?"
}
text_lower = text.lower()
response_text = responses.get("default")
for key, response in responses.items():
if key in text_lower:
response_text = response
break
return {
"type": "general",
"response": response_text,
"success": True
}
# Initialize the agent
agent = VoiceAgent()
async def process_voice_input(audio_file):
"""Process voice input and return voice response"""
if audio_file is None:
return None, "Please record some audio first."
try:
# Convert speech to text
text = await agent.speech_to_text(audio_file)
if text.startswith("Error"):
return None, text
# Process with MCP
result = await agent.process_with_mcp(text)
response_text = result["response"]
# Convert response to speech
if ELEVENLABS_API_KEY:
try:
audio_bytes = await agent.text_to_speech(response_text)
# Save to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_file.write(audio_bytes)
return tmp_file.name, f"You said: '{text}'\n\nResponse: {response_text}"
except Exception as e:
return None, f"Text-to-speech error: {str(e)}\n\nYou said: '{text}'\nResponse: {response_text}"
else:
return None, f"You said: '{text}'\n\nResponse: {response_text}\n\n(Note: Set ELEVENLABS_API_KEY for voice output)"
except Exception as e:
return None, f"Error processing audio: {str(e)}"
def process_text_input(text_input):
"""Process text input directly"""
if not text_input.strip():
return "Please enter some text."
try:
# Process with MCP
result = asyncio.run(agent.process_with_mcp(text_input))
return result["response"]
except Exception as e:
return f"Error processing text: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice Agent - Gradio MCP Hackathon", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎀 Voice Agent with MCP
**Hackathon Project**: Gradio Agents & MCP Hackathon
This lightweight voice agent can:
- πŸ—£οΈ Process voice input and respond with voice
- πŸ“… Schedule calendar appointments
- ❓ Answer general questions
- πŸ”§ Uses MCP (Model Context Protocol) for processing
## Setup Instructions:
1. Set `ELEVENLABS_API_KEY` environment variable for voice synthesis
2. Set `GOOGLE_CALENDAR_CREDENTIALS` for calendar integration (optional)
3. Try voice input or type your questions below!
""")
with gr.Tab("🎀 Voice Mode"):
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record your voice"
)
voice_button = gr.Button("Process Voice Input", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="AI Response (Voice)")
text_output = gr.Textbox(
label="Conversation Log",
lines=6,
interactive=False
)
voice_button.click(
fn=process_voice_input,
inputs=[audio_input],
outputs=[audio_output, text_output]
)
with gr.Tab("πŸ’¬ Text Mode"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Type your message",
placeholder="Ask me anything or request to schedule an appointment...",
lines=3
)
text_button = gr.Button("Send Message", variant="primary")
with gr.Column():
text_response = gr.Textbox(
label="AI Response",
lines=6,
interactive=False
)
text_button.click(
fn=process_text_input,
inputs=[text_input],
outputs=[text_response]
)
# Quick action buttons
gr.Markdown("### Quick Actions:")
with gr.Row():
quick_hello = gr.Button("πŸ‘‹ Say Hello")
quick_time = gr.Button("πŸ• What time is it?")
quick_appointment = gr.Button("πŸ“… Schedule appointment tomorrow at 2pm")
quick_hello.click(
fn=lambda: process_text_input("hello"),
outputs=[text_response]
)
quick_time.click(
fn=lambda: process_text_input("what time is it"),
outputs=[text_response]
)
quick_appointment.click(
fn=lambda: process_text_input("schedule an appointment tomorrow at 2pm"),
outputs=[text_response]
)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About This Project
This is a hackathon submission for the **Gradio Agents & MCP Hackathon**.
### Features:
- **Voice Input/Output**: Uses speech recognition and ElevenLabs TTS
- **MCP Integration**: Implements Model Context Protocol for intelligent processing
- **Calendar Management**: Can schedule appointments (demo mode)
- **Lightweight**: Optimized for Hugging Face Spaces
### Technologies Used:
- **Gradio**: For the web interface
- **ElevenLabs**: For text-to-speech synthesis
- **MCP**: For intelligent request processing
- **Speech Recognition**: For voice-to-text conversion
### Environment Variables:
- `ELEVENLABS_API_KEY`: Your ElevenLabs API key
- `GOOGLE_CALENDAR_CREDENTIALS`: Google Calendar API credentials (optional)
### Example Interactions:
- "Hello, how are you?"
- "What time is it?"
- "Schedule a doctor appointment for tomorrow at 3pm"
- "Book a meeting with John next Monday"
""")
if __name__ == "__main__":
demo.launch()