Spaces:
Sleeping
Sleeping
fix issues
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import io
|
|
| 3 |
import json
|
| 4 |
import asyncio
|
| 5 |
import base64
|
|
|
|
| 6 |
from typing import Optional
|
| 7 |
|
| 8 |
import gradio as gr
|
|
@@ -15,21 +16,13 @@ try:
|
|
| 15 |
except Exception:
|
| 16 |
OPENAI_AVAILABLE = False
|
| 17 |
|
| 18 |
-
# Optional: HF transformers fallbacks
|
| 19 |
-
try:
|
| 20 |
-
from PIL import Image
|
| 21 |
-
import requests
|
| 22 |
-
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 23 |
-
HF_BLIP_AVAILABLE = True
|
| 24 |
-
except Exception:
|
| 25 |
-
HF_BLIP_AVAILABLE = False
|
| 26 |
-
|
| 27 |
# -----------------------------
|
| 28 |
# Configuration
|
| 29 |
# -----------------------------
|
| 30 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 31 |
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
|
| 32 |
HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
|
|
|
|
| 33 |
|
| 34 |
if OPENAI_API_KEY and OPENAI_AVAILABLE:
|
| 35 |
openai.api_key = OPENAI_API_KEY
|
|
@@ -38,6 +31,9 @@ if OPENAI_API_KEY and OPENAI_AVAILABLE:
|
|
| 38 |
ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL") # placeholder
|
| 39 |
ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
|
| 40 |
|
|
|
|
|
|
|
|
|
|
| 41 |
# -----------------------------
|
| 42 |
# Minimal MCP Server shim
|
| 43 |
# -----------------------------
|
|
@@ -83,16 +79,14 @@ def transcribe_with_openai(audio_file_path: str) -> str:
|
|
| 83 |
"""Transcribe audio using OpenAI Whisper (if available)."""
|
| 84 |
if not OPENAI_AVAILABLE:
|
| 85 |
return "OpenAI library not available"
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
try:
|
| 89 |
transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
|
| 90 |
-
# Some SDKs return .text
|
| 91 |
if isinstance(transcript, dict):
|
| 92 |
return transcript.get("text", "")
|
| 93 |
return getattr(transcript, "text", "")
|
| 94 |
-
|
| 95 |
-
|
| 96 |
|
| 97 |
|
| 98 |
def transcribe_fallback(audio_file_path: str) -> str:
|
|
@@ -107,9 +101,10 @@ def transcribe_fallback(audio_file_path: str) -> str:
|
|
| 107 |
|
| 108 |
|
| 109 |
def tts_elevenlabs(text: str) -> bytes:
|
| 110 |
-
"""Call ElevenLabs API to synthesize speech. Returns raw audio bytes
|
| 111 |
if not ELEVENLABS_API_KEY:
|
| 112 |
raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
|
|
|
|
| 113 |
url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
|
| 114 |
headers = {
|
| 115 |
"xi-api-key": ELEVENLABS_API_KEY,
|
|
@@ -125,44 +120,47 @@ def tts_elevenlabs(text: str) -> bytes:
|
|
| 125 |
return resp.content
|
| 126 |
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
# Gemini Image Description
|
| 131 |
-
# -----------------------------
|
| 132 |
-
|
| 133 |
-
def describe_image_gemini(image_path: str) -> str:
|
| 134 |
-
"""Describe an image using Google Gemini Vision."""
|
| 135 |
try:
|
| 136 |
-
import
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
return "GOOGLE_GEMINI_API_KEY not set"
|
| 140 |
-
|
| 141 |
-
genai.configure(api_key=GEMINI_KEY)
|
| 142 |
-
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 143 |
-
|
| 144 |
with open(image_path, "rb") as f:
|
| 145 |
image_bytes = f.read()
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
except Exception as e:
|
| 150 |
-
return f"
|
| 151 |
|
| 152 |
-
|
| 153 |
-
(image_path: str) -> str:
|
| 154 |
-
"""Attempt to describe an image using OpenAI vision
|
| 155 |
if not OPENAI_AVAILABLE:
|
| 156 |
return "OpenAI not available for image captioning"
|
| 157 |
try:
|
| 158 |
with open(image_path, "rb") as f:
|
| 159 |
-
# Example using the OpenAI image understanding endpoints (SDKs vary)
|
| 160 |
-
# We'll call the Chat Completions with system prompt and base64 image as a fallback
|
| 161 |
b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 162 |
prompt = (
|
| 163 |
"You are an assistant that describes images for visually impaired users. "
|
| 164 |
-
"Provide a concise, vivid, and accessible description of the image.
|
| 165 |
-
|
| 166 |
Image(base64):" + b64
|
| 167 |
)
|
| 168 |
resp = openai.ChatCompletion.create(
|
|
@@ -172,21 +170,6 @@ Image(base64):" + b64
|
|
| 172 |
except Exception as e:
|
| 173 |
return f"OpenAI image describe error: {e}"
|
| 174 |
|
| 175 |
-
|
| 176 |
-
def describe_image_blip(image_path: str) -> str:
|
| 177 |
-
if not HF_BLIP_AVAILABLE:
|
| 178 |
-
return "HF BLIP not available in this runtime"
|
| 179 |
-
try:
|
| 180 |
-
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 181 |
-
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 182 |
-
raw_image = Image.open(image_path).convert("RGB")
|
| 183 |
-
inputs = processor(raw_image, return_tensors="pt")
|
| 184 |
-
out = model.generate(**inputs)
|
| 185 |
-
caption = processor.decode(out[0], skip_special_tokens=True)
|
| 186 |
-
return caption
|
| 187 |
-
except Exception as e:
|
| 188 |
-
return f"BLIP caption error: {e}"
|
| 189 |
-
|
| 190 |
# -----------------------------
|
| 191 |
# MCP Tools
|
| 192 |
# -----------------------------
|
|
@@ -202,25 +185,41 @@ def speak_text_tool(text: str) -> ToolResult:
|
|
| 202 |
|
| 203 |
@server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
|
| 204 |
def describe_image_tool(image_path: str) -> ToolResult:
|
| 205 |
-
#
|
| 206 |
if OPENAI_AVAILABLE:
|
| 207 |
desc = describe_image_openai(image_path)
|
| 208 |
if desc and not desc.startswith("OpenAI image describe error"):
|
| 209 |
-
return ToolResult(content=desc)
|
| 210 |
-
if
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
|
| 216 |
@server.tool(name="transcribe_audio", description="Transcribe user audio to text")
|
| 217 |
def transcribe_audio_tool(audio_path: str) -> ToolResult:
|
|
|
|
| 218 |
if OPENAI_AVAILABLE:
|
| 219 |
text = transcribe_with_openai(audio_path)
|
| 220 |
-
|
|
|
|
| 221 |
else:
|
| 222 |
text = transcribe_fallback(audio_path)
|
| 223 |
-
|
|
|
|
| 224 |
|
| 225 |
# -----------------------------
|
| 226 |
# Gradio UI (client)
|
|
@@ -229,28 +228,46 @@ def transcribe_audio_tool(audio_path: str) -> ToolResult:
|
|
| 229 |
def decode_base64_audio(b64: str) -> bytes:
|
| 230 |
return base64.b64decode(b64)
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
return "
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
|
|
|
| 249 |
gr.Markdown("# Accessibility Voice Agent β MCP Tools")
|
| 250 |
|
| 251 |
with gr.Row():
|
| 252 |
-
with gr.Column(scale=
|
| 253 |
-
chatbox = gr.Chatbot(label="Assistant")
|
| 254 |
user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
|
| 255 |
|
| 256 |
with gr.Row():
|
|
@@ -264,59 +281,106 @@ with gr.Accordion("π API Keys (stored only in session)", open=False):
|
|
| 264 |
img_upload = gr.File(label="Upload image (for description)")
|
| 265 |
img_btn = gr.Button("Describe image")
|
| 266 |
|
| 267 |
-
with gr.Column(scale=
|
| 268 |
-
gr.Markdown("###
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
# Callbacks
|
| 272 |
-
def on_send_text(text, chat_history, mic_file):
|
| 273 |
-
|
| 274 |
if mic_file:
|
| 275 |
-
|
| 276 |
-
tools_log_val = (tools_log_val + "
|
| 277 |
-
Transcribing audio...")
|
| 278 |
-
# transcribe
|
| 279 |
tr = transcribe_audio_tool(mic_file)
|
| 280 |
user_text = tr.content
|
|
|
|
|
|
|
| 281 |
else:
|
| 282 |
user_text = text
|
| 283 |
-
# append user->assistant exchange
|
| 284 |
chat_history = chat_history or []
|
| 285 |
chat_history.append((user_text, "..."))
|
| 286 |
-
|
| 287 |
-
|
|
|
|
| 288 |
# expects: "describe image: filename"
|
| 289 |
_, _, fname = user_text.partition(":")
|
| 290 |
fname = fname.strip()
|
| 291 |
if fname:
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
| 294 |
else:
|
| 295 |
-
assistant = "Please upload an image using the Describe Image tool."
|
| 296 |
else:
|
| 297 |
-
assistant = "I heard: " + user_text
|
|
|
|
| 298 |
chat_history[-1] = (user_text, assistant)
|
| 299 |
-
return chat_history, tools_log_val
|
| 300 |
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
-
def on_tts(text):
|
|
|
|
|
|
|
| 304 |
res = speak_text_tool(text)
|
| 305 |
if res.meta and res.meta.get("format") == "base64-audio":
|
| 306 |
audio_bytes = decode_base64_audio(res.content)
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
-
tts_btn.click(on_tts, inputs=[tts_text], outputs=[gr.Audio(label="TTS Output")])
|
| 311 |
|
| 312 |
-
def on_describe_image(file_obj):
|
| 313 |
if not file_obj:
|
| 314 |
return "No file uploaded"
|
| 315 |
-
# file_obj
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
if __name__ == "__main__":
|
| 322 |
-
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|
|
|
|
| 3 |
import json
|
| 4 |
import asyncio
|
| 5 |
import base64
|
| 6 |
+
import time
|
| 7 |
from typing import Optional
|
| 8 |
|
| 9 |
import gradio as gr
|
|
|
|
| 16 |
except Exception:
|
| 17 |
OPENAI_AVAILABLE = False
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# -----------------------------
|
| 20 |
# Configuration
|
| 21 |
# -----------------------------
|
| 22 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
| 23 |
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
|
| 24 |
HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
|
| 25 |
+
GOOGLE_GEMINI_API_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
|
| 26 |
|
| 27 |
if OPENAI_API_KEY and OPENAI_AVAILABLE:
|
| 28 |
openai.api_key = OPENAI_API_KEY
|
|
|
|
| 31 |
ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL") # placeholder
|
| 32 |
ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
|
| 33 |
|
| 34 |
+
# Hugging Face Inference API endpoint (for image captioning fallback)
|
| 35 |
+
HF_INFERENCE_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
|
| 36 |
+
|
| 37 |
# -----------------------------
|
| 38 |
# Minimal MCP Server shim
|
| 39 |
# -----------------------------
|
|
|
|
| 79 |
"""Transcribe audio using OpenAI Whisper (if available)."""
|
| 80 |
if not OPENAI_AVAILABLE:
|
| 81 |
return "OpenAI library not available"
|
| 82 |
+
try:
|
| 83 |
+
with open(audio_file_path, "rb") as f:
|
|
|
|
| 84 |
transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
|
|
|
|
| 85 |
if isinstance(transcript, dict):
|
| 86 |
return transcript.get("text", "")
|
| 87 |
return getattr(transcript, "text", "")
|
| 88 |
+
except Exception as e:
|
| 89 |
+
return f"OpenAI transcription error: {e}"
|
| 90 |
|
| 91 |
|
| 92 |
def transcribe_fallback(audio_file_path: str) -> str:
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def tts_elevenlabs(text: str) -> bytes:
|
| 104 |
+
"""Call ElevenLabs API to synthesize speech. Returns raw audio bytes."""
|
| 105 |
if not ELEVENLABS_API_KEY:
|
| 106 |
raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
|
| 107 |
+
import requests
|
| 108 |
url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
|
| 109 |
headers = {
|
| 110 |
"xi-api-key": ELEVENLABS_API_KEY,
|
|
|
|
| 120 |
return resp.content
|
| 121 |
|
| 122 |
|
| 123 |
+
def describe_image_hf(image_path: str) -> str:
|
| 124 |
+
"""Describe an image using Hugging Face Inference API (BLIP model hosted)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
try:
|
| 126 |
+
import requests
|
| 127 |
+
if not HUGGINGFACE_API_TOKEN:
|
| 128 |
+
return "HUGGINGFACE_API_TOKEN not set"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
with open(image_path, "rb") as f:
|
| 130 |
image_bytes = f.read()
|
| 131 |
+
headers = {
|
| 132 |
+
"Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"
|
| 133 |
+
}
|
| 134 |
+
# The HF Inference API accepts files as binary
|
| 135 |
+
resp = requests.post(HF_INFERENCE_URL, headers=headers, data=image_bytes)
|
| 136 |
+
if resp.status_code != 200:
|
| 137 |
+
return f"HF Inference error: {resp.status_code} {resp.text}"
|
| 138 |
+
# Model returns JSON with 'generated_text' or a simple string depending on model
|
| 139 |
+
try:
|
| 140 |
+
j = resp.json()
|
| 141 |
+
# Some endpoints return [{'generated_text': '...'}]
|
| 142 |
+
if isinstance(j, list) and j and 'generated_text' in j[0]:
|
| 143 |
+
return j[0]['generated_text']
|
| 144 |
+
if isinstance(j, dict) and 'generated_text' in j:
|
| 145 |
+
return j['generated_text']
|
| 146 |
+
# Otherwise return text
|
| 147 |
+
return str(j)
|
| 148 |
+
except Exception:
|
| 149 |
+
return resp.text
|
| 150 |
except Exception as e:
|
| 151 |
+
return f"HF describe error: {e}"
|
| 152 |
|
| 153 |
+
|
| 154 |
+
def describe_image_openai(image_path: str) -> str:
|
| 155 |
+
"""Attempt to describe an image using OpenAI vision if available."""
|
| 156 |
if not OPENAI_AVAILABLE:
|
| 157 |
return "OpenAI not available for image captioning"
|
| 158 |
try:
|
| 159 |
with open(image_path, "rb") as f:
|
|
|
|
|
|
|
| 160 |
b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 161 |
prompt = (
|
| 162 |
"You are an assistant that describes images for visually impaired users. "
|
| 163 |
+
"Provide a concise, vivid, and accessible description of the image.
|
|
|
|
| 164 |
Image(base64):" + b64
|
| 165 |
)
|
| 166 |
resp = openai.ChatCompletion.create(
|
|
|
|
| 170 |
except Exception as e:
|
| 171 |
return f"OpenAI image describe error: {e}"
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
# -----------------------------
|
| 174 |
# MCP Tools
|
| 175 |
# -----------------------------
|
|
|
|
| 185 |
|
| 186 |
@server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
|
| 187 |
def describe_image_tool(image_path: str) -> ToolResult:
|
| 188 |
+
# Priority: OpenAI -> Gemini -> Hugging Face Inference -> error
|
| 189 |
if OPENAI_AVAILABLE:
|
| 190 |
desc = describe_image_openai(image_path)
|
| 191 |
if desc and not desc.startswith("OpenAI image describe error"):
|
| 192 |
+
return ToolResult(content=desc, meta={"backend":"openai"})
|
| 193 |
+
# Gemini (if configured)
|
| 194 |
+
if GOOGLE_GEMINI_API_KEY:
|
| 195 |
+
try:
|
| 196 |
+
import google.generativeai as genai
|
| 197 |
+
genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
|
| 198 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 199 |
+
with open(image_path, "rb") as f:
|
| 200 |
+
image_bytes = f.read()
|
| 201 |
+
response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
|
| 202 |
+
return ToolResult(content=response.text, meta={"backend":"gemini"})
|
| 203 |
+
except Exception:
|
| 204 |
+
pass
|
| 205 |
+
# Hugging Face Inference
|
| 206 |
+
desc = describe_image_hf(image_path)
|
| 207 |
+
if desc:
|
| 208 |
+
return ToolResult(content=desc, meta={"backend":"huggingface"})
|
| 209 |
+
return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY, GOOGLE_GEMINI_API_KEY, or HUGGINGFACE_API_TOKEN.")
|
| 210 |
|
| 211 |
|
| 212 |
@server.tool(name="transcribe_audio", description="Transcribe user audio to text")
|
| 213 |
def transcribe_audio_tool(audio_path: str) -> ToolResult:
|
| 214 |
+
start = time.time()
|
| 215 |
if OPENAI_AVAILABLE:
|
| 216 |
text = transcribe_with_openai(audio_path)
|
| 217 |
+
duration = time.time() - start
|
| 218 |
+
return ToolResult(content=text, meta={"backend":"openai","duration":duration})
|
| 219 |
else:
|
| 220 |
text = transcribe_fallback(audio_path)
|
| 221 |
+
duration = time.time() - start
|
| 222 |
+
return ToolResult(content=text, meta={"backend":"local_whisper","duration":duration})
|
| 223 |
|
| 224 |
# -----------------------------
|
| 225 |
# Gradio UI (client)
|
|
|
|
| 228 |
def decode_base64_audio(b64: str) -> bytes:
|
| 229 |
return base64.b64decode(b64)
|
| 230 |
|
| 231 |
+
app_theme = {
|
| 232 |
+
"primary_hue": "blue",
|
| 233 |
+
"secondary_hue": "slate",
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
# Helper to format tool-call explanations
|
| 237 |
+
def format_tool_log(tool_name, reason, meta, output, style="A"):
|
| 238 |
+
backend = meta.get("backend") if meta else "unknown"
|
| 239 |
+
duration = meta.get("duration") if meta else None
|
| 240 |
+
if style == "A":
|
| 241 |
+
# Simple
|
| 242 |
+
return f"[{tool_name}] {backend} -> {str(output)[:200]}"
|
| 243 |
+
if style == "B":
|
| 244 |
+
# Detailed human-readable
|
| 245 |
+
lines = [f"π§ Tool: {tool_name}", f"π― Why: {reason}", f"βοΈ Backend: {backend}"]
|
| 246 |
+
if duration is not None:
|
| 247 |
+
lines.append(f"β± Duration: {duration:.2f}s")
|
| 248 |
+
lines.append(f"π Output: {str(output)}")
|
| 249 |
+
return "
|
| 250 |
+
".join(lines)
|
| 251 |
+
if style == "C":
|
| 252 |
+
# Ultra-visual
|
| 253 |
+
s = f"π§ {tool_name} β’ Reason: {reason} β’ Backend: {backend}"
|
| 254 |
+
if duration is not None:
|
| 255 |
+
s += f" β’ {duration:.2f}s"
|
| 256 |
+
s += f"
|
| 257 |
+
β {str(output)}"
|
| 258 |
+
return s
|
| 259 |
+
# D -> both
|
| 260 |
+
return {
|
| 261 |
+
"simple": f"[{tool_name}] {backend} -> {str(output)[:200]}",
|
| 262 |
+
"detailed": format_tool_log(tool_name, reason, meta, output, style="B")
|
| 263 |
+
}
|
| 264 |
|
| 265 |
+
with gr.Blocks(css=".gradio-container {background:#f7fafc}") as demo:
|
| 266 |
gr.Markdown("# Accessibility Voice Agent β MCP Tools")
|
| 267 |
|
| 268 |
with gr.Row():
|
| 269 |
+
with gr.Column(scale=3):
|
| 270 |
+
chatbox = gr.Chatbot(label="Assistant", elem_id="chatbox")
|
| 271 |
user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
|
| 272 |
|
| 273 |
with gr.Row():
|
|
|
|
| 281 |
img_upload = gr.File(label="Upload image (for description)")
|
| 282 |
img_btn = gr.Button("Describe image")
|
| 283 |
|
| 284 |
+
with gr.Column(scale=2):
|
| 285 |
+
gr.Markdown("### Tool Call Log & Explanations")
|
| 286 |
+
log_style = gr.Radio(choices=["A","B","C","D"], value="B", label="Log style (A:Simple B:Detailed C:Visual D:Both)")
|
| 287 |
+
tools_log = gr.Textbox(value="Ready.", lines=20, interactive=False, label="Tools Log")
|
| 288 |
+
tools_panel = gr.HTML("<div id='tools_panel' style='max-height:400px;overflow:auto;background:#ffffff;padding:8px;border-radius:8px;'></div>")
|
| 289 |
+
gr.Markdown("---")
|
| 290 |
+
gr.Markdown("**Tool explanations appear here each time a tool runs.**")
|
| 291 |
|
| 292 |
# Callbacks
|
| 293 |
+
def on_send_text(text, chat_history, mic_file, style):
|
| 294 |
+
tools_entries = []
|
| 295 |
if mic_file:
|
| 296 |
+
# transcribe audio
|
|
|
|
|
|
|
|
|
|
| 297 |
tr = transcribe_audio_tool(mic_file)
|
| 298 |
user_text = tr.content
|
| 299 |
+
log = format_tool_log("transcribe_audio", "User provided microphone audio", tr.meta or {}, tr.content, style)
|
| 300 |
+
tools_entries.append(log)
|
| 301 |
else:
|
| 302 |
user_text = text
|
|
|
|
| 303 |
chat_history = chat_history or []
|
| 304 |
chat_history.append((user_text, "..."))
|
| 305 |
+
|
| 306 |
+
# demo assistant behavior
|
| 307 |
+
if user_text and user_text.strip().lower().startswith("describe image:"):
|
| 308 |
# expects: "describe image: filename"
|
| 309 |
_, _, fname = user_text.partition(":")
|
| 310 |
fname = fname.strip()
|
| 311 |
if fname:
|
| 312 |
+
# We assume the image was uploaded earlier and path provided
|
| 313 |
+
res = describe_image_tool(fname)
|
| 314 |
+
assistant = res.content
|
| 315 |
+
log = format_tool_log("describe_image", "User requested image description", res.meta or {}, res.content, style)
|
| 316 |
+
tools_entries.append(log)
|
| 317 |
else:
|
| 318 |
+
assistant = "Please upload an image using the Describe Image tool or provide a path like: describe image: /path/to/image.jpg"
|
| 319 |
else:
|
| 320 |
+
assistant = "I heard: " + (user_text or "(empty)")
|
| 321 |
+
|
| 322 |
chat_history[-1] = (user_text, assistant)
|
|
|
|
| 323 |
|
| 324 |
+
# update tools panel content
|
| 325 |
+
panel_html = ''
|
| 326 |
+
if isinstance(log, dict):
|
| 327 |
+
# D style returns dict
|
| 328 |
+
panel_html += f"<pre>{log['detailed']}</pre>"
|
| 329 |
+
panel_html += f"<hr><pre>{log['simple']}</pre>"
|
| 330 |
+
else:
|
| 331 |
+
for e in tools_entries:
|
| 332 |
+
panel_html += f"<pre style='background:#f1f5f9;border-radius:6px;padding:8px;margin-bottom:8px;'>{e}</pre>"
|
| 333 |
+
return chat_history, tools_log, gr.update(value=panel_html)
|
| 334 |
+
|
| 335 |
+
send_btn.click(on_send_text, inputs=[user_input, chatbox, mic, log_style], outputs=[chatbox, tools_log, tools_panel])
|
| 336 |
|
| 337 |
+
def on_tts(text, style):
|
| 338 |
+
if not text:
|
| 339 |
+
return None, gr.update(value="No text provided")
|
| 340 |
res = speak_text_tool(text)
|
| 341 |
if res.meta and res.meta.get("format") == "base64-audio":
|
| 342 |
audio_bytes = decode_base64_audio(res.content)
|
| 343 |
+
log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, "<audio bytes>", style)
|
| 344 |
+
panel_html = f"<pre style='background:#eef2ff;padding:8px;border-radius:6px;'>{log}</pre>"
|
| 345 |
+
return (audio_bytes, 16000), gr.update(value=panel_html)
|
| 346 |
+
else:
|
| 347 |
+
log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, res.content, style)
|
| 348 |
+
panel_html = f"<pre style='background:#fee2e2;padding:8px;border-radius:6px;'>{log}</pre>"
|
| 349 |
+
return None, gr.update(value=panel_html)
|
| 350 |
|
| 351 |
+
tts_btn.click(on_tts, inputs=[tts_text, log_style], outputs=[gr.Audio(label="TTS Output"), tools_panel])
|
| 352 |
|
| 353 |
+
def on_describe_image(file_obj, style):
|
| 354 |
if not file_obj:
|
| 355 |
return "No file uploaded"
|
| 356 |
+
# file_obj may be a tempfile object or path
|
| 357 |
+
path = getattr(file_obj, 'name', file_obj)
|
| 358 |
+
res = describe_image_tool(path)
|
| 359 |
+
log = format_tool_log("describe_image", "User uploaded an image for description", res.meta or {}, res.content, style)
|
| 360 |
+
panel_html = f"<pre style='background:#ecfdf5;padding:8px;border-radius:6px;'>{log}</pre>"
|
| 361 |
+
# show result in chatbox as assistant reply
|
| 362 |
+
return [("<image uploaded>", res.content)], gr.update(value=panel_html)
|
| 363 |
+
|
| 364 |
+
img_btn.click(on_describe_image, inputs=[img_upload, log_style], outputs=[chatbox, tools_panel])
|
| 365 |
+
|
| 366 |
+
# API Keys accordion (session-only)
|
| 367 |
+
with gr.Accordion("π API Keys (stored only in session)", open=False):
|
| 368 |
+
openai_key = gr.Textbox(label="OpenAI API Key", type="password")
|
| 369 |
+
eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
|
| 370 |
+
hf_key = gr.Textbox(label="Hugging Face API Token", type="password")
|
| 371 |
+
|
| 372 |
+
def set_keys(ok, ek, hk):
|
| 373 |
+
if ok:
|
| 374 |
+
os.environ["OPENAI_API_KEY"] = ok
|
| 375 |
+
if ek:
|
| 376 |
+
os.environ["ELEVENLABS_API_KEY"] = ek
|
| 377 |
+
if hk:
|
| 378 |
+
os.environ["HUGGINGFACE_API_TOKEN"] = hk
|
| 379 |
+
return "API keys set for this session. Refresh the page to pick them up in all runtimes."
|
| 380 |
+
|
| 381 |
+
set_btn = gr.Button("Save API Keys")
|
| 382 |
+
set_output = gr.Textbox(label="Status")
|
| 383 |
+
set_btn.click(set_keys, [openai_key, eleven_key, hf_key], [set_output])
|
| 384 |
|
| 385 |
if __name__ == "__main__":
|
| 386 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|