subhash4face commited on
Commit
ede6a4f
Β·
verified Β·
1 Parent(s): 420030c

fix issues

Browse files
Files changed (1) hide show
  1. app.py +176 -112
app.py CHANGED
@@ -3,6 +3,7 @@ import io
3
  import json
4
  import asyncio
5
  import base64
 
6
  from typing import Optional
7
 
8
  import gradio as gr
@@ -15,21 +16,13 @@ try:
15
  except Exception:
16
  OPENAI_AVAILABLE = False
17
 
18
- # Optional: HF transformers fallbacks
19
- try:
20
- from PIL import Image
21
- import requests
22
- from transformers import BlipProcessor, BlipForConditionalGeneration
23
- HF_BLIP_AVAILABLE = True
24
- except Exception:
25
- HF_BLIP_AVAILABLE = False
26
-
27
  # -----------------------------
28
  # Configuration
29
  # -----------------------------
30
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
31
  ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
32
  HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
 
33
 
34
  if OPENAI_API_KEY and OPENAI_AVAILABLE:
35
  openai.api_key = OPENAI_API_KEY
@@ -38,6 +31,9 @@ if OPENAI_API_KEY and OPENAI_AVAILABLE:
38
  ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL") # placeholder
39
  ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
40
 
 
 
 
41
  # -----------------------------
42
  # Minimal MCP Server shim
43
  # -----------------------------
@@ -83,16 +79,14 @@ def transcribe_with_openai(audio_file_path: str) -> str:
83
  """Transcribe audio using OpenAI Whisper (if available)."""
84
  if not OPENAI_AVAILABLE:
85
  return "OpenAI library not available"
86
- with open(audio_file_path, "rb") as f:
87
- # Uses the OpenAI Audio transcription API (may vary by SDK version)
88
- try:
89
  transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
90
- # Some SDKs return .text
91
  if isinstance(transcript, dict):
92
  return transcript.get("text", "")
93
  return getattr(transcript, "text", "")
94
- except Exception as e:
95
- return f"OpenAI transcription error: {e}"
96
 
97
 
98
  def transcribe_fallback(audio_file_path: str) -> str:
@@ -107,9 +101,10 @@ def transcribe_fallback(audio_file_path: str) -> str:
107
 
108
 
109
  def tts_elevenlabs(text: str) -> bytes:
110
- """Call ElevenLabs API to synthesize speech. Returns raw audio bytes (wav/mp3 depending on API)."""
111
  if not ELEVENLABS_API_KEY:
112
  raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
 
113
  url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
114
  headers = {
115
  "xi-api-key": ELEVENLABS_API_KEY,
@@ -125,44 +120,47 @@ def tts_elevenlabs(text: str) -> bytes:
125
  return resp.content
126
 
127
 
128
-
129
- def # -----------------------------
130
- # Gemini Image Description
131
- # -----------------------------
132
-
133
- def describe_image_gemini(image_path: str) -> str:
134
- """Describe an image using Google Gemini Vision."""
135
  try:
136
- import google.generativeai as genai
137
- GEMINI_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
138
- if not GEMINI_KEY:
139
- return "GOOGLE_GEMINI_API_KEY not set"
140
-
141
- genai.configure(api_key=GEMINI_KEY)
142
- model = genai.GenerativeModel("gemini-1.5-flash")
143
-
144
  with open(image_path, "rb") as f:
145
  image_bytes = f.read()
146
-
147
- response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
148
- return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  except Exception as e:
150
- return f"Gemini describe error: {e}"
151
 
152
- # (OpenAI code removed for simplicity)
153
- (image_path: str) -> str:
154
- """Attempt to describe an image using OpenAI vision (if available)."""
155
  if not OPENAI_AVAILABLE:
156
  return "OpenAI not available for image captioning"
157
  try:
158
  with open(image_path, "rb") as f:
159
- # Example using the OpenAI image understanding endpoints (SDKs vary)
160
- # We'll call the Chat Completions with system prompt and base64 image as a fallback
161
  b64 = base64.b64encode(f.read()).decode("utf-8")
162
  prompt = (
163
  "You are an assistant that describes images for visually impaired users. "
164
- "Provide a concise, vivid, and accessible description of the image."
165
-
166
  Image(base64):" + b64
167
  )
168
  resp = openai.ChatCompletion.create(
@@ -172,21 +170,6 @@ Image(base64):" + b64
172
  except Exception as e:
173
  return f"OpenAI image describe error: {e}"
174
 
175
-
176
- def describe_image_blip(image_path: str) -> str:
177
- if not HF_BLIP_AVAILABLE:
178
- return "HF BLIP not available in this runtime"
179
- try:
180
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
181
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
182
- raw_image = Image.open(image_path).convert("RGB")
183
- inputs = processor(raw_image, return_tensors="pt")
184
- out = model.generate(**inputs)
185
- caption = processor.decode(out[0], skip_special_tokens=True)
186
- return caption
187
- except Exception as e:
188
- return f"BLIP caption error: {e}"
189
-
190
  # -----------------------------
191
  # MCP Tools
192
  # -----------------------------
@@ -202,25 +185,41 @@ def speak_text_tool(text: str) -> ToolResult:
202
 
203
  @server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
204
  def describe_image_tool(image_path: str) -> ToolResult:
205
- # Prioritize OpenAI -> HF BLIP -> error
206
  if OPENAI_AVAILABLE:
207
  desc = describe_image_openai(image_path)
208
  if desc and not desc.startswith("OpenAI image describe error"):
209
- return ToolResult(content=desc)
210
- if HF_BLIP_AVAILABLE:
211
- desc = describe_image_blip(image_path)
212
- return ToolResult(content=desc)
213
- return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY or install transformers + pillow.")
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
 
216
  @server.tool(name="transcribe_audio", description="Transcribe user audio to text")
217
  def transcribe_audio_tool(audio_path: str) -> ToolResult:
 
218
  if OPENAI_AVAILABLE:
219
  text = transcribe_with_openai(audio_path)
220
- return ToolResult(content=text)
 
221
  else:
222
  text = transcribe_fallback(audio_path)
223
- return ToolResult(content=text)
 
224
 
225
  # -----------------------------
226
  # Gradio UI (client)
@@ -229,28 +228,46 @@ def transcribe_audio_tool(audio_path: str) -> ToolResult:
229
  def decode_base64_audio(b64: str) -> bytes:
230
  return base64.b64decode(b64)
231
 
232
- with gr.Blocks() as demo:
233
-
234
- with gr.Accordion("πŸ”‘ API Keys (stored only in session)", open=False):
235
- openai_key = gr.Textbox(label="OpenAI API Key", type="password")
236
- eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
237
- gemini_key = gr.Textbox(label="Gemini API Key", type="password")
238
-
239
- def set_keys(ok, ek, gk):
240
- if ok: os.environ["OPENAI_API_KEY"] = ok
241
- if ek: os.environ["ELEVENLABS_API_KEY"] = ek
242
- if gk: os.environ["GOOGLE_GEMINI_API_KEY"] = gk
243
- return "API keys set for this session."
244
-
245
- set_btn = gr.Button("Save API Keys")
246
- set_output = gr.Textbox(label="Status")
247
- set_btn.click(set_keys, [openai_key, eleven_key, gemini_key], [set_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
 
249
  gr.Markdown("# Accessibility Voice Agent β€” MCP Tools")
250
 
251
  with gr.Row():
252
- with gr.Column(scale=2):
253
- chatbox = gr.Chatbot(label="Assistant")
254
  user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
255
 
256
  with gr.Row():
@@ -264,59 +281,106 @@ with gr.Accordion("πŸ”‘ API Keys (stored only in session)", open=False):
264
  img_upload = gr.File(label="Upload image (for description)")
265
  img_btn = gr.Button("Describe image")
266
 
267
- with gr.Column(scale=1):
268
- gr.Markdown("### Tools Log")
269
- tools_log = gr.Textbox(value="Ready.", lines=20)
 
 
 
 
270
 
271
  # Callbacks
272
- def on_send_text(text, chat_history, mic_file):
273
- # If there's a mic file, prefer transcribing audio
274
  if mic_file:
275
- tools_log_val = tools_log.value if hasattr(tools_log, 'value') else ''
276
- tools_log_val = (tools_log_val + "
277
- Transcribing audio...")
278
- # transcribe
279
  tr = transcribe_audio_tool(mic_file)
280
  user_text = tr.content
 
 
281
  else:
282
  user_text = text
283
- # append user->assistant exchange
284
  chat_history = chat_history or []
285
  chat_history.append((user_text, "..."))
286
- # For demo: assistant echoes + uses describe_image if commands detected
287
- if user_text.strip().lower().startswith("describe image:"):
 
288
  # expects: "describe image: filename"
289
  _, _, fname = user_text.partition(":")
290
  fname = fname.strip()
291
  if fname:
292
- desc = describe_image_tool(fname)
293
- assistant = desc.content
 
 
 
294
  else:
295
- assistant = "Please upload an image using the Describe Image tool."
296
  else:
297
- assistant = "I heard: " + user_text
 
298
  chat_history[-1] = (user_text, assistant)
299
- return chat_history, tools_log_val
300
 
301
- send_btn.click(on_send_text, inputs=[user_input, chatbox, mic], outputs=[chatbox, tools_log])
 
 
 
 
 
 
 
 
 
 
 
302
 
303
- def on_tts(text):
 
 
304
  res = speak_text_tool(text)
305
  if res.meta and res.meta.get("format") == "base64-audio":
306
  audio_bytes = decode_base64_audio(res.content)
307
- return (audio_bytes, 16000)
308
- return None
 
 
 
 
 
309
 
310
- tts_btn.click(on_tts, inputs=[tts_text], outputs=[gr.Audio(label="TTS Output")])
311
 
312
- def on_describe_image(file_obj):
313
  if not file_obj:
314
  return "No file uploaded"
315
- # file_obj is a tempfile path in hf spaces; pass path to tool
316
- desc = describe_image_tool(file_obj.name if hasattr(file_obj, 'name') else file_obj)
317
- return desc.content
318
-
319
- img_btn.click(on_describe_image, inputs=[img_upload], outputs=[chatbox])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
 
321
  if __name__ == "__main__":
322
- demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
 
3
  import json
4
  import asyncio
5
  import base64
6
+ import time
7
  from typing import Optional
8
 
9
  import gradio as gr
 
16
  except Exception:
17
  OPENAI_AVAILABLE = False
18
 
 
 
 
 
 
 
 
 
 
19
  # -----------------------------
20
  # Configuration
21
  # -----------------------------
22
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
23
  ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
24
  HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
25
+ GOOGLE_GEMINI_API_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
26
 
27
  if OPENAI_API_KEY and OPENAI_AVAILABLE:
28
  openai.api_key = OPENAI_API_KEY
 
31
  ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL") # placeholder
32
  ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
33
 
34
+ # Hugging Face Inference API endpoint (for image captioning fallback)
35
+ HF_INFERENCE_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
36
+
37
  # -----------------------------
38
  # Minimal MCP Server shim
39
  # -----------------------------
 
79
  """Transcribe audio using OpenAI Whisper (if available)."""
80
  if not OPENAI_AVAILABLE:
81
  return "OpenAI library not available"
82
+ try:
83
+ with open(audio_file_path, "rb") as f:
 
84
  transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
 
85
  if isinstance(transcript, dict):
86
  return transcript.get("text", "")
87
  return getattr(transcript, "text", "")
88
+ except Exception as e:
89
+ return f"OpenAI transcription error: {e}"
90
 
91
 
92
  def transcribe_fallback(audio_file_path: str) -> str:
 
101
 
102
 
103
  def tts_elevenlabs(text: str) -> bytes:
104
+ """Call ElevenLabs API to synthesize speech. Returns raw audio bytes."""
105
  if not ELEVENLABS_API_KEY:
106
  raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
107
+ import requests
108
  url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
109
  headers = {
110
  "xi-api-key": ELEVENLABS_API_KEY,
 
120
  return resp.content
121
 
122
 
123
+ def describe_image_hf(image_path: str) -> str:
124
+ """Describe an image using Hugging Face Inference API (BLIP model hosted)."""
 
 
 
 
 
125
  try:
126
+ import requests
127
+ if not HUGGINGFACE_API_TOKEN:
128
+ return "HUGGINGFACE_API_TOKEN not set"
 
 
 
 
 
129
  with open(image_path, "rb") as f:
130
  image_bytes = f.read()
131
+ headers = {
132
+ "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"
133
+ }
134
+ # The HF Inference API accepts files as binary
135
+ resp = requests.post(HF_INFERENCE_URL, headers=headers, data=image_bytes)
136
+ if resp.status_code != 200:
137
+ return f"HF Inference error: {resp.status_code} {resp.text}"
138
+ # Model returns JSON with 'generated_text' or a simple string depending on model
139
+ try:
140
+ j = resp.json()
141
+ # Some endpoints return [{'generated_text': '...'}]
142
+ if isinstance(j, list) and j and 'generated_text' in j[0]:
143
+ return j[0]['generated_text']
144
+ if isinstance(j, dict) and 'generated_text' in j:
145
+ return j['generated_text']
146
+ # Otherwise return text
147
+ return str(j)
148
+ except Exception:
149
+ return resp.text
150
  except Exception as e:
151
+ return f"HF describe error: {e}"
152
 
153
+
154
+ def describe_image_openai(image_path: str) -> str:
155
+ """Attempt to describe an image using OpenAI vision if available."""
156
  if not OPENAI_AVAILABLE:
157
  return "OpenAI not available for image captioning"
158
  try:
159
  with open(image_path, "rb") as f:
 
 
160
  b64 = base64.b64encode(f.read()).decode("utf-8")
161
  prompt = (
162
  "You are an assistant that describes images for visually impaired users. "
163
+ "Provide a concise, vivid, and accessible description of the image.
 
164
  Image(base64):" + b64
165
  )
166
  resp = openai.ChatCompletion.create(
 
170
  except Exception as e:
171
  return f"OpenAI image describe error: {e}"
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  # -----------------------------
174
  # MCP Tools
175
  # -----------------------------
 
185
 
186
  @server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
187
  def describe_image_tool(image_path: str) -> ToolResult:
188
+ # Priority: OpenAI -> Gemini -> Hugging Face Inference -> error
189
  if OPENAI_AVAILABLE:
190
  desc = describe_image_openai(image_path)
191
  if desc and not desc.startswith("OpenAI image describe error"):
192
+ return ToolResult(content=desc, meta={"backend":"openai"})
193
+ # Gemini (if configured)
194
+ if GOOGLE_GEMINI_API_KEY:
195
+ try:
196
+ import google.generativeai as genai
197
+ genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
198
+ model = genai.GenerativeModel("gemini-1.5-flash")
199
+ with open(image_path, "rb") as f:
200
+ image_bytes = f.read()
201
+ response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
202
+ return ToolResult(content=response.text, meta={"backend":"gemini"})
203
+ except Exception:
204
+ pass
205
+ # Hugging Face Inference
206
+ desc = describe_image_hf(image_path)
207
+ if desc:
208
+ return ToolResult(content=desc, meta={"backend":"huggingface"})
209
+ return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY, GOOGLE_GEMINI_API_KEY, or HUGGINGFACE_API_TOKEN.")
210
 
211
 
212
  @server.tool(name="transcribe_audio", description="Transcribe user audio to text")
213
  def transcribe_audio_tool(audio_path: str) -> ToolResult:
214
+ start = time.time()
215
  if OPENAI_AVAILABLE:
216
  text = transcribe_with_openai(audio_path)
217
+ duration = time.time() - start
218
+ return ToolResult(content=text, meta={"backend":"openai","duration":duration})
219
  else:
220
  text = transcribe_fallback(audio_path)
221
+ duration = time.time() - start
222
+ return ToolResult(content=text, meta={"backend":"local_whisper","duration":duration})
223
 
224
  # -----------------------------
225
  # Gradio UI (client)
 
228
  def decode_base64_audio(b64: str) -> bytes:
229
  return base64.b64decode(b64)
230
 
231
+ app_theme = {
232
+ "primary_hue": "blue",
233
+ "secondary_hue": "slate",
234
+ }
235
+
236
+ # Helper to format tool-call explanations
237
+ def format_tool_log(tool_name, reason, meta, output, style="A"):
238
+ backend = meta.get("backend") if meta else "unknown"
239
+ duration = meta.get("duration") if meta else None
240
+ if style == "A":
241
+ # Simple
242
+ return f"[{tool_name}] {backend} -> {str(output)[:200]}"
243
+ if style == "B":
244
+ # Detailed human-readable
245
+ lines = [f"πŸ”§ Tool: {tool_name}", f"🎯 Why: {reason}", f"βš™οΈ Backend: {backend}"]
246
+ if duration is not None:
247
+ lines.append(f"⏱ Duration: {duration:.2f}s")
248
+ lines.append(f"πŸ“ Output: {str(output)}")
249
+ return "
250
+ ".join(lines)
251
+ if style == "C":
252
+ # Ultra-visual
253
+ s = f"πŸ”§ {tool_name} β€’ Reason: {reason} β€’ Backend: {backend}"
254
+ if duration is not None:
255
+ s += f" β€’ {duration:.2f}s"
256
+ s += f"
257
+ β†’ {str(output)}"
258
+ return s
259
+ # D -> both
260
+ return {
261
+ "simple": f"[{tool_name}] {backend} -> {str(output)[:200]}",
262
+ "detailed": format_tool_log(tool_name, reason, meta, output, style="B")
263
+ }
264
 
265
+ with gr.Blocks(css=".gradio-container {background:#f7fafc}") as demo:
266
  gr.Markdown("# Accessibility Voice Agent β€” MCP Tools")
267
 
268
  with gr.Row():
269
+ with gr.Column(scale=3):
270
+ chatbox = gr.Chatbot(label="Assistant", elem_id="chatbox")
271
  user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
272
 
273
  with gr.Row():
 
281
  img_upload = gr.File(label="Upload image (for description)")
282
  img_btn = gr.Button("Describe image")
283
 
284
+ with gr.Column(scale=2):
285
+ gr.Markdown("### Tool Call Log & Explanations")
286
+ log_style = gr.Radio(choices=["A","B","C","D"], value="B", label="Log style (A:Simple B:Detailed C:Visual D:Both)")
287
+ tools_log = gr.Textbox(value="Ready.", lines=20, interactive=False, label="Tools Log")
288
+ tools_panel = gr.HTML("<div id='tools_panel' style='max-height:400px;overflow:auto;background:#ffffff;padding:8px;border-radius:8px;'></div>")
289
+ gr.Markdown("---")
290
+ gr.Markdown("**Tool explanations appear here each time a tool runs.**")
291
 
292
  # Callbacks
293
+ def on_send_text(text, chat_history, mic_file, style):
294
+ tools_entries = []
295
  if mic_file:
296
+ # transcribe audio
 
 
 
297
  tr = transcribe_audio_tool(mic_file)
298
  user_text = tr.content
299
+ log = format_tool_log("transcribe_audio", "User provided microphone audio", tr.meta or {}, tr.content, style)
300
+ tools_entries.append(log)
301
  else:
302
  user_text = text
 
303
  chat_history = chat_history or []
304
  chat_history.append((user_text, "..."))
305
+
306
+ # demo assistant behavior
307
+ if user_text and user_text.strip().lower().startswith("describe image:"):
308
  # expects: "describe image: filename"
309
  _, _, fname = user_text.partition(":")
310
  fname = fname.strip()
311
  if fname:
312
+ # We assume the image was uploaded earlier and path provided
313
+ res = describe_image_tool(fname)
314
+ assistant = res.content
315
+ log = format_tool_log("describe_image", "User requested image description", res.meta or {}, res.content, style)
316
+ tools_entries.append(log)
317
  else:
318
+ assistant = "Please upload an image using the Describe Image tool or provide a path like: describe image: /path/to/image.jpg"
319
  else:
320
+ assistant = "I heard: " + (user_text or "(empty)")
321
+
322
  chat_history[-1] = (user_text, assistant)
 
323
 
324
+ # update tools panel content
325
+ panel_html = ''
326
+ if isinstance(log, dict):
327
+ # D style returns dict
328
+ panel_html += f"<pre>{log['detailed']}</pre>"
329
+ panel_html += f"<hr><pre>{log['simple']}</pre>"
330
+ else:
331
+ for e in tools_entries:
332
+ panel_html += f"<pre style='background:#f1f5f9;border-radius:6px;padding:8px;margin-bottom:8px;'>{e}</pre>"
333
+ return chat_history, tools_log, gr.update(value=panel_html)
334
+
335
+ send_btn.click(on_send_text, inputs=[user_input, chatbox, mic, log_style], outputs=[chatbox, tools_log, tools_panel])
336
 
337
+ def on_tts(text, style):
338
+ if not text:
339
+ return None, gr.update(value="No text provided")
340
  res = speak_text_tool(text)
341
  if res.meta and res.meta.get("format") == "base64-audio":
342
  audio_bytes = decode_base64_audio(res.content)
343
+ log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, "<audio bytes>", style)
344
+ panel_html = f"<pre style='background:#eef2ff;padding:8px;border-radius:6px;'>{log}</pre>"
345
+ return (audio_bytes, 16000), gr.update(value=panel_html)
346
+ else:
347
+ log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, res.content, style)
348
+ panel_html = f"<pre style='background:#fee2e2;padding:8px;border-radius:6px;'>{log}</pre>"
349
+ return None, gr.update(value=panel_html)
350
 
351
+ tts_btn.click(on_tts, inputs=[tts_text, log_style], outputs=[gr.Audio(label="TTS Output"), tools_panel])
352
 
353
+ def on_describe_image(file_obj, style):
354
  if not file_obj:
355
  return "No file uploaded"
356
+ # file_obj may be a tempfile object or path
357
+ path = getattr(file_obj, 'name', file_obj)
358
+ res = describe_image_tool(path)
359
+ log = format_tool_log("describe_image", "User uploaded an image for description", res.meta or {}, res.content, style)
360
+ panel_html = f"<pre style='background:#ecfdf5;padding:8px;border-radius:6px;'>{log}</pre>"
361
+ # show result in chatbox as assistant reply
362
+ return [("<image uploaded>", res.content)], gr.update(value=panel_html)
363
+
364
+ img_btn.click(on_describe_image, inputs=[img_upload, log_style], outputs=[chatbox, tools_panel])
365
+
366
+ # API Keys accordion (session-only)
367
+ with gr.Accordion("πŸ”‘ API Keys (stored only in session)", open=False):
368
+ openai_key = gr.Textbox(label="OpenAI API Key", type="password")
369
+ eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
370
+ hf_key = gr.Textbox(label="Hugging Face API Token", type="password")
371
+
372
+ def set_keys(ok, ek, hk):
373
+ if ok:
374
+ os.environ["OPENAI_API_KEY"] = ok
375
+ if ek:
376
+ os.environ["ELEVENLABS_API_KEY"] = ek
377
+ if hk:
378
+ os.environ["HUGGINGFACE_API_TOKEN"] = hk
379
+ return "API keys set for this session. Refresh the page to pick them up in all runtimes."
380
+
381
+ set_btn = gr.Button("Save API Keys")
382
+ set_output = gr.Textbox(label="Status")
383
+ set_btn.click(set_keys, [openai_key, eleven_key, hf_key], [set_output])
384
 
385
  if __name__ == "__main__":
386
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))