Spaces:

MCP-1st-Birthday
/

VisionPro

Sleeping

App Files Files Community

subhash4face commited on Nov 30, 2025

Commit

ede6a4f

verified ·

1 Parent(s): 420030c

fix issues

Browse files

Files changed (1) hide show

app.py +176 -112

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import io
 import json
 import asyncio
 import base64
 from typing import Optional
 import gradio as gr
@@ -15,21 +16,13 @@ try:
 except Exception:
     OPENAI_AVAILABLE = False
-# Optional: HF transformers fallbacks
-try:
-    from PIL import Image
-    import requests
-    from transformers import BlipProcessor, BlipForConditionalGeneration
-    HF_BLIP_AVAILABLE = True
-except Exception:
-    HF_BLIP_AVAILABLE = False
 # -----------------------------
 # Configuration
 # -----------------------------
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
 HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
 if OPENAI_API_KEY and OPENAI_AVAILABLE:
     openai.api_key = OPENAI_API_KEY
@@ -38,6 +31,9 @@ if OPENAI_API_KEY and OPENAI_AVAILABLE:
 ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")  # placeholder
 ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
 # -----------------------------
 # Minimal MCP Server shim
 # -----------------------------
@@ -83,16 +79,14 @@ def transcribe_with_openai(audio_file_path: str) -> str:
     """Transcribe audio using OpenAI Whisper (if available)."""
     if not OPENAI_AVAILABLE:
         return "OpenAI library not available"
-    with open(audio_file_path, "rb") as f:
-        # Uses the OpenAI Audio transcription API (may vary by SDK version)
-        try:
             transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
-            # Some SDKs return .text
             if isinstance(transcript, dict):
                 return transcript.get("text", "")
             return getattr(transcript, "text", "")
-        except Exception as e:
-            return f"OpenAI transcription error: {e}"
 def transcribe_fallback(audio_file_path: str) -> str:
@@ -107,9 +101,10 @@ def transcribe_fallback(audio_file_path: str) -> str:
 def tts_elevenlabs(text: str) -> bytes:
-    """Call ElevenLabs API to synthesize speech. Returns raw audio bytes (wav/mp3 depending on API)."""
     if not ELEVENLABS_API_KEY:
         raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
     url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
     headers = {
         "xi-api-key": ELEVENLABS_API_KEY,
@@ -125,44 +120,47 @@ def tts_elevenlabs(text: str) -> bytes:
     return resp.content
-def # -----------------------------
-# Gemini Image Description
-# -----------------------------
-def describe_image_gemini(image_path: str) -> str:
-    """Describe an image using Google Gemini Vision."""
     try:
-        import google.generativeai as genai
-        GEMINI_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
-        if not GEMINI_KEY:
-            return "GOOGLE_GEMINI_API_KEY not set"
-        genai.configure(api_key=GEMINI_KEY)
-        model = genai.GenerativeModel("gemini-1.5-flash")
         with open(image_path, "rb") as f:
             image_bytes = f.read()
-        response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
-        return response.text
     except Exception as e:
-        return f"Gemini describe error: {e}"
-# (OpenAI code removed for simplicity)
-(image_path: str) -> str:
-    """Attempt to describe an image using OpenAI vision (if available)."""
     if not OPENAI_AVAILABLE:
         return "OpenAI not available for image captioning"
     try:
         with open(image_path, "rb") as f:
-            # Example using the OpenAI image understanding endpoints (SDKs vary)
-            # We'll call the Chat Completions with system prompt and base64 image as a fallback
             b64 = base64.b64encode(f.read()).decode("utf-8")
             prompt = (
                 "You are an assistant that describes images for visually impaired users. "
-                "Provide a concise, vivid, and accessible description of the image."
 Image(base64):" + b64
             )
             resp = openai.ChatCompletion.create(
@@ -172,21 +170,6 @@ Image(base64):" + b64
     except Exception as e:
         return f"OpenAI image describe error: {e}"
-def describe_image_blip(image_path: str) -> str:
-    if not HF_BLIP_AVAILABLE:
-        return "HF BLIP not available in this runtime"
-    try:
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-        raw_image = Image.open(image_path).convert("RGB")
-        inputs = processor(raw_image, return_tensors="pt")
-        out = model.generate(**inputs)
-        caption = processor.decode(out[0], skip_special_tokens=True)
-        return caption
-    except Exception as e:
-        return f"BLIP caption error: {e}"
 # -----------------------------
 # MCP Tools
 # -----------------------------
@@ -202,25 +185,41 @@ def speak_text_tool(text: str) -> ToolResult:
 @server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
 def describe_image_tool(image_path: str) -> ToolResult:
-    # Prioritize OpenAI -> HF BLIP -> error
     if OPENAI_AVAILABLE:
         desc = describe_image_openai(image_path)
         if desc and not desc.startswith("OpenAI image describe error"):
-            return ToolResult(content=desc)
-    if HF_BLIP_AVAILABLE:
-        desc = describe_image_blip(image_path)
-        return ToolResult(content=desc)
-    return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY or install transformers + pillow.")
 @server.tool(name="transcribe_audio", description="Transcribe user audio to text")
 def transcribe_audio_tool(audio_path: str) -> ToolResult:
     if OPENAI_AVAILABLE:
         text = transcribe_with_openai(audio_path)
-        return ToolResult(content=text)
     else:
         text = transcribe_fallback(audio_path)
-        return ToolResult(content=text)
 # -----------------------------
 # Gradio UI (client)
@@ -229,28 +228,46 @@ def transcribe_audio_tool(audio_path: str) -> ToolResult:
 def decode_base64_audio(b64: str) -> bytes:
     return base64.b64decode(b64)
-with gr.Blocks() as demo:
-with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
-    openai_key = gr.Textbox(label="OpenAI API Key", type="password")
-    eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
-    gemini_key = gr.Textbox(label="Gemini API Key", type="password")
-    def set_keys(ok, ek, gk):
-        if ok: os.environ["OPENAI_API_KEY"] = ok
-        if ek: os.environ["ELEVENLABS_API_KEY"] = ek
-        if gk: os.environ["GOOGLE_GEMINI_API_KEY"] = gk
-        return "API keys set for this session."
-    set_btn = gr.Button("Save API Keys")
-    set_output = gr.Textbox(label="Status")
-    set_btn.click(set_keys, [openai_key, eleven_key, gemini_key], [set_output])
     gr.Markdown("# Accessibility Voice Agent — MCP Tools")
     with gr.Row():
-        with gr.Column(scale=2):
-            chatbox = gr.Chatbot(label="Assistant")
             user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
             with gr.Row():
@@ -264,59 +281,106 @@ with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
                 img_upload = gr.File(label="Upload image (for description)")
                 img_btn = gr.Button("Describe image")
-        with gr.Column(scale=1):
-            gr.Markdown("### Tools Log")
-            tools_log = gr.Textbox(value="Ready.", lines=20)
     # Callbacks
-    def on_send_text(text, chat_history, mic_file):
-        # If there's a mic file, prefer transcribing audio
         if mic_file:
-            tools_log_val = tools_log.value if hasattr(tools_log, 'value') else ''
-            tools_log_val = (tools_log_val + "
-Transcribing audio...")
-            # transcribe
             tr = transcribe_audio_tool(mic_file)
             user_text = tr.content
         else:
             user_text = text
-        # append user->assistant exchange
         chat_history = chat_history or []
         chat_history.append((user_text, "..."))
-        # For demo: assistant echoes + uses describe_image if commands detected
-        if user_text.strip().lower().startswith("describe image:"):
             # expects: "describe image: filename"
             _, _, fname = user_text.partition(":")
             fname = fname.strip()
             if fname:
-                desc = describe_image_tool(fname)
-                assistant = desc.content
             else:
-                assistant = "Please upload an image using the Describe Image tool."
         else:
-            assistant = "I heard: " + user_text
         chat_history[-1] = (user_text, assistant)
-        return chat_history, tools_log_val
-    send_btn.click(on_send_text, inputs=[user_input, chatbox, mic], outputs=[chatbox, tools_log])
-    def on_tts(text):
         res = speak_text_tool(text)
         if res.meta and res.meta.get("format") == "base64-audio":
             audio_bytes = decode_base64_audio(res.content)
-            return (audio_bytes, 16000)
-        return None
-    tts_btn.click(on_tts, inputs=[tts_text], outputs=[gr.Audio(label="TTS Output")])
-    def on_describe_image(file_obj):
         if not file_obj:
             return "No file uploaded"
-        # file_obj is a tempfile path in hf spaces; pass path to tool
-        desc = describe_image_tool(file_obj.name if hasattr(file_obj, 'name') else file_obj)
-        return desc.content
-    img_btn.click(on_describe_image, inputs=[img_upload], outputs=[chatbox])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import json
 import asyncio
 import base64
+import time
 from typing import Optional
 import gradio as gr
 except Exception:
     OPENAI_AVAILABLE = False
 # -----------------------------
 # Configuration
 # -----------------------------
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
 HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
+GOOGLE_GEMINI_API_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
 if OPENAI_API_KEY and OPENAI_AVAILABLE:
     openai.api_key = OPENAI_API_KEY
 ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")  # placeholder
 ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
+# Hugging Face Inference API endpoint (for image captioning fallback)
+HF_INFERENCE_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
 # -----------------------------
 # Minimal MCP Server shim
 # -----------------------------
     """Transcribe audio using OpenAI Whisper (if available)."""
     if not OPENAI_AVAILABLE:
         return "OpenAI library not available"
+    try:
+        with open(audio_file_path, "rb") as f:
             transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
             if isinstance(transcript, dict):
                 return transcript.get("text", "")
             return getattr(transcript, "text", "")
+    except Exception as e:
+        return f"OpenAI transcription error: {e}"
 def transcribe_fallback(audio_file_path: str) -> str:
 def tts_elevenlabs(text: str) -> bytes:
+    """Call ElevenLabs API to synthesize speech. Returns raw audio bytes."""
     if not ELEVENLABS_API_KEY:
         raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
+    import requests
     url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
     headers = {
         "xi-api-key": ELEVENLABS_API_KEY,
     return resp.content
+def describe_image_hf(image_path: str) -> str:
+    """Describe an image using Hugging Face Inference API (BLIP model hosted)."""
     try:
+        import requests
+        if not HUGGINGFACE_API_TOKEN:
+            return "HUGGINGFACE_API_TOKEN not set"
         with open(image_path, "rb") as f:
             image_bytes = f.read()
+        headers = {
+            "Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"
+        }
+        # The HF Inference API accepts files as binary
+        resp = requests.post(HF_INFERENCE_URL, headers=headers, data=image_bytes)
+        if resp.status_code != 200:
+            return f"HF Inference error: {resp.status_code} {resp.text}"
+        # Model returns JSON with 'generated_text' or a simple string depending on model
+        try:
+            j = resp.json()
+            # Some endpoints return [{'generated_text': '...'}]
+            if isinstance(j, list) and j and 'generated_text' in j[0]:
+                return j[0]['generated_text']
+            if isinstance(j, dict) and 'generated_text' in j:
+                return j['generated_text']
+            # Otherwise return text
+            return str(j)
+        except Exception:
+            return resp.text
     except Exception as e:
+        return f"HF describe error: {e}"
+def describe_image_openai(image_path: str) -> str:
+    """Attempt to describe an image using OpenAI vision if available."""
     if not OPENAI_AVAILABLE:
         return "OpenAI not available for image captioning"
     try:
         with open(image_path, "rb") as f:
             b64 = base64.b64encode(f.read()).decode("utf-8")
             prompt = (
                 "You are an assistant that describes images for visually impaired users. "
+                "Provide a concise, vivid, and accessible description of the image.
 Image(base64):" + b64
             )
             resp = openai.ChatCompletion.create(
     except Exception as e:
         return f"OpenAI image describe error: {e}"
 # -----------------------------
 # MCP Tools
 # -----------------------------
 @server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
 def describe_image_tool(image_path: str) -> ToolResult:
+    # Priority: OpenAI -> Gemini -> Hugging Face Inference -> error
     if OPENAI_AVAILABLE:
         desc = describe_image_openai(image_path)
         if desc and not desc.startswith("OpenAI image describe error"):
+            return ToolResult(content=desc, meta={"backend":"openai"})
+    # Gemini (if configured)
+    if GOOGLE_GEMINI_API_KEY:
+        try:
+            import google.generativeai as genai
+            genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
+            model = genai.GenerativeModel("gemini-1.5-flash")
+            with open(image_path, "rb") as f:
+                image_bytes = f.read()
+            response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
+            return ToolResult(content=response.text, meta={"backend":"gemini"})
+        except Exception:
+            pass
+    # Hugging Face Inference
+    desc = describe_image_hf(image_path)
+    if desc:
+        return ToolResult(content=desc, meta={"backend":"huggingface"})
+    return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY, GOOGLE_GEMINI_API_KEY, or HUGGINGFACE_API_TOKEN.")
 @server.tool(name="transcribe_audio", description="Transcribe user audio to text")
 def transcribe_audio_tool(audio_path: str) -> ToolResult:
+    start = time.time()
     if OPENAI_AVAILABLE:
         text = transcribe_with_openai(audio_path)
+        duration = time.time() - start
+        return ToolResult(content=text, meta={"backend":"openai","duration":duration})
     else:
         text = transcribe_fallback(audio_path)
+        duration = time.time() - start
+        return ToolResult(content=text, meta={"backend":"local_whisper","duration":duration})
 # -----------------------------
 # Gradio UI (client)
 def decode_base64_audio(b64: str) -> bytes:
     return base64.b64decode(b64)
+app_theme = {
+    "primary_hue": "blue",
+    "secondary_hue": "slate",
+}
+# Helper to format tool-call explanations
+def format_tool_log(tool_name, reason, meta, output, style="A"):
+    backend = meta.get("backend") if meta else "unknown"
+    duration = meta.get("duration") if meta else None
+    if style == "A":
+        # Simple
+        return f"[{tool_name}] {backend} -> {str(output)[:200]}"
+    if style == "B":
+        # Detailed human-readable
+        lines = [f"🔧 Tool: {tool_name}", f"🎯 Why: {reason}", f"⚙️ Backend: {backend}"]
+        if duration is not None:
+            lines.append(f"⏱ Duration: {duration:.2f}s")
+        lines.append(f"📝 Output: {str(output)}")
+        return "
+".join(lines)
+    if style == "C":
+        # Ultra-visual
+        s = f"🔧 {tool_name} • Reason: {reason} • Backend: {backend}"
+        if duration is not None:
+            s += f" • {duration:.2f}s"
+        s += f"
+→ {str(output)}"
+        return s
+    # D -> both
+    return {
+        "simple": f"[{tool_name}] {backend} -> {str(output)[:200]}",
+        "detailed": format_tool_log(tool_name, reason, meta, output, style="B")
+    }
+with gr.Blocks(css=".gradio-container {background:#f7fafc}") as demo:
     gr.Markdown("# Accessibility Voice Agent — MCP Tools")
     with gr.Row():
+        with gr.Column(scale=3):
+            chatbox = gr.Chatbot(label="Assistant", elem_id="chatbox")
             user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
             with gr.Row():
                 img_upload = gr.File(label="Upload image (for description)")
                 img_btn = gr.Button("Describe image")
+        with gr.Column(scale=2):
+            gr.Markdown("### Tool Call Log & Explanations")
+            log_style = gr.Radio(choices=["A","B","C","D"], value="B", label="Log style (A:Simple B:Detailed C:Visual D:Both)")
+            tools_log = gr.Textbox(value="Ready.", lines=20, interactive=False, label="Tools Log")
+            tools_panel = gr.HTML("<div id='tools_panel' style='max-height:400px;overflow:auto;background:#ffffff;padding:8px;border-radius:8px;'></div>")
+            gr.Markdown("---")
+            gr.Markdown("**Tool explanations appear here each time a tool runs.**")
     # Callbacks
+    def on_send_text(text, chat_history, mic_file, style):
+        tools_entries = []
         if mic_file:
+            # transcribe audio
             tr = transcribe_audio_tool(mic_file)
             user_text = tr.content
+            log = format_tool_log("transcribe_audio", "User provided microphone audio", tr.meta or {}, tr.content, style)
+            tools_entries.append(log)
         else:
             user_text = text
         chat_history = chat_history or []
         chat_history.append((user_text, "..."))
+        # demo assistant behavior
+        if user_text and user_text.strip().lower().startswith("describe image:"):
             # expects: "describe image: filename"
             _, _, fname = user_text.partition(":")
             fname = fname.strip()
             if fname:
+                # We assume the image was uploaded earlier and path provided
+                res = describe_image_tool(fname)
+                assistant = res.content
+                log = format_tool_log("describe_image", "User requested image description", res.meta or {}, res.content, style)
+                tools_entries.append(log)
             else:
+                assistant = "Please upload an image using the Describe Image tool or provide a path like: describe image: /path/to/image.jpg"
         else:
+            assistant = "I heard: " + (user_text or "(empty)")
         chat_history[-1] = (user_text, assistant)
+        # update tools panel content
+        panel_html = ''
+        if isinstance(log, dict):
+            # D style returns dict
+            panel_html += f"<pre>{log['detailed']}</pre>"
+            panel_html += f"<hr><pre>{log['simple']}</pre>"
+        else:
+            for e in tools_entries:
+                panel_html += f"<pre style='background:#f1f5f9;border-radius:6px;padding:8px;margin-bottom:8px;'>{e}</pre>"
+        return chat_history, tools_log, gr.update(value=panel_html)
+    send_btn.click(on_send_text, inputs=[user_input, chatbox, mic, log_style], outputs=[chatbox, tools_log, tools_panel])
+    def on_tts(text, style):
+        if not text:
+            return None, gr.update(value="No text provided")
         res = speak_text_tool(text)
         if res.meta and res.meta.get("format") == "base64-audio":
             audio_bytes = decode_base64_audio(res.content)
+            log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, "<audio bytes>", style)
+            panel_html = f"<pre style='background:#eef2ff;padding:8px;border-radius:6px;'>{log}</pre>"
+            return (audio_bytes, 16000), gr.update(value=panel_html)
+        else:
+            log = format_tool_log("speak_text", "User requested text-to-speech", res.meta or {}, res.content, style)
+            panel_html = f"<pre style='background:#fee2e2;padding:8px;border-radius:6px;'>{log}</pre>"
+            return None, gr.update(value=panel_html)
+    tts_btn.click(on_tts, inputs=[tts_text, log_style], outputs=[gr.Audio(label="TTS Output"), tools_panel])
+    def on_describe_image(file_obj, style):
         if not file_obj:
             return "No file uploaded"
+        # file_obj may be a tempfile object or path
+        path = getattr(file_obj, 'name', file_obj)
+        res = describe_image_tool(path)
+        log = format_tool_log("describe_image", "User uploaded an image for description", res.meta or {}, res.content, style)
+        panel_html = f"<pre style='background:#ecfdf5;padding:8px;border-radius:6px;'>{log}</pre>"
+        # show result in chatbox as assistant reply
+        return [("<image uploaded>", res.content)], gr.update(value=panel_html)
+    img_btn.click(on_describe_image, inputs=[img_upload, log_style], outputs=[chatbox, tools_panel])
+    # API Keys accordion (session-only)
+    with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
+        openai_key = gr.Textbox(label="OpenAI API Key", type="password")
+        eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
+        hf_key = gr.Textbox(label="Hugging Face API Token", type="password")
+        def set_keys(ok, ek, hk):
+            if ok:
+                os.environ["OPENAI_API_KEY"] = ok
+            if ek:
+                os.environ["ELEVENLABS_API_KEY"] = ek
+            if hk:
+                os.environ["HUGGINGFACE_API_TOKEN"] = hk
+            return "API keys set for this session. Refresh the page to pick them up in all runtimes."
+        set_btn = gr.Button("Save API Keys")
+        set_output = gr.Textbox(label="Status")
+        set_btn.click(set_keys, [openai_key, eleven_key, hf_key], [set_output])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))