Spaces:

MCP-1st-Birthday
/

VisionPro

Sleeping

App Files Files Community

subhash4face commited on Nov 29, 2025

Commit

420030c

verified ·

1 Parent(s): 125ea2c

Updated to include API keys

Browse files

Files changed (1) hide show

app.py +197 -83

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import io
 import json
@@ -9,13 +8,14 @@ from typing import Optional
 import gradio as gr
 from pydantic import BaseModel
-# Try optional dependencies
 try:
     import openai
     OPENAI_AVAILABLE = True
 except Exception:
     OPENAI_AVAILABLE = False
 try:
     from PIL import Image
     import requests
@@ -24,18 +24,23 @@ try:
 except Exception:
     HF_BLIP_AVAILABLE = False
-# Config
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
 if OPENAI_API_KEY and OPENAI_AVAILABLE:
     openai.api_key = OPENAI_API_KEY
-ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")
 ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
-# MCP server shim
 class ToolResult(BaseModel):
     content: str
     meta: Optional[dict] = None
@@ -48,161 +53,270 @@ class MCPServer:
     def tool(self, name: str, description: str = ""):
         def decorator(fn):
-            self.tools[name] = {"fn": fn, "description": description}
             return fn
         return decorator
 server = MCPServer("accessibility_voice_mcp")
-# STT utilities
 def transcribe_with_openai(audio_file_path: str) -> str:
     if not OPENAI_AVAILABLE:
-        return "OpenAI not available"
-    try:
-        with open(audio_file_path, "rb") as f:
-            tr = openai.Audio.transcriptions.create(model="whisper-1", file=f)
-            if isinstance(tr, dict):
-                return tr.get("text", "")
-            return getattr(tr, "text", "")
-    except Exception as e:
-        return f"OpenAI transcription error: {e}"
 def transcribe_fallback(audio_file_path: str) -> str:
     try:
         import whisper
         model = whisper.load_model("small")
         res = model.transcribe(audio_file_path)
         return res.get("text", "")
     except Exception as e:
-        return f"Local STT fallback failed: {e}"
-# TTS
 def tts_elevenlabs(text: str) -> bytes:
     if not ELEVENLABS_API_KEY:
-        raise RuntimeError("ELEVENLABS_API_KEY missing")
     url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
-    headers = {"xi-api-key": ELEVENLABS_API_KEY, "Content-Type": "application/json"}
-    payload = {"text": text, "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}}
-    r = requests.post(url, headers=headers, json=payload, stream=True)
-    if r.status_code != 200:
-        raise RuntimeError(f"ElevenLabs error {r.status_code}: {r.text}")
-    return r.content
-# Gemini Vision
 def describe_image_gemini(image_path: str) -> str:
     try:
         import google.generativeai as genai
-        key = os.environ.get("GOOGLE_GEMINI_API_KEY")
-        if not key:
             return "GOOGLE_GEMINI_API_KEY not set"
-        genai.configure(api_key=key)
         model = genai.GenerativeModel("gemini-1.5-flash")
         with open(image_path, "rb") as f:
-            img_bytes = f.read()
-        resp = model.generate_content([
-            "Describe this image for a visually impaired user.",
-            {"mime_type": "image/jpeg", "data": img_bytes}
-        ])
-        return resp.text
     except Exception as e:
         return f"Gemini describe error: {e}"
-# BLIP fallback
 def describe_image_blip(image_path: str) -> str:
     if not HF_BLIP_AVAILABLE:
-        return "BLIP not available"
     try:
         processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
         model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-        raw = Image.open(image_path).convert("RGB")
-        inputs = processor(raw, return_tensors="pt")
         out = model.generate(**inputs)
-        return processor.decode(out[0], skip_special_tokens=True)
     except Exception as e:
         return f"BLIP caption error: {e}"
 # MCP Tools
-@server.tool("speak_text", "Convert text to speech using ElevenLabs")
 def speak_text_tool(text: str) -> ToolResult:
     try:
-        audio = tts_elevenlabs(text)
-        enc = base64.b64encode(audio).decode("utf-8")
-        return ToolResult(content=enc, meta={"format": "base64-audio"})
     except Exception as e:
-        return ToolResult(content=f"TTS error: {e}")
-@server.tool("describe_image", "Describe an uploaded image")
 def describe_image_tool(image_path: str) -> ToolResult:
-    desc = describe_image_gemini(image_path)
-    if "error" not in desc.lower() and "not set" not in desc.lower():
         return ToolResult(content=desc)
-    desc = describe_image_blip(image_path)
-    return ToolResult(content=desc)
-@server.tool("transcribe_audio", "Transcribe audio to text")
-def transcribe_audio_tool(path: str) -> ToolResult:
     if OPENAI_AVAILABLE:
-        return ToolResult(content=transcribe_with_openai(path))
-    return ToolResult(content=transcribe_fallback(path))
-# Gradio UI
 def decode_base64_audio(b64: str) -> bytes:
     return base64.b64decode(b64)
 with gr.Blocks() as demo:
     gr.Markdown("# Accessibility Voice Agent — MCP Tools")
     with gr.Row():
         with gr.Column(scale=2):
-            chatbox = gr.Chatbot(type="messages")
-            user_input = gr.Textbox(placeholder="Type or speak...", show_label=False)
             with gr.Row():
-                mic = gr.Audio(sources=["microphone"], type="filepath")
                 send_btn = gr.Button("Send")
-            with gr.Accordion("Tools"):
-                tts_text = gr.Textbox(label="Text to speak")
-                tts_btn = gr.Button("Speak")
-                img_upload = gr.File(label="Upload image")
                 img_btn = gr.Button("Describe image")
         with gr.Column(scale=1):
             tools_log = gr.Textbox(value="Ready.", lines=20)
-    def on_send_text(text, history, mic_file):
-        log = tools_log.value
         if mic_file:
-            log += "\nTranscribing..."
             tr = transcribe_audio_tool(mic_file)
-            text = tr.content
-        history = history or []
-        history.append({"role": "user", "content": text})
-        history.append({"role": "assistant", "content": "You said: " + text})
-        return history, log
-    send_btn.click(on_send_text, [user_input, chatbox, mic], [chatbox, tools_log])
     def on_tts(text):
         res = speak_text_tool(text)
         if res.meta and res.meta.get("format") == "base64-audio":
-            audio = decode_base64_audio(res.content)
-            return (audio, 16000)
-    tts_btn.click(on_tts, [tts_text], [gr.Audio()])
-    def on_describe(file_obj):
         if not file_obj:
             return "No file uploaded"
-        desc = describe_image_tool(file_obj.name)
-        return {"role": "assistant", "content": desc.content}
-    img_btn.click(on_describe, [img_upload], [chatbox])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import os
 import io
 import json
 import gradio as gr
 from pydantic import BaseModel
+# Optional: use openai if available for transcription and image captioning
 try:
     import openai
     OPENAI_AVAILABLE = True
 except Exception:
     OPENAI_AVAILABLE = False
+# Optional: HF transformers fallbacks
 try:
     from PIL import Image
     import requests
 except Exception:
     HF_BLIP_AVAILABLE = False
+# -----------------------------
+# Configuration
+# -----------------------------
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
+HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")
 if OPENAI_API_KEY and OPENAI_AVAILABLE:
     openai.api_key = OPENAI_API_KEY
+# ElevenLabs defaults
+ELEVEN_VOICE_ID = os.environ.get("ELEVEN_VOICE_ID", "EXAVITQu4vr4xnSDxMaL")  # placeholder
 ELEVEN_API_URL = "https://api.elevenlabs.io/v1/text-to-speech"
+# -----------------------------
+# Minimal MCP Server shim
+# -----------------------------
 class ToolResult(BaseModel):
     content: str
     meta: Optional[dict] = None
     def tool(self, name: str, description: str = ""):
         def decorator(fn):
+            self.tools[name] = {
+                "fn": fn,
+                "description": description,
+            }
             return fn
         return decorator
+    async def run_tool(self, name: str, *args, **kwargs):
+        tool = self.tools.get(name)
+        if not tool:
+            raise ValueError(f"Tool {name} not found")
+        fn = tool["fn"]
+        if asyncio.iscoroutinefunction(fn):
+            res = await fn(*args, **kwargs)
+        else:
+            res = fn(*args, **kwargs)
+        if isinstance(res, ToolResult):
+            return res
+        return ToolResult(content=str(res))
 server = MCPServer("accessibility_voice_mcp")
+# -----------------------------
+# Utilities: STT, TTS, Image describe
+# -----------------------------
 def transcribe_with_openai(audio_file_path: str) -> str:
+    """Transcribe audio using OpenAI Whisper (if available)."""
     if not OPENAI_AVAILABLE:
+        return "OpenAI library not available"
+    with open(audio_file_path, "rb") as f:
+        # Uses the OpenAI Audio transcription API (may vary by SDK version)
+        try:
+            transcript = openai.Audio.transcriptions.create(model="whisper-1", file=f)
+            # Some SDKs return .text
+            if isinstance(transcript, dict):
+                return transcript.get("text", "")
+            return getattr(transcript, "text", "")
+        except Exception as e:
+            return f"OpenAI transcription error: {e}"
 def transcribe_fallback(audio_file_path: str) -> str:
+    """Fallback: invoke whisper from local package (if installed)."""
     try:
         import whisper
         model = whisper.load_model("small")
         res = model.transcribe(audio_file_path)
         return res.get("text", "")
     except Exception as e:
+        return f"Local transcription fallback failed: {e}"
 def tts_elevenlabs(text: str) -> bytes:
+    """Call ElevenLabs API to synthesize speech. Returns raw audio bytes (wav/mp3 depending on API)."""
     if not ELEVENLABS_API_KEY:
+        raise RuntimeError("ELEVENLABS_API_KEY not set in environment")
     url = f"{ELEVEN_API_URL}/{ELEVEN_VOICE_ID}"
+    headers = {
+        "xi-api-key": ELEVENLABS_API_KEY,
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "text": text,
+        "voice_settings": {"stability": 0.5, "similarity_boost": 0.75}
+    }
+    resp = requests.post(url, headers=headers, json=payload, stream=True)
+    if resp.status_code != 200:
+        raise RuntimeError(f"ElevenLabs TTS failed: {resp.status_code} {resp.text}")
+    return resp.content
+def # -----------------------------
+# Gemini Image Description
+# -----------------------------
 def describe_image_gemini(image_path: str) -> str:
+    """Describe an image using Google Gemini Vision."""
     try:
         import google.generativeai as genai
+        GEMINI_KEY = os.environ.get("GOOGLE_GEMINI_API_KEY")
+        if not GEMINI_KEY:
             return "GOOGLE_GEMINI_API_KEY not set"
+        genai.configure(api_key=GEMINI_KEY)
         model = genai.GenerativeModel("gemini-1.5-flash")
         with open(image_path, "rb") as f:
+            image_bytes = f.read()
+        response = model.generate_content(["Describe this image for a visually impaired user.", {"mime_type":"image/jpeg", "data": image_bytes}])
+        return response.text
     except Exception as e:
         return f"Gemini describe error: {e}"
+# (OpenAI code removed for simplicity)
+(image_path: str) -> str:
+    """Attempt to describe an image using OpenAI vision (if available)."""
+    if not OPENAI_AVAILABLE:
+        return "OpenAI not available for image captioning"
+    try:
+        with open(image_path, "rb") as f:
+            # Example using the OpenAI image understanding endpoints (SDKs vary)
+            # We'll call the Chat Completions with system prompt and base64 image as a fallback
+            b64 = base64.b64encode(f.read()).decode("utf-8")
+            prompt = (
+                "You are an assistant that describes images for visually impaired users. "
+                "Provide a concise, vivid, and accessible description of the image."
+Image(base64):" + b64
+            )
+            resp = openai.ChatCompletion.create(
+                model="gpt-4o-mini", messages=[{"role":"user","content":prompt}], max_tokens=300
+            )
+            return resp.choices[0].message.content.strip()
+    except Exception as e:
+        return f"OpenAI image describe error: {e}"
 def describe_image_blip(image_path: str) -> str:
     if not HF_BLIP_AVAILABLE:
+        return "HF BLIP not available in this runtime"
     try:
         processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
         model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        raw_image = Image.open(image_path).convert("RGB")
+        inputs = processor(raw_image, return_tensors="pt")
         out = model.generate(**inputs)
+        caption = processor.decode(out[0], skip_special_tokens=True)
+        return caption
     except Exception as e:
         return f"BLIP caption error: {e}"
+# -----------------------------
 # MCP Tools
+# -----------------------------
+@server.tool(name="speak_text", description="Convert text to speech using ElevenLabs")
 def speak_text_tool(text: str) -> ToolResult:
     try:
+        audio_bytes = tts_elevenlabs(text)
+        encoded = base64.b64encode(audio_bytes).decode("utf-8")
+        return ToolResult(content=encoded, meta={"format": "base64-audio"})
     except Exception as e:
+        return ToolResult(content=f"TTS Error: {e}")
+@server.tool(name="describe_image", description="Describe an uploaded image for visually impaired users")
 def describe_image_tool(image_path: str) -> ToolResult:
+    # Prioritize OpenAI -> HF BLIP -> error
+    if OPENAI_AVAILABLE:
+        desc = describe_image_openai(image_path)
+        if desc and not desc.startswith("OpenAI image describe error"):
+            return ToolResult(content=desc)
+    if HF_BLIP_AVAILABLE:
+        desc = describe_image_blip(image_path)
         return ToolResult(content=desc)
+    return ToolResult(content="No image captioning backend available. Set OPENAI_API_KEY or install transformers + pillow.")
+@server.tool(name="transcribe_audio", description="Transcribe user audio to text")
+def transcribe_audio_tool(audio_path: str) -> ToolResult:
     if OPENAI_AVAILABLE:
+        text = transcribe_with_openai(audio_path)
+        return ToolResult(content=text)
+    else:
+        text = transcribe_fallback(audio_path)
+        return ToolResult(content=text)
+# -----------------------------
+# Gradio UI (client)
+# -----------------------------
 def decode_base64_audio(b64: str) -> bytes:
     return base64.b64decode(b64)
 with gr.Blocks() as demo:
+with gr.Accordion("🔑 API Keys (stored only in session)", open=False):
+    openai_key = gr.Textbox(label="OpenAI API Key", type="password")
+    eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
+    gemini_key = gr.Textbox(label="Gemini API Key", type="password")
+    def set_keys(ok, ek, gk):
+        if ok: os.environ["OPENAI_API_KEY"] = ok
+        if ek: os.environ["ELEVENLABS_API_KEY"] = ek
+        if gk: os.environ["GOOGLE_GEMINI_API_KEY"] = gk
+        return "API keys set for this session."
+    set_btn = gr.Button("Save API Keys")
+    set_output = gr.Textbox(label="Status")
+    set_btn.click(set_keys, [openai_key, eleven_key, gemini_key], [set_output])
     gr.Markdown("# Accessibility Voice Agent — MCP Tools")
     with gr.Row():
         with gr.Column(scale=2):
+            chatbox = gr.Chatbot(label="Assistant")
+            user_input = gr.Textbox(placeholder="Type or press the microphone to speak...", show_label=False)
             with gr.Row():
+                mic = gr.Audio(source="microphone", type="filepath", label="Record voice (press to record)")
                 send_btn = gr.Button("Send")
+            with gr.Accordion("Advanced / Tools", open=False):
+                tts_text = gr.Textbox(label="Text to speak (ElevenLabs)")
+                tts_btn = gr.Button("Speak (TTS)")
+                img_upload = gr.File(label="Upload image (for description)")
                 img_btn = gr.Button("Describe image")
         with gr.Column(scale=1):
+            gr.Markdown("### Tools Log")
             tools_log = gr.Textbox(value="Ready.", lines=20)
+    # Callbacks
+    def on_send_text(text, chat_history, mic_file):
+        # If there's a mic file, prefer transcribing audio
         if mic_file:
+            tools_log_val = tools_log.value if hasattr(tools_log, 'value') else ''
+            tools_log_val = (tools_log_val + "
+Transcribing audio...")
+            # transcribe
             tr = transcribe_audio_tool(mic_file)
+            user_text = tr.content
+        else:
+            user_text = text
+        # append user->assistant exchange
+        chat_history = chat_history or []
+        chat_history.append((user_text, "..."))
+        # For demo: assistant echoes + uses describe_image if commands detected
+        if user_text.strip().lower().startswith("describe image:"):
+            # expects: "describe image: filename"
+            _, _, fname = user_text.partition(":")
+            fname = fname.strip()
+            if fname:
+                desc = describe_image_tool(fname)
+                assistant = desc.content
+            else:
+                assistant = "Please upload an image using the Describe Image tool."
+        else:
+            assistant = "I heard: " + user_text
+        chat_history[-1] = (user_text, assistant)
+        return chat_history, tools_log_val
+    send_btn.click(on_send_text, inputs=[user_input, chatbox, mic], outputs=[chatbox, tools_log])
     def on_tts(text):
         res = speak_text_tool(text)
         if res.meta and res.meta.get("format") == "base64-audio":
+            audio_bytes = decode_base64_audio(res.content)
+            return (audio_bytes, 16000)
+        return None
+    tts_btn.click(on_tts, inputs=[tts_text], outputs=[gr.Audio(label="TTS Output")])
+    def on_describe_image(file_obj):
         if not file_obj:
             return "No file uploaded"
+        # file_obj is a tempfile path in hf spaces; pass path to tool
+        desc = describe_image_tool(file_obj.name if hasattr(file_obj, 'name') else file_obj)
+        return desc.content
+    img_btn.click(on_describe_image, inputs=[img_upload], outputs=[chatbox])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))