Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 17, 2025

Commit

9d41b1d

verified ·

1 Parent(s): d4bf33b

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -39

app.py CHANGED Viewed

@@ -5,26 +5,31 @@ import requests
 import tempfile
 import secrets
 import gradio as gr
-from huggingface_hub import upload_file
-from dashscope import MultiModalConversation
 # --- Config ---
 HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
-MODEL = "qwen2.5-vl-7b-instruct"
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN environment variable not set.")
 # --- Helper Functions ---
 def save_and_upload_image(image_b64):
     """Save image to /tmp and upload to HF dataset."""
     image_bytes = base64.b64decode(image_b64)
-    local_tmp_path = "/tmp/tmp.jpg"
     with open(local_tmp_path, "wb") as f:
         f.write(image_bytes)
-    path_in_repo = f"images/uploaded_image_{len(image_bytes)}.jpg"
     upload_file(
         path_or_fileobj=local_tmp_path,
         path_in_repo=path_in_repo,
@@ -36,22 +41,6 @@ def save_and_upload_image(image_b64):
     hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
     return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
-def prepare_vlm_message(image_path, text="Describe this image in detail."):
-    """Read local image, encode to base64, and prepare VLM message."""
-    with open(image_path, "rb") as f:
-        image_b64 = base64.b64encode(f.read()).decode("utf-8")
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": text},
-                {"type": "image_data", "image_data": {"b64": image_b64}}
-            ]
-        }
-    ]
-    return messages
 # --- Main MCP function ---
 def process_and_describe(payload: dict):
     try:
@@ -61,26 +50,20 @@ def process_and_describe(payload: dict):
         # 1️⃣ Save & upload image
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
-        # 2️⃣ Prepare VLM message
-        messages = prepare_vlm_message(local_tmp_path)
-        # 3️⃣ Call VLM using MultiModalConversation
-        responses = MultiModalConversation.call(
-            model=MODEL,
-            messages=messages,
-            stream=True
         )
-        vlm_text = ""
-        for resp in responses:
-            if resp.status_code != 200:
-                return {"error": f"VLM call failed: {resp.status_code}"}
-            content = resp.output.choices[0].message.content
-            # Extract text from response
-            for ele in content:
-                if "text" in ele:
-                    vlm_text += ele["text"]
         return {
             "saved_to_hf_hub": True,
             "repo_id": HF_DATASET_REPO,
@@ -103,4 +86,6 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
     demo.launch(mcp_server=True)

 import tempfile
 import secrets
 import gradio as gr
+from huggingface_hub import upload_file, InferenceClient
+from PIL import Image
 # --- Config ---
 HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
+# Model specifically for VLM (image-to-text) tasks on Hugging Face
+HF_VLM_MODEL = "llava-hf/llava-interleave-qwen-0.5b-hf" # A suitable VLM model
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN environment variable not set.")
+# Initialize the Hugging Face Inference Client
+hf_client = InferenceClient(token=HF_TOKEN)
 # --- Helper Functions ---
 def save_and_upload_image(image_b64):
     """Save image to /tmp and upload to HF dataset."""
     image_bytes = base64.b64decode(image_b64)
+    # Use a unique filename to prevent conflicts in /tmp
+    local_tmp_path = f"/tmp/uploaded_image_{secrets.token_hex(8)}.jpg"
     with open(local_tmp_path, "wb") as f:
         f.write(image_bytes)
+    path_in_repo = f"images/uploaded_image_{secrets.token_hex(8)}.jpg"
     upload_file(
         path_or_fileobj=local_tmp_path,
         path_in_repo=path_in_repo,
     hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
     return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
 # --- Main MCP function ---
 def process_and_describe(payload: dict):
     try:
         # 1️⃣ Save & upload image
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
+        # 2️⃣ Prepare prompt
+        prompt = "Describe this image in detail."
+        # Open the image using PIL for the InferenceClient
+        image = Image.open(local_tmp_path)
+        # 3️⃣ Call VLM using Hugging Face Inference Client
+        # The client automatically handles the API call and authentication
+        vlm_text = hf_client.image_to_text(
+            image=image,
+            model=HF_VLM_MODEL,
+            details=True, # Set details=True for more comprehensive output if available
         )
         return {
             "saved_to_hf_hub": True,
             "repo_id": HF_DATASET_REPO,
 )
 if __name__ == "__main__":
+    # You will need to install the required libraries:
+    # pip install gradio huggingface_hub Pillow requests
     demo.launch(mcp_server=True)