Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

File size: 3,436 Bytes

0ef482f
938f609
48607b7
1f8048b
938f609
9d41b1d
 
48607b7
1f8048b
 
 
 
c5129eb
1f8048b
 
 
938f609
9d41b1d
1f8048b
 
 
 
 
 
 
49eb2ad
1f8048b
 
 
49eb2ad
1f8048b
 
 
 
 
 
 
 
 
 
9d41b1d
938f609
 
0ef482f
48607b7
17438da
48607b7
c5129eb
938f609
9d41b1d
c5129eb
 
 
 
 
 
 
 
 
 
 
9d41b1d
c5129eb
 
9d41b1d
c5129eb
 
48607b7
b6258cb
c5129eb
 
48607b7
dd3451f
53af268
 
 
938f609
 
48607b7
c5129eb
dd3451f
ec3d9e7
0ef482f
cd798bc
 
d081bf3
9a56bc2
 
 
 
 
 
 
444e2a5
0ef482f
c5129eb
17438da

import os
import base64
import requests
import tempfile
import gradio as gr
from huggingface_hub import upload_file, InferenceClient
from PIL import Image

# --- Config ---
HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
HF_DATASET_REPO = "OppaAI/Robot_MCP"
# Model specifically for VLM (image-to-text) tasks on Hugging Face
HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" 

if not HF_TOKEN:
    raise ValueError("HF_TOKEN environment variable not set.")

# Initialize the Hugging Face Inference Client
hf_client = InferenceClient(token=HF_TOKEN)

# --- Helper Functions ---
def save_and_upload_image(image_b64):
    """Save image to /tmp and upload to HF dataset."""
    image_bytes = base64.b64decode(image_b64)
    # Use a unique filename to prevent conflicts in /tmp
    local_tmp_path = f"/tmp/tmp.jpg" 
    with open(local_tmp_path, "wb") as f:
        f.write(image_bytes)

    path_in_repo = f"images/tmp.jpg"
    upload_file(
        path_or_fileobj=local_tmp_path,
        path_in_repo=path_in_repo,
        repo_id=HF_DATASET_REPO,
        token=HF_TOKEN,
        repo_type="dataset"
    )

    hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
    return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)

# --- Main MCP function ---
def process_and_describe(payload: dict):
    try:
        robot_id = payload.get("robot_id", "unknown")
        image_b64 = payload["image_b64"]

        # 1️⃣ Save & upload image (needed for tracking, but B64 is used for VLM call)
        local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
        
        # 2️⃣ Prepare the multimodal message payload for the conversational API
        messages_payload = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in detail."},
                    # Pass the original Base64 string directly in the required format
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
                ],
            }
        ]

        # 3️⃣ Call VLM using hf_client.chat.completions.create (The correct method for 'conversational' task)
        chat_completion = hf_client.chat.completions.create(
            model=HF_VLM_MODEL,
            messages=messages_payload,
            max_tokens=150, # Use max_tokens instead of max_new_tokens for this method
        )
        
        # Extract the text content from the response object
        vlm_text = chat_completion.choices[0].message.content.strip()

        return {
            "saved_to_hf_hub": True,
            "repo_id": HF_DATASET_REPO,
            "path_in_repo": path_in_repo,
            "image_url": hf_url,
            "file_size_bytes": size_bytes,
            "robot_id": robot_id,
            "vlm_description": vlm_text
        }

    except Exception as e:
        # Added better error handling
        return {"error": f"An API error occurred: {str(e)}"}

# --- Gradio MCP Interface ---
demo = gr.Interface(
    fn=process_and_describe,
    inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
    outputs=gr.JSON(label="Reply to Jetson"),
    api_name="predict"
)

if __name__ == "__main__":
    # Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
    demo.launch(mcp_server=True)