File size: 3,110 Bytes
0ef482f
938f609
48607b7
1f8048b
938f609
9d41b1d
 
48607b7
1f8048b
 
 
c5129eb
1f8048b
 
406e27f
1f8048b
 
49eb2ad
1f8048b
 
 
49eb2ad
1f8048b
 
 
 
406e27f
1f8048b
 
 
 
 
9d41b1d
406e27f
938f609
 
0ef482f
dac9550
 
 
 
 
48607b7
17438da
48607b7
dac9550
406e27f
dac9550
 
 
 
 
c5129eb
 
 
 
 
 
 
 
 
9d41b1d
dac9550
c5129eb
9d41b1d
c5129eb
dac9550
48607b7
dac9550
c5129eb
48607b7
dd3451f
53af268
 
 
938f609
 
48607b7
c5129eb
dd3451f
ec3d9e7
0ef482f
cd798bc
d081bf3
9a56bc2
 
 
 
 
 
 
444e2a5
0ef482f
c5129eb
17438da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import base64
import requests
import tempfile
import gradio as gr
from huggingface_hub import upload_file, InferenceClient
from PIL import Image

# --- Config ---
HF_DATASET_REPO = "OppaAI/Robot_MCP"
# Model specifically for VLM (image-to-text) tasks on Hugging Face
HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" 

# --- Helper Functions ---
def save_and_upload_image(image_b64, hf_token):
    """Save image to /tmp and upload to HF dataset."""
    image_bytes = base64.b64decode(image_b64)
    local_tmp_path = f"/tmp/tmp.jpg" 
    with open(local_tmp_path, "wb") as f:
        f.write(image_bytes)

    path_in_repo = f"images/tmp.jpg"
    upload_file(
        path_or_fileobj=local_tmp_path,
        path_in_repo=path_in_repo,
        repo_id=HF_DATASET_REPO,
        token=hf_token,  # ← use token from payload
        repo_type="dataset"
    )

    hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
    return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)


# --- Main MCP function ---
def process_and_describe(payload: dict):
    try:
        # 1️⃣ Use robot-sent token if available, otherwise fallback
        hf_token = payload.get("hf_token")
        if not hf_token:
            return {"error": "HF token not provided in payload."}

        robot_id = payload.get("robot_id", "unknown")
        image_b64 = payload["image_b64"]

        # 2️⃣ Save image temporarily (for tracking)
        local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)

        # 3️⃣ Initialize HF client per request
        hf_client = InferenceClient(token=hf_token)

        # 4️⃣ Prepare multimodal message payload
        messages_payload = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in detail."},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
                ],
            }
        ]

        # 5️⃣ Call VLM
        chat_completion = hf_client.chat.completions.create(
            model=HF_VLM_MODEL,
            messages=messages_payload,
            max_tokens=150,
        )

        vlm_text = chat_completion.choices[0].message.content.strip()

        return {
            "saved_to_hf_hub": True,
            "repo_id": HF_DATASET_REPO,
            "path_in_repo": path_in_repo,
            "image_url": hf_url,
            "file_size_bytes": size_bytes,
            "robot_id": robot_id,
            "vlm_description": vlm_text
        }

    except Exception as e:
        return {"error": f"An API error occurred: {str(e)}"}

# --- Gradio MCP Interface ---
demo = gr.Interface(
    fn=process_and_describe,
    inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
    outputs=gr.JSON(label="Reply to Jetson"),
    api_name="predict"
)

if __name__ == "__main__":
    # Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
    demo.launch(mcp_server=True)