File size: 2,998 Bytes
0ef482f
938f609
 
48607b7
938f609
 
 
9d41b1d
 
48607b7
938f609
886787d
938f609
9d41b1d
 
48607b7
938f609
 
 
9d41b1d
 
 
938f609
 
 
 
9d41b1d
 
938f609
 
 
9d41b1d
938f609
 
 
 
 
 
 
 
 
 
 
 
 
0ef482f
48607b7
17438da
48607b7
938f609
 
 
68ac715
 
9d41b1d
 
 
 
 
68ac715
9d41b1d
 
 
68ac715
48607b7
 
dd3451f
53af268
 
 
938f609
 
48607b7
53af268
dd3451f
ec3d9e7
0ef482f
938f609
d081bf3
938f609
17438da
53af268
 
48607b7
17438da
 
444e2a5
0ef482f
9d41b1d
 
17438da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import copy
import base64
import requests
import tempfile
import secrets
import gradio as gr
from huggingface_hub import upload_file, InferenceClient
from PIL import Image

# --- Config ---
HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
HF_DATASET_REPO = "OppaAI/Robot_MCP"
# Model specifically for VLM (image-to-text) tasks on Hugging Face
HF_VLM_MODEL = "llava-hf/llava-interleave-qwen-0.5b-hf" # A suitable VLM model

if not HF_TOKEN:
    raise ValueError("HF_TOKEN environment variable not set.")

# Initialize the Hugging Face Inference Client
hf_client = InferenceClient(token=HF_TOKEN)

# --- Helper Functions ---
def save_and_upload_image(image_b64):
    """Save image to /tmp and upload to HF dataset."""
    image_bytes = base64.b64decode(image_b64)
    # Use a unique filename to prevent conflicts in /tmp
    local_tmp_path = f"/tmp/uploaded_image_{secrets.token_hex(8)}.jpg" 
    with open(local_tmp_path, "wb") as f:
        f.write(image_bytes)

    path_in_repo = f"images/uploaded_image_{secrets.token_hex(8)}.jpg"
    upload_file(
        path_or_fileobj=local_tmp_path,
        path_in_repo=path_in_repo,
        repo_id=HF_DATASET_REPO,
        token=HF_TOKEN,
        repo_type="dataset"
    )

    hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
    return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)

# --- Main MCP function ---
def process_and_describe(payload: dict):
    try:
        robot_id = payload.get("robot_id", "unknown")
        image_b64 = payload["image_b64"]

        # 1️⃣ Save & upload image
        local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)

        # 2️⃣ Prepare prompt (optional, some models ignore this for basic image_to_text)
        # prompt = "Describe this image in detail." 
        
        # Open the image using PIL for the InferenceClient
        image = Image.open(local_tmp_path)

        # 3️⃣ Call VLM using Hugging Face Inference Client
        # Removed the problematic 'details' argument
        vlm_text = hf_client.image_to_text(
            image=image,
            model=HF_VLM_MODEL,
            # details=True # <-- REMOVED THIS LINE
        )

        return {
            "saved_to_hf_hub": True,
            "repo_id": HF_DATASET_REPO,
            "path_in_repo": path_in_repo,
            "image_url": hf_url,
            "file_size_bytes": size_bytes,
            "robot_id": robot_id,
            "vlm_description": vlm_text
        }

    except Exception as e:
        return {"error": str(e)}

# --- Gradio MCP Interface ---
demo = gr.Interface(
    fn=process_and_describe,
    inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
    outputs=gr.JSON(label="Reply to Jetson"),
    api_name="predict"
)

if __name__ == "__main__":
    # You will need to install the required libraries:
    # pip install gradio huggingface_hub Pillow requests
    demo.launch(mcp_server=True)