File size: 3,677 Bytes
0ef482f
1f8048b
938f609
48607b7
1f8048b
938f609
 
9d41b1d
 
48607b7
1f8048b
 
 
 
 
 
 
 
938f609
9d41b1d
1f8048b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d41b1d
938f609
 
0ef482f
48607b7
17438da
48607b7
938f609
 
9d41b1d
b6258cb
 
 
 
 
 
 
 
 
 
 
9d41b1d
b6258cb
 
 
 
9d41b1d
b6258cb
48607b7
b6258cb
 
 
 
 
 
 
 
 
48607b7
dd3451f
53af268
 
 
938f609
 
48607b7
53af268
dd3451f
ec3d9e7
0ef482f
b6258cb
938f609
d081bf3
b6258cb
444e2a5
0ef482f
17438da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import copy
import base64
import requests
import tempfile
import secrets
import gradio as gr
from huggingface_hub import upload_file, InferenceClient
from PIL import Image

# --- Config ---
HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
HF_DATASET_REPO = "OppaAI/Robot_MCP"
# Model specifically for VLM (image-to-text) tasks on Hugging Face
HF_VLM_MODEL = "llava-hf/llava-interleave-qwen-0.5b-hf" # A suitable VLM model

if not HF_TOKEN:
    raise ValueError("HF_TOKEN environment variable not set.")

# Initialize the Hugging Face Inference Client
hf_client = InferenceClient(token=HF_TOKEN)

# --- Helper Functions ---
def save_and_upload_image(image_b64):
    """Save image to /tmp and upload to HF dataset."""
    image_bytes = base64.b64decode(image_b64)
    # Use a unique filename to prevent conflicts in /tmp
    local_tmp_path = f"/tmp/uploaded_image_{secrets.token_hex(8)}.jpg" 
    with open(local_tmp_path, "wb") as f:
        f.write(image_bytes)

    path_in_repo = f"images/uploaded_image_{secrets.token_hex(8)}.jpg"
    upload_file(
        path_or_fileobj=local_tmp_path,
        path_in_repo=path_in_repo,
        repo_id=HF_DATASET_REPO,
        token=HF_TOKEN,
        repo_type="dataset"
    )

    hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
    return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)

# --- Main MCP function ---
def process_and_describe(payload: dict):
    try:
        robot_id = payload.get("robot_id", "unknown")
        image_b64 = payload["image_b64"]

        # 1️⃣ Save & upload image
        local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
        
        # --- NEW VLM CALL LOGIC ---
        # 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
        # The API expects an image embedded in the prompt using Markdown syntax.
        prompt_text = "Describe this image in detail."
        
        # Base64 encode the image for embedding in the JSON payload
        with open(local_tmp_path, "rb") as f:
            image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")

        # The full prompt format required by Qwen in the API
        full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}">{prompt_text}'

        # 3️⃣ Call VLM using hf_client.post (low-level API call for specific models)
        # We use the 'text-generation' task endpoint as indicated by the error message.
        api_response = hf_client.post(
            json={"inputs": full_prompt, "parameters": {"max_new_tokens": 150}},
            model=HF_VLM_MODEL,
            task="text-generation"
        )
        
        # The response is usually a list of dicts, extract the generated text
        # Example response format: [{'generated_text': '... description ...'}]
        if isinstance(api_response, list) and len(api_response) > 0:
            vlm_text = api_response[0].get('generated_text', '').strip()
        else:
            vlm_text = "Failed to parse VLM response."

        # --- END NEW VLM CALL LOGIC ---

        return {
            "saved_to_hf_hub": True,
            "repo_id": HF_DATASET_REPO,
            "path_in_repo": path_in_repo,
            "image_url": hf_url,
            "file_size_bytes": size_bytes,
            "robot_id": robot_id,
            "vlm_description": vlm_text
        }

    except Exception as e:
        # Added better error handling as suggested previously
        return {"error": str(e)}

# ... (Gradio Interface code remains the same) ...

if __name__ == "__main__":
    demo.launch(mcp_server=True)