Spaces:
Sleeping
Sleeping
File size: 3,223 Bytes
0ef482f 938f609 48607b7 938f609 53af268 938f609 48607b7 938f609 886787d 938f609 48607b7 938f609 48607b7 938f609 0ef482f 48607b7 17438da 48607b7 938f609 48607b7 938f609 48607b7 dd3451f 53af268 938f609 48607b7 53af268 dd3451f ec3d9e7 0ef482f 938f609 d081bf3 938f609 17438da 53af268 48607b7 17438da 444e2a5 0ef482f 17438da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import copy
import base64
import requests
import tempfile
import secrets
import gradio as gr
from huggingface_hub import upload_file
from dashscope import MultiModalConversation
# --- Config ---
HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
HF_DATASET_REPO = "OppaAI/Robot_MCP"
MODEL = "qwen2.5-vl-7b-instruct"
if not HF_TOKEN:
raise ValueError("HF_TOKEN environment variable not set.")
# --- Helper Functions ---
def save_and_upload_image(image_b64):
"""Save image to /tmp and upload to HF dataset."""
image_bytes = base64.b64decode(image_b64)
local_tmp_path = "/tmp/tmp.jpg"
with open(local_tmp_path, "wb") as f:
f.write(image_bytes)
path_in_repo = f"images/uploaded_image_{len(image_bytes)}.jpg"
upload_file(
path_or_fileobj=local_tmp_path,
path_in_repo=path_in_repo,
repo_id=HF_DATASET_REPO,
token=HF_TOKEN,
repo_type="dataset"
)
hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
def prepare_vlm_message(image_path, text="Describe this image in detail."):
"""Read local image, encode to base64, and prepare VLM message."""
with open(image_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": text},
{"type": "image_data", "image_data": {"b64": image_b64}}
]
}
]
return messages
# --- Main MCP function ---
def process_and_describe(payload: dict):
try:
robot_id = payload.get("robot_id", "unknown")
image_b64 = payload["image_b64"]
# 1️⃣ Save & upload image
local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
# 2️⃣ Prepare VLM message
messages = prepare_vlm_message(local_tmp_path)
# 3️⃣ Call VLM using MultiModalConversation
responses = MultiModalConversation.call(
model=MODEL,
messages=messages,
stream=True
)
vlm_text = ""
for resp in responses:
if resp.status_code != 200:
return {"error": f"VLM call failed: {resp.status_code}"}
content = resp.output.choices[0].message.content
# Extract text from response
for ele in content:
if "text" in ele:
vlm_text += ele["text"]
return {
"saved_to_hf_hub": True,
"repo_id": HF_DATASET_REPO,
"path_in_repo": path_in_repo,
"image_url": hf_url,
"file_size_bytes": size_bytes,
"robot_id": robot_id,
"vlm_description": vlm_text
}
except Exception as e:
return {"error": str(e)}
# --- Gradio MCP Interface ---
demo = gr.Interface(
fn=process_and_describe,
inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
outputs=gr.JSON(label="Reply to Jetson"),
api_name="predict"
)
if __name__ == "__main__":
demo.launch(mcp_server=True)
|