Spaces:
Sleeping
Sleeping
File size: 3,436 Bytes
0ef482f 938f609 48607b7 1f8048b 938f609 9d41b1d 48607b7 1f8048b c5129eb 1f8048b 938f609 9d41b1d 1f8048b 49eb2ad 1f8048b 49eb2ad 1f8048b 9d41b1d 938f609 0ef482f 48607b7 17438da 48607b7 c5129eb 938f609 9d41b1d c5129eb 9d41b1d c5129eb 9d41b1d c5129eb 48607b7 b6258cb c5129eb 48607b7 dd3451f 53af268 938f609 48607b7 c5129eb dd3451f ec3d9e7 0ef482f cd798bc d081bf3 9a56bc2 444e2a5 0ef482f c5129eb 17438da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os
import base64
import requests
import tempfile
import gradio as gr
from huggingface_hub import upload_file, InferenceClient
from PIL import Image
# --- Config ---
HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
HF_DATASET_REPO = "OppaAI/Robot_MCP"
# Model specifically for VLM (image-to-text) tasks on Hugging Face
HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
if not HF_TOKEN:
raise ValueError("HF_TOKEN environment variable not set.")
# Initialize the Hugging Face Inference Client
hf_client = InferenceClient(token=HF_TOKEN)
# --- Helper Functions ---
def save_and_upload_image(image_b64):
"""Save image to /tmp and upload to HF dataset."""
image_bytes = base64.b64decode(image_b64)
# Use a unique filename to prevent conflicts in /tmp
local_tmp_path = f"/tmp/tmp.jpg"
with open(local_tmp_path, "wb") as f:
f.write(image_bytes)
path_in_repo = f"images/tmp.jpg"
upload_file(
path_or_fileobj=local_tmp_path,
path_in_repo=path_in_repo,
repo_id=HF_DATASET_REPO,
token=HF_TOKEN,
repo_type="dataset"
)
hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
# --- Main MCP function ---
def process_and_describe(payload: dict):
try:
robot_id = payload.get("robot_id", "unknown")
image_b64 = payload["image_b64"]
# 1️⃣ Save & upload image (needed for tracking, but B64 is used for VLM call)
local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
# 2️⃣ Prepare the multimodal message payload for the conversational API
messages_payload = [
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail."},
# Pass the original Base64 string directly in the required format
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
],
}
]
# 3️⃣ Call VLM using hf_client.chat.completions.create (The correct method for 'conversational' task)
chat_completion = hf_client.chat.completions.create(
model=HF_VLM_MODEL,
messages=messages_payload,
max_tokens=150, # Use max_tokens instead of max_new_tokens for this method
)
# Extract the text content from the response object
vlm_text = chat_completion.choices[0].message.content.strip()
return {
"saved_to_hf_hub": True,
"repo_id": HF_DATASET_REPO,
"path_in_repo": path_in_repo,
"image_url": hf_url,
"file_size_bytes": size_bytes,
"robot_id": robot_id,
"vlm_description": vlm_text
}
except Exception as e:
# Added better error handling
return {"error": f"An API error occurred: {str(e)}"}
# --- Gradio MCP Interface ---
demo = gr.Interface(
fn=process_and_describe,
inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
outputs=gr.JSON(label="Reply to Jetson"),
api_name="predict"
)
if __name__ == "__main__":
# Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
demo.launch(mcp_server=True)
|