Spaces:
Sleeping
Sleeping
File size: 3,564 Bytes
0ef482f 1f8048b 938f609 48607b7 1f8048b 938f609 9d41b1d 48607b7 1f8048b 18ab832 1f8048b 938f609 9d41b1d 1f8048b 9d41b1d 938f609 0ef482f 48607b7 17438da 48607b7 938f609 9d41b1d b6258cb cd798bc b6258cb cd798bc 9d41b1d cd798bc 9d41b1d cd798bc 48607b7 b6258cb cd798bc 48607b7 dd3451f 53af268 938f609 48607b7 cd798bc dd3451f ec3d9e7 0ef482f cd798bc d081bf3 9a56bc2 444e2a5 0ef482f 9a56bc2 17438da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import os
import copy
import base64
import requests
import tempfile
import secrets
import gradio as gr
from huggingface_hub import upload_file, InferenceClient
from PIL import Image
# --- Config ---
HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
HF_DATASET_REPO = "OppaAI/Robot_MCP"
# Model specifically for VLM (image-to-text) tasks on Hugging Face
HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" # A suitable VLM model
if not HF_TOKEN:
raise ValueError("HF_TOKEN environment variable not set.")
# Initialize the Hugging Face Inference Client
hf_client = InferenceClient(token=HF_TOKEN)
# --- Helper Functions ---
def save_and_upload_image(image_b64):
"""Save image to /tmp and upload to HF dataset."""
image_bytes = base64.b64decode(image_b64)
# Use a unique filename to prevent conflicts in /tmp
local_tmp_path = f"/tmp/uploaded_image_{secrets.token_hex(8)}.jpg"
with open(local_tmp_path, "wb") as f:
f.write(image_bytes)
path_in_repo = f"images/uploaded_image_{secrets.token_hex(8)}.jpg"
upload_file(
path_or_fileobj=local_tmp_path,
path_in_repo=path_in_repo,
repo_id=HF_DATASET_REPO,
token=HF_TOKEN,
repo_type="dataset"
)
hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
# --- Main MCP function ---
def process_and_describe(payload: dict):
try:
robot_id = payload.get("robot_id", "unknown")
image_b64 = payload["image_b64"]
# 1️⃣ Save & upload image
local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
# 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
prompt_text = "Describe this image in detail."
# Base64 encode the image for embedding in the prompt
with open(local_tmp_path, "rb") as f:
image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")
# The full prompt format required by Qwen, embedded in a chat-like structure for the API
full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}"> {prompt_text}'
# 3️⃣ Call VLM using hf_client.text_generation (the preferred method for general LLMs)
# This sends the custom prompt string to the model endpoint.
vlm_text = hf_client.text_generation(
model=HF_VLM_MODEL,
prompt=full_prompt,
max_new_tokens=150,
# Other parameters like temperature can be added here if needed
)
# The response from text_generation is already the cleaned string
return {
"saved_to_hf_hub": True,
"repo_id": HF_DATASET_REPO,
"path_in_repo": path_in_repo,
"image_url": hf_url,
"file_size_bytes": size_bytes,
"robot_id": robot_id,
"vlm_description": vlm_text.strip()
}
except Exception as e:
# Added better error handling
return {"error": f"An API error occurred: {str(e)}"}
# --- Gradio MCP Interface ---
demo = gr.Interface(
fn=process_and_describe,
inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
outputs=gr.JSON(label="Reply to Jetson"),
api_name="predict"
)
if __name__ == "__main__":
# You will need to install the required libraries:
# pip install gradio huggingface_hub Pillow requests
demo.launch(mcp_server=True)
|