OppaAI commited on
Commit
9d41b1d
·
verified ·
1 Parent(s): d4bf33b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -39
app.py CHANGED
@@ -5,26 +5,31 @@ import requests
5
  import tempfile
6
  import secrets
7
  import gradio as gr
8
- from huggingface_hub import upload_file
9
- from dashscope import MultiModalConversation
10
 
11
  # --- Config ---
12
  HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
13
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
- MODEL = "qwen2.5-vl-7b-instruct"
 
15
 
16
  if not HF_TOKEN:
17
  raise ValueError("HF_TOKEN environment variable not set.")
18
 
 
 
 
19
  # --- Helper Functions ---
20
  def save_and_upload_image(image_b64):
21
  """Save image to /tmp and upload to HF dataset."""
22
  image_bytes = base64.b64decode(image_b64)
23
- local_tmp_path = "/tmp/tmp.jpg"
 
24
  with open(local_tmp_path, "wb") as f:
25
  f.write(image_bytes)
26
 
27
- path_in_repo = f"images/uploaded_image_{len(image_bytes)}.jpg"
28
  upload_file(
29
  path_or_fileobj=local_tmp_path,
30
  path_in_repo=path_in_repo,
@@ -36,22 +41,6 @@ def save_and_upload_image(image_b64):
36
  hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
37
  return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
38
 
39
- def prepare_vlm_message(image_path, text="Describe this image in detail."):
40
- """Read local image, encode to base64, and prepare VLM message."""
41
- with open(image_path, "rb") as f:
42
- image_b64 = base64.b64encode(f.read()).decode("utf-8")
43
-
44
- messages = [
45
- {
46
- "role": "user",
47
- "content": [
48
- {"type": "text", "text": text},
49
- {"type": "image_data", "image_data": {"b64": image_b64}}
50
- ]
51
- }
52
- ]
53
- return messages
54
-
55
  # --- Main MCP function ---
56
  def process_and_describe(payload: dict):
57
  try:
@@ -61,26 +50,20 @@ def process_and_describe(payload: dict):
61
  # 1️⃣ Save & upload image
62
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
63
 
64
- # 2️⃣ Prepare VLM message
65
- messages = prepare_vlm_message(local_tmp_path)
66
-
67
- # 3️⃣ Call VLM using MultiModalConversation
68
- responses = MultiModalConversation.call(
69
- model=MODEL,
70
- messages=messages,
71
- stream=True
 
 
 
 
72
  )
73
 
74
- vlm_text = ""
75
- for resp in responses:
76
- if resp.status_code != 200:
77
- return {"error": f"VLM call failed: {resp.status_code}"}
78
- content = resp.output.choices[0].message.content
79
- # Extract text from response
80
- for ele in content:
81
- if "text" in ele:
82
- vlm_text += ele["text"]
83
-
84
  return {
85
  "saved_to_hf_hub": True,
86
  "repo_id": HF_DATASET_REPO,
@@ -103,4 +86,6 @@ demo = gr.Interface(
103
  )
104
 
105
  if __name__ == "__main__":
 
 
106
  demo.launch(mcp_server=True)
 
5
  import tempfile
6
  import secrets
7
  import gradio as gr
8
+ from huggingface_hub import upload_file, InferenceClient
9
+ from PIL import Image
10
 
11
  # --- Config ---
12
  HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
13
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
+ # Model specifically for VLM (image-to-text) tasks on Hugging Face
15
+ HF_VLM_MODEL = "llava-hf/llava-interleave-qwen-0.5b-hf" # A suitable VLM model
16
 
17
  if not HF_TOKEN:
18
  raise ValueError("HF_TOKEN environment variable not set.")
19
 
20
+ # Initialize the Hugging Face Inference Client
21
+ hf_client = InferenceClient(token=HF_TOKEN)
22
+
23
  # --- Helper Functions ---
24
  def save_and_upload_image(image_b64):
25
  """Save image to /tmp and upload to HF dataset."""
26
  image_bytes = base64.b64decode(image_b64)
27
+ # Use a unique filename to prevent conflicts in /tmp
28
+ local_tmp_path = f"/tmp/uploaded_image_{secrets.token_hex(8)}.jpg"
29
  with open(local_tmp_path, "wb") as f:
30
  f.write(image_bytes)
31
 
32
+ path_in_repo = f"images/uploaded_image_{secrets.token_hex(8)}.jpg"
33
  upload_file(
34
  path_or_fileobj=local_tmp_path,
35
  path_in_repo=path_in_repo,
 
41
  hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
42
  return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # --- Main MCP function ---
45
  def process_and_describe(payload: dict):
46
  try:
 
50
  # 1️⃣ Save & upload image
51
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
52
 
53
+ # 2️⃣ Prepare prompt
54
+ prompt = "Describe this image in detail."
55
+
56
+ # Open the image using PIL for the InferenceClient
57
+ image = Image.open(local_tmp_path)
58
+
59
+ # 3️⃣ Call VLM using Hugging Face Inference Client
60
+ # The client automatically handles the API call and authentication
61
+ vlm_text = hf_client.image_to_text(
62
+ image=image,
63
+ model=HF_VLM_MODEL,
64
+ details=True, # Set details=True for more comprehensive output if available
65
  )
66
 
 
 
 
 
 
 
 
 
 
 
67
  return {
68
  "saved_to_hf_hub": True,
69
  "repo_id": HF_DATASET_REPO,
 
86
  )
87
 
88
  if __name__ == "__main__":
89
+ # You will need to install the required libraries:
90
+ # pip install gradio huggingface_hub Pillow requests
91
  demo.launch(mcp_server=True)