OppaAI commited on
Commit
dac9550
·
verified ·
1 Parent(s): 51deb36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -17
app.py CHANGED
@@ -7,17 +7,10 @@ from huggingface_hub import upload_file, InferenceClient
7
  from PIL import Image
8
 
9
  # --- Config ---
10
- #HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
11
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
12
  # Model specifically for VLM (image-to-text) tasks on Hugging Face
13
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
14
 
15
- #if not HF_TOKEN:
16
- # raise ValueError("HF_TOKEN environment variable not set.")
17
-
18
- # Initialize the Hugging Face Inference Client
19
- hf_client = InferenceClient(token=HF_TOKEN)
20
-
21
  # --- Helper Functions ---
22
  def save_and_upload_image(image_b64):
23
  """Save image to /tmp and upload to HF dataset."""
@@ -42,33 +35,38 @@ def save_and_upload_image(image_b64):
42
  # --- Main MCP function ---
43
  def process_and_describe(payload: dict):
44
  try:
45
- hf_token = payload.get("hf_token", HF_TOKEN)
 
 
 
 
46
  robot_id = payload.get("robot_id", "unknown")
47
  image_b64 = payload["image_b64"]
48
 
49
- # 1️⃣ Save & upload image (needed for tracking, but B64 is used for VLM call)
50
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
51
-
52
- # 2️⃣ Prepare the multimodal message payload for the conversational API
 
 
 
53
  messages_payload = [
54
  {
55
  "role": "user",
56
  "content": [
57
  {"type": "text", "text": "Describe this image in detail."},
58
- # Pass the original Base64 string directly in the required format
59
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
60
  ],
61
  }
62
  ]
63
 
64
- # 3️⃣ Call VLM using hf_client.chat.completions.create (The correct method for 'conversational' task)
65
  chat_completion = hf_client.chat.completions.create(
66
  model=HF_VLM_MODEL,
67
  messages=messages_payload,
68
- max_tokens=150, # Use max_tokens instead of max_new_tokens for this method
69
  )
70
-
71
- # Extract the text content from the response object
72
  vlm_text = chat_completion.choices[0].message.content.strip()
73
 
74
  return {
@@ -82,7 +80,6 @@ def process_and_describe(payload: dict):
82
  }
83
 
84
  except Exception as e:
85
- # Added better error handling
86
  return {"error": f"An API error occurred: {str(e)}"}
87
 
88
  # --- Gradio MCP Interface ---
 
7
  from PIL import Image
8
 
9
  # --- Config ---
 
10
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
11
  # Model specifically for VLM (image-to-text) tasks on Hugging Face
12
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
13
 
 
 
 
 
 
 
14
  # --- Helper Functions ---
15
  def save_and_upload_image(image_b64):
16
  """Save image to /tmp and upload to HF dataset."""
 
35
  # --- Main MCP function ---
36
  def process_and_describe(payload: dict):
37
  try:
38
+ # 1️⃣ Use robot-sent token if available, otherwise fallback
39
+ hf_token = payload.get("hf_token")
40
+ if not hf_token:
41
+ return {"error": "HF token not provided in payload."}
42
+
43
  robot_id = payload.get("robot_id", "unknown")
44
  image_b64 = payload["image_b64"]
45
 
46
+ # 2️⃣ Save image temporarily (for tracking)
47
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
48
+
49
+ # 3️⃣ Initialize HF client per request
50
+ hf_client = InferenceClient(token=hf_token)
51
+
52
+ # 4️⃣ Prepare multimodal message payload
53
  messages_payload = [
54
  {
55
  "role": "user",
56
  "content": [
57
  {"type": "text", "text": "Describe this image in detail."},
 
58
  {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
59
  ],
60
  }
61
  ]
62
 
63
+ # 5️⃣ Call VLM
64
  chat_completion = hf_client.chat.completions.create(
65
  model=HF_VLM_MODEL,
66
  messages=messages_payload,
67
+ max_tokens=150,
68
  )
69
+
 
70
  vlm_text = chat_completion.choices[0].message.content.strip()
71
 
72
  return {
 
80
  }
81
 
82
  except Exception as e:
 
83
  return {"error": f"An API error occurred: {str(e)}"}
84
 
85
  # --- Gradio MCP Interface ---