Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 17, 2025

Commit

c5129eb

verified ·

1 Parent(s): cd798bc

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -21

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from PIL import Image
 HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 # Model specifically for VLM (image-to-text) tasks on Hugging Face
-HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" # A suitable VLM model
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN environment variable not set.")
@@ -47,29 +47,30 @@ def process_and_describe(payload: dict):
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload["image_b64"]
-        # 1️⃣ Save & upload image
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
-        # 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
-        prompt_text = "Describe this image in detail."
-        # Base64 encode the image for embedding in the prompt
-        with open(local_tmp_path, "rb") as f:
-            image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")
-        # The full prompt format required by Qwen, embedded in a chat-like structure for the API
-        full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}"> {prompt_text}'
-        # 3️⃣ Call VLM using hf_client.text_generation (the preferred method for general LLMs)
-        # This sends the custom prompt string to the model endpoint.
-        vlm_text = hf_client.text_generation(
             model=HF_VLM_MODEL,
-            prompt=full_prompt,
-            max_new_tokens=150,
-            # Other parameters like temperature can be added here if needed
         )
-        # The response from text_generation is already the cleaned string
         return {
             "saved_to_hf_hub": True,
@@ -78,7 +79,7 @@ def process_and_describe(payload: dict):
             "image_url": hf_url,
             "file_size_bytes": size_bytes,
             "robot_id": robot_id,
-            "vlm_description": vlm_text.strip()
         }
     except Exception as e:
@@ -94,6 +95,5 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    # You will need to install the required libraries:
-    # pip install gradio huggingface_hub Pillow requests
     demo.launch(mcp_server=True)

 HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 # Model specifically for VLM (image-to-text) tasks on Hugging Face
+HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN environment variable not set.")
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload["image_b64"]
+        # 1️⃣ Save & upload image (needed for tracking, but B64 is used for VLM call)
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
+        # 2️⃣ Prepare the multimodal message payload for the conversational API
+        messages_payload = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this image in detail."},
+                    # Pass the original Base64 string directly in the required format
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
+                ],
+            }
+        ]
+        # 3️⃣ Call VLM using hf_client.chat.completions.create (The correct method for 'conversational' task)
+        chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
+            messages=messages_payload,
+            max_tokens=150, # Use max_tokens instead of max_new_tokens for this method
         )
+        # Extract the text content from the response object
+        vlm_text = chat_completion.choices[0].message.content.strip()
         return {
             "saved_to_hf_hub": True,
             "image_url": hf_url,
             "file_size_bytes": size_bytes,
             "robot_id": robot_id,
+            "vlm_description": vlm_text
         }
     except Exception as e:
 )
 if __name__ == "__main__":
+    # Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
     demo.launch(mcp_server=True)