OppaAI commited on
Commit
b6258cb
·
verified ·
1 Parent(s): 224f8af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -50
app.py CHANGED
@@ -1,46 +1,19 @@
1
  import os
2
- import copy
3
  import base64
4
  import requests
5
- import tempfile
6
  import secrets
7
  import gradio as gr
8
  from huggingface_hub import upload_file, InferenceClient
9
  from PIL import Image
 
10
 
11
- # --- Config ---
12
- HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
13
- HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
- # Model specifically for VLM (image-to-text) tasks on Hugging Face
15
- HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" # A suitable VLM model
16
-
17
- if not HF_TOKEN:
18
- raise ValueError("HF_TOKEN environment variable not set.")
19
 
20
  # Initialize the Hugging Face Inference Client
21
  hf_client = InferenceClient(token=HF_TOKEN)
22
 
23
- # --- Helper Functions ---
24
- def save_and_upload_image(image_b64):
25
- """Save image to /tmp and upload to HF dataset."""
26
- image_bytes = base64.b64decode(image_b64)
27
- # Use a unique filename to prevent conflicts in /tmp
28
- local_tmp_path = f"/tmp/uploaded_image_{secrets.token_hex(8)}.jpg"
29
- with open(local_tmp_path, "wb") as f:
30
- f.write(image_bytes)
31
-
32
- path_in_repo = f"images/uploaded_image_{secrets.token_hex(8)}.jpg"
33
- upload_file(
34
- path_or_fileobj=local_tmp_path,
35
- path_in_repo=path_in_repo,
36
- repo_id=HF_DATASET_REPO,
37
- token=HF_TOKEN,
38
- repo_type="dataset"
39
- )
40
-
41
- hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
42
- return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
43
-
44
  # --- Main MCP function ---
45
  def process_and_describe(payload: dict):
46
  try:
@@ -49,20 +22,35 @@ def process_and_describe(payload: dict):
49
 
50
  # 1️⃣ Save & upload image
51
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
52
-
53
- # 2️⃣ Prepare prompt (optional, some models ignore this for basic image_to_text)
54
- # prompt = "Describe this image in detail."
55
 
56
- # Open the image using PIL for the InferenceClient
57
- image = Image.open(local_tmp_path)
 
 
 
 
 
 
 
 
 
58
 
59
- # 3️⃣ Call VLM using Hugging Face Inference Client
60
- # Removed the problematic 'details' argument
61
- vlm_text = hf_client.image_to_text(
62
- image=image,
63
  model=HF_VLM_MODEL,
64
- # details=True # <-- REMOVED THIS LINE
65
  )
 
 
 
 
 
 
 
 
 
66
 
67
  return {
68
  "saved_to_hf_hub": True,
@@ -75,17 +63,10 @@ def process_and_describe(payload: dict):
75
  }
76
 
77
  except Exception as e:
 
78
  return {"error": str(e)}
79
 
80
- # --- Gradio MCP Interface ---
81
- demo = gr.Interface(
82
- fn=process_and_describe,
83
- inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
84
- outputs=gr.JSON(label="Reply to Jetson"),
85
- api_name="predict"
86
- )
87
 
88
  if __name__ == "__main__":
89
- # You will need to install the required libraries:
90
- # pip install gradio huggingface_hub Pillow requests
91
  demo.launch(mcp_server=True)
 
1
  import os
 
2
  import base64
3
  import requests
 
4
  import secrets
5
  import gradio as gr
6
  from huggingface_hub import upload_file, InferenceClient
7
  from PIL import Image
8
+ import json
9
 
10
+ # ... (Config and Helper Functions remain the same as previous snippets) ...
11
+ # Ensure HF_VLM_MODEL is set back to the Qwen model you want to use:
12
+ HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 
 
 
 
 
13
 
14
  # Initialize the Hugging Face Inference Client
15
  hf_client = InferenceClient(token=HF_TOKEN)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # --- Main MCP function ---
18
  def process_and_describe(payload: dict):
19
  try:
 
22
 
23
  # 1️⃣ Save & upload image
24
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
 
 
 
25
 
26
+ # --- NEW VLM CALL LOGIC ---
27
+ # 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
28
+ # The API expects an image embedded in the prompt using Markdown syntax.
29
+ prompt_text = "Describe this image in detail."
30
+
31
+ # Base64 encode the image for embedding in the JSON payload
32
+ with open(local_tmp_path, "rb") as f:
33
+ image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")
34
+
35
+ # The full prompt format required by Qwen in the API
36
+ full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}">{prompt_text}'
37
 
38
+ # 3️⃣ Call VLM using hf_client.post (low-level API call for specific models)
39
+ # We use the 'text-generation' task endpoint as indicated by the error message.
40
+ api_response = hf_client.post(
41
+ json={"inputs": full_prompt, "parameters": {"max_new_tokens": 150}},
42
  model=HF_VLM_MODEL,
43
+ task="text-generation"
44
  )
45
+
46
+ # The response is usually a list of dicts, extract the generated text
47
+ # Example response format: [{'generated_text': '... description ...'}]
48
+ if isinstance(api_response, list) and len(api_response) > 0:
49
+ vlm_text = api_response[0].get('generated_text', '').strip()
50
+ else:
51
+ vlm_text = "Failed to parse VLM response."
52
+
53
+ # --- END NEW VLM CALL LOGIC ---
54
 
55
  return {
56
  "saved_to_hf_hub": True,
 
63
  }
64
 
65
  except Exception as e:
66
+ # Added better error handling as suggested previously
67
  return {"error": str(e)}
68
 
69
+ # ... (Gradio Interface code remains the same) ...
 
 
 
 
 
 
70
 
71
  if __name__ == "__main__":
 
 
72
  demo.launch(mcp_server=True)