OppaAI commited on
Commit
c5129eb
·
verified ·
1 Parent(s): cd798bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -21
app.py CHANGED
@@ -12,7 +12,7 @@ from PIL import Image
12
  HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
13
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
  # Model specifically for VLM (image-to-text) tasks on Hugging Face
15
- HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct" # A suitable VLM model
16
 
17
  if not HF_TOKEN:
18
  raise ValueError("HF_TOKEN environment variable not set.")
@@ -47,29 +47,30 @@ def process_and_describe(payload: dict):
47
  robot_id = payload.get("robot_id", "unknown")
48
  image_b64 = payload["image_b64"]
49
 
50
- # 1️⃣ Save & upload image
51
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
52
 
53
- # 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
54
- prompt_text = "Describe this image in detail."
55
-
56
- # Base64 encode the image for embedding in the prompt
57
- with open(local_tmp_path, "rb") as f:
58
- image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")
59
-
60
- # The full prompt format required by Qwen, embedded in a chat-like structure for the API
61
- full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}"> {prompt_text}'
 
 
62
 
63
- # 3️⃣ Call VLM using hf_client.text_generation (the preferred method for general LLMs)
64
- # This sends the custom prompt string to the model endpoint.
65
- vlm_text = hf_client.text_generation(
66
  model=HF_VLM_MODEL,
67
- prompt=full_prompt,
68
- max_new_tokens=150,
69
- # Other parameters like temperature can be added here if needed
70
  )
71
 
72
- # The response from text_generation is already the cleaned string
 
73
 
74
  return {
75
  "saved_to_hf_hub": True,
@@ -78,7 +79,7 @@ def process_and_describe(payload: dict):
78
  "image_url": hf_url,
79
  "file_size_bytes": size_bytes,
80
  "robot_id": robot_id,
81
- "vlm_description": vlm_text.strip()
82
  }
83
 
84
  except Exception as e:
@@ -94,6 +95,5 @@ demo = gr.Interface(
94
  )
95
 
96
  if __name__ == "__main__":
97
- # You will need to install the required libraries:
98
- # pip install gradio huggingface_hub Pillow requests
99
  demo.launch(mcp_server=True)
 
12
  HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
13
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
  # Model specifically for VLM (image-to-text) tasks on Hugging Face
15
+ HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
16
 
17
  if not HF_TOKEN:
18
  raise ValueError("HF_TOKEN environment variable not set.")
 
47
  robot_id = payload.get("robot_id", "unknown")
48
  image_b64 = payload["image_b64"]
49
 
50
+ # 1️⃣ Save & upload image (needed for tracking, but B64 is used for VLM call)
51
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
52
 
53
+ # 2️⃣ Prepare the multimodal message payload for the conversational API
54
+ messages_payload = [
55
+ {
56
+ "role": "user",
57
+ "content": [
58
+ {"type": "text", "text": "Describe this image in detail."},
59
+ # Pass the original Base64 string directly in the required format
60
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
61
+ ],
62
+ }
63
+ ]
64
 
65
+ # 3️⃣ Call VLM using hf_client.chat.completions.create (The correct method for 'conversational' task)
66
+ chat_completion = hf_client.chat.completions.create(
 
67
  model=HF_VLM_MODEL,
68
+ messages=messages_payload,
69
+ max_tokens=150, # Use max_tokens instead of max_new_tokens for this method
 
70
  )
71
 
72
+ # Extract the text content from the response object
73
+ vlm_text = chat_completion.choices[0].message.content.strip()
74
 
75
  return {
76
  "saved_to_hf_hub": True,
 
79
  "image_url": hf_url,
80
  "file_size_bytes": size_bytes,
81
  "robot_id": robot_id,
82
+ "vlm_description": vlm_text
83
  }
84
 
85
  except Exception as e:
 
95
  )
96
 
97
  if __name__ == "__main__":
98
+ # Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
 
99
  demo.launch(mcp_server=True)