Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,34 +50,26 @@ def process_and_describe(payload: dict):
|
|
| 50 |
# 1️⃣ Save & upload image
|
| 51 |
local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
|
| 52 |
|
| 53 |
-
# --- NEW VLM CALL LOGIC ---
|
| 54 |
# 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
|
| 55 |
-
# The API expects an image embedded in the prompt using Markdown syntax.
|
| 56 |
prompt_text = "Describe this image in detail."
|
| 57 |
|
| 58 |
-
# Base64 encode the image for embedding in the
|
| 59 |
with open(local_tmp_path, "rb") as f:
|
| 60 |
image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")
|
| 61 |
|
| 62 |
-
# The full prompt format required by Qwen in the API
|
| 63 |
-
full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}">{prompt_text}'
|
| 64 |
|
| 65 |
-
# 3️⃣ Call VLM using hf_client.
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
json={"inputs": full_prompt, "parameters": {"max_new_tokens": 150}},
|
| 69 |
model=HF_VLM_MODEL,
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
)
|
| 72 |
|
| 73 |
-
# The response
|
| 74 |
-
# Example response format: [{'generated_text': '... description ...'}]
|
| 75 |
-
if isinstance(api_response, list) and len(api_response) > 0:
|
| 76 |
-
vlm_text = api_response[0].get('generated_text', '').strip()
|
| 77 |
-
else:
|
| 78 |
-
vlm_text = "Failed to parse VLM response."
|
| 79 |
-
|
| 80 |
-
# --- END NEW VLM CALL LOGIC ---
|
| 81 |
|
| 82 |
return {
|
| 83 |
"saved_to_hf_hub": True,
|
|
@@ -86,12 +78,12 @@ def process_and_describe(payload: dict):
|
|
| 86 |
"image_url": hf_url,
|
| 87 |
"file_size_bytes": size_bytes,
|
| 88 |
"robot_id": robot_id,
|
| 89 |
-
"vlm_description": vlm_text
|
| 90 |
}
|
| 91 |
|
| 92 |
except Exception as e:
|
| 93 |
-
# Added better error handling
|
| 94 |
-
return {"error": str(e)}
|
| 95 |
|
| 96 |
# --- Gradio MCP Interface ---
|
| 97 |
demo = gr.Interface(
|
|
|
|
| 50 |
# 1️⃣ Save & upload image
|
| 51 |
local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
|
| 52 |
|
|
|
|
| 53 |
# 2️⃣ Prepare prompt in the Qwen specific format (using Markdown for image embedding)
|
|
|
|
| 54 |
prompt_text = "Describe this image in detail."
|
| 55 |
|
| 56 |
+
# Base64 encode the image for embedding in the prompt
|
| 57 |
with open(local_tmp_path, "rb") as f:
|
| 58 |
image_b64_encoded_string = base64.b64encode(f.read()).decode("utf-8")
|
| 59 |
|
| 60 |
+
# The full prompt format required by Qwen, embedded in a chat-like structure for the API
|
| 61 |
+
full_prompt = f'<img src="data:image/jpeg;base64,{image_b64_encoded_string}"> {prompt_text}'
|
| 62 |
|
| 63 |
+
# 3️⃣ Call VLM using hf_client.text_generation (the preferred method for general LLMs)
|
| 64 |
+
# This sends the custom prompt string to the model endpoint.
|
| 65 |
+
vlm_text = hf_client.text_generation(
|
|
|
|
| 66 |
model=HF_VLM_MODEL,
|
| 67 |
+
prompt=full_prompt,
|
| 68 |
+
max_new_tokens=150,
|
| 69 |
+
# Other parameters like temperature can be added here if needed
|
| 70 |
)
|
| 71 |
|
| 72 |
+
# The response from text_generation is already the cleaned string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
return {
|
| 75 |
"saved_to_hf_hub": True,
|
|
|
|
| 78 |
"image_url": hf_url,
|
| 79 |
"file_size_bytes": size_bytes,
|
| 80 |
"robot_id": robot_id,
|
| 81 |
+
"vlm_description": vlm_text.strip()
|
| 82 |
}
|
| 83 |
|
| 84 |
except Exception as e:
|
| 85 |
+
# Added better error handling
|
| 86 |
+
return {"error": f"An API error occurred: {str(e)}"}
|
| 87 |
|
| 88 |
# --- Gradio MCP Interface ---
|
| 89 |
demo = gr.Interface(
|