Spaces:
Runtime error
Runtime error
change label format
Browse files
app.py
CHANGED
|
@@ -57,9 +57,11 @@ import spaces
|
|
| 57 |
@spaces.GPU
|
| 58 |
def process_image_and_text(image, text):
|
| 59 |
"""Process image and text input, return thinking process and bbox"""
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
-
question = question +
|
|
|
|
| 63 |
|
| 64 |
messages = [
|
| 65 |
{
|
|
@@ -116,7 +118,7 @@ def process_image_and_text(image, text):
|
|
| 116 |
if __name__ == "__main__":
|
| 117 |
import gradio as gr
|
| 118 |
|
| 119 |
-
model_path = "
|
| 120 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 121 |
device = "cuda"
|
| 122 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
|
|
@@ -131,7 +133,7 @@ if __name__ == "__main__":
|
|
| 131 |
fn=gradio_interface,
|
| 132 |
inputs=[
|
| 133 |
gr.Image(type="pil", label="Input Image"),
|
| 134 |
-
gr.Textbox(label="
|
| 135 |
],
|
| 136 |
outputs=[
|
| 137 |
gr.Textbox(label="Thinking Process"),
|
|
@@ -142,8 +144,8 @@ if __name__ == "__main__":
|
|
| 142 |
description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
|
| 143 |
examples=[
|
| 144 |
["examples/image1.jpg", "person"],
|
| 145 |
-
["examples/image2.jpg", "drink,
|
| 146 |
-
["examples/image3.png", "keyboard,
|
| 147 |
],
|
| 148 |
cache_examples=False,
|
| 149 |
examples_per_page=10
|
|
|
|
| 57 |
@spaces.GPU
|
| 58 |
def process_image_and_text(image, text):
|
| 59 |
"""Process image and text input, return thinking process and bbox"""
|
| 60 |
+
labels = text.split(",")
|
| 61 |
+
question = f"First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: {labels}. "
|
| 62 |
|
| 63 |
+
question = question + "Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."
|
| 64 |
+
print("question: ", question)
|
| 65 |
|
| 66 |
messages = [
|
| 67 |
{
|
|
|
|
| 118 |
if __name__ == "__main__":
|
| 119 |
import gradio as gr
|
| 120 |
|
| 121 |
+
model_path = "omlab/VLM-R1-Qwen2.5VL-3B-OVD-0321"
|
| 122 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 123 |
device = "cuda"
|
| 124 |
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
|
|
|
|
| 133 |
fn=gradio_interface,
|
| 134 |
inputs=[
|
| 135 |
gr.Image(type="pil", label="Input Image"),
|
| 136 |
+
gr.Textbox(label="Objects to detect (separated by ,)")
|
| 137 |
],
|
| 138 |
outputs=[
|
| 139 |
gr.Textbox(label="Thinking Process"),
|
|
|
|
| 144 |
description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
|
| 145 |
examples=[
|
| 146 |
["examples/image1.jpg", "person"],
|
| 147 |
+
["examples/image2.jpg", "drink,fruit"],
|
| 148 |
+
["examples/image3.png", "keyboard,white cup,laptop"],
|
| 149 |
],
|
| 150 |
cache_examples=False,
|
| 151 |
examples_per_page=10
|