Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ from threading import Thread
|
|
| 8 |
import base64
|
| 9 |
from io import BytesIO
|
| 10 |
import re
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import spaces
|
|
@@ -22,7 +23,6 @@ from transformers import (
|
|
| 22 |
AutoProcessor,
|
| 23 |
TextIteratorStreamer,
|
| 24 |
AutoModelForCausalLM,
|
| 25 |
-
AutoTokenizer
|
| 26 |
)
|
| 27 |
from qwen_vl_utils import process_vision_info
|
| 28 |
|
|
@@ -69,17 +69,17 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
| 69 |
torch_dtype=torch.float16
|
| 70 |
).to(device).eval()
|
| 71 |
|
| 72 |
-
# Load
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
)
|
| 81 |
|
| 82 |
-
|
|
|
|
| 83 |
def image_to_base64(image):
|
| 84 |
"""Convert a PIL image to a base64-encoded string."""
|
| 85 |
buffered = BytesIO()
|
|
@@ -95,12 +95,12 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
|
|
| 95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
| 96 |
return image
|
| 97 |
|
| 98 |
-
def draw_points(image, points, color="
|
| 99 |
-
"""Draw points on an image."""
|
| 100 |
draw = ImageDraw.Draw(image)
|
| 101 |
for point in points:
|
| 102 |
x, y = point
|
| 103 |
-
draw.ellipse(
|
| 104 |
return image
|
| 105 |
|
| 106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
|
@@ -127,11 +127,11 @@ default_system_prompt = (
|
|
| 127 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
| 128 |
)
|
| 129 |
|
| 130 |
-
# Function for object detection
|
| 131 |
@spaces.GPU
|
| 132 |
def run_example(image, text_input, system_prompt):
|
| 133 |
"""Detect objects in an image and return bounding box annotations."""
|
| 134 |
-
model = model_x
|
| 135 |
processor = processor_x
|
| 136 |
|
| 137 |
messages = [
|
|
@@ -172,6 +172,39 @@ def run_example(image, text_input, system_prompt):
|
|
| 172 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
| 173 |
return output_text[0], str(parsed_boxes), annotated_image
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
def downsample_video(video_path):
|
| 176 |
"""
|
| 177 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
@@ -316,34 +349,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 316 |
time.sleep(0.01)
|
| 317 |
yield buffer, buffer
|
| 318 |
|
| 319 |
-
#
|
| 320 |
-
@spaces.GPU
|
| 321 |
-
def detect_moondream(im: Image.Image, object_name: str, mode: str):
|
| 322 |
-
"""
|
| 323 |
-
Open Vocabulary Detection using moondream2
|
| 324 |
-
Args:
|
| 325 |
-
im: Pillow Image
|
| 326 |
-
object_name: the object you would like to detect
|
| 327 |
-
mode: point or object_detection
|
| 328 |
-
Returns:
|
| 329 |
-
list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
|
| 330 |
-
annotated_image: Image with detections drawn
|
| 331 |
-
"""
|
| 332 |
-
model = load_moondream_model()
|
| 333 |
-
if mode == "point":
|
| 334 |
-
points = model.point(im, object_name)["points"]
|
| 335 |
-
annotated_image = draw_points(im.copy(), points)
|
| 336 |
-
return points, annotated_image
|
| 337 |
-
elif mode == "object_detection":
|
| 338 |
-
boxes = model.detect(im, object_name)["objects"]
|
| 339 |
-
annotated_image = draw_bounding_boxes(im.copy(), boxes)
|
| 340 |
-
return boxes, annotated_image
|
| 341 |
-
|
| 342 |
-
# Define examples for image and video inference
|
| 343 |
image_examples = [
|
| 344 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
| 345 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
| 346 |
-
["explain the movie shot in detail.", "images/3.png"],
|
| 347 |
["fill the correct numbers.", "images/4.png"]
|
| 348 |
]
|
| 349 |
|
|
@@ -352,18 +362,11 @@ video_examples = [
|
|
| 352 |
["explain the video in detail.", "videos/2.mp4"]
|
| 353 |
]
|
| 354 |
|
| 355 |
-
# Define examples for object detection
|
| 356 |
object_detection_examples = [
|
| 357 |
["Detect Spider-Man T-shirt.", "images/22.png"],
|
| 358 |
["Detect Green Car.", "images/11.png"]
|
| 359 |
]
|
| 360 |
|
| 361 |
-
# Define examples for Moondream Vision
|
| 362 |
-
moondream_examples = [
|
| 363 |
-
["Spider-Man T-shirt", "images/22.png", "point"],
|
| 364 |
-
["Green Car", "images/11.png", "object_detection"]
|
| 365 |
-
]
|
| 366 |
-
|
| 367 |
# Added CSS to style the output area as a "Canvas"
|
| 368 |
css = """
|
| 369 |
.submit-btn {
|
|
@@ -423,27 +426,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 423 |
inputs=[input_img, text_input, system_prompt],
|
| 424 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
| 425 |
)
|
| 426 |
-
|
|
|
|
|
|
|
| 427 |
with gr.Row():
|
| 428 |
with gr.Column():
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
gr.Examples(
|
| 434 |
-
examples=moondream_examples,
|
| 435 |
-
inputs=[moon_object, moon_image, moon_mode]
|
| 436 |
-
)
|
| 437 |
with gr.Column():
|
| 438 |
-
|
| 439 |
-
|
| 440 |
|
| 441 |
-
|
| 442 |
-
fn=
|
| 443 |
-
inputs=[
|
| 444 |
-
outputs=[
|
| 445 |
)
|
| 446 |
|
|
|
|
| 447 |
with gr.Accordion("Advanced options", open=False):
|
| 448 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
| 449 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
|
@@ -456,7 +458,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 456 |
gr.Markdown("## Result.Md")
|
| 457 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
| 458 |
|
| 459 |
-
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
| 460 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
| 461 |
|
| 462 |
model_choice = gr.Radio(
|
|
@@ -470,8 +472,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 470 |
gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
| 471 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
| 472 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
| 473 |
-
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 474 |
-
|
| 475 |
image_submit.click(
|
| 476 |
fn=generate_image,
|
| 477 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
|
|
|
| 8 |
import base64
|
| 9 |
from io import BytesIO
|
| 10 |
import re
|
| 11 |
+
from typing import Literal
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
import spaces
|
|
|
|
| 23 |
AutoProcessor,
|
| 24 |
TextIteratorStreamer,
|
| 25 |
AutoModelForCausalLM,
|
|
|
|
| 26 |
)
|
| 27 |
from qwen_vl_utils import process_vision_info
|
| 28 |
|
|
|
|
| 69 |
torch_dtype=torch.float16
|
| 70 |
).to(device).eval()
|
| 71 |
|
| 72 |
+
# Load moondream2
|
| 73 |
+
MODEL_ID_MD = "vikhyatk/moondream2"
|
| 74 |
+
model_md = AutoModelForCausalLM.from_pretrained(
|
| 75 |
+
MODEL_ID_MD,
|
| 76 |
+
revision="2025-06-21",
|
| 77 |
+
trust_remote_code=True,
|
| 78 |
+
torch_dtype=torch.float16,
|
| 79 |
+
).to(device).eval()
|
|
|
|
| 80 |
|
| 81 |
+
|
| 82 |
+
# Helper functions for object detection and drawing
|
| 83 |
def image_to_base64(image):
|
| 84 |
"""Convert a PIL image to a base64-encoded string."""
|
| 85 |
buffered = BytesIO()
|
|
|
|
| 95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
| 96 |
return image
|
| 97 |
|
| 98 |
+
def draw_points(image, points, color="lime", radius=10):
|
| 99 |
+
"""Draw points (circles) on an image."""
|
| 100 |
draw = ImageDraw.Draw(image)
|
| 101 |
for point in points:
|
| 102 |
x, y = point
|
| 103 |
+
draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
|
| 104 |
return image
|
| 105 |
|
| 106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
|
|
|
| 127 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
| 128 |
)
|
| 129 |
|
| 130 |
+
# Function for ViLaSR object detection
|
| 131 |
@spaces.GPU
|
| 132 |
def run_example(image, text_input, system_prompt):
|
| 133 |
"""Detect objects in an image and return bounding box annotations."""
|
| 134 |
+
model = model_x
|
| 135 |
processor = processor_x
|
| 136 |
|
| 137 |
messages = [
|
|
|
|
| 172 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
| 173 |
return output_text[0], str(parsed_boxes), annotated_image
|
| 174 |
|
| 175 |
+
# Function for Moondream object pointing/detection
|
| 176 |
+
@spaces.GPU
|
| 177 |
+
def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
|
| 178 |
+
"""
|
| 179 |
+
Open Vocabulary Detection/Pointing using moondream2.
|
| 180 |
+
"""
|
| 181 |
+
if image is None:
|
| 182 |
+
return "Please upload an image.", None
|
| 183 |
+
|
| 184 |
+
original_width, original_height = image.size
|
| 185 |
+
annotated_image = image.copy()
|
| 186 |
+
json_output = {}
|
| 187 |
+
|
| 188 |
+
if mode == "point":
|
| 189 |
+
result = model_md.point(im=image, prompt=prompt)
|
| 190 |
+
points = result.get("points", [])
|
| 191 |
+
json_output = result
|
| 192 |
+
if points:
|
| 193 |
+
rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
|
| 194 |
+
annotated_image = draw_points(annotated_image, rescaled_points)
|
| 195 |
+
|
| 196 |
+
elif mode == "object_detection":
|
| 197 |
+
result = model_md.detect(im=image, prompt=prompt)
|
| 198 |
+
boxes = result.get("objects", [])
|
| 199 |
+
json_output = result
|
| 200 |
+
if boxes:
|
| 201 |
+
rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
|
| 202 |
+
annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
|
| 203 |
+
else:
|
| 204 |
+
return "Invalid mode selected.", None
|
| 205 |
+
|
| 206 |
+
return json_output, annotated_image
|
| 207 |
+
|
| 208 |
def downsample_video(video_path):
|
| 209 |
"""
|
| 210 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
|
|
| 349 |
time.sleep(0.01)
|
| 350 |
yield buffer, buffer
|
| 351 |
|
| 352 |
+
# Define examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
image_examples = [
|
| 354 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
| 355 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
| 356 |
+
["explain the movie shot in detail.", "images/3.png"],
|
| 357 |
["fill the correct numbers.", "images/4.png"]
|
| 358 |
]
|
| 359 |
|
|
|
|
| 362 |
["explain the video in detail.", "videos/2.mp4"]
|
| 363 |
]
|
| 364 |
|
|
|
|
| 365 |
object_detection_examples = [
|
| 366 |
["Detect Spider-Man T-shirt.", "images/22.png"],
|
| 367 |
["Detect Green Car.", "images/11.png"]
|
| 368 |
]
|
| 369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
# Added CSS to style the output area as a "Canvas"
|
| 371 |
css = """
|
| 372 |
.submit-btn {
|
|
|
|
| 426 |
inputs=[input_img, text_input, system_prompt],
|
| 427 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
| 428 |
)
|
| 429 |
+
# NEW MOONDREAM TAB
|
| 430 |
+
with gr.TabItem("moondream-vision"):
|
| 431 |
+
gr.Markdown("## Moondream Vision: Object Pointing & Detection")
|
| 432 |
with gr.Row():
|
| 433 |
with gr.Column():
|
| 434 |
+
moondream_input_img = gr.Image(label="Input Image", type="pil")
|
| 435 |
+
moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
|
| 436 |
+
moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
|
| 437 |
+
moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
with gr.Column():
|
| 439 |
+
moondream_json_output = gr.JSON(label="Output JSON")
|
| 440 |
+
moondream_annotated_image = gr.Image(label="Detection Result")
|
| 441 |
|
| 442 |
+
moondream_submit_btn.click(
|
| 443 |
+
fn=run_moondream,
|
| 444 |
+
inputs=[moondream_input_img, moondream_text_input, moondream_mode],
|
| 445 |
+
outputs=[moondream_json_output, moondream_annotated_image]
|
| 446 |
)
|
| 447 |
|
| 448 |
+
|
| 449 |
with gr.Accordion("Advanced options", open=False):
|
| 450 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
| 451 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
|
|
|
| 458 |
gr.Markdown("## Result.Md")
|
| 459 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
| 460 |
|
| 461 |
+
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
| 462 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
| 463 |
|
| 464 |
model_choice = gr.Radio(
|
|
|
|
| 472 |
gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
| 473 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
| 474 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
| 475 |
+
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 476 |
+
|
| 477 |
image_submit.click(
|
| 478 |
fn=generate_image,
|
| 479 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|