Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,6 @@ from threading import Thread
|
|
| 8 |
import base64
|
| 9 |
from io import BytesIO
|
| 10 |
import re
|
| 11 |
-
from typing import Literal
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
import spaces
|
|
@@ -23,6 +22,7 @@ from transformers import (
|
|
| 23 |
AutoProcessor,
|
| 24 |
TextIteratorStreamer,
|
| 25 |
AutoModelForCausalLM,
|
|
|
|
| 26 |
)
|
| 27 |
from qwen_vl_utils import process_vision_info
|
| 28 |
|
|
@@ -71,15 +71,15 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
| 71 |
|
| 72 |
# Load moondream2
|
| 73 |
MODEL_ID_MD = "vikhyatk/moondream2"
|
|
|
|
| 74 |
model_md = AutoModelForCausalLM.from_pretrained(
|
| 75 |
MODEL_ID_MD,
|
| 76 |
revision="2025-06-21",
|
| 77 |
trust_remote_code=True,
|
| 78 |
-
torch_dtype=torch.float16
|
| 79 |
).to(device).eval()
|
| 80 |
|
| 81 |
-
|
| 82 |
-
# Helper functions for object detection and drawing
|
| 83 |
def image_to_base64(image):
|
| 84 |
"""Convert a PIL image to a base64-encoded string."""
|
| 85 |
buffered = BytesIO()
|
|
@@ -95,14 +95,6 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
|
|
| 95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
| 96 |
return image
|
| 97 |
|
| 98 |
-
def draw_points(image, points, color="lime", radius=10):
|
| 99 |
-
"""Draw points (circles) on an image."""
|
| 100 |
-
draw = ImageDraw.Draw(image)
|
| 101 |
-
for point in points:
|
| 102 |
-
x, y = point
|
| 103 |
-
draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
|
| 104 |
-
return image
|
| 105 |
-
|
| 106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
| 107 |
"""Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
|
| 108 |
x_scale = original_width / scaled_width
|
|
@@ -127,11 +119,11 @@ default_system_prompt = (
|
|
| 127 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
| 128 |
)
|
| 129 |
|
| 130 |
-
# Function for
|
| 131 |
@spaces.GPU
|
| 132 |
def run_example(image, text_input, system_prompt):
|
| 133 |
"""Detect objects in an image and return bounding box annotations."""
|
| 134 |
-
model = model_x
|
| 135 |
processor = processor_x
|
| 136 |
|
| 137 |
messages = [
|
|
@@ -172,41 +164,6 @@ def run_example(image, text_input, system_prompt):
|
|
| 172 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
| 173 |
return output_text[0], str(parsed_boxes), annotated_image
|
| 174 |
|
| 175 |
-
# Function for Moondream object pointing/detection
|
| 176 |
-
@spaces.GPU
|
| 177 |
-
def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
|
| 178 |
-
"""
|
| 179 |
-
Open Vocabulary Detection/Pointing using moondream2.
|
| 180 |
-
"""
|
| 181 |
-
if image is None:
|
| 182 |
-
return "Please upload an image.", None
|
| 183 |
-
|
| 184 |
-
original_width, original_height = image.size
|
| 185 |
-
annotated_image = image.copy()
|
| 186 |
-
json_output = {}
|
| 187 |
-
|
| 188 |
-
if mode == "point":
|
| 189 |
-
# FIX: Changed 'im' to 'image'
|
| 190 |
-
result = model_md.point(image=image, prompt=prompt)
|
| 191 |
-
points = result.get("points", [])
|
| 192 |
-
json_output = result
|
| 193 |
-
if points:
|
| 194 |
-
rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
|
| 195 |
-
annotated_image = draw_points(annotated_image, rescaled_points)
|
| 196 |
-
|
| 197 |
-
elif mode == "object_detection":
|
| 198 |
-
# FIX: Changed 'im' to 'image'
|
| 199 |
-
result = model_md.detect(image=image, prompt=prompt)
|
| 200 |
-
boxes = result.get("objects", [])
|
| 201 |
-
json_output = result
|
| 202 |
-
if boxes:
|
| 203 |
-
rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
|
| 204 |
-
annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
|
| 205 |
-
else:
|
| 206 |
-
return "Invalid mode selected.", None
|
| 207 |
-
|
| 208 |
-
return json_output, annotated_image
|
| 209 |
-
|
| 210 |
def downsample_video(video_path):
|
| 211 |
"""
|
| 212 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
@@ -249,6 +206,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
| 249 |
elif model_name == "ShotVL-7B":
|
| 250 |
processor = processor_s
|
| 251 |
model = model_s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
else:
|
| 253 |
yield "Invalid model selected.", "Invalid model selected."
|
| 254 |
return
|
|
@@ -305,6 +281,31 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 305 |
elif model_name == "ShotVL-7B":
|
| 306 |
processor = processor_s
|
| 307 |
model = model_s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
else:
|
| 309 |
yield "Invalid model selected.", "Invalid model selected."
|
| 310 |
return
|
|
@@ -351,11 +352,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
| 351 |
time.sleep(0.01)
|
| 352 |
yield buffer, buffer
|
| 353 |
|
| 354 |
-
# Define examples
|
| 355 |
image_examples = [
|
| 356 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
| 357 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
| 358 |
-
["explain the movie shot in detail.", "images/3.png"],
|
| 359 |
["fill the correct numbers.", "images/4.png"]
|
| 360 |
]
|
| 361 |
|
|
@@ -364,8 +365,9 @@ video_examples = [
|
|
| 364 |
["explain the video in detail.", "videos/2.mp4"]
|
| 365 |
]
|
| 366 |
|
|
|
|
| 367 |
object_detection_examples = [
|
| 368 |
-
["Detect Spider-Man T-shirt.", "images/22.png"],
|
| 369 |
["Detect Green Car.", "images/11.png"]
|
| 370 |
]
|
| 371 |
|
|
@@ -428,25 +430,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 428 |
inputs=[input_img, text_input, system_prompt],
|
| 429 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
| 430 |
)
|
| 431 |
-
# NEW MOONDREAM TAB
|
| 432 |
-
with gr.TabItem("moondream-vision"):
|
| 433 |
-
gr.Markdown("## Moondream Vision: Object Pointing & Detection")
|
| 434 |
-
with gr.Row():
|
| 435 |
-
with gr.Column():
|
| 436 |
-
moondream_input_img = gr.Image(label="Input Image", type="pil")
|
| 437 |
-
moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
|
| 438 |
-
moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
|
| 439 |
-
moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
|
| 440 |
-
with gr.Column():
|
| 441 |
-
moondream_json_output = gr.JSON(label="Output JSON")
|
| 442 |
-
moondream_annotated_image = gr.Image(label="Detection Result")
|
| 443 |
-
|
| 444 |
-
moondream_submit_btn.click(
|
| 445 |
-
fn=run_moondream,
|
| 446 |
-
inputs=[moondream_input_img, moondream_text_input, moondream_mode],
|
| 447 |
-
outputs=[moondream_json_output, moondream_annotated_image]
|
| 448 |
-
)
|
| 449 |
-
|
| 450 |
|
| 451 |
with gr.Accordion("Advanced options", open=False):
|
| 452 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
|
@@ -460,22 +443,23 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 460 |
gr.Markdown("## Result.Md")
|
| 461 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
| 462 |
|
| 463 |
-
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
| 464 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
| 465 |
|
| 466 |
model_choice = gr.Radio(
|
| 467 |
-
choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
|
| 468 |
label="Select Model",
|
| 469 |
value="Camel-Doc-OCR-062825"
|
| 470 |
)
|
| 471 |
|
| 472 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
|
| 473 |
gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
| 474 |
-
gr.Markdown("> [OCRFlux-3B](https://
|
| 475 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
| 476 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
| 477 |
-
gr.Markdown("
|
| 478 |
-
|
|
|
|
| 479 |
image_submit.click(
|
| 480 |
fn=generate_image,
|
| 481 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
|
|
|
| 8 |
import base64
|
| 9 |
from io import BytesIO
|
| 10 |
import re
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import spaces
|
|
|
|
| 22 |
AutoProcessor,
|
| 23 |
TextIteratorStreamer,
|
| 24 |
AutoModelForCausalLM,
|
| 25 |
+
AutoTokenizer
|
| 26 |
)
|
| 27 |
from qwen_vl_utils import process_vision_info
|
| 28 |
|
|
|
|
| 71 |
|
| 72 |
# Load moondream2
|
| 73 |
MODEL_ID_MD = "vikhyatk/moondream2"
|
| 74 |
+
tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
|
| 75 |
model_md = AutoModelForCausalLM.from_pretrained(
|
| 76 |
MODEL_ID_MD,
|
| 77 |
revision="2025-06-21",
|
| 78 |
trust_remote_code=True,
|
| 79 |
+
torch_dtype=torch.float16
|
| 80 |
).to(device).eval()
|
| 81 |
|
| 82 |
+
# Helper functions for object detection
|
|
|
|
| 83 |
def image_to_base64(image):
|
| 84 |
"""Convert a PIL image to a base64-encoded string."""
|
| 85 |
buffered = BytesIO()
|
|
|
|
| 95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
| 96 |
return image
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
| 99 |
"""Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
|
| 100 |
x_scale = original_width / scaled_width
|
|
|
|
| 119 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
| 120 |
)
|
| 121 |
|
| 122 |
+
# Function for object detection
|
| 123 |
@spaces.GPU
|
| 124 |
def run_example(image, text_input, system_prompt):
|
| 125 |
"""Detect objects in an image and return bounding box annotations."""
|
| 126 |
+
model = model_x
|
| 127 |
processor = processor_x
|
| 128 |
|
| 129 |
messages = [
|
|
|
|
| 164 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
| 165 |
return output_text[0], str(parsed_boxes), annotated_image
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
def downsample_video(video_path):
|
| 168 |
"""
|
| 169 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
|
|
| 206 |
elif model_name == "ShotVL-7B":
|
| 207 |
processor = processor_s
|
| 208 |
model = model_s
|
| 209 |
+
elif model_name == "moondream2":
|
| 210 |
+
model = model_md
|
| 211 |
+
tokenizer = tokenizer_md
|
| 212 |
+
image_embeds = model.encode_image(image)
|
| 213 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 214 |
+
thread = Thread(target=model.answer_question, kwargs={
|
| 215 |
+
"image_embeds": image_embeds,
|
| 216 |
+
"question": text,
|
| 217 |
+
"tokenizer": tokenizer,
|
| 218 |
+
"max_new_tokens": max_new_tokens,
|
| 219 |
+
"streamer": streamer,
|
| 220 |
+
})
|
| 221 |
+
thread.start()
|
| 222 |
+
buffer = ""
|
| 223 |
+
for new_text in streamer:
|
| 224 |
+
buffer += new_text
|
| 225 |
+
time.sleep(0.01)
|
| 226 |
+
yield buffer, buffer
|
| 227 |
+
return
|
| 228 |
else:
|
| 229 |
yield "Invalid model selected.", "Invalid model selected."
|
| 230 |
return
|
|
|
|
| 281 |
elif model_name == "ShotVL-7B":
|
| 282 |
processor = processor_s
|
| 283 |
model = model_s
|
| 284 |
+
elif model_name == "moondream2":
|
| 285 |
+
model = model_md
|
| 286 |
+
tokenizer = tokenizer_md
|
| 287 |
+
frames = downsample_video(video_path)
|
| 288 |
+
buffer = ""
|
| 289 |
+
for frame in frames:
|
| 290 |
+
image, timestamp = frame
|
| 291 |
+
image_embeds = model.encode_image(image)
|
| 292 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 293 |
+
thread = Thread(target=model.answer_question, kwargs={
|
| 294 |
+
"image_embeds": image_embeds,
|
| 295 |
+
"question": text,
|
| 296 |
+
"tokenizer": tokenizer,
|
| 297 |
+
"max_new_tokens": max_new_tokens,
|
| 298 |
+
"streamer": streamer,
|
| 299 |
+
})
|
| 300 |
+
thread.start()
|
| 301 |
+
frame_buffer = f"Frame {timestamp}:\n"
|
| 302 |
+
for new_text in streamer:
|
| 303 |
+
frame_buffer += new_text
|
| 304 |
+
buffer += new_text
|
| 305 |
+
time.sleep(0.01)
|
| 306 |
+
yield buffer, buffer
|
| 307 |
+
buffer += "\n\n"
|
| 308 |
+
return
|
| 309 |
else:
|
| 310 |
yield "Invalid model selected.", "Invalid model selected."
|
| 311 |
return
|
|
|
|
| 352 |
time.sleep(0.01)
|
| 353 |
yield buffer, buffer
|
| 354 |
|
| 355 |
+
# Define examples for image and video inference
|
| 356 |
image_examples = [
|
| 357 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
| 358 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
| 359 |
+
["explain the movie shot in detail.", "images/3.png"],
|
| 360 |
["fill the correct numbers.", "images/4.png"]
|
| 361 |
]
|
| 362 |
|
|
|
|
| 365 |
["explain the video in detail.", "videos/2.mp4"]
|
| 366 |
]
|
| 367 |
|
| 368 |
+
# Define examples for object detection
|
| 369 |
object_detection_examples = [
|
| 370 |
+
["Detect Spider-Man T-shirt.", -shirt.", "images/22.png"],
|
| 371 |
["Detect Green Car.", "images/11.png"]
|
| 372 |
]
|
| 373 |
|
|
|
|
| 430 |
inputs=[input_img, text_input, system_prompt],
|
| 431 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
| 432 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
with gr.Accordion("Advanced options", open=False):
|
| 435 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
|
|
|
| 443 |
gr.Markdown("## Result.Md")
|
| 444 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
| 445 |
|
| 446 |
+
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
| 447 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
| 448 |
|
| 449 |
model_choice = gr.Radio(
|
| 450 |
+
choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B", "moondream2"],
|
| 451 |
label="Select Model",
|
| 452 |
value="Camel-Doc-OCR-062825"
|
| 453 |
)
|
| 454 |
|
| 455 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
|
| 456 |
gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
| 457 |
+
gr.Markdown("> [OCRFlux-3B](https://h темаuggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
| 458 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
| 459 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
| 460 |
+
gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
|
| 461 |
+
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
| 462 |
+
|
| 463 |
image_submit.click(
|
| 464 |
fn=generate_image,
|
| 465 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|