Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Jul 4

Commit

74f76da

verified ·

1 Parent(s): fd7a385

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -66

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from threading import Thread
 import base64
 from io import BytesIO
 import re
 import gradio as gr
 import spaces
@@ -22,7 +23,6 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForCausalLM,
-    AutoTokenizer
 )
 from qwen_vl_utils import process_vision_info
@@ -69,17 +69,17 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Load Moondream2
-@spaces.GPU
-def load_moondream_model():
-    return AutoModelForCausalLM.from_pretrained(
-        "vikhyatk/moondream2",
-        revision="2025-04-14",
-        trust_remote_code=True,
-        device_map={"": "cuda"},
-    )
-# Helper functions for object detection
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
     buffered = BytesIO()
@@ -95,12 +95,12 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
         draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
     return image
-def draw_points(image, points, color="red", radius=5):
-    """Draw points on an image."""
     draw = ImageDraw.Draw(image)
     for point in points:
         x, y = point
-        draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color)
     return image
 def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
@@ -127,11 +127,11 @@ default_system_prompt = (
     "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 )
-# Function for object detection
 @spaces.GPU
 def run_example(image, text_input, system_prompt):
     """Detect objects in an image and return bounding box annotations."""
-    model = model_x
     processor = processor_x
     messages = [
@@ -172,6 +172,39 @@ def run_example(image, text_input, system_prompt):
     annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
     return output_text[0], str(parsed_boxes), annotated_image
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -316,34 +349,11 @@ def generate_video(model_name: str, text: str, video_path: str,
         time.sleep(0.01)
         yield buffer, buffer
-# Moondream Vision Detection Function
-@spaces.GPU
-def detect_moondream(im: Image.Image, object_name: str, mode: str):
-    """
-    Open Vocabulary Detection using moondream2
-    Args:
-        im: Pillow Image
-        object_name: the object you would like to detect
-        mode: point or object_detection
-    Returns:
-        list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
-        annotated_image: Image with detections drawn
-    """
-    model = load_moondream_model()
-    if mode == "point":
-        points = model.point(im, object_name)["points"]
-        annotated_image = draw_points(im.copy(), points)
-        return points, annotated_image
-    elif mode == "object_detection":
-        boxes = model.detect(im, object_name)["objects"]
-        annotated_image = draw_bounding_boxes(im.copy(), boxes)
-        return boxes, annotated_image
-# Define examples for image and video inference
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
-    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
@@ -352,18 +362,11 @@ video_examples = [
     ["explain the video in detail.", "videos/2.mp4"]
 ]
-# Define examples for object detection
 object_detection_examples = [
     ["Detect Spider-Man T-shirt.", "images/22.png"],
     ["Detect Green Car.", "images/11.png"]
 ]
-# Define examples for Moondream Vision
-moondream_examples = [
-    ["Spider-Man T-shirt", "images/22.png", "point"],
-    ["Green Car", "images/11.png", "object_detection"]
-]
 # Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
@@ -423,27 +426,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         inputs=[input_img, text_input, system_prompt],
                         outputs=[model_output_text, parsed_boxes, annotated_image]
                     )
-                with gr.TabItem("Moondream Vision"):
                     with gr.Row():
                         with gr.Column():
-                            moon_image = gr.Image(label="Input Image", type="pil")
-                            moon_object = gr.Textbox(label="Object to Detect", placeholder="e.g., Spider-Man T-shirt")
-                            moon_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="point")
-                            moon_submit = gr.Button("Detect", elem_classes="submit-btn")
-                            gr.Examples(
-                                examples=moondream_examples,
-                                inputs=[moon_object, moon_image, moon_mode]
-                            )
                         with gr.Column():
-                            moon_output_json = gr.JSON(label="Detection Results")
-                            moon_annotated_image = gr.Image(label="Annotated Image")
-                    moon_submit.click(
-                        fn=detect_moondream,
-                        inputs=[moon_image, moon_object, moon_mode],
-                        outputs=[moon_output_json, moon_annotated_image]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -456,7 +458,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 gr.Markdown("## Result.Md")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
-                with gr.Accordion("Formatted Result (Result.Md)", open=False):
                     markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
@@ -470,8 +472,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
             gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
             gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
-            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],

 import base64
 from io import BytesIO
 import re
+from typing import Literal
 import gradio as gr
 import spaces
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForCausalLM,
 )
 from qwen_vl_utils import process_vision_info
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load moondream2
+MODEL_ID_MD = "vikhyatk/moondream2"
+model_md = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID_MD,
+    revision="2025-06-21",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+).to(device).eval()
+# Helper functions for object detection and drawing
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
     buffered = BytesIO()
         draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
     return image
+def draw_points(image, points, color="lime", radius=10):
+    """Draw points (circles) on an image."""
     draw = ImageDraw.Draw(image)
     for point in points:
         x, y = point
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
     return image
 def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
     "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 )
+# Function for ViLaSR object detection
 @spaces.GPU
 def run_example(image, text_input, system_prompt):
     """Detect objects in an image and return bounding box annotations."""
+    model = model_x
     processor = processor_x
     messages = [
     annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
     return output_text[0], str(parsed_boxes), annotated_image
+# Function for Moondream object pointing/detection
+@spaces.GPU
+def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
+    """
+    Open Vocabulary Detection/Pointing using moondream2.
+    """
+    if image is None:
+        return "Please upload an image.", None
+    original_width, original_height = image.size
+    annotated_image = image.copy()
+    json_output = {}
+    if mode == "point":
+        result = model_md.point(im=image, prompt=prompt)
+        points = result.get("points", [])
+        json_output = result
+        if points:
+            rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
+            annotated_image = draw_points(annotated_image, rescaled_points)
+    elif mode == "object_detection":
+        result = model_md.detect(im=image, prompt=prompt)
+        boxes = result.get("objects", [])
+        json_output = result
+        if boxes:
+            rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
+            annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
+    else:
+        return "Invalid mode selected.", None
+    return json_output, annotated_image
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
         time.sleep(0.01)
         yield buffer, buffer
+# Define examples
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
+    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
     ["explain the video in detail.", "videos/2.mp4"]
 ]
 object_detection_examples = [
     ["Detect Spider-Man T-shirt.", "images/22.png"],
     ["Detect Green Car.", "images/11.png"]
 ]
 # Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
                         inputs=[input_img, text_input, system_prompt],
                         outputs=[model_output_text, parsed_boxes, annotated_image]
                     )
+                # NEW MOONDREAM TAB
+                with gr.TabItem("moondream-vision"):
+                    gr.Markdown("## Moondream Vision: Object Pointing & Detection")
                     with gr.Row():
                         with gr.Column():
+                            moondream_input_img = gr.Image(label="Input Image", type="pil")
+                            moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
+                            moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
+                            moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
                         with gr.Column():
+                            moondream_json_output = gr.JSON(label="Output JSON")
+                            moondream_annotated_image = gr.Image(label="Detection Result")
+                    moondream_submit_btn.click(
+                        fn=run_moondream,
+                        inputs=[moondream_input_img, moondream_text_input, moondream_mode],
+                        outputs=[moondream_json_output, moondream_annotated_image]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 gr.Markdown("## Result.Md")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
+                with gr.Accordion("Formatted Result (Result.Md)", open=False):
                     markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
             gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
             gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
             gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
+            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],