Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 3

Commit

7480fb9

verified ·

1 Parent(s): 1719f16

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -181

app.py CHANGED Viewed

@@ -1,17 +1,13 @@
 import spaces
 import json
-import math
 import os
 import traceback
 from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple
 import re
 import time
 from threading import Thread
-from io import BytesIO
-import uuid
 import tempfile
-import cv2
 import gradio as gr
 import numpy as np
@@ -96,7 +92,8 @@ print("moondream3-preview loaded and compiled.")
 # --- Moondream3 Utility Functions ---
-def create_annotated_image(image, detection_result, object_name="Object"):
     if not isinstance(detection_result, dict) or "objects" not in detection_result:
         return image
@@ -112,6 +109,7 @@ def create_annotated_image(image, detection_result, object_name="Object"):
         x_max = int(obj["x_max"] * original_width)
         y_max = int(obj["y_max"] * original_height)
         x_min = max(0, min(x_min, original_width))
         y_min = max(0, min(y_min, original_height))
         x_max = max(0, min(x_max, original_width))
@@ -129,112 +127,16 @@ def create_annotated_image(image, detection_result, object_name="Object"):
         class_id=np.arange(len(bboxes))
     )
-    bounding_box_annotator = sv.BoxAnnotator(
-        thickness=3,
-        color_lookup=sv.ColorLookup.INDEX
-    )
-    label_annotator = sv.LabelAnnotator(
-        text_thickness=2,
-        text_scale=0.6,
-        color_lookup=sv.ColorLookup.INDEX
-    )
-    annotated_image = bounding_box_annotator.annotate(
-        scene=annotated_image, detections=detections
-    )
-    annotated_image = label_annotator.annotate(
-        scene=annotated_image, detections=detections, labels=labels
-    )
     return Image.fromarray(annotated_image)
-@spaces.GPU()
-def process_video_with_tracking(video_path, prompt, detection_interval=3):
-    cap = cv2.VideoCapture(video_path)
-    fps = int(cap.get(cv2.CAP_PROP_FPS))
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    byte_tracker = sv.ByteTrack()
-    temp_dir = tempfile.mkdtemp()
-    output_path = os.path.join(temp_dir, "tracked_video.mp4")
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-    frame_count = 0
-    detection_count = 0
-    try:
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            run_detection = (frame_count % detection_interval == 0)
-            detections = sv.Detections.empty()
-            if run_detection:
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                pil_image = Image.fromarray(frame_rgb)
-                result = model_md3.detect(pil_image, prompt)
-                detection_count += 1
-                if "objects" in result and result["objects"]:
-                    bboxes = []
-                    confidences = []
-                    for obj in result["objects"]:
-                        x_min = max(0.0, min(1.0, obj["x_min"])) * width
-                        y_min = max(0.0, min(1.0, obj["y_min"])) * height
-                        x_max = max(0.0, min(1.0, obj["x_max"])) * width
-                        y_max = max(0.0, min(1.0, obj["y_max"])) * height
-                        if x_max > x_min and y_max > y_min:
-                            bboxes.append([x_min, y_min, x_max, y_max])
-                            confidences.append(0.8)
-                    if bboxes:
-                        detections = sv.Detections(
-                            xyxy=np.array(bboxes, dtype=np.float32),
-                            confidence=np.array(confidences, dtype=np.float32),
-                            class_id=np.zeros(len(bboxes), dtype=int)
-                        )
-            detections = byte_tracker.update_with_detections(detections)
-            if len(detections) > 0:
-                box_annotator = sv.BoxAnnotator(thickness=3, color_lookup=sv.ColorLookup.TRACK)
-                label_annotator = sv.LabelAnnotator(text_scale=0.6, text_thickness=2, color_lookup=sv.ColorLookup.TRACK)
-                labels = [f"{prompt} ID: {tracker_id}" for tracker_id in detections.tracker_id]
-                frame = box_annotator.annotate(scene=frame, detections=detections)
-                frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)
-            out.write(frame)
-            frame_count += 1
-            if frame_count % 30 == 0:
-                progress = (frame_count / total_frames) * 100
-                print(f"Processing: {progress:.1f}% ({frame_count}/{total_frames}) - Detections: {detection_count}")
-    finally:
-        cap.release()
-        out.release()
-    summary = f"""Video processing complete:
-- Total frames processed: {frame_count}
-- Detection runs: {detection_count} (every {detection_interval} frames)
-- Objects tracked: {prompt}
-- Processing speed: ~{detection_count/frame_count*100:.1f}% detection rate for optimization"""
-    return output_path, summary
-def create_point_annotated_image(image, point_result):
     if not isinstance(point_result, dict) or "points" not in point_result:
         return image
@@ -251,14 +153,13 @@ def create_point_annotated_image(image, point_result):
         points_array = np.array(points).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
-        annotated_image = vertex_annotator.annotate(
-            scene=annotated_image, key_points=key_points
-        )
     return Image.fromarray(annotated_image)
 @spaces.GPU()
-def detect_objects_md3(image, prompt, task_type, max_objects):
     STANDARD_SIZE = (1024, 1024)
     if image is None:
         raise gr.Error("Please upload an image.")
@@ -276,12 +177,13 @@ def detect_objects_md3(image, prompt, task_type, max_objects):
     elif task_type == "Caption":
         result = model_md3.caption(image, length="normal")
         annotated_image = image
-    else:
         result = model_md3.query(image=image, question=prompt, reasoning=True)
         annotated_image = image
     elapsed_ms = (time.perf_counter() - t0) * 1_000
     if isinstance(result, dict):
         if "objects" in result:
           output_text = f"Found {len(result['objects'])} objects:\n"
@@ -304,13 +206,6 @@ def detect_objects_md3(image, prompt, task_type, max_objects):
     return annotated_image, output_text, timing_text
-def process_video_md3(video_file, prompt, detection_interval):
-    if video_file is None:
-        return None, "Please upload a video file"
-    output_path, summary = process_video_with_tracking(video_file, prompt, detection_interval)
-    return output_path, summary
 # --- Core Application Logic (for other models) ---
 @spaces.GPU
 def process_document_stream(
@@ -323,9 +218,7 @@ def process_document_stream(
     top_k: int,
     repetition_penalty: float
 ):
-    """
-    Main generator function for models other than Moondream3.
-    """
     if image is None:
         yield "Please upload an image."
         return
@@ -367,7 +260,6 @@ def process_document_stream(
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-        # Clean up potential model-specific tokens
         buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
         time.sleep(0.01)
         yield buffer
@@ -382,7 +274,7 @@ def create_gradio_interface():
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.Markdown("# Multimodal VLM v1.0 🚀")
-        gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, Object Detection, and Video Tracking.")
         with gr.Tabs():
             # --- TAB 1: Document and General VLMs ---
@@ -392,7 +284,7 @@ def create_gradio_interface():
                         gr.Markdown("### 1. Configure Inputs")
                         model_choice = gr.Dropdown(
                             choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)", "Video-MTR (Video/Text)"],
-                            label="Select Model", value= "Camel-Doc-OCR-062825 (OCR)"
                         )
                         image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                         prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
@@ -422,59 +314,38 @@ def create_gradio_interface():
             # --- TAB 2: Moondream3 Lab ---
             with gr.TabItem("🌝 Moondream3 Lab"):
-                with gr.Tabs():
-                    with gr.TabItem("🖼️ Image Processing"):
-                        with gr.Row():
-                            with gr.Column(scale=1):
-                                md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
-                                md3_task_type = gr.Radio(
-                                    choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
-                                    label="Task Type", value="Object Detection"
-                                )
-                                md3_prompt_input = gr.Textbox(
-                                    label="Prompt (object to detect/question to ask)",
-                                    placeholder="e.g., 'car', 'person', 'What's in this image?'", value="objects"
-                                )
-                                md3_max_objects = gr.Number(
-                                    label="Max Objects (for Object Detection only)",
-                                    value=10, minimum=1, maximum=50, step=1, visible=True
-                                )
-                                md3_generate_btn = gr.Button(value="✨ Generate", variant="primary")
-                            with gr.Column(scale=1):
-                                md3_output_image = gr.Image(type="pil", label="Result", height=400)
-                                md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
-                                md3_output_time = gr.Markdown()
-                        gr.Examples(
-                            examples=[
-                                ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG", "Object Detection", "candy", 5],
-                                ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG", "Point Detection", "candy", 5],
-                                ["https://moondream.ai/images/blog/moondream-3-preview/benchmarks.jpg", "Caption", "", 5],
-                                ["https://moondream.ai/images/blog/moondream-3-preview/benchmarks.jpg", "Visual Question Answering", "how well does moondream 3 perform in chartvqa?", 5],
-                            ],
-                            inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
-                            label="Click an example to populate inputs"
                         )
-                    with gr.TabItem("📹 Video Object Tracking"):
-                        with gr.Row():
-                            with gr.Column(scale=1):
-                                md3_video_input = gr.Video(label="Upload a video file", height=400)
-                                md3_video_prompt = gr.Textbox(label="Object to track", placeholder="e.g., 'person', 'car', 'ball'", value="person")
-                                md3_detection_interval = gr.Slider(
-                                    minimum=5, maximum=30, value=15, step=1, label="Detection Interval (frames)",
-                                    info="Run detection every N frames (lower is slower but more accurate)."
-                                )
-                                md3_process_video_btn = gr.Button(value="🎥 Process Video", variant="primary")
-                            with gr.Column(scale=1):
-                                md3_output_video = gr.Video(label="Tracked Video Result", height=400)
-                                md3_video_summary = gr.Textbox(label="Processing Summary", lines=8, show_copy_button=True)
-                        gr.Examples(
-                            examples=[["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4", "snowboarder", 15]],
-                            inputs=[md3_video_input, md3_video_prompt, md3_detection_interval],
-                            label="Click an example to populate inputs"
                         )
         # --- Event Handlers ---
         # Document Tab
@@ -496,11 +367,6 @@ def create_gradio_interface():
             inputs=[md3_image_input, md3_prompt_input, md3_task_type, md3_max_objects],
             outputs=[md3_output_image, md3_output_textbox, md3_output_time]
         )
-        md3_process_video_btn.click(
-            fn=process_video_md3,
-            inputs=[md3_video_input, md3_video_prompt, md3_detection_interval],
-            outputs=[md3_output_video, md3_video_summary]
-        )
     return demo

 import spaces
 import json
 import os
 import traceback
 from io import BytesIO
+from typing import Dict
 import re
 import time
 from threading import Thread
 import tempfile
 import gradio as gr
 import numpy as np
 # --- Moondream3 Utility Functions ---
+def create_annotated_image(image: Image.Image, detection_result: Dict, object_name: str = "Object") -> Image.Image:
+    """Draws bounding boxes on an image based on detection results."""
     if not isinstance(detection_result, dict) or "objects" not in detection_result:
         return image
         x_max = int(obj["x_max"] * original_width)
         y_max = int(obj["y_max"] * original_height)
+        # Clamp coordinates to be within image dimensions
         x_min = max(0, min(x_min, original_width))
         y_min = max(0, min(y_min, original_height))
         x_max = max(0, min(x_max, original_width))
         class_id=np.arange(len(bboxes))
     )
+    bounding_box_annotator = sv.BoxAnnotator(thickness=3)
+    label_annotator = sv.LabelAnnotator(text_thickness=2, text_scale=0.6)
+    annotated_image = bounding_box_annotator.annotate(scene=annotated_image, detections=detections)
+    annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
     return Image.fromarray(annotated_image)
+def create_point_annotated_image(image: Image.Image, point_result: Dict) -> Image.Image:
+    """Draws points on an image based on detection results."""
     if not isinstance(point_result, dict) or "points" not in point_result:
         return image
         points_array = np.array(points).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
+        annotated_image = vertex_annotator.annotate(scene=annotated_image, key_points=key_points)
     return Image.fromarray(annotated_image)
 @spaces.GPU()
+def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_objects: int):
+    """Handles all image-based tasks for the Moondream3 model."""
     STANDARD_SIZE = (1024, 1024)
     if image is None:
         raise gr.Error("Please upload an image.")
     elif task_type == "Caption":
         result = model_md3.caption(image, length="normal")
         annotated_image = image
+    else:  # Visual Question Answering
         result = model_md3.query(image=image, question=prompt, reasoning=True)
         annotated_image = image
     elapsed_ms = (time.perf_counter() - t0) * 1_000
+    # Format the output text based on the result type
     if isinstance(result, dict):
         if "objects" in result:
           output_text = f"Found {len(result['objects'])} objects:\n"
     return annotated_image, output_text, timing_text
 # --- Core Application Logic (for other models) ---
 @spaces.GPU
 def process_document_stream(
     top_k: int,
     repetition_penalty: float
 ):
+    """Main generator function for models other than Moondream3."""
     if image is None:
         yield "Please upload an image."
         return
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
         time.sleep(0.01)
         yield buffer
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
         gr.Markdown("# Multimodal VLM v1.0 🚀")
+        gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, and Object Detection.")
         with gr.Tabs():
             # --- TAB 1: Document and General VLMs ---
                         gr.Markdown("### 1. Configure Inputs")
                         model_choice = gr.Dropdown(
                             choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)", "Video-MTR (Video/Text)"],
+                            label="Select Model", value="Camel-Doc-OCR-062825 (OCR)"
                         )
                         image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                         prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
             # --- TAB 2: Moondream3 Lab ---
             with gr.TabItem("🌝 Moondream3 Lab"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
+                        md3_task_type = gr.Radio(
+                            choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
+                            label="Task Type", value="Object Detection"
                         )
+                        md3_prompt_input = gr.Textbox(
+                            label="Prompt (object to detect/question to ask)",
+                            placeholder="e.g., 'car', 'person', 'What's in this image?'", value="objects"
                         )
+                        md3_max_objects = gr.Number(
+                            label="Max Objects (for Object Detection only)",
+                            value=10, minimum=1, maximum=50, step=1, visible=True
+                        )
+                        md3_generate_btn = gr.Button(value="✨ Generate", variant="primary")
+                    with gr.Column(scale=1):
+                        md3_output_image = gr.Image(type="pil", label="Result", height=400)
+                        md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
+                        md3_output_time = gr.Markdown()
+                gr.Examples(
+                    examples=[
+                        ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG", "Object Detection", "candy", 5],
+                        ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG", "Point Detection", "candy", 5],
+                        ["https://moondream.ai/images/blog/moondream-3-preview/benchmarks.jpg", "Caption", "", 5],
+                        ["https://moondream.ai/images/blog/moondream-3-preview/benchmarks.jpg", "Visual Question Answering", "how well does moondream 3 perform in chartvqa?", 5],
+                    ],
+                    inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
+                    label="Click an example to populate inputs"
+                )
         # --- Event Handlers ---
         # Document Tab
             inputs=[md3_image_input, md3_prompt_input, md3_task_type, md3_max_objects],
             outputs=[md3_output_image, md3_output_textbox, md3_output_time]
         )
     return demo