Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 3

Commit

2f0a2ad

verified ·

1 Parent(s): 7480fb9

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -34

app.py CHANGED Viewed

@@ -1,12 +1,15 @@
 import spaces
 import json
 import os
 import traceback
 from io import BytesIO
-from typing import Dict
 import re
 import time
 from threading import Thread
 import tempfile
 import gradio as gr
@@ -65,18 +68,6 @@ model_t = Qwen2VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 print("MinerU2.5-2509 loaded.")
-# Load Video-MTR
-print("Loading Video-MTR...")
-MODEL_ID_S = "Phoebe13/Video-MTR"
-processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
-model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_S,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-print("Video-MTR loaded.")
 # Load moondream3
 print("Loading moondream3-preview...")
 MODEL_ID_MD3 = "moondream/moondream3-preview"
@@ -92,8 +83,7 @@ print("moondream3-preview loaded and compiled.")
 # --- Moondream3 Utility Functions ---
-def create_annotated_image(image: Image.Image, detection_result: Dict, object_name: str = "Object") -> Image.Image:
-    """Draws bounding boxes on an image based on detection results."""
     if not isinstance(detection_result, dict) or "objects" not in detection_result:
         return image
@@ -109,7 +99,6 @@ def create_annotated_image(image: Image.Image, detection_result: Dict, object_na
         x_max = int(obj["x_max"] * original_width)
         y_max = int(obj["y_max"] * original_height)
-        # Clamp coordinates to be within image dimensions
         x_min = max(0, min(x_min, original_width))
         y_min = max(0, min(y_min, original_height))
         x_max = max(0, min(x_max, original_width))
@@ -127,16 +116,26 @@ def create_annotated_image(image: Image.Image, detection_result: Dict, object_na
         class_id=np.arange(len(bboxes))
     )
-    bounding_box_annotator = sv.BoxAnnotator(thickness=3)
-    label_annotator = sv.LabelAnnotator(text_thickness=2, text_scale=0.6)
-    annotated_image = bounding_box_annotator.annotate(scene=annotated_image, detections=detections)
-    annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
     return Image.fromarray(annotated_image)
-def create_point_annotated_image(image: Image.Image, point_result: Dict) -> Image.Image:
-    """Draws points on an image based on detection results."""
     if not isinstance(point_result, dict) or "points" not in point_result:
         return image
@@ -153,13 +152,14 @@ def create_point_annotated_image(image: Image.Image, point_result: Dict) -> Imag
         points_array = np.array(points).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
-        annotated_image = vertex_annotator.annotate(scene=annotated_image, key_points=key_points)
     return Image.fromarray(annotated_image)
 @spaces.GPU()
-def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_objects: int):
-    """Handles all image-based tasks for the Moondream3 model."""
     STANDARD_SIZE = (1024, 1024)
     if image is None:
         raise gr.Error("Please upload an image.")
@@ -177,13 +177,12 @@ def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_obje
     elif task_type == "Caption":
         result = model_md3.caption(image, length="normal")
         annotated_image = image
-    else:  # Visual Question Answering
         result = model_md3.query(image=image, question=prompt, reasoning=True)
         annotated_image = image
     elapsed_ms = (time.perf_counter() - t0) * 1_000
-    # Format the output text based on the result type
     if isinstance(result, dict):
         if "objects" in result:
           output_text = f"Found {len(result['objects'])} objects:\n"
@@ -206,6 +205,7 @@ def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_obje
     return annotated_image, output_text, timing_text
 # --- Core Application Logic (for other models) ---
 @spaces.GPU
 def process_document_stream(
@@ -218,7 +218,9 @@ def process_document_stream(
     top_k: int,
     repetition_penalty: float
 ):
-    """Main generator function for models other than Moondream3."""
     if image is None:
         yield "Please upload an image."
         return
@@ -231,8 +233,6 @@ def process_document_stream(
         processor, model = processor_m, model_m
     elif model_name == "MinerU2.5-2509 (General)":
         processor, model = processor_t, model_t
-    elif model_name == "Video-MTR (Video/Text)":
-        processor, model = processor_s, model_s
     else:
         yield "Invalid model selected."
         return
@@ -260,6 +260,7 @@ def process_document_stream(
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
         time.sleep(0.01)
         yield buffer
@@ -283,8 +284,8 @@ def create_gradio_interface():
                     with gr.Column(scale=1):
                         gr.Markdown("### 1. Configure Inputs")
                         model_choice = gr.Dropdown(
-                            choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)", "Video-MTR (Video/Text)"],
-                            label="Select Model", value="Camel-Doc-OCR-062825 (OCR)"
                         )
                         image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                         prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
@@ -313,7 +314,7 @@ def create_gradio_interface():
                 )
             # --- TAB 2: Moondream3 Lab ---
-            with gr.TabItem("🌝 Moondream3 Lab"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
@@ -345,7 +346,7 @@ def create_gradio_interface():
                     inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
                     label="Click an example to populate inputs"
                 )
         # --- Event Handlers ---
         # Document Tab

 import spaces
 import json
+import math
 import os
 import traceback
 from io import BytesIO
+from typing import Any, Dict, List, Optional, Tuple
 import re
 import time
 from threading import Thread
+from io import BytesIO
+import uuid
 import tempfile
 import gradio as gr
 ).to(device).eval()
 print("MinerU2.5-2509 loaded.")
 # Load moondream3
 print("Loading moondream3-preview...")
 MODEL_ID_MD3 = "moondream/moondream3-preview"
 # --- Moondream3 Utility Functions ---
+def create_annotated_image(image, detection_result, object_name="Object"):
     if not isinstance(detection_result, dict) or "objects" not in detection_result:
         return image
         x_max = int(obj["x_max"] * original_width)
         y_max = int(obj["y_max"] * original_height)
         x_min = max(0, min(x_min, original_width))
         y_min = max(0, min(y_min, original_height))
         x_max = max(0, min(x_max, original_width))
         class_id=np.arange(len(bboxes))
     )
+    bounding_box_annotator = sv.BoxAnnotator(
+        thickness=3,
+        color_lookup=sv.ColorLookup.INDEX
+    )
+    label_annotator = sv.LabelAnnotator(
+        text_thickness=2,
+        text_scale=0.6,
+        color_lookup=sv.ColorLookup.INDEX
+    )
+    annotated_image = bounding_box_annotator.annotate(
+        scene=annotated_image, detections=detections
+    )
+    annotated_image = label_annotator.annotate(
+        scene=annotated_image, detections=detections, labels=labels
+    )
     return Image.fromarray(annotated_image)
+def create_point_annotated_image(image, point_result):
     if not isinstance(point_result, dict) or "points" not in point_result:
         return image
         points_array = np.array(points).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
+        annotated_image = vertex_annotator.annotate(
+            scene=annotated_image, key_points=key_points
+        )
     return Image.fromarray(annotated_image)
 @spaces.GPU()
+def detect_objects_md3(image, prompt, task_type, max_objects):
     STANDARD_SIZE = (1024, 1024)
     if image is None:
         raise gr.Error("Please upload an image.")
     elif task_type == "Caption":
         result = model_md3.caption(image, length="normal")
         annotated_image = image
+    else:
         result = model_md3.query(image=image, question=prompt, reasoning=True)
         annotated_image = image
     elapsed_ms = (time.perf_counter() - t0) * 1_000
     if isinstance(result, dict):
         if "objects" in result:
           output_text = f"Found {len(result['objects'])} objects:\n"
     return annotated_image, output_text, timing_text
 # --- Core Application Logic (for other models) ---
 @spaces.GPU
 def process_document_stream(
     top_k: int,
     repetition_penalty: float
 ):
+    """
+    Main generator function for models other than Moondream3.
+    """
     if image is None:
         yield "Please upload an image."
         return
         processor, model = processor_m, model_m
     elif model_name == "MinerU2.5-2509 (General)":
         processor, model = processor_t, model_t
     else:
         yield "Invalid model selected."
         return
     buffer = ""
     for new_text in streamer:
         buffer += new_text
+        # Clean up potential model-specific tokens
         buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
         time.sleep(0.01)
         yield buffer
                     with gr.Column(scale=1):
                         gr.Markdown("### 1. Configure Inputs")
                         model_choice = gr.Dropdown(
+                            choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)"],
+                            label="Select Model", value= "Camel-Doc-OCR-062825 (OCR)"
                         )
                         image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
                         prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
                 )
             # --- TAB 2: Moondream3 Lab ---
+            with gr.TabItem("🌝 Moondream3 Lab (Image Processing)"):
                 with gr.Row():
                     with gr.Column(scale=1):
                         md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
                     inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
                     label="Click an example to populate inputs"
                 )
         # --- Event Handlers ---
         # Document Tab