prithivMLmods commited on
Commit
2f0a2ad
·
verified ·
1 Parent(s): 7480fb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -34
app.py CHANGED
@@ -1,12 +1,15 @@
1
  import spaces
2
  import json
 
3
  import os
4
  import traceback
5
  from io import BytesIO
6
- from typing import Dict
7
  import re
8
  import time
9
  from threading import Thread
 
 
10
  import tempfile
11
 
12
  import gradio as gr
@@ -65,18 +68,6 @@ model_t = Qwen2VLForConditionalGeneration.from_pretrained(
65
  ).to(device).eval()
66
  print("MinerU2.5-2509 loaded.")
67
 
68
-
69
- # Load Video-MTR
70
- print("Loading Video-MTR...")
71
- MODEL_ID_S = "Phoebe13/Video-MTR"
72
- processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
73
- model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
- MODEL_ID_S,
75
- trust_remote_code=True,
76
- torch_dtype=torch.float16
77
- ).to(device).eval()
78
- print("Video-MTR loaded.")
79
-
80
  # Load moondream3
81
  print("Loading moondream3-preview...")
82
  MODEL_ID_MD3 = "moondream/moondream3-preview"
@@ -92,8 +83,7 @@ print("moondream3-preview loaded and compiled.")
92
 
93
  # --- Moondream3 Utility Functions ---
94
 
95
- def create_annotated_image(image: Image.Image, detection_result: Dict, object_name: str = "Object") -> Image.Image:
96
- """Draws bounding boxes on an image based on detection results."""
97
  if not isinstance(detection_result, dict) or "objects" not in detection_result:
98
  return image
99
 
@@ -109,7 +99,6 @@ def create_annotated_image(image: Image.Image, detection_result: Dict, object_na
109
  x_max = int(obj["x_max"] * original_width)
110
  y_max = int(obj["y_max"] * original_height)
111
 
112
- # Clamp coordinates to be within image dimensions
113
  x_min = max(0, min(x_min, original_width))
114
  y_min = max(0, min(y_min, original_height))
115
  x_max = max(0, min(x_max, original_width))
@@ -127,16 +116,26 @@ def create_annotated_image(image: Image.Image, detection_result: Dict, object_na
127
  class_id=np.arange(len(bboxes))
128
  )
129
 
130
- bounding_box_annotator = sv.BoxAnnotator(thickness=3)
131
- label_annotator = sv.LabelAnnotator(text_thickness=2, text_scale=0.6)
 
 
 
 
 
 
 
132
 
133
- annotated_image = bounding_box_annotator.annotate(scene=annotated_image, detections=detections)
134
- annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
 
 
 
 
135
 
136
  return Image.fromarray(annotated_image)
137
 
138
- def create_point_annotated_image(image: Image.Image, point_result: Dict) -> Image.Image:
139
- """Draws points on an image based on detection results."""
140
  if not isinstance(point_result, dict) or "points" not in point_result:
141
  return image
142
 
@@ -153,13 +152,14 @@ def create_point_annotated_image(image: Image.Image, point_result: Dict) -> Imag
153
  points_array = np.array(points).reshape(1, -1, 2)
154
  key_points = sv.KeyPoints(xy=points_array)
155
  vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
156
- annotated_image = vertex_annotator.annotate(scene=annotated_image, key_points=key_points)
 
 
157
 
158
  return Image.fromarray(annotated_image)
159
 
160
  @spaces.GPU()
161
- def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_objects: int):
162
- """Handles all image-based tasks for the Moondream3 model."""
163
  STANDARD_SIZE = (1024, 1024)
164
  if image is None:
165
  raise gr.Error("Please upload an image.")
@@ -177,13 +177,12 @@ def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_obje
177
  elif task_type == "Caption":
178
  result = model_md3.caption(image, length="normal")
179
  annotated_image = image
180
- else: # Visual Question Answering
181
  result = model_md3.query(image=image, question=prompt, reasoning=True)
182
  annotated_image = image
183
 
184
  elapsed_ms = (time.perf_counter() - t0) * 1_000
185
 
186
- # Format the output text based on the result type
187
  if isinstance(result, dict):
188
  if "objects" in result:
189
  output_text = f"Found {len(result['objects'])} objects:\n"
@@ -206,6 +205,7 @@ def detect_objects_md3(image: Image.Image, prompt: str, task_type: str, max_obje
206
 
207
  return annotated_image, output_text, timing_text
208
 
 
209
  # --- Core Application Logic (for other models) ---
210
  @spaces.GPU
211
  def process_document_stream(
@@ -218,7 +218,9 @@ def process_document_stream(
218
  top_k: int,
219
  repetition_penalty: float
220
  ):
221
- """Main generator function for models other than Moondream3."""
 
 
222
  if image is None:
223
  yield "Please upload an image."
224
  return
@@ -231,8 +233,6 @@ def process_document_stream(
231
  processor, model = processor_m, model_m
232
  elif model_name == "MinerU2.5-2509 (General)":
233
  processor, model = processor_t, model_t
234
- elif model_name == "Video-MTR (Video/Text)":
235
- processor, model = processor_s, model_s
236
  else:
237
  yield "Invalid model selected."
238
  return
@@ -260,6 +260,7 @@ def process_document_stream(
260
  buffer = ""
261
  for new_text in streamer:
262
  buffer += new_text
 
263
  buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
264
  time.sleep(0.01)
265
  yield buffer
@@ -283,8 +284,8 @@ def create_gradio_interface():
283
  with gr.Column(scale=1):
284
  gr.Markdown("### 1. Configure Inputs")
285
  model_choice = gr.Dropdown(
286
- choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)", "Video-MTR (Video/Text)"],
287
- label="Select Model", value="Camel-Doc-OCR-062825 (OCR)"
288
  )
289
  image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
290
  prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
@@ -313,7 +314,7 @@ def create_gradio_interface():
313
  )
314
 
315
  # --- TAB 2: Moondream3 Lab ---
316
- with gr.TabItem("🌝 Moondream3 Lab"):
317
  with gr.Row():
318
  with gr.Column(scale=1):
319
  md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
@@ -345,7 +346,7 @@ def create_gradio_interface():
345
  inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
346
  label="Click an example to populate inputs"
347
  )
348
-
349
  # --- Event Handlers ---
350
 
351
  # Document Tab
 
1
  import spaces
2
  import json
3
+ import math
4
  import os
5
  import traceback
6
  from io import BytesIO
7
+ from typing import Any, Dict, List, Optional, Tuple
8
  import re
9
  import time
10
  from threading import Thread
11
+ from io import BytesIO
12
+ import uuid
13
  import tempfile
14
 
15
  import gradio as gr
 
68
  ).to(device).eval()
69
  print("MinerU2.5-2509 loaded.")
70
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  # Load moondream3
72
  print("Loading moondream3-preview...")
73
  MODEL_ID_MD3 = "moondream/moondream3-preview"
 
83
 
84
  # --- Moondream3 Utility Functions ---
85
 
86
+ def create_annotated_image(image, detection_result, object_name="Object"):
 
87
  if not isinstance(detection_result, dict) or "objects" not in detection_result:
88
  return image
89
 
 
99
  x_max = int(obj["x_max"] * original_width)
100
  y_max = int(obj["y_max"] * original_height)
101
 
 
102
  x_min = max(0, min(x_min, original_width))
103
  y_min = max(0, min(y_min, original_height))
104
  x_max = max(0, min(x_max, original_width))
 
116
  class_id=np.arange(len(bboxes))
117
  )
118
 
119
+ bounding_box_annotator = sv.BoxAnnotator(
120
+ thickness=3,
121
+ color_lookup=sv.ColorLookup.INDEX
122
+ )
123
+ label_annotator = sv.LabelAnnotator(
124
+ text_thickness=2,
125
+ text_scale=0.6,
126
+ color_lookup=sv.ColorLookup.INDEX
127
+ )
128
 
129
+ annotated_image = bounding_box_annotator.annotate(
130
+ scene=annotated_image, detections=detections
131
+ )
132
+ annotated_image = label_annotator.annotate(
133
+ scene=annotated_image, detections=detections, labels=labels
134
+ )
135
 
136
  return Image.fromarray(annotated_image)
137
 
138
+ def create_point_annotated_image(image, point_result):
 
139
  if not isinstance(point_result, dict) or "points" not in point_result:
140
  return image
141
 
 
152
  points_array = np.array(points).reshape(1, -1, 2)
153
  key_points = sv.KeyPoints(xy=points_array)
154
  vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
155
+ annotated_image = vertex_annotator.annotate(
156
+ scene=annotated_image, key_points=key_points
157
+ )
158
 
159
  return Image.fromarray(annotated_image)
160
 
161
  @spaces.GPU()
162
+ def detect_objects_md3(image, prompt, task_type, max_objects):
 
163
  STANDARD_SIZE = (1024, 1024)
164
  if image is None:
165
  raise gr.Error("Please upload an image.")
 
177
  elif task_type == "Caption":
178
  result = model_md3.caption(image, length="normal")
179
  annotated_image = image
180
+ else:
181
  result = model_md3.query(image=image, question=prompt, reasoning=True)
182
  annotated_image = image
183
 
184
  elapsed_ms = (time.perf_counter() - t0) * 1_000
185
 
 
186
  if isinstance(result, dict):
187
  if "objects" in result:
188
  output_text = f"Found {len(result['objects'])} objects:\n"
 
205
 
206
  return annotated_image, output_text, timing_text
207
 
208
+
209
  # --- Core Application Logic (for other models) ---
210
  @spaces.GPU
211
  def process_document_stream(
 
218
  top_k: int,
219
  repetition_penalty: float
220
  ):
221
+ """
222
+ Main generator function for models other than Moondream3.
223
+ """
224
  if image is None:
225
  yield "Please upload an image."
226
  return
 
233
  processor, model = processor_m, model_m
234
  elif model_name == "MinerU2.5-2509 (General)":
235
  processor, model = processor_t, model_t
 
 
236
  else:
237
  yield "Invalid model selected."
238
  return
 
260
  buffer = ""
261
  for new_text in streamer:
262
  buffer += new_text
263
+ # Clean up potential model-specific tokens
264
  buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
265
  time.sleep(0.01)
266
  yield buffer
 
284
  with gr.Column(scale=1):
285
  gr.Markdown("### 1. Configure Inputs")
286
  model_choice = gr.Dropdown(
287
+ choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)"],
288
+ label="Select Model", value= "Camel-Doc-OCR-062825 (OCR)"
289
  )
290
  image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
291
  prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
 
314
  )
315
 
316
  # --- TAB 2: Moondream3 Lab ---
317
+ with gr.TabItem("🌝 Moondream3 Lab (Image Processing)"):
318
  with gr.Row():
319
  with gr.Column(scale=1):
320
  md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
 
346
  inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
347
  label="Click an example to populate inputs"
348
  )
349
+
350
  # --- Event Handlers ---
351
 
352
  # Document Tab