Spaces:
Running
on
Zero
Running
on
Zero
update app
Browse files
app.py
CHANGED
|
@@ -355,7 +355,42 @@ def create_gradio_interface():
|
|
| 355 |
gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, and Object Detection.")
|
| 356 |
|
| 357 |
with gr.Tabs():
|
| 358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
with gr.TabItem("📄 Document & General VLM"):
|
| 360 |
with gr.Row():
|
| 361 |
with gr.Column(scale=2):
|
|
@@ -390,39 +425,7 @@ def create_gradio_interface():
|
|
| 390 |
inputs=[image_input_doc, prompt_input_doc]
|
| 391 |
)
|
| 392 |
|
| 393 |
-
# --- TAB 2: Moondream3 Lab ---
|
| 394 |
-
with gr.TabItem("🌝 Moondream3"):
|
| 395 |
-
with gr.Row():
|
| 396 |
-
with gr.Column(scale=1):
|
| 397 |
-
md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
|
| 398 |
-
md3_task_type = gr.Radio(
|
| 399 |
-
choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
|
| 400 |
-
label="Task Type", value="Object Detection"
|
| 401 |
-
)
|
| 402 |
-
md3_prompt_input = gr.Textbox(
|
| 403 |
-
label="Prompt (object to detect/question to ask)",
|
| 404 |
-
placeholder="e.g., 'car', 'person', 'What's in this image?'"
|
| 405 |
-
)
|
| 406 |
-
md3_max_objects = gr.Number(
|
| 407 |
-
label="Max Objects (for Object Detection only)",
|
| 408 |
-
value=10, minimum=1, maximum=50, step=1, visible=True
|
| 409 |
-
)
|
| 410 |
-
md3_generate_btn = gr.Button(value="Submit", variant="primary")
|
| 411 |
-
with gr.Column(scale=1):
|
| 412 |
-
md3_output_image = gr.Image(type="pil", label="Result", height=400)
|
| 413 |
-
md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
|
| 414 |
-
md3_output_time = gr.Markdown()
|
| 415 |
|
| 416 |
-
gr.Examples(
|
| 417 |
-
examples=[
|
| 418 |
-
["md3/1.jpg", "Object Detection", "boats", 7],
|
| 419 |
-
["md3/2.jpg", "Point Detection", "children", 7],
|
| 420 |
-
["md3/3.png", "Caption", "", 5],
|
| 421 |
-
["md3/4.jpeg", "Visual Question Answering", "Analyze the GDP trend over the years.", 5],
|
| 422 |
-
],
|
| 423 |
-
inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
|
| 424 |
-
label="Click an example to populate inputs"
|
| 425 |
-
)
|
| 426 |
|
| 427 |
process_btn.click(
|
| 428 |
fn=process_document_stream,
|
|
|
|
| 355 |
gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, and Object Detection.")
|
| 356 |
|
| 357 |
with gr.Tabs():
|
| 358 |
+
|
| 359 |
+
# --- TAB 1: Moondream3 Lab ---
|
| 360 |
+
with gr.TabItem("🌝 Moondream3"):
|
| 361 |
+
with gr.Row():
|
| 362 |
+
with gr.Column(scale=1):
|
| 363 |
+
md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
|
| 364 |
+
md3_task_type = gr.Radio(
|
| 365 |
+
choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
|
| 366 |
+
label="Task Type", value="Object Detection"
|
| 367 |
+
)
|
| 368 |
+
md3_prompt_input = gr.Textbox(
|
| 369 |
+
label="Prompt (object to detect/question to ask)",
|
| 370 |
+
placeholder="e.g., 'car', 'person', 'What's in this image?'"
|
| 371 |
+
)
|
| 372 |
+
md3_max_objects = gr.Number(
|
| 373 |
+
label="Max Objects (for Object Detection only)",
|
| 374 |
+
value=10, minimum=1, maximum=50, step=1, visible=True
|
| 375 |
+
)
|
| 376 |
+
md3_generate_btn = gr.Button(value="Submit", variant="primary")
|
| 377 |
+
with gr.Column(scale=1):
|
| 378 |
+
md3_output_image = gr.Image(type="pil", label="Result", height=400)
|
| 379 |
+
md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
|
| 380 |
+
md3_output_time = gr.Markdown()
|
| 381 |
+
|
| 382 |
+
gr.Examples(
|
| 383 |
+
examples=[
|
| 384 |
+
["md3/1.jpg", "Object Detection", "boats", 7],
|
| 385 |
+
["md3/2.jpg", "Point Detection", "children", 7],
|
| 386 |
+
["md3/3.png", "Caption", "", 5],
|
| 387 |
+
["md3/4.jpeg", "Visual Question Answering", "Analyze the GDP trend over the years.", 5],
|
| 388 |
+
],
|
| 389 |
+
inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
|
| 390 |
+
label="Click an example to populate inputs"
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
# --- TAB 2: Document and General VLMs ---
|
| 394 |
with gr.TabItem("📄 Document & General VLM"):
|
| 395 |
with gr.Row():
|
| 396 |
with gr.Column(scale=2):
|
|
|
|
| 425 |
inputs=[image_input_doc, prompt_input_doc]
|
| 426 |
)
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
process_btn.click(
|
| 431 |
fn=process_document_stream,
|