Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -257,13 +257,13 @@ video_examples = [
|
|
| 257 |
]
|
| 258 |
|
| 259 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 260 |
-
gr.Markdown("# **[Multimodal
|
| 261 |
gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
|
| 262 |
|
| 263 |
with gr.Row():
|
| 264 |
# --- LEFT COLUMN (INPUTS) ---
|
| 265 |
with gr.Column(scale=1):
|
| 266 |
-
model_choice = gr.
|
| 267 |
choices=[
|
| 268 |
"Nanonets-OCR-s",
|
| 269 |
"MonkeyOCR-Recognition",
|
|
@@ -271,20 +271,20 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 271 |
"Typhoon-OCR-7B",
|
| 272 |
"SmolDocling-256M-preview",
|
| 273 |
],
|
| 274 |
-
label="
|
| 275 |
value="Nanonets-OCR-s",
|
| 276 |
)
|
| 277 |
|
| 278 |
with gr.Tabs():
|
| 279 |
with gr.TabItem("🖼️ Image Inference"):
|
| 280 |
image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
|
| 281 |
-
image_upload = gr.Image(type="pil", label="Upload Image")
|
| 282 |
image_submit = gr.Button("Generate", elem_classes="submit-btn")
|
| 283 |
gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
|
| 284 |
|
| 285 |
with gr.TabItem("🎬 Video Inference"):
|
| 286 |
video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
|
| 287 |
-
video_upload = gr.Video(label="Upload Video")
|
| 288 |
video_submit = gr.Button("Generate", elem_classes="submit-btn")
|
| 289 |
gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
|
| 290 |
|
|
@@ -313,18 +313,19 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 313 |
raw_output = gr.Textbox(
|
| 314 |
label="Raw Output Stream", interactive=False, lines=8
|
| 315 |
)
|
|
|
|
| 316 |
formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
|
| 317 |
|
| 318 |
with gr.Accordion("💻 Model Information", open=True):
|
| 319 |
gr.Markdown(
|
| 320 |
"""
|
| 321 |
-
- **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**:
|
| 322 |
-
- **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**:
|
| 323 |
-
- **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**:
|
| 324 |
-
- **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual
|
| 325 |
-
- **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**:
|
| 326 |
-
|
| 327 |
-
|
| 328 |
|
| 329 |
> [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
|
| 330 |
"""
|
|
|
|
| 257 |
]
|
| 258 |
|
| 259 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 260 |
+
gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
|
| 261 |
gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
|
| 262 |
|
| 263 |
with gr.Row():
|
| 264 |
# --- LEFT COLUMN (INPUTS) ---
|
| 265 |
with gr.Column(scale=1):
|
| 266 |
+
model_choice = gr.Dropdown(
|
| 267 |
choices=[
|
| 268 |
"Nanonets-OCR-s",
|
| 269 |
"MonkeyOCR-Recognition",
|
|
|
|
| 271 |
"Typhoon-OCR-7B",
|
| 272 |
"SmolDocling-256M-preview",
|
| 273 |
],
|
| 274 |
+
label="Select Model⚡",
|
| 275 |
value="Nanonets-OCR-s",
|
| 276 |
)
|
| 277 |
|
| 278 |
with gr.Tabs():
|
| 279 |
with gr.TabItem("🖼️ Image Inference"):
|
| 280 |
image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
|
| 281 |
+
image_upload = gr.Image(type="pil", label="Upload Image", height="299")
|
| 282 |
image_submit = gr.Button("Generate", elem_classes="submit-btn")
|
| 283 |
gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
|
| 284 |
|
| 285 |
with gr.TabItem("🎬 Video Inference"):
|
| 286 |
video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
|
| 287 |
+
video_upload = gr.Video(label="Upload Video", height="299")
|
| 288 |
video_submit = gr.Button("Generate", elem_classes="submit-btn")
|
| 289 |
gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
|
| 290 |
|
|
|
|
| 313 |
raw_output = gr.Textbox(
|
| 314 |
label="Raw Output Stream", interactive=False, lines=8
|
| 315 |
)
|
| 316 |
+
with gr.Accordion("(Result.md)", open=False):
|
| 317 |
formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
|
| 318 |
|
| 319 |
with gr.Accordion("💻 Model Information", open=True):
|
| 320 |
gr.Markdown(
|
| 321 |
"""
|
| 322 |
+
- **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.
|
| 323 |
+
- **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.
|
| 324 |
+
- **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.
|
| 325 |
+
- **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.
|
| 326 |
+
- **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.
|
| 327 |
+
|
| 328 |
+
- **⚠️Note**: Performance on video inference tasks is experimental and may vary between models.
|
| 329 |
|
| 330 |
> [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
|
| 331 |
"""
|