Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 19

Commit

cc7a352

verified ·

1 Parent(s): 2997107

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -12

app.py CHANGED Viewed

@@ -257,13 +257,13 @@ video_examples = [
 ]
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal OCR²](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
     with gr.Row():
         # --- LEFT COLUMN (INPUTS) ---
         with gr.Column(scale=1):
-            model_choice = gr.Radio(
                 choices=[
                     "Nanonets-OCR-s",
                     "MonkeyOCR-Recognition",
@@ -271,20 +271,20 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     "Typhoon-OCR-7B",
                     "SmolDocling-256M-preview",
                 ],
-                label="🤖 Select Model",
                 value="Nanonets-OCR-s",
             )
             with gr.Tabs():
                 with gr.TabItem("🖼️ Image Inference"):
                     image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
-                    image_upload = gr.Image(type="pil", label="Upload Image")
                     image_submit = gr.Button("Generate", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("🎬 Video Inference"):
                     video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
-                    video_upload = gr.Video(label="Upload Video")
                     video_submit = gr.Button("Generate", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
@@ -313,18 +313,19 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 raw_output = gr.Textbox(
                     label="Raw Output Stream", interactive=False, lines=8
                 )
                 formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
             with gr.Accordion("💻 Model Information", open=True):
                 gr.Markdown(
                     """
-                    - **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: Transforms documents into structured markdown with intelligent content recognition.
-                    - **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: An efficient multimodal model for converting documents to structured formats.
-                    - **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: Adopts a Structure-Recognition-Relation paradigm for efficient document processing.
-                    - **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual (Thai/English) document parsing model for real-world documents.
-                    - **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Generates and executes code for image processing and complex reasoning tasks.
-                    ---
-                    > ⚠️ **Note**: Performance on video inference tasks is experimental and may vary between models.
                     > [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
                     """

 ]
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
     with gr.Row():
         # --- LEFT COLUMN (INPUTS) ---
         with gr.Column(scale=1):
+            model_choice = gr.Dropdown(
                 choices=[
                     "Nanonets-OCR-s",
                     "MonkeyOCR-Recognition",
                     "Typhoon-OCR-7B",
                     "SmolDocling-256M-preview",
                 ],
+                label="Select Model⚡",
                 value="Nanonets-OCR-s",
             )
             with gr.Tabs():
                 with gr.TabItem("🖼️ Image Inference"):
                     image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
+                    image_upload = gr.Image(type="pil", label="Upload Image", height="299")
                     image_submit = gr.Button("Generate", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("🎬 Video Inference"):
                     video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
+                    video_upload = gr.Video(label="Upload Video", height="299")
                     video_submit = gr.Button("Generate", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
                 raw_output = gr.Textbox(
                     label="Raw Output Stream", interactive=False, lines=8
                 )
+                with gr.Accordion("(Result.md)", open=False):
                 formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
             with gr.Accordion("💻 Model Information", open=True):
                 gr.Markdown(
                     """
+                    - **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.
+                    - **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**:  SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.
+                    - **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.
+                    - **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.
+                    - **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.
+                    - **⚠️Note**: Performance on video inference tasks is experimental and may vary between models.
                     > [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
                     """