prithivMLmods commited on
Commit
cc7a352
·
verified ·
1 Parent(s): 2997107

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -12
app.py CHANGED
@@ -257,13 +257,13 @@ video_examples = [
257
  ]
258
 
259
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
260
- gr.Markdown("# **[Multimodal OCR²](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
261
  gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
262
 
263
  with gr.Row():
264
  # --- LEFT COLUMN (INPUTS) ---
265
  with gr.Column(scale=1):
266
- model_choice = gr.Radio(
267
  choices=[
268
  "Nanonets-OCR-s",
269
  "MonkeyOCR-Recognition",
@@ -271,20 +271,20 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
271
  "Typhoon-OCR-7B",
272
  "SmolDocling-256M-preview",
273
  ],
274
- label="🤖 Select Model",
275
  value="Nanonets-OCR-s",
276
  )
277
 
278
  with gr.Tabs():
279
  with gr.TabItem("🖼️ Image Inference"):
280
  image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
281
- image_upload = gr.Image(type="pil", label="Upload Image")
282
  image_submit = gr.Button("Generate", elem_classes="submit-btn")
283
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
284
 
285
  with gr.TabItem("🎬 Video Inference"):
286
  video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
287
- video_upload = gr.Video(label="Upload Video")
288
  video_submit = gr.Button("Generate", elem_classes="submit-btn")
289
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
290
 
@@ -313,18 +313,19 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
313
  raw_output = gr.Textbox(
314
  label="Raw Output Stream", interactive=False, lines=8
315
  )
 
316
  formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
317
 
318
  with gr.Accordion("💻 Model Information", open=True):
319
  gr.Markdown(
320
  """
321
- - **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: Transforms documents into structured markdown with intelligent content recognition.
322
- - **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: An efficient multimodal model for converting documents to structured formats.
323
- - **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: Adopts a Structure-Recognition-Relation paradigm for efficient document processing.
324
- - **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual (Thai/English) document parsing model for real-world documents.
325
- - **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Generates and executes code for image processing and complex reasoning tasks.
326
- ---
327
- > ⚠️ **Note**: Performance on video inference tasks is experimental and may vary between models.
328
 
329
  > [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
330
  """
 
257
  ]
258
 
259
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
260
+ gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
261
  gr.Markdown("A unified interface for state-of-the-art multimodal and document AI models. Select a model, upload an image or video, and enter a query to begin.")
262
 
263
  with gr.Row():
264
  # --- LEFT COLUMN (INPUTS) ---
265
  with gr.Column(scale=1):
266
+ model_choice = gr.Dropdown(
267
  choices=[
268
  "Nanonets-OCR-s",
269
  "MonkeyOCR-Recognition",
 
271
  "Typhoon-OCR-7B",
272
  "SmolDocling-256M-preview",
273
  ],
274
+ label="Select Model",
275
  value="Nanonets-OCR-s",
276
  )
277
 
278
  with gr.Tabs():
279
  with gr.TabItem("🖼️ Image Inference"):
280
  image_query = gr.Textbox(label="Query", placeholder="e.g., 'OCR the document'")
281
+ image_upload = gr.Image(type="pil", label="Upload Image", height="299")
282
  image_submit = gr.Button("Generate", elem_classes="submit-btn")
283
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
284
 
285
  with gr.TabItem("🎬 Video Inference"):
286
  video_query = gr.Textbox(label="Query", placeholder="e.g., 'What is happening in this video?'")
287
+ video_upload = gr.Video(label="Upload Video", height="299")
288
  video_submit = gr.Button("Generate", elem_classes="submit-btn")
289
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
290
 
 
313
  raw_output = gr.Textbox(
314
  label="Raw Output Stream", interactive=False, lines=8
315
  )
316
+ with gr.Accordion("(Result.md)", open=False):
317
  formatted_output = gr.Markdown(label="Formatted Result (Markdown)")
318
 
319
  with gr.Accordion("💻 Model Information", open=True):
320
  gr.Markdown(
321
  """
322
+ - **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.
323
+ - **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.
324
+ - **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.
325
+ - **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.
326
+ - **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.
327
+
328
+ - **⚠️Note**: Performance on video inference tasks is experimental and may vary between models.
329
 
330
  > [Report a Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)
331
  """