Spaces:

prithivMLmods
/

VLM-Parsing

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 24, 2025

Commit

f3894a5

verified ·

1 Parent(s): 406d1f1

update app

Browse files

Files changed (1) hide show

app.py +66 -73

app.py CHANGED Viewed

@@ -1,11 +1,8 @@
 import os
 import sys
 from typing import Iterable, Optional, Tuple, Dict, Any, List
-import hashlib
-import spaces
-import re
 import time
-import click
 import gradio as gr
 from io import BytesIO
 from PIL import Image
@@ -13,8 +10,7 @@ from loguru import logger
 from pathlib import Path
 import torch
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
-from transformers.image_utils import load_image
-import fitz
 import html2text
 import markdown
 import tempfile
@@ -129,7 +125,9 @@ def parse_page(image: Image.Image, model_name: str) -> str:
     else:
         raise ValueError(f"Unknown model choice: {model_name}")
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
     prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
@@ -263,76 +261,71 @@ def get_page_outputs(state: Dict[str, Any]) -> Tuple[str, str, str]:
 def clear_all():
     return None, None, "<h3>Results will be displayed here after processing.</h3>", "", "", None, "", '<div class="page-info">No file loaded</div>', get_initial_state()
-@click.command()
-def main():
-    css = """
-    .main-container { max-width: 1400px; margin: 0 auto; }
-    .header-text { text-align: center; margin-bottom: 20px; }
-    .page-info { text-align: center; padding: 8px 16px; font-weight: bold; margin: 10px 0; }
-    """
-    with gr.Blocks(theme=steel_blue_theme, css=css, title="Logics-Parsing Demo") as demo:
-        app_state = gr.State(value=get_initial_state())
-        gr.HTML("""
-        <div class="header-text">
-            <h1>📄 Multimodal: VLM Parsing</h1>
-            <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
-            <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
-                <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
-                <a href="https://github.com/PRITHIVSAKTHIUR/VLM-Parsing" target="_blank" style="text-decoration: none; font-weight: 500;">💻 GitHub</a>
-                <a href="https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending" target="_blank" style="text-decoration: none; font-weight: 500;">📝 Multimodal VLMs</a>
-            </div>
         </div>
-        """)
-        with gr.Row(elem_classes=["main-container"]):
-            with gr.Column(scale=1):
-                model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0", "olmOCR-7B-0825"], label="Select Model", value="Logics-Parsing")
-                file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
-                image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=320)
-                with gr.Row():
-                    prev_page_btn = gr.Button("◀ Previous")
-                    page_info = gr.HTML('<div class="page-info">No file loaded</div>')
-                    next_page_btn = gr.Button("Next ▶")
-                with gr.Accordion("Download & Details", open=False):
-                    output_file = gr.File(label='Download Markdown Result', interactive=False)
-                    cost_time = gr.Textbox(label='Time Cost', interactive=False)
-                example_root = "examples"
-                if os.path.exists(example_root) and os.path.isdir(example_root):
-                    example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
-                    if example_files:
-                        gr.Examples(examples=example_files, inputs=file_input, label="Examples")
-                process_btn = gr.Button("🚀 Process Document", variant="primary", size="lg")
-                clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
-            with gr.Column(scale=2):
-                with gr.Tabs():
-                    with gr.Tab("Markdown Source"):
-                        md_source_output = gr.Code(language="markdown", label="Markdown Source")
-                    with gr.Tab("Rendered Markdown"):
-                        md_render_output = gr.Markdown(label='Markdown Rendering')
-                    with gr.Tab("Generated HTML"):
-                        raw_html_output = gr.Code(language="html", label="Generated HTML")
-        file_input.change(fn=load_and_preview_file, inputs=file_input, outputs=[image_preview, page_info, app_state], show_progress="full")
-        process_btn.click(fn=process_all_pages, inputs=[app_state, model_choice], outputs=[md_render_output, md_source_output, raw_html_output, output_file, cost_time, app_state], show_progress="full")
-        prev_page_btn.click(fn=lambda s: navigate_page("prev", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
-        next_page_btn.click(fn=lambda s: navigate_page("next", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
-        clear_btn.click(fn=clear_all, outputs=[file_input, image_preview, md_render_output, md_source_output, raw_html_output, output_file, cost_time, page_info, app_state])
-    demo.queue().launch(debug=True, show_error=True)
-if __name__ == '__main__':
-    if not os.path.exists("examples"):
-        os.makedirs("examples")
-        logger.info("Created 'examples' directory. Please add some sample PDF/image files there.")
-    main()

 import os
 import sys
 from typing import Iterable, Optional, Tuple, Dict, Any, List
 import time
+import spaces
 import gradio as gr
 from io import BytesIO
 from PIL import Image
 from pathlib import Path
 import torch
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+import fitz  # PyMuPDF
 import html2text
 import markdown
 import tempfile
     else:
         raise ValueError(f"Unknown model choice: {model_name}")
+    # Standard Qwen2-VL format
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
     prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
 def clear_all():
     return None, None, "<h3>Results will be displayed here after processing.</h3>", "", "", None, "", '<div class="page-info">No file loaded</div>', get_initial_state()
+css = """
+.main-container { max-width: 1400px; margin: 0 auto; }
+.header-text { text-align: center; margin-bottom: 20px; }
+.page-info { text-align: center; padding: 8px 16px; font-weight: bold; margin: 10px 0; }
+"""
+with gr.Blocks() as demo:
+    app_state = gr.State(value=get_initial_state())
+    gr.HTML("""
+    <div class="header-text">
+        <h1>📄 Multimodal: VLM Parsing</h1>
+        <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
+        <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
+            <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
+            <a href="https://github.com/PRITHIVSAKTHIUR/VLM-Parsing" target="_blank" style="text-decoration: none; font-weight: 500;">💻 GitHub</a>
+            <a href="https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending" target="_blank" style="text-decoration: none; font-weight: 500;">📝 Multimodal VLMs</a>
         </div>
+    </div>
+    """)
+    with gr.Row(elem_classes=["main-container"]):
+        with gr.Column(scale=1):
+            model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0", "olmOCR-7B-0825"], label="Select Model", value="Logics-Parsing")
+            file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
+            image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=320)
+            with gr.Row():
+                prev_page_btn = gr.Button("◀ Previous")
+                page_info = gr.HTML('<div class="page-info">No file loaded</div>')
+                next_page_btn = gr.Button("Next ▶")
+            with gr.Accordion("Download & Details", open=False):
+                output_file = gr.File(label='Download Markdown Result', interactive=False)
+                cost_time = gr.Textbox(label='Time Cost', interactive=False)
+            example_root = "examples"
+            if os.path.exists(example_root) and os.path.isdir(example_root):
+                example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
+                if example_files:
+                    gr.Examples(examples=example_files, inputs=file_input, label="Examples")
+            process_btn = gr.Button("🚀 Process Document", variant="primary", size="lg")
+            clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.Tab("Markdown Source"):
+                    md_source_output = gr.Code(language="markdown", label="Markdown Source")
+                with gr.Tab("Rendered Markdown"):
+                    md_render_output = gr.Markdown(label='Markdown Rendering')
+                with gr.Tab("Generated HTML"):
+                    raw_html_output = gr.Code(language="html", label="Generated HTML")
+    file_input.change(fn=load_and_preview_file, inputs=file_input, outputs=[image_preview, page_info, app_state], show_progress="full")
+    process_btn.click(fn=process_all_pages, inputs=[app_state, model_choice], outputs=[md_render_output, md_source_output, raw_html_output, output_file, cost_time, app_state], show_progress="full")
+    prev_page_btn.click(fn=lambda s: navigate_page("prev", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
+    next_page_btn.click(fn=lambda s: navigate_page("next", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
+    clear_btn.click(fn=clear_all, outputs=[file_input, image_preview, md_render_output, md_source_output, raw_html_output, output_file, cost_time, page_info, app_state])
+if __name__ == '__main__':
+    demo.queue()
+    demo.launch(theme=steel_blue_theme, css=css, mcp_server=True, ssr_mode=False, show_error=True)