prithivMLmods commited on
Commit
f3894a5
Β·
verified Β·
1 Parent(s): 406d1f1

update app

Browse files
Files changed (1) hide show
  1. app.py +66 -73
app.py CHANGED
@@ -1,11 +1,8 @@
1
  import os
2
  import sys
3
  from typing import Iterable, Optional, Tuple, Dict, Any, List
4
- import hashlib
5
- import spaces
6
- import re
7
  import time
8
- import click
9
  import gradio as gr
10
  from io import BytesIO
11
  from PIL import Image
@@ -13,8 +10,7 @@ from loguru import logger
13
  from pathlib import Path
14
  import torch
15
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
16
- from transformers.image_utils import load_image
17
- import fitz
18
  import html2text
19
  import markdown
20
  import tempfile
@@ -129,7 +125,9 @@ def parse_page(image: Image.Image, model_name: str) -> str:
129
  else:
130
  raise ValueError(f"Unknown model choice: {model_name}")
131
 
 
132
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
 
133
  prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
134
  inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
135
 
@@ -263,76 +261,71 @@ def get_page_outputs(state: Dict[str, Any]) -> Tuple[str, str, str]:
263
  def clear_all():
264
  return None, None, "<h3>Results will be displayed here after processing.</h3>", "", "", None, "", '<div class="page-info">No file loaded</div>', get_initial_state()
265
 
266
- @click.command()
267
- def main():
268
- css = """
269
- .main-container { max-width: 1400px; margin: 0 auto; }
270
- .header-text { text-align: center; margin-bottom: 20px; }
271
- .page-info { text-align: center; padding: 8px 16px; font-weight: bold; margin: 10px 0; }
272
- """
273
- with gr.Blocks(theme=steel_blue_theme, css=css, title="Logics-Parsing Demo") as demo:
274
- app_state = gr.State(value=get_initial_state())
275
-
276
- gr.HTML("""
277
- <div class="header-text">
278
- <h1>πŸ“„ Multimodal: VLM Parsing</h1>
279
- <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
280
- <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
281
- <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">πŸ€— Model Info</a>
282
- <a href="https://github.com/PRITHIVSAKTHIUR/VLM-Parsing" target="_blank" style="text-decoration: none; font-weight: 500;">πŸ’» GitHub</a>
283
- <a href="https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending" target="_blank" style="text-decoration: none; font-weight: 500;">πŸ“ Multimodal VLMs</a>
284
- </div>
285
  </div>
286
- """)
287
-
288
- with gr.Row(elem_classes=["main-container"]):
289
- with gr.Column(scale=1):
290
- model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0", "olmOCR-7B-0825"], label="Select Model", value="Logics-Parsing")
291
- file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
292
-
293
- image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=320)
294
-
295
- with gr.Row():
296
- prev_page_btn = gr.Button("β—€ Previous")
297
- page_info = gr.HTML('<div class="page-info">No file loaded</div>')
298
- next_page_btn = gr.Button("Next β–Ά")
299
-
300
- with gr.Accordion("Download & Details", open=False):
301
- output_file = gr.File(label='Download Markdown Result', interactive=False)
302
- cost_time = gr.Textbox(label='Time Cost', interactive=False)
303
-
304
- example_root = "examples"
305
- if os.path.exists(example_root) and os.path.isdir(example_root):
306
- example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
307
- if example_files:
308
- gr.Examples(examples=example_files, inputs=file_input, label="Examples")
309
-
310
- process_btn = gr.Button("πŸš€ Process Document", variant="primary", size="lg")
311
- clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
312
 
313
- with gr.Column(scale=2):
314
- with gr.Tabs():
315
- with gr.Tab("Markdown Source"):
316
- md_source_output = gr.Code(language="markdown", label="Markdown Source")
317
- with gr.Tab("Rendered Markdown"):
318
- md_render_output = gr.Markdown(label='Markdown Rendering')
319
- with gr.Tab("Generated HTML"):
320
- raw_html_output = gr.Code(language="html", label="Generated HTML")
321
-
322
- file_input.change(fn=load_and_preview_file, inputs=file_input, outputs=[image_preview, page_info, app_state], show_progress="full")
 
 
 
 
 
 
 
323
 
324
- process_btn.click(fn=process_all_pages, inputs=[app_state, model_choice], outputs=[md_render_output, md_source_output, raw_html_output, output_file, cost_time, app_state], show_progress="full")
 
 
 
 
 
 
 
 
 
 
 
325
 
326
- prev_page_btn.click(fn=lambda s: navigate_page("prev", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
327
-
328
- next_page_btn.click(fn=lambda s: navigate_page("next", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
329
 
330
- clear_btn.click(fn=clear_all, outputs=[file_input, image_preview, md_render_output, md_source_output, raw_html_output, output_file, cost_time, page_info, app_state])
331
-
332
- demo.queue().launch(debug=True, show_error=True)
333
 
334
- if __name__ == '__main__':
335
- if not os.path.exists("examples"):
336
- os.makedirs("examples")
337
- logger.info("Created 'examples' directory. Please add some sample PDF/image files there.")
338
- main()
 
1
  import os
2
  import sys
3
  from typing import Iterable, Optional, Tuple, Dict, Any, List
 
 
 
4
  import time
5
+ import spaces
6
  import gradio as gr
7
  from io import BytesIO
8
  from PIL import Image
 
10
  from pathlib import Path
11
  import torch
12
  from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
13
+ import fitz # PyMuPDF
 
14
  import html2text
15
  import markdown
16
  import tempfile
 
125
  else:
126
  raise ValueError(f"Unknown model choice: {model_name}")
127
 
128
+ # Standard Qwen2-VL format
129
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
130
+
131
  prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
132
  inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
133
 
 
261
  def clear_all():
262
  return None, None, "<h3>Results will be displayed here after processing.</h3>", "", "", None, "", '<div class="page-info">No file loaded</div>', get_initial_state()
263
 
264
+ css = """
265
+ .main-container { max-width: 1400px; margin: 0 auto; }
266
+ .header-text { text-align: center; margin-bottom: 20px; }
267
+ .page-info { text-align: center; padding: 8px 16px; font-weight: bold; margin: 10px 0; }
268
+ """
269
+
270
+ with gr.Blocks() as demo:
271
+ app_state = gr.State(value=get_initial_state())
272
+
273
+ gr.HTML("""
274
+ <div class="header-text">
275
+ <h1>πŸ“„ Multimodal: VLM Parsing</h1>
276
+ <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
277
+ <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
278
+ <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">πŸ€— Model Info</a>
279
+ <a href="https://github.com/PRITHIVSAKTHIUR/VLM-Parsing" target="_blank" style="text-decoration: none; font-weight: 500;">πŸ’» GitHub</a>
280
+ <a href="https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending" target="_blank" style="text-decoration: none; font-weight: 500;">πŸ“ Multimodal VLMs</a>
 
 
281
  </div>
282
+ </div>
283
+ """)
284
+
285
+ with gr.Row(elem_classes=["main-container"]):
286
+ with gr.Column(scale=1):
287
+ model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0", "olmOCR-7B-0825"], label="Select Model", value="Logics-Parsing")
288
+ file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
289
+
290
+ image_preview = gr.Image(label="Preview", type="pil", interactive=False, height=320)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
+ with gr.Row():
293
+ prev_page_btn = gr.Button("β—€ Previous")
294
+ page_info = gr.HTML('<div class="page-info">No file loaded</div>')
295
+ next_page_btn = gr.Button("Next β–Ά")
296
+
297
+ with gr.Accordion("Download & Details", open=False):
298
+ output_file = gr.File(label='Download Markdown Result', interactive=False)
299
+ cost_time = gr.Textbox(label='Time Cost', interactive=False)
300
+
301
+ example_root = "examples"
302
+ if os.path.exists(example_root) and os.path.isdir(example_root):
303
+ example_files = [os.path.join(example_root, f) for f in os.listdir(example_root) if f.endswith(tuple(pdf_suffixes + image_suffixes))]
304
+ if example_files:
305
+ gr.Examples(examples=example_files, inputs=file_input, label="Examples")
306
+
307
+ process_btn = gr.Button("πŸš€ Process Document", variant="primary", size="lg")
308
+ clear_btn = gr.Button("πŸ—‘οΈ Clear All", variant="secondary")
309
 
310
+ with gr.Column(scale=2):
311
+ with gr.Tabs():
312
+ with gr.Tab("Markdown Source"):
313
+ md_source_output = gr.Code(language="markdown", label="Markdown Source")
314
+ with gr.Tab("Rendered Markdown"):
315
+ md_render_output = gr.Markdown(label='Markdown Rendering')
316
+ with gr.Tab("Generated HTML"):
317
+ raw_html_output = gr.Code(language="html", label="Generated HTML")
318
+
319
+ file_input.change(fn=load_and_preview_file, inputs=file_input, outputs=[image_preview, page_info, app_state], show_progress="full")
320
+
321
+ process_btn.click(fn=process_all_pages, inputs=[app_state, model_choice], outputs=[md_render_output, md_source_output, raw_html_output, output_file, cost_time, app_state], show_progress="full")
322
 
323
+ prev_page_btn.click(fn=lambda s: navigate_page("prev", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
324
+
325
+ next_page_btn.click(fn=lambda s: navigate_page("next", s), inputs=app_state, outputs=[image_preview, page_info, md_render_output, md_source_output, raw_html_output, app_state])
326
 
327
+ clear_btn.click(fn=clear_all, outputs=[file_input, image_preview, md_render_output, md_source_output, raw_html_output, output_file, cost_time, page_info, app_state])
 
 
328
 
329
+ if __name__ == '__main__':
330
+ demo.queue()
331
+ demo.launch(theme=steel_blue_theme, css=css, mcp_server=True, ssr_mode=False, show_error=True)