Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import hashlib | |
| import spaces | |
| import re | |
| import time | |
| import click | |
| import gradio as gr | |
| from io import BytesIO | |
| from PIL import Image | |
| from loguru import logger | |
| from pathlib import Path | |
| import torch | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| from transformers.image_utils import load_image | |
| import fitz | |
| import html2text | |
| import markdown | |
| import tempfile | |
| from typing import Optional, Tuple | |
| # --- Constants and Setup --- | |
| pdf_suffixes = [".pdf"] | |
| image_suffixes = [".png", ".jpeg", ".jpg"] | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # --- Model and Processor Initialization --- | |
| logger.info(f"Using device: {device}") | |
| # Model 1: Logics-Parsing | |
| MODEL_ID_1 = "Logics-MLLM/Logics-Parsing" | |
| logger.info(f"Loading model 1: {MODEL_ID_1}") | |
| processor_1 = AutoProcessor.from_pretrained(MODEL_ID_1, trust_remote_code=True) | |
| model_1 = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID_1, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32 | |
| ).to(device).eval() | |
| logger.info(f"Model '{MODEL_ID_1}' loaded successfully.") | |
| # Model 2: Gliese-OCR-7B-Post1.0 | |
| MODEL_ID_2 = "prithivMLmods/Gliese-OCR-7B-Post1.0" | |
| logger.info(f"Loading model 2: {MODEL_ID_2}") | |
| processor_2 = AutoProcessor.from_pretrained(MODEL_ID_2, trust_remote_code=True) | |
| model_2 = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID_2, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32 | |
| ).to(device).eval() | |
| logger.info(f"Model '{MODEL_ID_2}' loaded successfully.") | |
| def parse_page(image: Image.Image, model_name: str) -> str: | |
| """ | |
| Parses a single document page image using the selected model. | |
| """ | |
| # Select the appropriate model and processor based on the choice | |
| if model_name == "Logics-Parsing": | |
| current_processor = processor_1 | |
| current_model = model_1 | |
| elif model_name == "Gliese-OCR-7B-Post1.0": | |
| current_processor = processor_2 | |
| current_model = model_2 | |
| else: | |
| raise ValueError(f"Unknown model choice: {model_name}") | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}, | |
| ], | |
| }, | |
| ] | |
| prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = current_processor( | |
| text=[prompt_full], images=[image], return_tensors="pt", padding=True | |
| ).to(device) | |
| with torch.no_grad(): | |
| generated_ids = current_model.generate( | |
| **inputs, max_new_tokens=2048, temperature=0.1, top_p=0.9, do_sample=True, repetition_penalty=1.05 | |
| ) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = current_processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0] | |
| return output_text | |
| def convert_pdf_to_images_fitz(pdf_path: str, dpi: int = 200) -> list: | |
| """ | |
| Converts a PDF file to a list of PIL Images using PyMuPDF (fitz). | |
| """ | |
| images = [] | |
| try: | |
| pdf_document = fitz.open(pdf_path) | |
| zoom = dpi / 72.0 | |
| mat = fitz.Matrix(zoom, zoom) | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document.load_page(page_num) | |
| pix = page.get_pixmap(matrix=mat) | |
| img_data = pix.tobytes("png") | |
| image = Image.open(BytesIO(img_data)) | |
| images.append(image) | |
| pdf_document.close() | |
| except Exception as e: | |
| logger.error(f"Failed to convert PDF using PyMuPDF: {e}") | |
| raise | |
| return images | |
| async def pdf_parse(file_path: str, model_choice: str): | |
| """ | |
| Main parsing function that orchestrates the PDF processing pipeline. | |
| """ | |
| if not file_path: | |
| logger.warning("File path is None.") | |
| return "<h3>Please upload a file first.</h3>", "", "", None, "Error: No file provided", None, "No file loaded" | |
| logger.info(f'Processing file: {file_path} with model: {model_choice}') | |
| start_time = time.time() | |
| try: | |
| pages = convert_pdf_to_images_fitz(file_path, dpi=200) | |
| if not pages: | |
| raise ValueError("Could not extract any pages from the PDF.") | |
| html_parts = [] | |
| for i, page in enumerate(pages): | |
| logger.info(f"Parsing page {i+1}/{len(pages)}") | |
| # Pass the model choice to the parsing function | |
| html = parse_page(page, model_choice) | |
| html_parts.append(f'<!-- Page {i+1} -->\n{html}') | |
| full_html = '\n'.join(html_parts) | |
| parsing_time = time.time() - start_time | |
| mmd = html2text.html2text(full_html) | |
| mmd_html = markdown.markdown(mmd, extensions=['fenced_code', 'tables']) | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False, encoding='utf-8') as f: | |
| f.write(mmd) | |
| md_path = f.name | |
| cost_time_str = f'Total processing time: {parsing_time:.2f}s' | |
| preview_image = pages[0] | |
| page_info_html = f'<div class="page-info">Page 1 / {len(pages)}</div>' | |
| return mmd_html, mmd, full_html, md_path, cost_time_str, preview_image, page_info_html | |
| except Exception as e: | |
| logger.error(f"Parsing failed: {e}", exc_info=True) | |
| error_html = f"<h3>An error occurred during processing:</h3><p>{str(e)}</p>" | |
| return error_html, "", "", None, f"Error: {str(e)}", None, "Error processing" | |
| def show_pdf_preview_as_image(file_path: Optional[str]) -> Tuple[Optional[Image.Image], str]: | |
| """ | |
| Generates a PIL Image preview of the first page of a PDF or image file | |
| and provides page count information. | |
| """ | |
| if not file_path: | |
| return None, '<div class="page-info">No file loaded</div>' | |
| page_info_html = '<div class="page-info">Page 1 / 1</div>' | |
| try: | |
| if Path(file_path).suffix.lower() in image_suffixes: | |
| return Image.open(file_path).convert("RGB"), page_info_html | |
| elif Path(file_path).suffix.lower() == '.pdf': | |
| doc = fitz.open(file_path) | |
| page_count = len(doc) | |
| page_info_html = f'<div class="page-info">Page 1 / {page_count}</div>' | |
| if page_count > 0: | |
| page = doc.load_page(0) | |
| zoom = 200 / 72.0 | |
| mat = fitz.Matrix(zoom, zoom) | |
| pix = page.get_pixmap(matrix=mat) | |
| img = Image.open(BytesIO(pix.tobytes("png"))) | |
| doc.close() | |
| return img, page_info_html | |
| doc.close() | |
| except Exception as e: | |
| logger.error(f"Failed to create file preview: {e}") | |
| return None, '<div class="page-info">Failed to load preview</div>' | |
| def clear_all(): | |
| """Clears all input and output components in the UI.""" | |
| return ( | |
| None, | |
| None, | |
| "<h3>Results will be displayed here after processing.</h3>", | |
| "", | |
| "", | |
| None, | |
| "", | |
| '<div class="page-info">No file loaded</div>' | |
| ) | |
| def main(): | |
| """ | |
| Sets up and launches the Gradio user interface for the Logics-Parsing app. | |
| """ | |
| css = """ | |
| .main-container { max-width: 1400px; margin: 0 auto; } | |
| .header-text { text-align: center; color: #2c3e50; margin-bottom: 20px; } | |
| .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;} | |
| .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; } | |
| .page-info { text-align: center; padding: 8px 16px; border-radius: 20px; font-weight: bold; margin: 10px 0; } | |
| """ | |
| with gr.Blocks(theme="bethecloud/storj_theme", css=css, title="Logics-Parsing Demo") as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="header-text"> | |
| <h1>📄 Logics-Parsing: Structured Document Analysis</h1> | |
| <p style="font-size: 1.1em; color: #6b7280;"> | |
| An advanced Vision Language Model to parse documents and images into clean HTML and Markdown. | |
| </p> | |
| <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;"> | |
| <a href="https://huggingface.co/Logics-MLLM/Logics-Parsing" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;"> | |
| 🤗 Model Page | |
| </a> | |
| <a href="https://github.com/alibaba/Logics-Parsing" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;"> | |
| 💻 GitHub | |
| </a> | |
| <a href="https://arxiv.org/abs/2509.19760" target="_blank" style="text-decoration: none; color: #2563eb; font-weight: 500;"> | |
| 📝 Arxiv Paper | |
| </a> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(elem_classes=["main-container"]): | |
| # Left column for inputs and controls | |
| with gr.Column(scale=1): | |
| model_choice = gr.Dropdown( | |
| choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0"], | |
| label="Select Model⚡️", | |
| value="Logics-Parsing" | |
| ) | |
| file_input = gr.File( | |
| label="Upload PDF or Image", | |
| file_types=[".pdf", ".jpg", ".jpeg", ".png"], | |
| type="filepath" | |
| ) | |
| image_preview = gr.Image( | |
| label="Preview", type="pil", interactive=False, height=280 | |
| ) | |
| with gr.Row(): | |
| prev_page_btn = gr.Button("◀ Previous", size="md") | |
| page_info = gr.HTML('<div class="page-info">No file loaded</div>') | |
| next_page_btn = gr.Button("Next ▶", size="md") | |
| example_root = "examples" | |
| if os.path.exists(example_root) and os.path.isdir(example_root): | |
| example_files = [ | |
| os.path.join(example_root, f) | |
| for f in os.listdir(example_root) | |
| if f.endswith(tuple(pdf_suffixes + image_suffixes)) | |
| ] | |
| if example_files: | |
| with gr.Accordion("Open Examples⚙️", open=False): | |
| gr.Examples( | |
| examples=example_files, | |
| inputs=file_input, | |
| examples_per_page=10, | |
| ) | |
| with gr.Accordion("Other Details🕧", open=False): | |
| output_file = gr.File(label='Download Markdown Result', interactive=False) | |
| cost_time = gr.Text(label='Time Cost', interactive=False) | |
| process_btn = gr.Button( | |
| "🚀 Process Document", variant="primary", elem_classes=["process-button"], size="lg" | |
| ) | |
| clear_btn = gr.Button("🗑️ Clear All", variant="secondary") | |
| # Right column for results | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.Tab("Markdown Source"): | |
| mmd = gr.TextArea(lines=27, | |
| show_copy_button=True, | |
| label="Markdown Source", | |
| interactive=True) | |
| with gr.Tab("Markdown Rendering"): | |
| mmd_html = gr.TextArea( | |
| lines=27, | |
| label='Markdown Rendering', | |
| show_copy_button=True | |
| ) | |
| with gr.Tab("Generated HTML"): | |
| raw_html = gr.TextArea(lines=27, | |
| show_copy_button=True, | |
| label="Generated HTML") | |
| # --- Event Handlers --- | |
| file_input.change( | |
| fn=show_pdf_preview_as_image, | |
| inputs=[file_input], | |
| outputs=[image_preview, page_info], | |
| show_progress="full" | |
| ) | |
| process_btn.click( | |
| fn=pdf_parse, | |
| inputs=[file_input, model_choice], | |
| outputs=[mmd_html, mmd, raw_html, output_file, cost_time, image_preview, page_info], | |
| concurrency_limit=15, | |
| show_progress="full" | |
| ) | |
| clear_btn.click( | |
| fn=clear_all, | |
| outputs=[ | |
| file_input, image_preview, mmd_html, mmd, raw_html, output_file, | |
| cost_time, page_info | |
| ] | |
| ) | |
| demo.queue().launch(debug=True, show_error=True) | |
| if __name__ == '__main__': | |
| if not os.path.exists("examples"): | |
| os.makedirs("examples") | |
| logger.info("Created 'examples' directory. Please add some sample PDF/image files there.") | |
| main() |