Spaces:

prithivMLmods
/

VLM-Parsing

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 24, 2025

Commit

a6bb602

verified ·

1 Parent(s): 5f810c2

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -39

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from PIL import Image
 from loguru import logger
 from pathlib import Path
 import torch
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 from transformers.image_utils import load_image
 import fitz
 import html2text
@@ -93,50 +93,54 @@ model_1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 logger.info(f"Model '{MODEL_ID_1}' loaded successfully.")
-# Model 2: Gliese-OCR-7B-Post1.0
-MODEL_ID_2 = "prithivMLmods/Gliese-OCR-7B-Post1.0"
-logger.info(f"Loading model 2: {MODEL_ID_2}")
-processor_2 = AutoProcessor.from_pretrained(MODEL_ID_2, trust_remote_code=True)
-model_2 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_2,
-    trust_remote_code=True,
-    torch_dtype=torch.float16 if device == "cuda" else torch.float32
-).to(device).eval()
-logger.info(f"Model '{MODEL_ID_2}' loaded successfully.")
-# Model 3: olmOCR-7B-0825
-MODEL_ID_3 = "allenai/olmOCR-7B-0825"
-logger.info(f"Loading model 3: {MODEL_ID_3}")
-processor_3 = AutoProcessor.from_pretrained(MODEL_ID_3, trust_remote_code=True)
-model_3 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_3,
-    trust_remote_code=True,
-    torch_dtype=torch.float16 if device == "cuda" else torch.float32
-).to(device).eval()
-logger.info(f"Model '{MODEL_ID_3}' loaded successfully.")
 @spaces.GPU
 def parse_page(image: Image.Image, model_name: str) -> str:
     if model_name == "Logics-Parsing":
         current_processor, current_model = processor_1, model_1
-    elif model_name == "Gliese-OCR-7B-Post1.0":
-        current_processor, current_model = processor_2, model_2
-    elif model_name == "olmOCR-7B-0825":
-        current_processor, current_model = processor_3, model_3
     else:
         raise ValueError(f"Unknown model choice: {model_name}")
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
-    prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
-    with torch.no_grad():
-        generated_ids = current_model.generate(**inputs, max_new_tokens=2048, do_sample=False)
-    generated_ids = generated_ids[:, inputs['input_ids'].shape[1]:]
-    output_text = current_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return output_text
 def convert_file_to_images(file_path: str, dpi: int = 200) -> List[Image.Image]:
     images = []
     file_ext = Path(file_path).suffix.lower()
@@ -272,7 +276,7 @@ def main():
         gr.HTML("""
         <div class="header-text">
-            <h1>📄 Multimodal: VLM Parsing</h1>
             <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
             <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
                 <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
@@ -284,7 +288,7 @@ def main():
         with gr.Row(elem_classes=["main-container"]):
             with gr.Column(scale=1):
-                model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0", "olmOCR-7B-0825"], label="Select Model", value="Logics-Parsing")
                 file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
                 process_btn = gr.Button("🚀Process Document", variant="primary", size="lg")

 from loguru import logger
 from pathlib import Path
 import torch
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoModel
 from transformers.image_utils import load_image
 import fitz
 import html2text
 ).to(device).eval()
 logger.info(f"Model '{MODEL_ID_1}' loaded successfully.")
+# Model 2: DeepSeek-OCR
+logger.info("Loading model and tokenizer for DeepSeek-OCR...")
+model_name_2 = "deepseek-ai/DeepSeek-OCR"
+tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2, trust_remote_code=True)
+model_2 = AutoModel.from_pretrained(
+    model_name_2,
+    _attn_implementation="flash_attention_2",
+    trust_remote_code=True
+).eval()
+logger.info("✅ DeepSeek-OCR model loaded successfully.")
 @spaces.GPU
 def parse_page(image: Image.Image, model_name: str) -> str:
     if model_name == "Logics-Parsing":
         current_processor, current_model = processor_1, model_1
+        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
+        prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
+        with torch.no_grad():
+            generated_ids = current_model.generate(**inputs, max_new_tokens=2048, do_sample=False)
+        generated_ids = generated_ids[:, inputs['input_ids'].shape[1]:]
+        output_text = current_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return output_text
+    elif model_name == "DeepSeek-OCR":
+        # Move model to the correct device for inference
+        model_2.to(device)
+        conversation = [
+            {"role": "user", "content": ["", image]},
+        ]
+        input_tensor = tokenizer_2.apply_chat_template(conversation, return_tensors="pt")
+        with torch.no_grad():
+            output_tensor = model_2.run(input_tensor.to(device))
+        # This model returns plain text, so we wrap it in basic HTML for consistency
+        ocr_text = output_tensor[0]
+        html_output = "".join(f"<p>{line}</p>" for line in ocr_text.split('\n'))
+        return html_output
     else:
         raise ValueError(f"Unknown model choice: {model_name}")
 def convert_file_to_images(file_path: str, dpi: int = 200) -> List[Image.Image]:
     images = []
     file_ext = Path(file_path).suffix.lower()
         gr.HTML("""
         <div class="header-text">
+            <h1>📄 Multimodal: VLM Parsing & OCR</h1>
             <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
             <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
                 <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
         with gr.Row(elem_classes=["main-container"]):
             with gr.Column(scale=1):
+                model_choice = gr.Dropdown(choices=["Logics-Parsing", "DeepSeek-OCR"], label="Select Model", value="Logics-Parsing")
                 file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
                 process_btn = gr.Button("🚀Process Document", variant="primary", size="lg")