prithivMLmods commited on
Commit
a6bb602
·
verified ·
1 Parent(s): 5f810c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -39
app.py CHANGED
@@ -12,7 +12,7 @@ from PIL import Image
12
  from loguru import logger
13
  from pathlib import Path
14
  import torch
15
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
16
  from transformers.image_utils import load_image
17
  import fitz
18
  import html2text
@@ -93,50 +93,54 @@ model_1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
93
  ).to(device).eval()
94
  logger.info(f"Model '{MODEL_ID_1}' loaded successfully.")
95
 
96
- # Model 2: Gliese-OCR-7B-Post1.0
97
- MODEL_ID_2 = "prithivMLmods/Gliese-OCR-7B-Post1.0"
98
- logger.info(f"Loading model 2: {MODEL_ID_2}")
99
- processor_2 = AutoProcessor.from_pretrained(MODEL_ID_2, trust_remote_code=True)
100
- model_2 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
101
- MODEL_ID_2,
102
- trust_remote_code=True,
103
- torch_dtype=torch.float16 if device == "cuda" else torch.float32
104
- ).to(device).eval()
105
- logger.info(f"Model '{MODEL_ID_2}' loaded successfully.")
106
-
107
- # Model 3: olmOCR-7B-0825
108
- MODEL_ID_3 = "allenai/olmOCR-7B-0825"
109
- logger.info(f"Loading model 3: {MODEL_ID_3}")
110
- processor_3 = AutoProcessor.from_pretrained(MODEL_ID_3, trust_remote_code=True)
111
- model_3 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
112
- MODEL_ID_3,
113
- trust_remote_code=True,
114
- torch_dtype=torch.float16 if device == "cuda" else torch.float32
115
- ).to(device).eval()
116
- logger.info(f"Model '{MODEL_ID_3}' loaded successfully.")
117
 
118
  @spaces.GPU
119
  def parse_page(image: Image.Image, model_name: str) -> str:
120
  if model_name == "Logics-Parsing":
121
  current_processor, current_model = processor_1, model_1
122
- elif model_name == "Gliese-OCR-7B-Post1.0":
123
- current_processor, current_model = processor_2, model_2
124
- elif model_name == "olmOCR-7B-0825":
125
- current_processor, current_model = processor_3, model_3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  else:
127
  raise ValueError(f"Unknown model choice: {model_name}")
128
 
129
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
130
- prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
131
- inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
132
-
133
- with torch.no_grad():
134
- generated_ids = current_model.generate(**inputs, max_new_tokens=2048, do_sample=False)
135
-
136
- generated_ids = generated_ids[:, inputs['input_ids'].shape[1]:]
137
- output_text = current_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
138
- return output_text
139
-
140
  def convert_file_to_images(file_path: str, dpi: int = 200) -> List[Image.Image]:
141
  images = []
142
  file_ext = Path(file_path).suffix.lower()
@@ -272,7 +276,7 @@ def main():
272
 
273
  gr.HTML("""
274
  <div class="header-text">
275
- <h1>📄 Multimodal: VLM Parsing</h1>
276
  <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
277
  <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
278
  <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
@@ -284,7 +288,7 @@ def main():
284
 
285
  with gr.Row(elem_classes=["main-container"]):
286
  with gr.Column(scale=1):
287
- model_choice = gr.Dropdown(choices=["Logics-Parsing", "Gliese-OCR-7B-Post1.0", "olmOCR-7B-0825"], label="Select Model", value="Logics-Parsing")
288
  file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
289
 
290
  process_btn = gr.Button("🚀Process Document", variant="primary", size="lg")
 
12
  from loguru import logger
13
  from pathlib import Path
14
  import torch
15
+ from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoModel
16
  from transformers.image_utils import load_image
17
  import fitz
18
  import html2text
 
93
  ).to(device).eval()
94
  logger.info(f"Model '{MODEL_ID_1}' loaded successfully.")
95
 
96
+ # Model 2: DeepSeek-OCR
97
+ logger.info("Loading model and tokenizer for DeepSeek-OCR...")
98
+ model_name_2 = "deepseek-ai/DeepSeek-OCR"
99
+ tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2, trust_remote_code=True)
100
+ model_2 = AutoModel.from_pretrained(
101
+ model_name_2,
102
+ _attn_implementation="flash_attention_2",
103
+ trust_remote_code=True
104
+ ).eval()
105
+ logger.info(" DeepSeek-OCR model loaded successfully.")
106
+
 
 
 
 
 
 
 
 
 
 
107
 
108
  @spaces.GPU
109
  def parse_page(image: Image.Image, model_name: str) -> str:
110
  if model_name == "Logics-Parsing":
111
  current_processor, current_model = processor_1, model_1
112
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), figures (<figure>), formulas (<formula>), and others. Include category tags, and filter out irrelevant elements like headers and footers."}]}]
113
+ prompt_full = current_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
114
+ inputs = current_processor(text=prompt_full, images=[image.convert("RGB")], return_tensors="pt").to(device)
115
+
116
+ with torch.no_grad():
117
+ generated_ids = current_model.generate(**inputs, max_new_tokens=2048, do_sample=False)
118
+
119
+ generated_ids = generated_ids[:, inputs['input_ids'].shape[1]:]
120
+ output_text = current_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
121
+ return output_text
122
+
123
+ elif model_name == "DeepSeek-OCR":
124
+ # Move model to the correct device for inference
125
+ model_2.to(device)
126
+
127
+ conversation = [
128
+ {"role": "user", "content": ["", image]},
129
+ ]
130
+
131
+ input_tensor = tokenizer_2.apply_chat_template(conversation, return_tensors="pt")
132
+
133
+ with torch.no_grad():
134
+ output_tensor = model_2.run(input_tensor.to(device))
135
+
136
+ # This model returns plain text, so we wrap it in basic HTML for consistency
137
+ ocr_text = output_tensor[0]
138
+ html_output = "".join(f"<p>{line}</p>" for line in ocr_text.split('\n'))
139
+ return html_output
140
+
141
  else:
142
  raise ValueError(f"Unknown model choice: {model_name}")
143
 
 
 
 
 
 
 
 
 
 
 
 
144
  def convert_file_to_images(file_path: str, dpi: int = 200) -> List[Image.Image]:
145
  images = []
146
  file_ext = Path(file_path).suffix.lower()
 
276
 
277
  gr.HTML("""
278
  <div class="header-text">
279
+ <h1>📄 Multimodal: VLM Parsing & OCR</h1>
280
  <p style="font-size: 1.1em;">An advanced Vision Language Model to parse documents and images into clean Markdown (html)</p>
281
  <div style="display: flex; justify-content: center; gap: 20px; margin: 15px 0;">
282
  <a href="https://huggingface.co/collections/prithivMLmods/mm-vlm-parsing-68e33e52bfb9ae60b50602dc" target="_blank" style="text-decoration: none; font-weight: 500;">🤗 Model Info</a>
 
288
 
289
  with gr.Row(elem_classes=["main-container"]):
290
  with gr.Column(scale=1):
291
+ model_choice = gr.Dropdown(choices=["Logics-Parsing", "DeepSeek-OCR"], label="Select Model", value="Logics-Parsing")
292
  file_input = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"], type="filepath")
293
 
294
  process_btn = gr.Button("🚀Process Document", variant="primary", size="lg")