Chhagan005 commited on
Commit
641a587
·
verified ·
1 Parent(s): 6dc5ea6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -16
app.py CHANGED
@@ -16,6 +16,7 @@ import cv2
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
 
19
  AutoModelForImageTextToText,
20
  AutoProcessor,
21
  TextIteratorStreamer,
@@ -159,7 +160,7 @@ class RadioAnimated(gr.HTML):
159
  uid = uuid.uuid4().hex[:8]
160
  group_name = f"ra-{uid}"
161
 
162
- inputs_html = "\n".join(
163
  f"""
164
  <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
165
  <label class="ra-label" for="{group_name}-{i}">{c}</label>
@@ -216,47 +217,52 @@ class RadioAnimated(gr.HTML):
216
  def apply_gpu_duration(val: str):
217
  return int(val)
218
 
 
219
  MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
220
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
221
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
222
  MODEL_ID_V,
223
- attn_implementation="kernels-community/flash-attn2",
224
  trust_remote_code=True,
225
  torch_dtype=torch.float16
226
  ).to(device).eval()
227
 
 
228
  MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
229
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
230
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
231
  MODEL_ID_X,
232
- attn_implementation="kernels-community/flash-attn2",
233
  trust_remote_code=True,
234
  torch_dtype=torch.float16
235
  ).to(device).eval()
236
 
237
- MODEL_ID_A = "CohereForAI/aya-vision-8b"
238
- processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
239
- model_a = AutoModelForImageTextToText.from_pretrained(
240
- MODEL_ID_A,
241
- attn_implementation="kernels-community/flash-attn2",
 
242
  trust_remote_code=True,
243
  torch_dtype=torch.float16
244
  ).to(device).eval()
245
 
 
246
  MODEL_ID_W = "allenai/olmOCR-7B-0725"
247
  processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
248
  model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
249
  MODEL_ID_W,
250
- attn_implementation="kernels-community/flash-attn2",
251
  trust_remote_code=True,
252
  torch_dtype=torch.float16
253
  ).to(device).eval()
254
 
 
255
  MODEL_ID_M = "reducto/RolmOCR"
256
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
257
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
258
  MODEL_ID_M,
259
- attn_implementation="kernels-community/flash-attn2",
260
  trust_remote_code=True,
261
  torch_dtype=torch.float16
262
  ).to(device).eval()
@@ -288,9 +294,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
288
  elif model_name == "Nanonets-OCR2-3B":
289
  processor = processor_v
290
  model = model_v
291
- elif model_name == "Aya-Vision-8B":
292
- processor = processor_a
293
- model = model_a
294
  elif model_name == "olmOCR-7B-0725":
295
  processor = processor_w
296
  model = model_w
@@ -346,7 +352,7 @@ image_examples = [
346
  ["Convert this page to docling", "examples/3.jpg"],
347
  ]
348
 
349
- with gr.Blocks() as demo:
350
  gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
351
  with gr.Row():
352
  with gr.Column(scale=2):
@@ -374,7 +380,7 @@ with gr.Blocks() as demo:
374
 
375
  model_choice = gr.Radio(
376
  choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
377
- "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
378
  label="Select Model",
379
  value="Nanonets-OCR2-3B"
380
  )
@@ -405,4 +411,4 @@ with gr.Blocks() as demo:
405
  )
406
 
407
  if __name__ == "__main__":
408
- demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)
 
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
19
+ Qwen3VLForConditionalGeneration,
20
  AutoModelForImageTextToText,
21
  AutoProcessor,
22
  TextIteratorStreamer,
 
160
  uid = uuid.uuid4().hex[:8]
161
  group_name = f"ra-{uid}"
162
 
163
+ inputs_html = "\\n".join(
164
  f"""
165
  <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
166
  <label class="ra-label" for="{group_name}-{i}">{c}</label>
 
217
  def apply_gpu_duration(val: str):
218
  return int(val)
219
 
220
+ # Model V: Nanonets-OCR2-3B
221
  MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
222
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
223
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
224
  MODEL_ID_V,
225
+ attn_implementation="flash_attention_2",
226
  trust_remote_code=True,
227
  torch_dtype=torch.float16
228
  ).to(device).eval()
229
 
230
+ # Model X: Qwen2-VL-OCR-2B
231
  MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
232
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
233
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
234
  MODEL_ID_X,
235
+ attn_implementation="flash_attention_2",
236
  trust_remote_code=True,
237
  torch_dtype=torch.float16
238
  ).to(device).eval()
239
 
240
+ # Model C: Chhagan-DocVL-Qwen3 (NEW)
241
+ MODEL_ID_C = "Chhagan005/Chhagan-DocVL-Qwen3"
242
+ processor_c = AutoProcessor.from_pretrained(MODEL_ID_C, trust_remote_code=True)
243
+ model_c = Qwen3VLForConditionalGeneration.from_pretrained(
244
+ MODEL_ID_C,
245
+ attn_implementation="flash_attention_2",
246
  trust_remote_code=True,
247
  torch_dtype=torch.float16
248
  ).to(device).eval()
249
 
250
+ # Model W: olmOCR-7B-0725
251
  MODEL_ID_W = "allenai/olmOCR-7B-0725"
252
  processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
253
  model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
254
  MODEL_ID_W,
255
+ attn_implementation="flash_attention_2",
256
  trust_remote_code=True,
257
  torch_dtype=torch.float16
258
  ).to(device).eval()
259
 
260
+ # Model M: RolmOCR
261
  MODEL_ID_M = "reducto/RolmOCR"
262
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
263
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
264
  MODEL_ID_M,
265
+ attn_implementation="flash_attention_2",
266
  trust_remote_code=True,
267
  torch_dtype=torch.float16
268
  ).to(device).eval()
 
294
  elif model_name == "Nanonets-OCR2-3B":
295
  processor = processor_v
296
  model = model_v
297
+ elif model_name == "Chhagan-DocVL-Qwen3":
298
+ processor = processor_c
299
+ model = model_c
300
  elif model_name == "olmOCR-7B-0725":
301
  processor = processor_w
302
  model = model_w
 
352
  ["Convert this page to docling", "examples/3.jpg"],
353
  ]
354
 
355
+ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
356
  gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
357
  with gr.Row():
358
  with gr.Column(scale=2):
 
380
 
381
  model_choice = gr.Radio(
382
  choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
383
+ "Chhagan-DocVL-Qwen3", "Qwen2-VL-OCR-2B"],
384
  label="Select Model",
385
  value="Nanonets-OCR2-3B"
386
  )
 
411
  )
412
 
413
  if __name__ == "__main__":
414
+ demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)