Chhagan005 commited on
Commit
93e9b9e
·
verified ·
1 Parent(s): 1a70a82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -49
app.py CHANGED
@@ -13,13 +13,17 @@ import numpy as np
13
  from PIL import Image
14
  import cv2
15
 
 
 
 
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
  TextIteratorStreamer,
20
  )
21
 
22
- # Try importing Qwen3VL if available
23
  try:
24
  from transformers import Qwen3VLForConditionalGeneration
25
  QWEN3_AVAILABLE = True
@@ -246,25 +250,43 @@ def apply_gpu_duration(val: str):
246
  return int(val)
247
 
248
  # Model V: Nanonets-OCR2-3B (Kept)
 
249
  MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
250
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
251
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
252
- MODEL_ID_V,
253
- attn_implementation="flash_attention_2",
254
- trust_remote_code=True,
255
- torch_dtype=torch.float16
256
- ).to(device).eval()
257
- print("✓ Nanonets-OCR2-3B loaded")
258
-
259
- # Model C1: Chhagan_ML-VL-OCR-v1 (NEW)
 
 
 
 
 
 
 
 
260
  MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
261
  try:
262
- processor_c1 = AutoProcessor.from_pretrained(MODEL_ID_C1, trust_remote_code=True)
 
 
 
 
 
 
263
  model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
264
  MODEL_ID_C1,
265
  attn_implementation="flash_attention_2",
266
  trust_remote_code=True,
267
- torch_dtype=torch.float16
 
 
 
268
  ).to(device).eval()
269
  C1_AVAILABLE = True
270
  print("✓ Chhagan_ML-VL-OCR-v1 loaded")
@@ -274,29 +296,8 @@ except Exception as e:
274
  processor_c1 = None
275
  model_c1 = None
276
 
277
- # Model C2: Chhagan-DocVL-Qwen3 (NEW)
278
- MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
279
- C2_AVAILABLE = False
280
- if QWEN3_AVAILABLE:
281
- try:
282
- processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
283
- model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
284
- MODEL_ID_C2,
285
- attn_implementation="flash_attention_2",
286
- trust_remote_code=True,
287
- torch_dtype=torch.float16
288
- ).to(device).eval()
289
- C2_AVAILABLE = True
290
- print("✓ Chhagan-DocVL-Qwen3 loaded")
291
- except Exception as e:
292
- print(f"✗ Chhagan-DocVL-Qwen3 failed: {e}")
293
- processor_c2 = None
294
- model_c2 = None
295
- else:
296
- processor_c2 = None
297
- model_c2 = None
298
-
299
- # Model Q3: Qwen3-VL-2B-Instruct (NEW - Official)
300
  MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
301
  Q3_AVAILABLE = False
302
  if QWEN3_AVAILABLE:
@@ -317,6 +318,11 @@ if QWEN3_AVAILABLE:
317
  else:
318
  processor_q3 = None
319
  model_q3 = None
 
 
 
 
 
320
 
321
  def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
322
  max_new_tokens: int, temperature: float, top_p: float,
@@ -338,6 +344,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
338
  """
339
  # Select model and processor
340
  if model_name == "Nanonets-OCR2-3B":
 
 
 
341
  processor = processor_v
342
  model = model_v
343
  elif model_name == "Chhagan-ML-VL-OCR-v1":
@@ -346,12 +355,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
346
  return
347
  processor = processor_c1
348
  model = model_c1
349
- elif model_name == "Chhagan-DocVL-Qwen3":
350
- if not C2_AVAILABLE:
351
- yield "Chhagan-DocVL-Qwen3 model is not available. Requires transformers>=4.57", "Chhagan-DocVL-Qwen3 model is not available."
352
- return
353
- processor = processor_c2
354
- model = model_c2
355
  elif model_name == "Qwen3-VL-2B-Instruct":
356
  if not Q3_AVAILABLE:
357
  yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
@@ -367,7 +370,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
367
  return
368
 
369
  # Use multilingual prompt if user query is empty or simple
370
- if not text or text.strip().lower() in ["ocr", "extract", "read"]:
371
  text = MULTILINGUAL_OCR_PROMPT
372
 
373
  messages = [{
@@ -421,15 +424,19 @@ image_examples = [
421
  ]
422
 
423
  # Build model choices dynamically
424
- model_choices = ["Nanonets-OCR2-3B"]
 
 
425
  if C1_AVAILABLE:
426
  model_choices.append("Chhagan-ML-VL-OCR-v1")
427
- if C2_AVAILABLE:
428
- model_choices.append("Chhagan-DocVL-Qwen3")
429
  if Q3_AVAILABLE:
430
  model_choices.append("Qwen3-VL-2B-Instruct")
431
 
432
- with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
 
 
 
 
433
  gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
434
  gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
435
 
@@ -464,7 +471,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
464
  model_choice = gr.Radio(
465
  choices=model_choices,
466
  label="Select Model",
467
- value=model_choices[0]
468
  )
469
 
470
  with gr.Row(elem_id="gpu-duration-container"):
@@ -494,4 +501,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
494
  )
495
 
496
  if __name__ == "__main__":
497
- demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
13
  from PIL import Image
14
  import cv2
15
 
16
+ # Clear any local cache conflicts
17
+ os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
18
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
19
+
20
  from transformers import (
21
  Qwen2_5_VLForConditionalGeneration,
22
  AutoProcessor,
23
  TextIteratorStreamer,
24
  )
25
 
26
+ # Try importing Qwen3VL
27
  try:
28
  from transformers import Qwen3VLForConditionalGeneration
29
  QWEN3_AVAILABLE = True
 
250
  return int(val)
251
 
252
  # Model V: Nanonets-OCR2-3B (Kept)
253
+ print("Loading Nanonets-OCR2-3B...")
254
  MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
255
+ try:
256
+ processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
257
+ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
258
+ MODEL_ID_V,
259
+ attn_implementation="flash_attention_2",
260
+ trust_remote_code=True,
261
+ torch_dtype=torch.float16
262
+ ).to(device).eval()
263
+ print("✓ Nanonets-OCR2-3B loaded")
264
+ NANONETS_AVAILABLE = True
265
+ except Exception as e:
266
+ print(f"✗ Nanonets-OCR2-3B failed: {e}")
267
+ NANONETS_AVAILABLE = False
268
+ processor_v = None
269
+ model_v = None
270
+
271
+ # Model C1: Chhagan_ML-VL-OCR-v1 (NEW - with proper cache handling)
272
+ print("Loading Chhagan_ML-VL-OCR-v1...")
273
  MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
274
  try:
275
+ processor_c1 = AutoProcessor.from_pretrained(
276
+ MODEL_ID_C1,
277
+ trust_remote_code=True,
278
+ cache_dir="/tmp/transformers_cache",
279
+ force_download=False,
280
+ local_files_only=False
281
+ )
282
  model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
283
  MODEL_ID_C1,
284
  attn_implementation="flash_attention_2",
285
  trust_remote_code=True,
286
+ torch_dtype=torch.float16,
287
+ cache_dir="/tmp/transformers_cache",
288
+ force_download=False,
289
+ local_files_only=False
290
  ).to(device).eval()
291
  C1_AVAILABLE = True
292
  print("✓ Chhagan_ML-VL-OCR-v1 loaded")
 
296
  processor_c1 = None
297
  model_c1 = None
298
 
299
+ # Model Q3: Qwen3-VL-2B-Instruct (Official)
300
+ print("Loading Qwen3-VL-2B-Instruct...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
302
  Q3_AVAILABLE = False
303
  if QWEN3_AVAILABLE:
 
318
  else:
319
  processor_q3 = None
320
  model_q3 = None
321
+ print("✗ Qwen3VL architecture not available")
322
+
323
+ # Note: Chhagan-DocVL-Qwen3 has tokenizer compatibility issues, skipping
324
+ print("\n⚠️ Note: Chhagan-DocVL-Qwen3 skipped due to tokenizer compatibility issues")
325
+ print("Available alternative: Using official Qwen3-VL-2B-Instruct instead\n")
326
 
327
  def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
328
  max_new_tokens: int, temperature: float, top_p: float,
 
344
  """
345
  # Select model and processor
346
  if model_name == "Nanonets-OCR2-3B":
347
+ if not NANONETS_AVAILABLE:
348
+ yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
349
+ return
350
  processor = processor_v
351
  model = model_v
352
  elif model_name == "Chhagan-ML-VL-OCR-v1":
 
355
  return
356
  processor = processor_c1
357
  model = model_c1
 
 
 
 
 
 
358
  elif model_name == "Qwen3-VL-2B-Instruct":
359
  if not Q3_AVAILABLE:
360
  yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
 
370
  return
371
 
372
  # Use multilingual prompt if user query is empty or simple
373
+ if not text or text.strip().lower() in ["ocr", "extract", "read", ""]:
374
  text = MULTILINGUAL_OCR_PROMPT
375
 
376
  messages = [{
 
424
  ]
425
 
426
  # Build model choices dynamically
427
+ model_choices = []
428
+ if NANONETS_AVAILABLE:
429
+ model_choices.append("Nanonets-OCR2-3B")
430
  if C1_AVAILABLE:
431
  model_choices.append("Chhagan-ML-VL-OCR-v1")
 
 
432
  if Q3_AVAILABLE:
433
  model_choices.append("Qwen3-VL-2B-Instruct")
434
 
435
+ if not model_choices:
436
+ model_choices = ["No models available"]
437
+
438
+ demo = gr.Blocks()
439
+ with demo:
440
  gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
441
  gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
442
 
 
471
  model_choice = gr.Radio(
472
  choices=model_choices,
473
  label="Select Model",
474
+ value=model_choices[0] if model_choices else None
475
  )
476
 
477
  with gr.Row(elem_id="gpu-duration-container"):
 
501
  )
502
 
503
  if __name__ == "__main__":
504
+ demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)