Spaces:

Chhagan005
/

Multi_ML_OCR

Running on Zero

App Files Files Community

Chhagan005 commited on 7 days ago

Commit

93e9b9e

verified ·

1 Parent(s): 1a70a82

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -49

app.py CHANGED Viewed

@@ -13,13 +13,17 @@ import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
-# Try importing Qwen3VL if available
 try:
     from transformers import Qwen3VLForConditionalGeneration
     QWEN3_AVAILABLE = True
@@ -246,25 +250,43 @@ def apply_gpu_duration(val: str):
     return int(val)
 # Model V: Nanonets-OCR2-3B (Kept)
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
-processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_V,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-print("✓ Nanonets-OCR2-3B loaded")
-# Model C1: Chhagan_ML-VL-OCR-v1 (NEW)
 MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
 try:
-    processor_c1 = AutoProcessor.from_pretrained(MODEL_ID_C1, trust_remote_code=True)
     model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_C1,
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
-        torch_dtype=torch.float16
     ).to(device).eval()
     C1_AVAILABLE = True
     print("✓ Chhagan_ML-VL-OCR-v1 loaded")
@@ -274,29 +296,8 @@ except Exception as e:
     processor_c1 = None
     model_c1 = None
-# Model C2: Chhagan-DocVL-Qwen3 (NEW)
-MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
-C2_AVAILABLE = False
-if QWEN3_AVAILABLE:
-    try:
-        processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
-        model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
-            MODEL_ID_C2,
-            attn_implementation="flash_attention_2",
-            trust_remote_code=True,
-            torch_dtype=torch.float16
-        ).to(device).eval()
-        C2_AVAILABLE = True
-        print("✓ Chhagan-DocVL-Qwen3 loaded")
-    except Exception as e:
-        print(f"✗ Chhagan-DocVL-Qwen3 failed: {e}")
-        processor_c2 = None
-        model_c2 = None
-else:
-    processor_c2 = None
-    model_c2 = None
-# Model Q3: Qwen3-VL-2B-Instruct (NEW - Official)
 MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
 Q3_AVAILABLE = False
 if QWEN3_AVAILABLE:
@@ -317,6 +318,11 @@ if QWEN3_AVAILABLE:
 else:
     processor_q3 = None
     model_q3 = None
 def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
                           max_new_tokens: int, temperature: float, top_p: float,
@@ -338,6 +344,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     """
     # Select model and processor
     if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
     elif model_name == "Chhagan-ML-VL-OCR-v1":
@@ -346,12 +355,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             return
         processor = processor_c1
         model = model_c1
-    elif model_name == "Chhagan-DocVL-Qwen3":
-        if not C2_AVAILABLE:
-            yield "Chhagan-DocVL-Qwen3 model is not available. Requires transformers>=4.57", "Chhagan-DocVL-Qwen3 model is not available."
-            return
-        processor = processor_c2
-        model = model_c2
     elif model_name == "Qwen3-VL-2B-Instruct":
         if not Q3_AVAILABLE:
             yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
@@ -367,7 +370,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         return
     # Use multilingual prompt if user query is empty or simple
-    if not text or text.strip().lower() in ["ocr", "extract", "read"]:
         text = MULTILINGUAL_OCR_PROMPT
     messages = [{
@@ -421,15 +424,19 @@ image_examples = [
 ]
 # Build model choices dynamically
-model_choices = ["Nanonets-OCR2-3B"]
 if C1_AVAILABLE:
     model_choices.append("Chhagan-ML-VL-OCR-v1")
-if C2_AVAILABLE:
-    model_choices.append("Chhagan-DocVL-Qwen3")
 if Q3_AVAILABLE:
     model_choices.append("Qwen3-VL-2B-Instruct")
-with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
     gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
@@ -464,7 +471,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             model_choice = gr.Radio(
                 choices=model_choices,
                 label="Select Model",
-                value=model_choices[0]
             )
             with gr.Row(elem_id="gpu-duration-container"):
@@ -494,4 +501,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 from PIL import Image
 import cv2
+# Clear any local cache conflicts
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
+# Try importing Qwen3VL
 try:
     from transformers import Qwen3VLForConditionalGeneration
     QWEN3_AVAILABLE = True
     return int(val)
 # Model V: Nanonets-OCR2-3B (Kept)
+print("Loading Nanonets-OCR2-3B...")
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
+try:
+    processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+    model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID_V,
+        attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+        torch_dtype=torch.float16
+    ).to(device).eval()
+    print("✓ Nanonets-OCR2-3B loaded")
+    NANONETS_AVAILABLE = True
+except Exception as e:
+    print(f"✗ Nanonets-OCR2-3B failed: {e}")
+    NANONETS_AVAILABLE = False
+    processor_v = None
+    model_v = None
+# Model C1: Chhagan_ML-VL-OCR-v1 (NEW - with proper cache handling)
+print("Loading Chhagan_ML-VL-OCR-v1...")
 MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
 try:
+    processor_c1 = AutoProcessor.from_pretrained(
+        MODEL_ID_C1,
+        trust_remote_code=True,
+        cache_dir="/tmp/transformers_cache",
+        force_download=False,
+        local_files_only=False
+    )
     model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_C1,
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
+        torch_dtype=torch.float16,
+        cache_dir="/tmp/transformers_cache",
+        force_download=False,
+        local_files_only=False
     ).to(device).eval()
     C1_AVAILABLE = True
     print("✓ Chhagan_ML-VL-OCR-v1 loaded")
     processor_c1 = None
     model_c1 = None
+# Model Q3: Qwen3-VL-2B-Instruct (Official)
+print("Loading Qwen3-VL-2B-Instruct...")
 MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
 Q3_AVAILABLE = False
 if QWEN3_AVAILABLE:
 else:
     processor_q3 = None
     model_q3 = None
+    print("✗ Qwen3VL architecture not available")
+# Note: Chhagan-DocVL-Qwen3 has tokenizer compatibility issues, skipping
+print("\n⚠️ Note: Chhagan-DocVL-Qwen3 skipped due to tokenizer compatibility issues")
+print("Available alternative: Using official Qwen3-VL-2B-Instruct instead\n")
 def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
                           max_new_tokens: int, temperature: float, top_p: float,
     """
     # Select model and processor
     if model_name == "Nanonets-OCR2-3B":
+        if not NANONETS_AVAILABLE:
+            yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
+            return
         processor = processor_v
         model = model_v
     elif model_name == "Chhagan-ML-VL-OCR-v1":
             return
         processor = processor_c1
         model = model_c1
     elif model_name == "Qwen3-VL-2B-Instruct":
         if not Q3_AVAILABLE:
             yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
         return
     # Use multilingual prompt if user query is empty or simple
+    if not text or text.strip().lower() in ["ocr", "extract", "read", ""]:
         text = MULTILINGUAL_OCR_PROMPT
     messages = [{
 ]
 # Build model choices dynamically
+model_choices = []
+if NANONETS_AVAILABLE:
+    model_choices.append("Nanonets-OCR2-3B")
 if C1_AVAILABLE:
     model_choices.append("Chhagan-ML-VL-OCR-v1")
 if Q3_AVAILABLE:
     model_choices.append("Qwen3-VL-2B-Instruct")
+if not model_choices:
+    model_choices = ["No models available"]
+demo = gr.Blocks()
+with demo:
     gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
     gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
             model_choice = gr.Radio(
                 choices=model_choices,
                 label="Select Model",
+                value=model_choices[0] if model_choices else None
             )
             with gr.Row(elem_id="gpu-duration-container"):
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)