Spaces:

Chhagan005
/

Multi_ML_OCR

Sleeping

App Files Files Community

Chhagan005 commited on 18 days ago

Commit

1a70a82

verified ·

1 Parent(s): 5982d54

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -70

app.py CHANGED Viewed

@@ -14,11 +14,19 @@ from PIL import Image
 import cv2
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -148,6 +156,28 @@ if torch.cuda.is_available():
 print("Using device:", device)
 class RadioAnimated(gr.HTML):
     def __init__(self, choices, value=None, **kwargs):
         if not choices or len(choices) < 2:
@@ -215,7 +245,7 @@ class RadioAnimated(gr.HTML):
 def apply_gpu_duration(val: str):
     return int(val)
-# Model V: Nanonets-OCR2-3B
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -224,54 +254,69 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# Model X: Qwen2-VL-OCR-2B
-MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_X,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Model P: PaddleOCR-VL (NEW - More stable than Qwen3)
-MODEL_ID_P = "PaddlePaddle/PaddleOCR-VL"
 try:
-    processor_p = AutoProcessor.from_pretrained(MODEL_ID_P, trust_remote_code=True)
-    model_p = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID_P,
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
         torch_dtype=torch.float16
     ).to(device).eval()
-    PADDLE_AVAILABLE = True
-    print("✓ PaddleOCR-VL model loaded successfully")
 except Exception as e:
-    print(f"✗ PaddleOCR-VL model not available: {e}")
-    PADDLE_AVAILABLE = False
-    processor_p = None
-    model_p = None
-# Model W: olmOCR-7B-0725
-MODEL_ID_W = "allenai/olmOCR-7B-0725"
-processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
-model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_W,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Model M: RolmOCR
-MODEL_ID_M = "reducto/RolmOCR"
-processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_M,
-    attn_implementation="flash_attention_2",
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
                           max_new_tokens: int, temperature: float, top_p: float,
@@ -291,24 +336,28 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
-    if model_name == "RolmOCR-7B":
-        processor = processor_m
-        model = model_m
-    elif model_name == "Qwen2-VL-OCR-2B":
-        processor = processor_x
-        model = model_x
-    elif model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
-    elif model_name == "PaddleOCR-VL":
-        if not PADDLE_AVAILABLE:
-            yield "PaddleOCR-VL model is not available.", "PaddleOCR-VL model is not available."
             return
-        processor = processor_p
-        model = model_p
-    elif model_name == "olmOCR-7B-0725":
-        processor = processor_w
-        model = model_w
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -317,6 +366,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
     messages = [{
         "role": "user",
         "content": [
@@ -324,7 +377,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             {"type": "text", "text": text},
         ]
     }]
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt_full],
@@ -354,23 +413,33 @@ def generate_image(model_name: str, text: str, image: Image.Image,
 image_examples = [
-    ["Perform OCR on the image precisely.", "examples/5.jpg"],
-    ["Run OCR on the image and ensure high accuracy.", "examples/4.jpg"],
-    ["Conduct OCR on the image with exact text recognition.", "examples/2.jpg"],
-    ["Perform precise OCR extraction on the image.", "examples/1.jpg"],
-    ["Convert this page to docling", "examples/3.jpg"],
 ]
 # Build model choices dynamically
-model_choices = ["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B", "Qwen2-VL-OCR-2B"]
-if PADDLE_AVAILABLE:
-    model_choices.append("PaddleOCR-VL")
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
-            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
             image_upload = gr.Image(type="pil", label="Upload Image", height=290)
             image_submit = gr.Button("Submit", variant="primary")
@@ -395,7 +464,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
             model_choice = gr.Radio(
                 choices=model_choices,
                 label="Select Model",
-                value="Nanonets-OCR2-3B"
             )
             with gr.Row(elem_id="gpu-duration-container"):
@@ -409,6 +478,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                     gpu_duration_state = gr.Number(value=60, visible=False)
             gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
     radioanimated_gpu_duration.change(
         fn=apply_gpu_duration,

 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
+# Try importing Qwen3VL if available
+try:
+    from transformers import Qwen3VLForConditionalGeneration
+    QWEN3_AVAILABLE = True
+except:
+    QWEN3_AVAILABLE = False
+    print("⚠️ Qwen3VL not available in current transformers version")
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 print("Using device:", device)
+# Multilingual OCR prompt template
+MULTILINGUAL_OCR_PROMPT = """Perform comprehensive OCR extraction on this document. Follow these rules:
+1. Extract ALL text exactly as it appears in the original language
+2. If the text is NOT in English, provide an English translation after the original text
+3. Identify the document type and extract key fields
+4. Preserve formatting and layout structure
+Format your response as:
+**Original Text:** (in source language)
+[extracted text]
+**English Translation:** (if not already in English)
+[translated text]
+**Key Fields Extracted:**
+- Document type:
+- [other relevant fields based on document type]
+Be accurate and preserve all details."""
 class RadioAnimated(gr.HTML):
     def __init__(self, choices, value=None, **kwargs):
         if not choices or len(choices) < 2:
 def apply_gpu_duration(val: str):
     return int(val)
+# Model V: Nanonets-OCR2-3B (Kept)
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+print("✓ Nanonets-OCR2-3B loaded")
+# Model C1: Chhagan_ML-VL-OCR-v1 (NEW)
+MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
 try:
+    processor_c1 = AutoProcessor.from_pretrained(MODEL_ID_C1, trust_remote_code=True)
+    model_c1 = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID_C1,
         attn_implementation="flash_attention_2",
         trust_remote_code=True,
         torch_dtype=torch.float16
     ).to(device).eval()
+    C1_AVAILABLE = True
+    print("✓ Chhagan_ML-VL-OCR-v1 loaded")
 except Exception as e:
+    print(f"✗ Chhagan_ML-VL-OCR-v1 failed: {e}")
+    C1_AVAILABLE = False
+    processor_c1 = None
+    model_c1 = None
+# Model C2: Chhagan-DocVL-Qwen3 (NEW)
+MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
+C2_AVAILABLE = False
+if QWEN3_AVAILABLE:
+    try:
+        processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
+        model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
+            MODEL_ID_C2,
+            attn_implementation="flash_attention_2",
+            trust_remote_code=True,
+            torch_dtype=torch.float16
+        ).to(device).eval()
+        C2_AVAILABLE = True
+        print("✓ Chhagan-DocVL-Qwen3 loaded")
+    except Exception as e:
+        print(f"✗ Chhagan-DocVL-Qwen3 failed: {e}")
+        processor_c2 = None
+        model_c2 = None
+else:
+    processor_c2 = None
+    model_c2 = None
+# Model Q3: Qwen3-VL-2B-Instruct (NEW - Official)
+MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
+Q3_AVAILABLE = False
+if QWEN3_AVAILABLE:
+    try:
+        processor_q3 = AutoProcessor.from_pretrained(MODEL_ID_Q3, trust_remote_code=True)
+        model_q3 = Qwen3VLForConditionalGeneration.from_pretrained(
+            MODEL_ID_Q3,
+            attn_implementation="flash_attention_2",
+            trust_remote_code=True,
+            torch_dtype=torch.float16
+        ).to(device).eval()
+        Q3_AVAILABLE = True
+        print("✓ Qwen3-VL-2B-Instruct loaded")
+    except Exception as e:
+        print(f"✗ Qwen3-VL-2B-Instruct failed: {e}")
+        processor_q3 = None
+        model_q3 = None
+else:
+    processor_q3 = None
+    model_q3 = None
 def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
                           max_new_tokens: int, temperature: float, top_p: float,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
+    # Select model and processor
+    if model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
+    elif model_name == "Chhagan-ML-VL-OCR-v1":
+        if not C1_AVAILABLE:
+            yield "Chhagan-ML-VL-OCR-v1 model is not available.", "Chhagan-ML-VL-OCR-v1 model is not available."
+            return
+        processor = processor_c1
+        model = model_c1
+    elif model_name == "Chhagan-DocVL-Qwen3":
+        if not C2_AVAILABLE:
+            yield "Chhagan-DocVL-Qwen3 model is not available. Requires transformers>=4.57", "Chhagan-DocVL-Qwen3 model is not available."
+            return
+        processor = processor_c2
+        model = model_c2
+    elif model_name == "Qwen3-VL-2B-Instruct":
+        if not Q3_AVAILABLE:
+            yield "Qwen3-VL-2B-Instruct model is not available. Requires transformers>=4.57", "Qwen3-VL-2B-Instruct model is not available."
             return
+        processor = processor_q3
+        model = model_q3
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         yield "Please upload an image.", "Please upload an image."
         return
+    # Use multilingual prompt if user query is empty or simple
+    if not text or text.strip().lower() in ["ocr", "extract", "read"]:
+        text = MULTILINGUAL_OCR_PROMPT
     messages = [{
         "role": "user",
         "content": [
             {"type": "text", "text": text},
         ]
     }]
+    try:
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except Exception as e:
+        print(f"Chat template error: {e}")
+        # Fallback to simple prompt
+        prompt_full = text
     inputs = processor(
         text=[prompt_full],
 image_examples = [
+    ["Perform comprehensive multilingual OCR with English translation", "examples/5.jpg"],
+    ["Extract all text in original language and translate to English", "examples/4.jpg"],
+    ["Perform OCR and provide structured key fields extraction", "examples/2.jpg"],
+    ["Extract document details with original text and English translation", "examples/1.jpg"],
+    ["Convert this page with multilingual support", "examples/3.jpg"],
 ]
 # Build model choices dynamically
+model_choices = ["Nanonets-OCR2-3B"]
+if C1_AVAILABLE:
+    model_choices.append("Chhagan-ML-VL-OCR-v1")
+if C2_AVAILABLE:
+    model_choices.append("Chhagan-DocVL-Qwen3")
+if Q3_AVAILABLE:
+    model_choices.append("Qwen3-VL-2B-Instruct")
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
+    gr.Markdown("# **Multimodal Multilingual OCR**", elem_id="main-title")
+    gr.Markdown("*Supports multilingual text extraction with automatic English translation*")
     with gr.Row():
         with gr.Column(scale=2):
+            image_query = gr.Textbox(
+                label="Query Input",
+                placeholder="Leave empty for automatic multilingual extraction with translation...",
+                value=""
+            )
             image_upload = gr.Image(type="pil", label="Upload Image", height=290)
             image_submit = gr.Button("Submit", variant="primary")
             model_choice = gr.Radio(
                 choices=model_choices,
                 label="Select Model",
+                value=model_choices[0]
             )
             with gr.Row(elem_id="gpu-duration-container"):
                     gpu_duration_state = gr.Number(value=60, visible=False)
             gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
+            gr.Markdown(f"**Models loaded:** {', '.join(model_choices)}")
     radioanimated_gpu_duration.change(
         fn=apply_gpu_duration,