Spaces:

Chhagan005
/

Multi_ML_OCR

Running on Zero

App Files Files Community

Chhagan005 commited on 22 days ago

Commit

641a587

verified ·

1 Parent(s): 6dc5ea6

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -16

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
@@ -159,7 +160,7 @@ class RadioAnimated(gr.HTML):
         uid = uuid.uuid4().hex[:8]
         group_name = f"ra-{uid}"
-        inputs_html = "\n".join(
             f"""
             <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
             <label class="ra-label" for="{group_name}-{i}">{c}</label>
@@ -216,47 +217,52 @@ class RadioAnimated(gr.HTML):
 def apply_gpu_duration(val: str):
     return int(val)
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
-    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
-    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-MODEL_ID_A = "CohereForAI/aya-vision-8b"
-processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
-model_a = AutoModelForImageTextToText.from_pretrained(
-    MODEL_ID_A,
-    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 MODEL_ID_W = "allenai/olmOCR-7B-0725"
 processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
 model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_W,
-    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 MODEL_ID_M = "reducto/RolmOCR"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
-    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -288,9 +294,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
-    elif model_name == "Aya-Vision-8B":
-        processor = processor_a
-        model = model_a
     elif model_name == "olmOCR-7B-0725":
         processor = processor_w
         model = model_w
@@ -346,7 +352,7 @@ image_examples = [
     ["Convert this page to docling", "examples/3.jpg"],
 ]
-with gr.Blocks() as demo:
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
@@ -374,7 +380,7 @@ with gr.Blocks() as demo:
             model_choice = gr.Radio(
                 choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
-                     "Aya-Vision-8B", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
@@ -405,4 +411,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)

 from transformers import (
     Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
+    Qwen3VLForConditionalGeneration,
     AutoModelForImageTextToText,
     AutoProcessor,
     TextIteratorStreamer,
         uid = uuid.uuid4().hex[:8]
         group_name = f"ra-{uid}"
+        inputs_html = "\\n".join(
             f"""
             <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
             <label class="ra-label" for="{group_name}-{i}">{c}</label>
 def apply_gpu_duration(val: str):
     return int(val)
+# Model V: Nanonets-OCR2-3B
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Model X: Qwen2-VL-OCR-2B
 MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Model C: Chhagan-DocVL-Qwen3 (NEW)
+MODEL_ID_C = "Chhagan005/Chhagan-DocVL-Qwen3"
+processor_c = AutoProcessor.from_pretrained(MODEL_ID_C, trust_remote_code=True)
+model_c = Qwen3VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_C,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Model W: olmOCR-7B-0725
 MODEL_ID_W = "allenai/olmOCR-7B-0725"
 processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
 model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_W,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Model M: RolmOCR
 MODEL_ID_M = "reducto/RolmOCR"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
     elif model_name == "Nanonets-OCR2-3B":
         processor = processor_v
         model = model_v
+    elif model_name == "Chhagan-DocVL-Qwen3":
+        processor = processor_c
+        model = model_c
     elif model_name == "olmOCR-7B-0725":
         processor = processor_w
         model = model_w
     ["Convert this page to docling", "examples/3.jpg"],
 ]
+with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             model_choice = gr.Radio(
                 choices=["Nanonets-OCR2-3B", "olmOCR-7B-0725", "RolmOCR-7B",
+                         "Chhagan-DocVL-Qwen3", "Qwen2-VL-OCR-2B"],
                 label="Select Model",
                 value="Nanonets-OCR2-3B"
             )
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)