MultiVLM-OCR

Running on Zero

App Files Files Community

Geraldine commited on 22 days ago

Commit

9704588

verified ·

1 Parent(s): 28c56d5

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -2

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gc
 import json
 import base64
 import time
 from io import BytesIO
 from threading import Thread
@@ -10,6 +11,7 @@ import gradio as gr
 import spaces
 import torch
 from PIL import Image
 from transformers import (
     Qwen2VLForConditionalGeneration,
@@ -41,6 +43,53 @@ if torch.cuda.is_available():
 print("Using device:", device)
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -51,9 +100,10 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 MODEL_ID_Y = "rednote-hilab/dots.ocr"
-processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
 model_y = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID_Y,
     attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32

 import json
 import base64
 import time
+from pathlib import Path
 from io import BytesIO
 from threading import Thread
 import spaces
 import torch
 from PIL import Image
+from huggingface_hub import snapshot_download
 from transformers import (
     Qwen2VLForConditionalGeneration,
 print("Using device:", device)
+def patch_dots_ocr_configuration(repo_path: str) -> None:
+    config_path = Path(repo_path) / "configuration_dots.py"
+    if not config_path.exists():
+        return
+    source = config_path.read_text(encoding="utf-8")
+    updated = source
+    if 'attributes = ["image_processor", "tokenizer"]' not in updated:
+        updated = updated.replace(
+            "class DotsVLProcessor(Qwen2_5_VLProcessor):\n",
+            'class DotsVLProcessor(Qwen2_5_VLProcessor):\n    attributes = ["image_processor", "tokenizer"]\n',
+            1,
+        )
+    if "def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):" in updated:
+        updated = updated.replace(
+            "def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):",
+            "def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):",
+            1,
+        )
+    if "super().__init__(image_processor, tokenizer, chat_template=chat_template)" in updated:
+        updated = updated.replace(
+            "super().__init__(image_processor, tokenizer, chat_template=chat_template)",
+            "super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)",
+            1,
+        )
+    if updated != source:
+        config_path.write_text(updated, encoding="utf-8")
+        print(f"Patched dots.OCR processor config: {config_path}")
+def resolve_dots_ocr_model_path(repo_id: str) -> str:
+    try:
+        AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
+        return repo_id
+    except TypeError as exc:
+        if "video_processor" not in str(exc):
+            raise
+        print("dots.OCR processor compatibility issue detected, applying local patch...")
+        local_path = snapshot_download(repo_id=repo_id, local_dir="/tmp/dots_ocr_model", local_dir_use_symlinks=False)
+        patch_dots_ocr_configuration(local_path)
+        return local_path
 MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 MODEL_ID_Y = "rednote-hilab/dots.ocr"
+MODEL_PATH_Y = resolve_dots_ocr_model_path(MODEL_ID_Y)
+processor_y = AutoProcessor.from_pretrained(MODEL_PATH_Y, trust_remote_code=True)
 model_y = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH_Y,
     attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32