Spaces:

Dustinsystem
/

ocrtester

Running on Zero

App Files Files Community

Kamal-prog-code commited on 6 days ago

Commit

4073fa4

1 Parent(s): b99d870

Enhance OCR functionality by integrating new model and processor, and update requirements

Browse files

Files changed (2) hide show

app.py +91 -10
requirements.txt +8 -9

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import AutoModel, AutoTokenizer
 import torch
 import spaces
 import os
@@ -16,17 +16,87 @@ from io import StringIO, BytesIO
 MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-model = AutoModel.from_pretrained(
-    MODEL_NAME,
-    _attn_implementation="flash_attention_2",
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True,
-    use_safetensors=True,
-)
 model = model.eval()
 if torch.cuda.is_available():
     model = model.to("cuda")
 BASE_SIZE = 1024
 IMAGE_SIZE = 768
 CROP_MODE = True
@@ -264,6 +334,11 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             )
             input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
@@ -286,13 +361,19 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
     page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
-    def run(multimodal_value, page_num):
         file_path = unpack_multimodal(multimodal_value)
         if file_path:
             return process_file(file_path, int(page_num))
         return "Error: Upload a file or image", "", "", None, []
-    submit_event = btn.click(run, [multimodal_in, page_selector],
                              [text_out, md_out, raw_out, img_out, gallery])
 if __name__ == "__main__":

 import gradio as gr
+from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoProcessor
 import torch
 import spaces
 import os
 MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
 model = model.eval()
 if torch.cuda.is_available():
     model = model.to("cuda")
+try:
+    from qwen_vl_utils import process_vision_info
+except Exception:
+    process_vision_info = None
+DOTS_OCR_PROMPT = "Extract all text from this image."
+_DOTS_OCR_MODEL = None
+_DOTS_OCR_PROCESSOR = None
+def get_dots_ocr_model():
+    global _DOTS_OCR_MODEL, _DOTS_OCR_PROCESSOR
+    if _DOTS_OCR_MODEL is None or _DOTS_OCR_PROCESSOR is None:
+        dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        model_kwargs = {
+            "torch_dtype": dtype,
+            "trust_remote_code": True,
+        }
+        if torch.cuda.is_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
+            model_kwargs["device_map"] = "auto"
+        _DOTS_OCR_MODEL = AutoModelForCausalLM.from_pretrained(
+            "rednote-hilab/dots.ocr",
+            **model_kwargs,
+        )
+        _DOTS_OCR_PROCESSOR = AutoProcessor.from_pretrained(
+            "rednote-hilab/dots.ocr",
+            trust_remote_code=True,
+        )
+    return _DOTS_OCR_MODEL, _DOTS_OCR_PROCESSOR
+def dots_ocr_infer(image, prompt=DOTS_OCR_PROMPT, max_new_tokens=4096):
+    if process_vision_info is None:
+        return "dots.ocr error: qwen_vl_utils is not available."
+    model, processor = get_dots_ocr_model()
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    device = next(model.parameters()).device
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            temperature=0.1,
+        )
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    return output_text[0] if output_text else ""
 BASE_SIZE = 1024
 IMAGE_SIZE = 768
 CROP_MODE = True
             )
             input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
+            model_choice = gr.Dropdown(
+                ["DeepSeek-OCR-2", "dots.ocr"],
+                value="DeepSeek-OCR-2",
+                label="Model",
+            )
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
     multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
     page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
+    def run(multimodal_value, page_num, model_name):
         file_path = unpack_multimodal(multimodal_value)
         if file_path:
+            if model_name == "dots.ocr":
+                image = load_image(file_path, int(page_num))
+                if image is None:
+                    return "Error: Upload a file or image", "", "", None, []
+                dots_text = dots_ocr_infer(image)
+                return dots_text, dots_text, dots_text, None, []
             return process_file(file_path, int(page_num))
         return "Error: Upload a file or image", "", "", None, []
+    submit_event = btn.click(run, [multimodal_in, page_selector, model_choice],
                              [text_out, md_out, raw_out, img_out, gallery])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,11 +1,10 @@
-torch==2.6.0
-transformers==4.46.3
-tokenizers==0.20.3
-accelerate
-einops
-addict
-easydict
 torchvision
-flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 PyMuPDF
-hf_transfer

+spaces
+huggingface_hub
+transformers==4.51.3
+torch
 torchvision
+qwen_vl_utils
+Pillow
 PyMuPDF
+accelerate
+https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl