Spaces:

Dustinsystem
/

ocrtester

Sleeping

App Files Files Community

Kamal-prog-code commited on Jan 30

Commit

ca7e05a

1 Parent(s): 5ebb043

revert back to deepseek

Browse files

Files changed (2) hide show

app.py +7 -184
requirements.txt +9 -10

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoProcessor
 import torch
 import spaces
 import os
@@ -12,178 +12,12 @@ import re
 import numpy as np
 import base64
 from io import StringIO, BytesIO
-from huggingface_hub import snapshot_download
-def ensure_llama_flash_attn2():
-    try:
-        from transformers.models.llama import modeling_llama as llama_mod
-    except Exception:
-        return
-    if not hasattr(llama_mod, "LlamaFlashAttention2"):
-        class LlamaFlashAttention2:  # fallback shim; not used when attn impl is SDPA
-            pass
-        llama_mod.LlamaFlashAttention2 = LlamaFlashAttention2
-ensure_llama_flash_attn2()
-def ensure_dynamiccache_max_length():
-    try:
-        from transformers.cache_utils import DynamicCache
-    except Exception:
-        return
-    if not hasattr(DynamicCache, "get_max_length"):
-        def get_max_length(self):
-            return self.get_seq_length()
-        DynamicCache.get_max_length = get_max_length
-ensure_dynamiccache_max_length()
-def allow_none_video_processor():
-    try:
-        import transformers.processing_utils as proc_utils
-    except Exception:
-        return
-    original = proc_utils.ProcessorMixin.check_argument_for_proper_class
-    def patched(self, attribute_name, arg):
-        if attribute_name == "video_processor" and arg is None:
-            return
-        return original(self, attribute_name, arg)
-    proc_utils.ProcessorMixin.check_argument_for_proper_class = patched
-allow_none_video_processor()
-MODEL_NAME = "deepseek-ai/DeepSeek-OCR-2"
-MODEL_REVISION = "e6322a289fe5b5218278d276d4e7c58e8103f46a"
-DOTS_OCR_MODEL = "rednote-hilab/dots.ocr"
-DOTS_OCR_REVISION = "c69eab6fac32ae66aaa8deea1f28a550ca8adec7"
-DOTS_OCR_LOCAL_DIR = "./models/dots-ocr"
-def resolve_attn_impl():
-    if os.environ.get("DISABLE_FLASH_ATTN") == "1":
-        return "eager"
-    try:
-        import flash_attn  # noqa: F401
-        return "flash_attention_2"
-    except Exception:
-        return "eager"
-ATTN_IMPL = resolve_attn_impl()
-def resolve_torch_dtype():
-    if torch.cuda.is_available():
-        if os.environ.get("FORCE_BF16") == "1" and torch.cuda.is_bf16_supported():
-            return torch.bfloat16
-        return torch.float16
-    return torch.float32
-TORCH_DTYPE = resolve_torch_dtype()
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True,
-    revision=MODEL_REVISION,
-)
-model = AutoModel.from_pretrained(
-    MODEL_NAME,
-    attn_implementation=ATTN_IMPL,
-    torch_dtype=TORCH_DTYPE,
-    trust_remote_code=True,
-    use_safetensors=True,
-    revision=MODEL_REVISION,
-)
-model = model.eval()
-if torch.cuda.is_available():
-    model = model.to("cuda")
-    if TORCH_DTYPE == torch.float16:
-        model = model.to(torch.float16)
-try:
-    from qwen_vl_utils import process_vision_info
-except Exception:
-    process_vision_info = None
-DOTS_OCR_PROMPT = "Extract all text from this image."
-_DOTS_OCR_MODEL = None
-_DOTS_OCR_PROCESSOR = None
-def get_dots_ocr_model():
-    global _DOTS_OCR_MODEL, _DOTS_OCR_PROCESSOR
-    if _DOTS_OCR_MODEL is None or _DOTS_OCR_PROCESSOR is None:
-        os.makedirs(DOTS_OCR_LOCAL_DIR, exist_ok=True)
-        snapshot_download(
-            repo_id=DOTS_OCR_MODEL,
-            revision=DOTS_OCR_REVISION,
-            local_dir=DOTS_OCR_LOCAL_DIR,
-            local_dir_use_symlinks=False,
-        )
-        dtype = TORCH_DTYPE
-        model_kwargs = {
-            "torch_dtype": dtype,
-            "trust_remote_code": True,
-            "revision": DOTS_OCR_REVISION,
-        }
-        if torch.cuda.is_available():
-            model_kwargs["attn_implementation"] = ATTN_IMPL
-            model_kwargs["device_map"] = "auto"
-        _DOTS_OCR_MODEL = AutoModelForCausalLM.from_pretrained(
-            DOTS_OCR_LOCAL_DIR,
-            **model_kwargs,
-        )
-        _DOTS_OCR_PROCESSOR = AutoProcessor.from_pretrained(
-            DOTS_OCR_LOCAL_DIR,
-            trust_remote_code=True,
-            revision=DOTS_OCR_REVISION,
-        )
-    return _DOTS_OCR_MODEL, _DOTS_OCR_PROCESSOR
-def dots_ocr_infer(image, prompt=DOTS_OCR_PROMPT, max_new_tokens=4096):
-    if process_vision_info is None:
-        return "dots.ocr error: qwen_vl_utils is not available."
-    model, processor = get_dots_ocr_model()
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt},
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True,
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    device = next(model.parameters()).device
-    inputs = inputs.to(device)
-    if TORCH_DTYPE in (torch.float16, torch.bfloat16) and "pixel_values" in inputs:
-        inputs["pixel_values"] = inputs["pixel_values"].to(TORCH_DTYPE)
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            do_sample=False,
-            temperature=0.1,
-        )
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False,
-    )
-    return output_text[0] if output_text else ""
 BASE_SIZE = 1024
 IMAGE_SIZE = 768
@@ -422,11 +256,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             )
             input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
-            model_choice = gr.Dropdown(
-                ["DeepSeek-OCR-2", "dots.ocr"],
-                value="DeepSeek-OCR-2",
-                label="Model",
-            )
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
@@ -449,19 +278,13 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
     page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
-    def run(multimodal_value, page_num, model_name):
         file_path = unpack_multimodal(multimodal_value)
         if file_path:
-            if model_name == "dots.ocr":
-                image = load_image(file_path, int(page_num))
-                if image is None:
-                    return "Error: Upload a file or image", "", "", None, []
-                dots_text = dots_ocr_infer(image)
-                return dots_text, dots_text, dots_text, None, []
             return process_file(file_path, int(page_num))
         return "Error: Upload a file or image", "", "", None, []
-    submit_event = btn.click(run, [multimodal_in, page_selector, model_choice],
                              [text_out, md_out, raw_out, img_out, gallery])
 if __name__ == "__main__":

 import gradio as gr
+from transformers import AutoModel, AutoTokenizer
 import torch
 import spaces
 import os
 import numpy as np
 import base64
 from io import StringIO, BytesIO
+MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
+model = model.eval().cuda()
 BASE_SIZE = 1024
 IMAGE_SIZE = 768
             )
             input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
     multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
     page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
+    def run(multimodal_value, page_num):
         file_path = unpack_multimodal(multimodal_value)
         if file_path:
             return process_file(file_path, int(page_num))
         return "Error: Upload a file or image", "", "", None, []
+    submit_event = btn.click(run, [multimodal_in, page_selector],
                              [text_out, md_out, raw_out, img_out, gallery])
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,12 +1,11 @@
-spaces
-huggingface_hub
-transformers
-torch
-torchvision
-qwen_vl_utils
-Pillow
-PyMuPDF
 accelerate
-addict
-matplotlib
 einops

+torch==2.6.0
+transformers==4.46.3
+tokenizers
 accelerate
 einops
+addict
+easydict
+torchvision
+flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+PyMuPDF
+hf_transfer