Spaces:

Dustinsystem
/

ocrtester

Running on Zero

App Files Files Community

Kamal-prog-code commited on 6 days ago

Commit

ce9e07d

1 Parent(s): 3f4b600

only deepseek

Browse files

Files changed (2) hide show

app.py +4 -124
requirements.txt +2 -14

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import AutoModel, AutoTokenizer, AutoProcessor, GenerationConfig
 import torch
 import spaces
 import os
@@ -12,12 +12,8 @@ import re
 import numpy as np
 import base64
 from io import StringIO, BytesIO
-from huggingface_hub import snapshot_download
 MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
-NEMOTRON_REPO = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
-NEMOTRON_LOCAL_DIR = "./models/nemotron-parse"
-NEMOTRON_REVISION = "e185ab4"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
 model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
@@ -42,113 +38,6 @@ INFO_MD = """
 - `<image>` is the placeholder where visual tokens are inserted.
 """
-_NEMOTRON_MODEL = None
-_NEMOTRON_PROCESSOR = None
-_NEMOTRON_GENERATION_CONFIG = None
-_NEMOTRON_POST = None
-_NEMOTRON_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-def get_nemotron_components():
-    global _NEMOTRON_MODEL, _NEMOTRON_PROCESSOR, _NEMOTRON_GENERATION_CONFIG, _NEMOTRON_POST
-    if _NEMOTRON_MODEL is None or _NEMOTRON_PROCESSOR is None:
-        os.makedirs(NEMOTRON_LOCAL_DIR, exist_ok=True)
-        model_dir = snapshot_download(
-            repo_id=NEMOTRON_REPO,
-            revision=NEMOTRON_REVISION,
-            local_dir=NEMOTRON_LOCAL_DIR,
-            local_dir_use_symlinks=False,
-        )
-        if model_dir not in sys.path:
-            sys.path.append(model_dir)
-        if _NEMOTRON_POST is None:
-            from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
-            _NEMOTRON_POST = (extract_classes_bboxes, transform_bbox_to_original, postprocess_text)
-        _NEMOTRON_PROCESSOR = AutoProcessor.from_pretrained(
-            model_dir,
-            trust_remote_code=True,
-            revision=NEMOTRON_REVISION,
-        )
-        _NEMOTRON_MODEL = AutoModel.from_pretrained(
-            model_dir,
-            trust_remote_code=True,
-            revision=NEMOTRON_REVISION,
-            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-        ).to(_NEMOTRON_DEVICE).eval()
-        try:
-            _NEMOTRON_GENERATION_CONFIG = GenerationConfig.from_pretrained(
-                model_dir,
-                trust_remote_code=True,
-                revision=NEMOTRON_REVISION,
-            )
-        except Exception:
-            _NEMOTRON_GENERATION_CONFIG = GenerationConfig(max_new_tokens=4096)
-    return _NEMOTRON_MODEL, _NEMOTRON_PROCESSOR, _NEMOTRON_GENERATION_CONFIG, _NEMOTRON_POST
-def process_nemotron_image(image):
-    if image is None:
-        return "Please upload an image first.", None, ""
-    model_n, processor_n, generation_config, post_funcs = get_nemotron_components()
-    extract_classes_bboxes, transform_bbox_to_original, postprocess_text = post_funcs
-    task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
-    inputs = processor_n(images=[image], text=task_prompt, return_tensors="pt").to(_NEMOTRON_DEVICE)
-    if _NEMOTRON_DEVICE.type == "cuda":
-        inputs = {k: v.to(torch.bfloat16) if v.dtype == torch.float32 else v for k, v in inputs.items()}
-    with torch.no_grad():
-        outputs = model_n.generate(
-            **inputs,
-            generation_config=generation_config,
-        )
-    generated_text = processor_n.batch_decode(outputs, skip_special_tokens=True)[0]
-    try:
-        classes, bboxes, texts = extract_classes_bboxes(generated_text)
-    except Exception:
-        return generated_text, image, generated_text
-    bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
-    processed_texts = [
-        postprocess_text(
-            text,
-            cls=cls,
-            table_format="latex",
-            text_format="markdown",
-            blank_text_in_figures=False,
-        )
-        for text, cls in zip(texts, classes)
-    ]
-    result_image = image.copy()
-    draw = ImageDraw.Draw(result_image)
-    color_map = {
-        "Table": "red",
-        "Figure": "blue",
-        "Text": "green",
-        "Title": "purple",
-    }
-    final_output_text = ""
-    for cls, bbox, txt in zip(classes, bboxes, processed_texts):
-        x1, y1, x2, y2 = bbox
-        xmin = min(x1, x2)
-        ymin = min(y1, y2)
-        xmax = max(x1, x2)
-        ymax = max(y1, y2)
-        color = color_map.get(cls, "red")
-        draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3)
-        if cls == "Table":
-            final_output_text += f"\n\n--- [Table] ---\n{txt}\n-----------------\n"
-        elif cls == "Figure":
-            final_output_text += "\n\n--- [Figure] ---\n(Figure Detected)\n-----------------\n"
-        else:
-            final_output_text += f"{txt}\n"
-    if not final_output_text.strip() and generated_text:
-        final_output_text = generated_text
-    return final_output_text, result_image, generated_text
 def extract_grounding_references(text):
     pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     return re.findall(pattern, text, re.DOTALL)
@@ -367,11 +256,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
             )
             input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
-            model_choice = gr.Dropdown(
-                ["DeepSeek-OCR-2", "NVIDIA Nemotron Parse OCR"],
-                value="DeepSeek-OCR-2",
-                label="Model",
-            )
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
@@ -394,18 +278,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
     multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
     page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
-    def run(multimodal_value, page_num, model_name):
         file_path = unpack_multimodal(multimodal_value)
         if file_path:
-            if model_name == "NVIDIA Nemotron Parse OCR":
-                image = load_image(file_path, int(page_num))
-                text_out_n, img_out_n, raw_out_n = process_nemotron_image(image)
-                return text_out_n, text_out_n, raw_out_n, img_out_n, []
             return process_file(file_path, int(page_num))
         return "Error: Upload a file or image", "", "", None, []
-    submit_event = btn.click(run, [multimodal_in, page_selector, model_choice],
                              [text_out, md_out, raw_out, img_out, gallery])
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(theme=gr.themes.Soft())

 import gradio as gr
+from transformers import AutoModel, AutoTokenizer
 import torch
 import spaces
 import os
 import numpy as np
 import base64
 from io import StringIO, BytesIO
 MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
 model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
 - `<image>` is the placeholder where visual tokens are inserted.
 """
 def extract_grounding_references(text):
     pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
     return re.findall(pattern, text, re.DOTALL)
             )
             input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
             page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
             btn = gr.Button("Extract", variant="primary", size="lg")
         with gr.Column(scale=2):
     multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
     page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
+    def run(multimodal_value, page_num):
         file_path = unpack_multimodal(multimodal_value)
         if file_path:
             return process_file(file_path, int(page_num))
         return "Error: Upload a file or image", "", "", None, []
+    submit_event = btn.click(run, [multimodal_in, page_selector],
                              [text_out, md_out, raw_out, img_out, gallery])
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(theme=gr.themes.Soft())

requirements.txt CHANGED Viewed

@@ -8,19 +8,7 @@ easydict
 torchvision
 flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 PyMuPDF
-hf_transfer
-gradio
 spaces
-huggingface_hub
 Pillow
-sentencepiece
-numpy==1.26.4
-timm
-torchmetrics
-mdtex2html
-html2text
-albumentations
-beautifulsoup4
-open-clip-torch
-opencv_python_headless==4.9.0.80
-safetensors

 torchvision
 flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
 PyMuPDF
 spaces
+gradio
 Pillow
+hf_transfer