Spaces:
Running
on
Zero
Running
on
Zero
Kamal-prog-code
commited on
Commit
·
ce9e07d
1
Parent(s):
3f4b600
only deepseek
Browse files- app.py +4 -124
- requirements.txt +2 -14
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from transformers import AutoModel, AutoTokenizer
|
| 3 |
import torch
|
| 4 |
import spaces
|
| 5 |
import os
|
|
@@ -12,12 +12,8 @@ import re
|
|
| 12 |
import numpy as np
|
| 13 |
import base64
|
| 14 |
from io import StringIO, BytesIO
|
| 15 |
-
from huggingface_hub import snapshot_download
|
| 16 |
|
| 17 |
MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
|
| 18 |
-
NEMOTRON_REPO = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
|
| 19 |
-
NEMOTRON_LOCAL_DIR = "./models/nemotron-parse"
|
| 20 |
-
NEMOTRON_REVISION = "e185ab4"
|
| 21 |
|
| 22 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 23 |
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
|
|
@@ -42,113 +38,6 @@ INFO_MD = """
|
|
| 42 |
- `<image>` is the placeholder where visual tokens are inserted.
|
| 43 |
"""
|
| 44 |
|
| 45 |
-
_NEMOTRON_MODEL = None
|
| 46 |
-
_NEMOTRON_PROCESSOR = None
|
| 47 |
-
_NEMOTRON_GENERATION_CONFIG = None
|
| 48 |
-
_NEMOTRON_POST = None
|
| 49 |
-
_NEMOTRON_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 50 |
-
|
| 51 |
-
def get_nemotron_components():
|
| 52 |
-
global _NEMOTRON_MODEL, _NEMOTRON_PROCESSOR, _NEMOTRON_GENERATION_CONFIG, _NEMOTRON_POST
|
| 53 |
-
if _NEMOTRON_MODEL is None or _NEMOTRON_PROCESSOR is None:
|
| 54 |
-
os.makedirs(NEMOTRON_LOCAL_DIR, exist_ok=True)
|
| 55 |
-
model_dir = snapshot_download(
|
| 56 |
-
repo_id=NEMOTRON_REPO,
|
| 57 |
-
revision=NEMOTRON_REVISION,
|
| 58 |
-
local_dir=NEMOTRON_LOCAL_DIR,
|
| 59 |
-
local_dir_use_symlinks=False,
|
| 60 |
-
)
|
| 61 |
-
if model_dir not in sys.path:
|
| 62 |
-
sys.path.append(model_dir)
|
| 63 |
-
if _NEMOTRON_POST is None:
|
| 64 |
-
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 65 |
-
_NEMOTRON_POST = (extract_classes_bboxes, transform_bbox_to_original, postprocess_text)
|
| 66 |
-
_NEMOTRON_PROCESSOR = AutoProcessor.from_pretrained(
|
| 67 |
-
model_dir,
|
| 68 |
-
trust_remote_code=True,
|
| 69 |
-
revision=NEMOTRON_REVISION,
|
| 70 |
-
)
|
| 71 |
-
_NEMOTRON_MODEL = AutoModel.from_pretrained(
|
| 72 |
-
model_dir,
|
| 73 |
-
trust_remote_code=True,
|
| 74 |
-
revision=NEMOTRON_REVISION,
|
| 75 |
-
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
|
| 76 |
-
).to(_NEMOTRON_DEVICE).eval()
|
| 77 |
-
try:
|
| 78 |
-
_NEMOTRON_GENERATION_CONFIG = GenerationConfig.from_pretrained(
|
| 79 |
-
model_dir,
|
| 80 |
-
trust_remote_code=True,
|
| 81 |
-
revision=NEMOTRON_REVISION,
|
| 82 |
-
)
|
| 83 |
-
except Exception:
|
| 84 |
-
_NEMOTRON_GENERATION_CONFIG = GenerationConfig(max_new_tokens=4096)
|
| 85 |
-
return _NEMOTRON_MODEL, _NEMOTRON_PROCESSOR, _NEMOTRON_GENERATION_CONFIG, _NEMOTRON_POST
|
| 86 |
-
|
| 87 |
-
def process_nemotron_image(image):
|
| 88 |
-
if image is None:
|
| 89 |
-
return "Please upload an image first.", None, ""
|
| 90 |
-
model_n, processor_n, generation_config, post_funcs = get_nemotron_components()
|
| 91 |
-
extract_classes_bboxes, transform_bbox_to_original, postprocess_text = post_funcs
|
| 92 |
-
|
| 93 |
-
task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
|
| 94 |
-
inputs = processor_n(images=[image], text=task_prompt, return_tensors="pt").to(_NEMOTRON_DEVICE)
|
| 95 |
-
if _NEMOTRON_DEVICE.type == "cuda":
|
| 96 |
-
inputs = {k: v.to(torch.bfloat16) if v.dtype == torch.float32 else v for k, v in inputs.items()}
|
| 97 |
-
|
| 98 |
-
with torch.no_grad():
|
| 99 |
-
outputs = model_n.generate(
|
| 100 |
-
**inputs,
|
| 101 |
-
generation_config=generation_config,
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
generated_text = processor_n.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 105 |
-
try:
|
| 106 |
-
classes, bboxes, texts = extract_classes_bboxes(generated_text)
|
| 107 |
-
except Exception:
|
| 108 |
-
return generated_text, image, generated_text
|
| 109 |
-
|
| 110 |
-
bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
|
| 111 |
-
processed_texts = [
|
| 112 |
-
postprocess_text(
|
| 113 |
-
text,
|
| 114 |
-
cls=cls,
|
| 115 |
-
table_format="latex",
|
| 116 |
-
text_format="markdown",
|
| 117 |
-
blank_text_in_figures=False,
|
| 118 |
-
)
|
| 119 |
-
for text, cls in zip(texts, classes)
|
| 120 |
-
]
|
| 121 |
-
|
| 122 |
-
result_image = image.copy()
|
| 123 |
-
draw = ImageDraw.Draw(result_image)
|
| 124 |
-
color_map = {
|
| 125 |
-
"Table": "red",
|
| 126 |
-
"Figure": "blue",
|
| 127 |
-
"Text": "green",
|
| 128 |
-
"Title": "purple",
|
| 129 |
-
}
|
| 130 |
-
|
| 131 |
-
final_output_text = ""
|
| 132 |
-
for cls, bbox, txt in zip(classes, bboxes, processed_texts):
|
| 133 |
-
x1, y1, x2, y2 = bbox
|
| 134 |
-
xmin = min(x1, x2)
|
| 135 |
-
ymin = min(y1, y2)
|
| 136 |
-
xmax = max(x1, x2)
|
| 137 |
-
ymax = max(y1, y2)
|
| 138 |
-
color = color_map.get(cls, "red")
|
| 139 |
-
draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3)
|
| 140 |
-
if cls == "Table":
|
| 141 |
-
final_output_text += f"\n\n--- [Table] ---\n{txt}\n-----------------\n"
|
| 142 |
-
elif cls == "Figure":
|
| 143 |
-
final_output_text += "\n\n--- [Figure] ---\n(Figure Detected)\n-----------------\n"
|
| 144 |
-
else:
|
| 145 |
-
final_output_text += f"{txt}\n"
|
| 146 |
-
|
| 147 |
-
if not final_output_text.strip() and generated_text:
|
| 148 |
-
final_output_text = generated_text
|
| 149 |
-
|
| 150 |
-
return final_output_text, result_image, generated_text
|
| 151 |
-
|
| 152 |
def extract_grounding_references(text):
|
| 153 |
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
| 154 |
return re.findall(pattern, text, re.DOTALL)
|
|
@@ -367,11 +256,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 367 |
)
|
| 368 |
input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
|
| 369 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
| 370 |
-
model_choice = gr.Dropdown(
|
| 371 |
-
["DeepSeek-OCR-2", "NVIDIA Nemotron Parse OCR"],
|
| 372 |
-
value="DeepSeek-OCR-2",
|
| 373 |
-
label="Model",
|
| 374 |
-
)
|
| 375 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 376 |
|
| 377 |
with gr.Column(scale=2):
|
|
@@ -394,18 +278,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
|
|
| 394 |
multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
|
| 395 |
page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
|
| 396 |
|
| 397 |
-
def run(multimodal_value, page_num
|
| 398 |
file_path = unpack_multimodal(multimodal_value)
|
| 399 |
if file_path:
|
| 400 |
-
if model_name == "NVIDIA Nemotron Parse OCR":
|
| 401 |
-
image = load_image(file_path, int(page_num))
|
| 402 |
-
text_out_n, img_out_n, raw_out_n = process_nemotron_image(image)
|
| 403 |
-
return text_out_n, text_out_n, raw_out_n, img_out_n, []
|
| 404 |
return process_file(file_path, int(page_num))
|
| 405 |
return "Error: Upload a file or image", "", "", None, []
|
| 406 |
|
| 407 |
-
submit_event = btn.click(run, [multimodal_in, page_selector
|
| 408 |
[text_out, md_out, raw_out, img_out, gallery])
|
| 409 |
|
| 410 |
if __name__ == "__main__":
|
| 411 |
-
demo.queue(max_size=20).launch(theme=gr.themes.Soft())
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from transformers import AutoModel, AutoTokenizer
|
| 3 |
import torch
|
| 4 |
import spaces
|
| 5 |
import os
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
import base64
|
| 14 |
from io import StringIO, BytesIO
|
|
|
|
| 15 |
|
| 16 |
MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
|
| 19 |
model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
|
|
|
|
| 38 |
- `<image>` is the placeholder where visual tokens are inserted.
|
| 39 |
"""
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def extract_grounding_references(text):
|
| 42 |
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
| 43 |
return re.findall(pattern, text, re.DOTALL)
|
|
|
|
| 256 |
)
|
| 257 |
input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
|
| 258 |
page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
btn = gr.Button("Extract", variant="primary", size="lg")
|
| 260 |
|
| 261 |
with gr.Column(scale=2):
|
|
|
|
| 278 |
multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
|
| 279 |
page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
|
| 280 |
|
| 281 |
+
def run(multimodal_value, page_num):
|
| 282 |
file_path = unpack_multimodal(multimodal_value)
|
| 283 |
if file_path:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
return process_file(file_path, int(page_num))
|
| 285 |
return "Error: Upload a file or image", "", "", None, []
|
| 286 |
|
| 287 |
+
submit_event = btn.click(run, [multimodal_in, page_selector],
|
| 288 |
[text_out, md_out, raw_out, img_out, gallery])
|
| 289 |
|
| 290 |
if __name__ == "__main__":
|
| 291 |
+
demo.queue(max_size=20).launch(theme=gr.themes.Soft())
|
requirements.txt
CHANGED
|
@@ -8,19 +8,7 @@ easydict
|
|
| 8 |
torchvision
|
| 9 |
flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
| 10 |
PyMuPDF
|
| 11 |
-
hf_transfer
|
| 12 |
-
gradio
|
| 13 |
spaces
|
| 14 |
-
|
| 15 |
Pillow
|
| 16 |
-
|
| 17 |
-
numpy==1.26.4
|
| 18 |
-
timm
|
| 19 |
-
torchmetrics
|
| 20 |
-
mdtex2html
|
| 21 |
-
html2text
|
| 22 |
-
albumentations
|
| 23 |
-
beautifulsoup4
|
| 24 |
-
open-clip-torch
|
| 25 |
-
opencv_python_headless==4.9.0.80
|
| 26 |
-
safetensors
|
|
|
|
| 8 |
torchvision
|
| 9 |
flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
| 10 |
PyMuPDF
|
|
|
|
|
|
|
| 11 |
spaces
|
| 12 |
+
gradio
|
| 13 |
Pillow
|
| 14 |
+
hf_transfer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|