Kamal-prog-code commited on
Commit
ce9e07d
·
1 Parent(s): 3f4b600

only deepseek

Browse files
Files changed (2) hide show
  1. app.py +4 -124
  2. requirements.txt +2 -14
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import AutoModel, AutoTokenizer, AutoProcessor, GenerationConfig
3
  import torch
4
  import spaces
5
  import os
@@ -12,12 +12,8 @@ import re
12
  import numpy as np
13
  import base64
14
  from io import StringIO, BytesIO
15
- from huggingface_hub import snapshot_download
16
 
17
  MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
18
- NEMOTRON_REPO = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
19
- NEMOTRON_LOCAL_DIR = "./models/nemotron-parse"
20
- NEMOTRON_REVISION = "e185ab4"
21
 
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
23
  model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
@@ -42,113 +38,6 @@ INFO_MD = """
42
  - `<image>` is the placeholder where visual tokens are inserted.
43
  """
44
 
45
- _NEMOTRON_MODEL = None
46
- _NEMOTRON_PROCESSOR = None
47
- _NEMOTRON_GENERATION_CONFIG = None
48
- _NEMOTRON_POST = None
49
- _NEMOTRON_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
-
51
- def get_nemotron_components():
52
- global _NEMOTRON_MODEL, _NEMOTRON_PROCESSOR, _NEMOTRON_GENERATION_CONFIG, _NEMOTRON_POST
53
- if _NEMOTRON_MODEL is None or _NEMOTRON_PROCESSOR is None:
54
- os.makedirs(NEMOTRON_LOCAL_DIR, exist_ok=True)
55
- model_dir = snapshot_download(
56
- repo_id=NEMOTRON_REPO,
57
- revision=NEMOTRON_REVISION,
58
- local_dir=NEMOTRON_LOCAL_DIR,
59
- local_dir_use_symlinks=False,
60
- )
61
- if model_dir not in sys.path:
62
- sys.path.append(model_dir)
63
- if _NEMOTRON_POST is None:
64
- from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
65
- _NEMOTRON_POST = (extract_classes_bboxes, transform_bbox_to_original, postprocess_text)
66
- _NEMOTRON_PROCESSOR = AutoProcessor.from_pretrained(
67
- model_dir,
68
- trust_remote_code=True,
69
- revision=NEMOTRON_REVISION,
70
- )
71
- _NEMOTRON_MODEL = AutoModel.from_pretrained(
72
- model_dir,
73
- trust_remote_code=True,
74
- revision=NEMOTRON_REVISION,
75
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
76
- ).to(_NEMOTRON_DEVICE).eval()
77
- try:
78
- _NEMOTRON_GENERATION_CONFIG = GenerationConfig.from_pretrained(
79
- model_dir,
80
- trust_remote_code=True,
81
- revision=NEMOTRON_REVISION,
82
- )
83
- except Exception:
84
- _NEMOTRON_GENERATION_CONFIG = GenerationConfig(max_new_tokens=4096)
85
- return _NEMOTRON_MODEL, _NEMOTRON_PROCESSOR, _NEMOTRON_GENERATION_CONFIG, _NEMOTRON_POST
86
-
87
- def process_nemotron_image(image):
88
- if image is None:
89
- return "Please upload an image first.", None, ""
90
- model_n, processor_n, generation_config, post_funcs = get_nemotron_components()
91
- extract_classes_bboxes, transform_bbox_to_original, postprocess_text = post_funcs
92
-
93
- task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
94
- inputs = processor_n(images=[image], text=task_prompt, return_tensors="pt").to(_NEMOTRON_DEVICE)
95
- if _NEMOTRON_DEVICE.type == "cuda":
96
- inputs = {k: v.to(torch.bfloat16) if v.dtype == torch.float32 else v for k, v in inputs.items()}
97
-
98
- with torch.no_grad():
99
- outputs = model_n.generate(
100
- **inputs,
101
- generation_config=generation_config,
102
- )
103
-
104
- generated_text = processor_n.batch_decode(outputs, skip_special_tokens=True)[0]
105
- try:
106
- classes, bboxes, texts = extract_classes_bboxes(generated_text)
107
- except Exception:
108
- return generated_text, image, generated_text
109
-
110
- bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
111
- processed_texts = [
112
- postprocess_text(
113
- text,
114
- cls=cls,
115
- table_format="latex",
116
- text_format="markdown",
117
- blank_text_in_figures=False,
118
- )
119
- for text, cls in zip(texts, classes)
120
- ]
121
-
122
- result_image = image.copy()
123
- draw = ImageDraw.Draw(result_image)
124
- color_map = {
125
- "Table": "red",
126
- "Figure": "blue",
127
- "Text": "green",
128
- "Title": "purple",
129
- }
130
-
131
- final_output_text = ""
132
- for cls, bbox, txt in zip(classes, bboxes, processed_texts):
133
- x1, y1, x2, y2 = bbox
134
- xmin = min(x1, x2)
135
- ymin = min(y1, y2)
136
- xmax = max(x1, x2)
137
- ymax = max(y1, y2)
138
- color = color_map.get(cls, "red")
139
- draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=3)
140
- if cls == "Table":
141
- final_output_text += f"\n\n--- [Table] ---\n{txt}\n-----------------\n"
142
- elif cls == "Figure":
143
- final_output_text += "\n\n--- [Figure] ---\n(Figure Detected)\n-----------------\n"
144
- else:
145
- final_output_text += f"{txt}\n"
146
-
147
- if not final_output_text.strip() and generated_text:
148
- final_output_text = generated_text
149
-
150
- return final_output_text, result_image, generated_text
151
-
152
  def extract_grounding_references(text):
153
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
154
  return re.findall(pattern, text, re.DOTALL)
@@ -367,11 +256,6 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
367
  )
368
  input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
369
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
370
- model_choice = gr.Dropdown(
371
- ["DeepSeek-OCR-2", "NVIDIA Nemotron Parse OCR"],
372
- value="DeepSeek-OCR-2",
373
- label="Model",
374
- )
375
  btn = gr.Button("Extract", variant="primary", size="lg")
376
 
377
  with gr.Column(scale=2):
@@ -394,18 +278,14 @@ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
394
  multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
395
  page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
396
 
397
- def run(multimodal_value, page_num, model_name):
398
  file_path = unpack_multimodal(multimodal_value)
399
  if file_path:
400
- if model_name == "NVIDIA Nemotron Parse OCR":
401
- image = load_image(file_path, int(page_num))
402
- text_out_n, img_out_n, raw_out_n = process_nemotron_image(image)
403
- return text_out_n, text_out_n, raw_out_n, img_out_n, []
404
  return process_file(file_path, int(page_num))
405
  return "Error: Upload a file or image", "", "", None, []
406
 
407
- submit_event = btn.click(run, [multimodal_in, page_selector, model_choice],
408
  [text_out, md_out, raw_out, img_out, gallery])
409
 
410
  if __name__ == "__main__":
411
- demo.queue(max_size=20).launch(theme=gr.themes.Soft())
 
1
  import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
  import torch
4
  import spaces
5
  import os
 
12
  import numpy as np
13
  import base64
14
  from io import StringIO, BytesIO
 
15
 
16
  MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
 
 
 
17
 
18
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
19
  model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
 
38
  - `<image>` is the placeholder where visual tokens are inserted.
39
  """
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def extract_grounding_references(text):
42
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
43
  return re.findall(pattern, text, re.DOTALL)
 
256
  )
257
  input_img = gr.Image(label="Input Image", type="pil", height=300, interactive=False)
258
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
 
 
 
 
 
259
  btn = gr.Button("Extract", variant="primary", size="lg")
260
 
261
  with gr.Column(scale=2):
 
278
  multimodal_in.change(update_page_selector_from_multimodal, [multimodal_in], [page_selector])
279
  page_selector.change(load_image_from_multimodal, [multimodal_in, page_selector], [input_img])
280
 
281
+ def run(multimodal_value, page_num):
282
  file_path = unpack_multimodal(multimodal_value)
283
  if file_path:
 
 
 
 
284
  return process_file(file_path, int(page_num))
285
  return "Error: Upload a file or image", "", "", None, []
286
 
287
+ submit_event = btn.click(run, [multimodal_in, page_selector],
288
  [text_out, md_out, raw_out, img_out, gallery])
289
 
290
  if __name__ == "__main__":
291
+ demo.queue(max_size=20).launch(theme=gr.themes.Soft())
requirements.txt CHANGED
@@ -8,19 +8,7 @@ easydict
8
  torchvision
9
  flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
10
  PyMuPDF
11
- hf_transfer
12
- gradio
13
  spaces
14
- huggingface_hub
15
  Pillow
16
- sentencepiece
17
- numpy==1.26.4
18
- timm
19
- torchmetrics
20
- mdtex2html
21
- html2text
22
- albumentations
23
- beautifulsoup4
24
- open-clip-torch
25
- opencv_python_headless==4.9.0.80
26
- safetensors
 
8
  torchvision
9
  flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
10
  PyMuPDF
 
 
11
  spaces
12
+ gradio
13
  Pillow
14
+ hf_transfer