BinKhoaLe1812 commited on
Commit
2020627
ยท
verified ยท
1 Parent(s): 30bc126

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +53 -8
  2. app.py +339 -0
  3. requirements.txt +11 -0
README.md CHANGED
@@ -1,14 +1,59 @@
1
  ---
2
- title: DeepseekOCR
3
- emoji: ๐Ÿ“Š
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.0.2
8
  app_file: app.py
9
- pinned: false
 
10
  license: mit
11
- short_description: Deepseek OCR Demo
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: DeepSeek OCR
3
+ emoji: ๐Ÿš€
4
+ colorFrom: red
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.49.1
8
  app_file: app.py
9
+ pinned: true
10
+ short_description: DeepSeek-OCR demo on PDF and img
11
  license: mit
 
12
  ---
13
 
14
+ ## API Usage
15
+
16
+ ### Using Gradio Client
17
+
18
+ ```python
19
+ from gradio_client import Client
20
+
21
+ client = Client("BinKhoaLe1812/DeepseekOCR")
22
+
23
+ # Process a PDF or image file
24
+ result = client.predict(
25
+ file_path="path/to/document.pdf", # or image file
26
+ mode="Gundam", # Options: Gundam, Tiny, Small, Base, Large
27
+ task="๐Ÿ“‹ Markdown", # Options: ๐Ÿ“‹ Markdown, ๐Ÿ“ Free OCR, ๐Ÿ“ Locate, ๐Ÿ” Describe, โœ๏ธ Custom
28
+ custom_prompt="", # Required for Custom or Locate tasks
29
+ page_num=1, # Page number (all pages are processed for PDFs)
30
+ api_name="/run"
31
+ )
32
+
33
+ # Result contains: [text_out, markdown_out, raw_out, img_out, gallery]
34
+ text, markdown, raw, image, crops = result
35
+ print(text) # Extracted text
36
+ print(markdown) # Markdown formatted output
37
+ ```
38
+
39
+ ### Using HTTP API
40
+
41
+ ```python
42
+ import requests
43
+
44
+ # Upload file and process
45
+ with open("document.pdf", "rb") as f:
46
+ files = {"files": f}
47
+ data = {
48
+ "mode": "Gundam",
49
+ "task": "๐Ÿ“‹ Markdown",
50
+ "custom_prompt": "",
51
+ "page_num": 1
52
+ }
53
+ response = requests.post(
54
+ "https://YOUR_USERNAME-YOUR_SPACE_NAME.hf.space/api/predict",
55
+ files=files,
56
+ data=data
57
+ )
58
+ result = response.json()
59
+ ```
app.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
+ import torch
4
+ import spaces
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ import shutil
9
+ from PIL import Image, ImageDraw, ImageFont, ImageOps
10
+ import fitz
11
+ import re
12
+ import warnings
13
+ import numpy as np
14
+ import base64
15
+ from io import StringIO, BytesIO
16
+
17
+ MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
20
+ model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
21
+ model = model.eval().cuda()
22
+
23
+ MODEL_CONFIGS = {
24
+ "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
25
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
26
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
27
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
28
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
29
+ }
30
+
31
+ TASK_PROMPTS = {
32
+ "๐Ÿ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
33
+ "๐Ÿ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
34
+ "๐Ÿ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
35
+ "๐Ÿ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
36
+ "โœ๏ธ Custom": {"prompt": "", "has_grounding": False}
37
+ }
38
+
39
+ def extract_grounding_references(text):
40
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
41
+ return re.findall(pattern, text, re.DOTALL)
42
+
43
+ def draw_bounding_boxes(image, refs, extract_images=False):
44
+ img_w, img_h = image.size
45
+ img_draw = image.copy()
46
+ draw = ImageDraw.Draw(img_draw)
47
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
48
+ draw2 = ImageDraw.Draw(overlay)
49
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
50
+ crops = []
51
+
52
+ color_map = {}
53
+ np.random.seed(42)
54
+
55
+ for ref in refs:
56
+ label = ref[1]
57
+ if label not in color_map:
58
+ color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
59
+
60
+ color = color_map[label]
61
+ coords = eval(ref[2])
62
+ color_a = color + (60,)
63
+
64
+ for box in coords:
65
+ x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
66
+
67
+ if extract_images and label == 'image':
68
+ crops.append(image.crop((x1, y1, x2, y2)))
69
+
70
+ width = 5 if label == 'title' else 3
71
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
72
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a)
73
+
74
+ text_bbox = draw.textbbox((0, 0), label, font=font)
75
+ tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
76
+ ty = max(0, y1 - 20)
77
+ draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
78
+ draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
79
+
80
+ img_draw.paste(overlay, (0, 0), overlay)
81
+ return img_draw, crops
82
+
83
+ def clean_output(text, include_images=False):
84
+ if not text:
85
+ return ""
86
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
87
+ matches = re.findall(pattern, text, re.DOTALL)
88
+ img_num = 0
89
+
90
+ for match in matches:
91
+ if '<|ref|>image<|/ref|>' in match[0]:
92
+ if include_images:
93
+ text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
94
+ img_num += 1
95
+ else:
96
+ text = text.replace(match[0], '', 1)
97
+ else:
98
+ text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
99
+
100
+ return text.strip()
101
+
102
+ def embed_images(markdown, crops):
103
+ if not crops:
104
+ return markdown
105
+ for i, img in enumerate(crops):
106
+ buf = BytesIO()
107
+ img.save(buf, format="PNG")
108
+ b64 = base64.b64encode(buf.getvalue()).decode()
109
+ markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
110
+ return markdown
111
+
112
+ @spaces.GPU(duration=60)
113
+ def process_image(image, mode, task, custom_prompt):
114
+ if image is None:
115
+ return " Error Upload image", "", "", None, []
116
+ if task in ["โœ๏ธ Custom", "๐Ÿ“ Locate"] and not custom_prompt.strip():
117
+ return "Enter prompt", "", "", None, []
118
+
119
+ if image.mode in ('RGBA', 'LA', 'P'):
120
+ image = image.convert('RGB')
121
+ image = ImageOps.exif_transpose(image)
122
+
123
+ config = MODEL_CONFIGS[mode]
124
+
125
+ if task == "โœ๏ธ Custom":
126
+ prompt = f"<image>\n{custom_prompt.strip()}"
127
+ has_grounding = '<|grounding|>' in custom_prompt
128
+ elif task == "๐Ÿ“ Locate":
129
+ prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
130
+ has_grounding = True
131
+ else:
132
+ prompt = TASK_PROMPTS[task]["prompt"]
133
+ has_grounding = TASK_PROMPTS[task]["has_grounding"]
134
+
135
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
136
+ image.save(tmp.name, 'JPEG', quality=95)
137
+ tmp.close()
138
+ out_dir = tempfile.mkdtemp()
139
+
140
+ stdout = sys.stdout
141
+ sys.stdout = StringIO()
142
+
143
+ model.infer(tokenizer=tokenizer, prompt=prompt, image_file=tmp.name, output_path=out_dir,
144
+ base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"])
145
+
146
+ result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
147
+ if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
148
+ sys.stdout = stdout
149
+
150
+ os.unlink(tmp.name)
151
+ shutil.rmtree(out_dir, ignore_errors=True)
152
+
153
+ if not result:
154
+ return "No text", "", "", None, []
155
+
156
+ cleaned = clean_output(result, False)
157
+ markdown = clean_output(result, True)
158
+
159
+ img_out = None
160
+ crops = []
161
+
162
+ if has_grounding and '<|ref|>' in result:
163
+ refs = extract_grounding_references(result)
164
+ if refs:
165
+ img_out, crops = draw_bounding_boxes(image, refs, True)
166
+
167
+ markdown = embed_images(markdown, crops)
168
+
169
+ return cleaned, markdown, result, img_out, crops
170
+
171
+ @spaces.GPU(duration=60)
172
+ def process_pdf(path, mode, task, custom_prompt, page_num):
173
+ doc = fitz.open(path)
174
+ total_pages = len(doc)
175
+
176
+ # Process all pages
177
+ all_cleaned = []
178
+ all_markdown = []
179
+ all_raw = []
180
+ all_crops = []
181
+ img_out = None
182
+
183
+ for page_idx in range(total_pages):
184
+ page = doc.load_page(page_idx)
185
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
186
+ img = Image.open(BytesIO(pix.tobytes("png")))
187
+
188
+ cleaned, markdown, result, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
189
+
190
+ if page_idx == 0:
191
+ # Use first page's error message if there's an error
192
+ if cleaned.startswith(" Error") or cleaned.startswith("Enter prompt") or cleaned == "No text":
193
+ doc.close()
194
+ return cleaned, "", "", None, []
195
+
196
+ all_cleaned.append(cleaned)
197
+ all_markdown.append(markdown)
198
+ all_raw.append(result)
199
+ all_crops.extend(page_crops)
200
+
201
+ # Use the last page's bounding boxes image, or first if available
202
+ if page_img_out is not None:
203
+ img_out = page_img_out
204
+
205
+ doc.close()
206
+
207
+ # Combine results from all pages
208
+ combined_cleaned = "\n\n--- Page Break ---\n\n".join(all_cleaned)
209
+ combined_markdown = "\n\n--- Page Break ---\n\n".join(all_markdown)
210
+ combined_raw = "\n\n--- Page Break ---\n\n".join(all_raw)
211
+
212
+ return combined_cleaned, combined_markdown, combined_raw, img_out, all_crops
213
+
214
+ def process_file(path, mode, task, custom_prompt, page_num):
215
+ if not path:
216
+ return "Error Upload file", "", "", None, []
217
+ if path.lower().endswith('.pdf'):
218
+ return process_pdf(path, mode, task, custom_prompt, page_num)
219
+ else:
220
+ return process_image(Image.open(path), mode, task, custom_prompt)
221
+
222
+ def toggle_prompt(task):
223
+ if task == "โœ๏ธ Custom":
224
+ return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
225
+ elif task == "๐Ÿ“ Locate":
226
+ return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
227
+ return gr.update(visible=False)
228
+
229
+ def select_boxes(task):
230
+ if task == "๐Ÿ“ Locate":
231
+ return gr.update(selected="tab_boxes")
232
+ return gr.update()
233
+
234
+ def get_pdf_page_count(file_path):
235
+ if not file_path or not file_path.lower().endswith('.pdf'):
236
+ return 1
237
+ doc = fitz.open(file_path)
238
+ count = len(doc)
239
+ doc.close()
240
+ return count
241
+
242
+ def load_image(file_path, page_num=1):
243
+ if not file_path:
244
+ return None
245
+ if file_path.lower().endswith('.pdf'):
246
+ doc = fitz.open(file_path)
247
+ page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
248
+ page = doc.load_page(page_idx)
249
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
250
+ img = Image.open(BytesIO(pix.tobytes("png")))
251
+ doc.close()
252
+ return img
253
+ else:
254
+ return Image.open(file_path)
255
+
256
+ def update_page_selector(file_path):
257
+ if not file_path:
258
+ return gr.update(visible=False)
259
+ if file_path.lower().endswith('.pdf'):
260
+ page_count = get_pdf_page_count(file_path)
261
+ return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
262
+ label=f"Select Page (1-{page_count})")
263
+ return gr.update(visible=False)
264
+
265
+ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
266
+ gr.Markdown("""
267
+ # ๐Ÿš€ DeepSeek-OCR
268
+
269
+ **Document parser with OCR capabilities. Process multi-page PDFs and images to extract text, convert to markdown, or locate specific content with bounding boxes.**
270
+ """)
271
+
272
+ with gr.Row():
273
+ with gr.Column(scale=1):
274
+ file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
275
+ input_img = gr.Image(label="Input Image", type="pil", height=300)
276
+ page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
277
+ mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
278
+ task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="๐Ÿ“‹ Markdown", label="Task")
279
+ prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
280
+ btn = gr.Button("Extract", variant="primary", size="lg")
281
+
282
+ with gr.Column(scale=2):
283
+ with gr.Tabs() as tabs:
284
+ with gr.Tab("Text", id="tab_text"):
285
+ text_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
286
+ with gr.Tab("Markdown Preview", id="tab_markdown"):
287
+ md_out = gr.Markdown("")
288
+ with gr.Tab("Boxes", id="tab_boxes"):
289
+ img_out = gr.Image(type="pil", height=500, show_label=False)
290
+ with gr.Tab("Cropped Images", id="tab_crops"):
291
+ gallery = gr.Gallery(show_label=False, columns=3, height=400)
292
+ with gr.Tab("Raw Text", id="tab_raw"):
293
+ raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
294
+
295
+ gr.Examples(
296
+ examples=[
297
+ ["examples/ocr.jpg", "Gundam", "๐Ÿ“‹ Markdown", ""],
298
+ ["examples/reachy-mini.jpg", "Gundam", "๐Ÿ“ Locate", "Robot"]
299
+ ],
300
+ inputs=[input_img, mode, task, prompt],
301
+ cache_examples=False
302
+ )
303
+
304
+ with gr.Accordion("โ„น๏ธ Info", open=False):
305
+ gr.Markdown("""
306
+ ### Modes
307
+ - **Gundam**: 1024 base + 640 tiles with cropping - Best balance
308
+ - **Tiny**: 512ร—512, no crop - Fastest
309
+ - **Small**: 640ร—640, no crop - Quick
310
+ - **Base**: 1024ร—1024, no crop - Standard
311
+ - **Large**: 1280ร—1280, no crop - Highest quality
312
+
313
+ ### Tasks
314
+ - **Markdown**: Convert document to structured markdown (grounding โœ…)
315
+ - **Free OCR**: Simple text extraction
316
+ - **Locate**: Find specific things in image (grounding โœ…)
317
+ - **Describe**: General image description
318
+ - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
319
+ """)
320
+
321
+ file_in.change(load_image, [file_in, page_selector], [input_img])
322
+ file_in.change(update_page_selector, [file_in], [page_selector])
323
+ page_selector.change(load_image, [file_in, page_selector], [input_img])
324
+ task.change(toggle_prompt, [task], [prompt])
325
+ task.change(select_boxes, [task], [tabs])
326
+
327
+ def run(image, file_path, mode, task, custom_prompt, page_num):
328
+ if file_path:
329
+ return process_file(file_path, mode, task, custom_prompt, int(page_num))
330
+ if image is not None:
331
+ return process_image(image, mode, task, custom_prompt)
332
+ return "Error uploading file or image", "", "", None, []
333
+
334
+ submit_event = btn.click(run, [input_img, file_in, mode, task, prompt, page_selector],
335
+ [text_out, md_out, raw_out, img_out, gallery])
336
+ submit_event.then(select_boxes, [task], [tabs])
337
+
338
+ if __name__ == "__main__":
339
+ demo.queue(max_size=20).launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.6.0
2
+ transformers==4.46.3
3
+ tokenizers==0.20.3
4
+ accelerate
5
+ einops
6
+ addict
7
+ easydict
8
+ torchvision
9
+ flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
10
+ PyMuPDF
11
+ hf_transfer