Kamal-prog-code commited on
Commit
e8782b1
Β·
1 Parent(s): b2ed188

Add application file

Browse files
Files changed (1) hide show
  1. app.py +317 -0
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
+ import torch
4
+ import spaces
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ import shutil
9
+ from PIL import Image, ImageDraw, ImageFont, ImageOps
10
+ import fitz
11
+ import re
12
+ import numpy as np
13
+ import base64
14
+ from io import StringIO, BytesIO
15
+
16
+ MODEL_NAME = 'deepseek-ai/DeepSeek-OCR-2'
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
19
+ model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
20
+ model = model.eval().cuda()
21
+
22
+ BASE_SIZE = 1024
23
+ IMAGE_SIZE = 768
24
+ CROP_MODE = True
25
+
26
+ TASK_PROMPTS = {
27
+ "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
28
+ "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
29
+ "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
30
+ "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
31
+ "✏️ Custom": {"prompt": "", "has_grounding": False}
32
+ }
33
+
34
+ def extract_grounding_references(text):
35
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
36
+ return re.findall(pattern, text, re.DOTALL)
37
+
38
+ def draw_bounding_boxes(image, refs, extract_images=False):
39
+ img_w, img_h = image.size
40
+ img_draw = image.copy()
41
+ draw = ImageDraw.Draw(img_draw)
42
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
43
+ draw2 = ImageDraw.Draw(overlay)
44
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 15)
45
+ crops = []
46
+
47
+ color_map = {}
48
+ np.random.seed(42)
49
+
50
+ for ref in refs:
51
+ label = ref[1]
52
+ if label not in color_map:
53
+ color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
54
+
55
+ color = color_map[label]
56
+ coords = eval(ref[2])
57
+ color_a = color + (60,)
58
+
59
+ for box in coords:
60
+ x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
61
+
62
+ if extract_images and label == 'image':
63
+ crops.append(image.crop((x1, y1, x2, y2)))
64
+
65
+ width = 5 if label == 'title' else 3
66
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
67
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a)
68
+
69
+ text_bbox = draw.textbbox((0, 0), label, font=font)
70
+ tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
71
+ ty = max(0, y1 - 20)
72
+ draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
73
+ draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
74
+
75
+ img_draw.paste(overlay, (0, 0), overlay)
76
+ return img_draw, crops
77
+
78
+ def clean_output(text, include_images=False):
79
+ if not text:
80
+ return ""
81
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
82
+ matches = re.findall(pattern, text, re.DOTALL)
83
+ img_num = 0
84
+
85
+ for match in matches:
86
+ if '<|ref|>image<|/ref|>' in match[0]:
87
+ if include_images:
88
+ text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
89
+ img_num += 1
90
+ else:
91
+ text = text.replace(match[0], '', 1)
92
+ else:
93
+ text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
94
+
95
+ text = text.replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
96
+
97
+ return text.strip()
98
+
99
+ def embed_images(markdown, crops):
100
+ if not crops:
101
+ return markdown
102
+ for i, img in enumerate(crops):
103
+ buf = BytesIO()
104
+ img.save(buf, format="PNG")
105
+ b64 = base64.b64encode(buf.getvalue()).decode()
106
+ markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
107
+ return markdown
108
+
109
+ @spaces.GPU(duration=90)
110
+ def process_image(image, task, custom_prompt):
111
+ if image is None:
112
+ return "Error: Upload an image", "", "", None, []
113
+ if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
114
+ return "Please enter a prompt", "", "", None, []
115
+
116
+ if image.mode in ('RGBA', 'LA', 'P'):
117
+ image = image.convert('RGB')
118
+ image = ImageOps.exif_transpose(image)
119
+
120
+ if task == "✏️ Custom":
121
+ prompt = f"<image>\n{custom_prompt.strip()}"
122
+ has_grounding = '<|grounding|>' in custom_prompt
123
+ elif task == "πŸ“ Locate":
124
+ prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
125
+ has_grounding = True
126
+ else:
127
+ prompt = TASK_PROMPTS[task]["prompt"]
128
+ has_grounding = TASK_PROMPTS[task]["has_grounding"]
129
+
130
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
131
+ image.save(tmp.name, 'JPEG', quality=95)
132
+ tmp.close()
133
+ out_dir = tempfile.mkdtemp()
134
+
135
+ stdout = sys.stdout
136
+ sys.stdout = StringIO()
137
+
138
+ model.infer(
139
+ tokenizer=tokenizer,
140
+ prompt=prompt,
141
+ image_file=tmp.name,
142
+ output_path=out_dir,
143
+ base_size=BASE_SIZE,
144
+ image_size=IMAGE_SIZE,
145
+ crop_mode=CROP_MODE,
146
+ save_results=False
147
+ )
148
+
149
+ debug_filters = ['PATCHES', '====', 'BASE:', 'directly resize', 'NO PATCHES', 'torch.Size', '%|']
150
+ result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
151
+ if l.strip() and not any(s in l for s in debug_filters)]).strip()
152
+ sys.stdout = stdout
153
+
154
+ os.unlink(tmp.name)
155
+ shutil.rmtree(out_dir, ignore_errors=True)
156
+
157
+ if not result:
158
+ return "No text detected", "", "", None, []
159
+
160
+ cleaned = clean_output(result, False)
161
+ markdown = clean_output(result, True)
162
+
163
+ img_out = None
164
+ crops = []
165
+
166
+ if has_grounding and '<|ref|>' in result:
167
+ refs = extract_grounding_references(result)
168
+ if refs:
169
+ img_out, crops = draw_bounding_boxes(image, refs, True)
170
+
171
+ markdown = embed_images(markdown, crops)
172
+
173
+ return cleaned, markdown, result, img_out, crops
174
+
175
+ @spaces.GPU(duration=90)
176
+ def process_pdf(path, task, custom_prompt, page_num):
177
+ doc = fitz.open(path)
178
+ total_pages = len(doc)
179
+ if page_num < 1 or page_num > total_pages:
180
+ doc.close()
181
+ return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
182
+ page = doc.load_page(page_num - 1)
183
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
184
+ img = Image.open(BytesIO(pix.tobytes("png")))
185
+ doc.close()
186
+
187
+ return process_image(img, task, custom_prompt)
188
+
189
+ def process_file(path, task, custom_prompt, page_num):
190
+ if not path:
191
+ return "Error: Upload a file", "", "", None, []
192
+ if path.lower().endswith('.pdf'):
193
+ return process_pdf(path, task, custom_prompt, page_num)
194
+ else:
195
+ return process_image(Image.open(path), task, custom_prompt)
196
+
197
+ def toggle_prompt(task):
198
+ if task == "✏️ Custom":
199
+ return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for bounding boxes")
200
+ elif task == "πŸ“ Locate":
201
+ return gr.update(visible=True, label="Text to Locate", placeholder="Enter text to locate")
202
+ return gr.update(visible=False)
203
+
204
+ def select_boxes(task):
205
+ if task == "πŸ“ Locate":
206
+ return gr.update(selected="tab_boxes")
207
+ return gr.update()
208
+
209
+ def get_pdf_page_count(file_path):
210
+ if not file_path or not file_path.lower().endswith('.pdf'):
211
+ return 1
212
+ doc = fitz.open(file_path)
213
+ count = len(doc)
214
+ doc.close()
215
+ return count
216
+
217
+ def load_image(file_path, page_num=1):
218
+ if not file_path:
219
+ return None
220
+ if file_path.lower().endswith('.pdf'):
221
+ doc = fitz.open(file_path)
222
+ page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
223
+ page = doc.load_page(page_idx)
224
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
225
+ img = Image.open(BytesIO(pix.tobytes("png")))
226
+ doc.close()
227
+ return img
228
+ else:
229
+ return Image.open(file_path)
230
+
231
+ def update_page_selector(file_path):
232
+ if not file_path:
233
+ return gr.update(visible=False)
234
+ if file_path.lower().endswith('.pdf'):
235
+ page_count = get_pdf_page_count(file_path)
236
+ return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
237
+ label=f"Select Page (1-{page_count})")
238
+ return gr.update(visible=False)
239
+
240
+ with gr.Blocks(title="DeepSeek-OCR-2") as demo:
241
+ gr.Markdown("""
242
+ # πŸš€ DeepSeek-OCR-2 Demo
243
+ **Convert documents to markdown, extract text, parse figures, and locate specific content with bounding boxes.**
244
+ **Model uses DeepEncoder v2 and achieves 91.09% on OmniDocBench (+3.73% over v1).**
245
+
246
+ **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
247
+ """)
248
+
249
+ with gr.Row():
250
+ with gr.Column(scale=1):
251
+ file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
252
+ input_img = gr.Image(label="Input Image", type="pil", height=300)
253
+ page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
254
+ task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
255
+ prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
256
+ btn = gr.Button("Extract", variant="primary", size="lg")
257
+
258
+ with gr.Column(scale=2):
259
+ with gr.Tabs() as tabs:
260
+ with gr.Tab("Text", id="tab_text"):
261
+ text_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
262
+ with gr.Tab("Markdown Preview", id="tab_markdown"):
263
+ md_out = gr.Markdown("")
264
+ with gr.Tab("Boxes", id="tab_boxes"):
265
+ img_out = gr.Image(type="pil", height=500, show_label=False)
266
+ with gr.Tab("Cropped Images", id="tab_crops"):
267
+ gallery = gr.Gallery(show_label=False, columns=3, height=400)
268
+ with gr.Tab("Raw Text", id="tab_raw"):
269
+ raw_out = gr.Textbox(lines=20, buttons=["copy"], show_label=False)
270
+
271
+ gr.Examples(
272
+ examples=[
273
+ ["examples/ocr.jpg", "πŸ“‹ Markdown", ""],
274
+ ["examples/reachy-mini.jpg", "πŸ“ Locate", "Robot"]
275
+ ],
276
+ inputs=[input_img, task, prompt],
277
+ cache_examples=False
278
+ )
279
+
280
+ with gr.Accordion("ℹ️ Info", open=False):
281
+ gr.Markdown("""
282
+ ### Configuration
283
+ 1024 base + 768 patches with dynamic cropping (2-6 patches). 144 tokens per patch + 256 base tokens.
284
+
285
+ ### Tasks
286
+ - **Markdown**: Convert document to structured markdown with layout detection (grounding βœ…)
287
+ - **Free OCR**: Simple text extraction without layout
288
+ - **Locate**: Find and highlight specific text/elements in image (grounding βœ…)
289
+ - **Describe**: General image description
290
+ - **Custom**: Your own prompt
291
+
292
+ ### Special Tokens
293
+ - `<image>` - Placeholder where visual tokens are inserted
294
+ - `<|grounding|>` - Enables layout detection with bounding boxes
295
+ - `<|ref|>text<|/ref|>` - Reference text to locate in the image
296
+
297
+ """)
298
+
299
+ file_in.change(load_image, [file_in, page_selector], [input_img])
300
+ file_in.change(update_page_selector, [file_in], [page_selector])
301
+ page_selector.change(load_image, [file_in, page_selector], [input_img])
302
+ task.change(toggle_prompt, [task], [prompt])
303
+ task.change(select_boxes, [task], [tabs])
304
+
305
+ def run(image, file_path, task, custom_prompt, page_num):
306
+ if file_path:
307
+ return process_file(file_path, task, custom_prompt, int(page_num))
308
+ if image is not None:
309
+ return process_image(image, task, custom_prompt)
310
+ return "Error: Upload a file or image", "", "", None, []
311
+
312
+ submit_event = btn.click(run, [input_img, file_in, task, prompt, page_selector],
313
+ [text_out, md_out, raw_out, img_out, gallery])
314
+ submit_event.then(select_boxes, [task], [tabs])
315
+
316
+ if __name__ == "__main__":
317
+ demo.queue(max_size=20).launch(theme=gr.themes.Soft())