Nattapong Tapachoom commited on
Commit
55b2e3f
Β·
1 Parent(s): d771b51

Update with Ollama DeepSeek OCR app

Browse files
Files changed (4) hide show
  1. .env +4 -0
  2. README.md +40 -12
  3. app.py +315 -0
  4. requirements.txt +6 -0
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Environment variables for Ollama-Deepseek-OCR
2
+
3
+ # For Ollama cloud usage, set your API key from https://ollama.com/settings/keys
4
+ OLLAMA_API_KEY=34be694fa5834914b405f98577a35c0a.niVYQurgIXz39j132SuNbRoJ
README.md CHANGED
@@ -1,12 +1,40 @@
1
- ---
2
- title: OCR
3
- emoji: πŸ“š
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 6.1.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ollama-Deepseek-OCR
2
+
3
+ A Gradio app for OCR using DeepSeek-OCR 3B model via Ollama.
4
+
5
+ ## Setup
6
+
7
+ 1. Install Ollama: https://ollama.ai/
8
+
9
+ 2. For local usage, pull the DeepSeek-VL model:
10
+ ```
11
+ ollama pull deepseek-vl
12
+ ```
13
+
14
+ 3. For cloud usage (if vision models become available), uncomment and set your API key in `.env`:
15
+ ```
16
+ OLLAMA_API_KEY=your_api_key_here
17
+ ```
18
+ Or set the environment variable directly. Note: Cloud models may not support vision tasks yet.
19
+
20
+ 4. Install Python dependencies:
21
+ ```
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ 5. Run the app:
26
+ ```
27
+ python app.py
28
+ ```
29
+
30
+ ## Features
31
+
32
+ - Convert documents to markdown
33
+ - Extract raw text
34
+ - Locate specific content with bounding boxes
35
+ - Support for images and PDFs
36
+
37
+ ## Note
38
+
39
+ This uses Ollama with deepseek-vl model for vision tasks. The original DeepSeek-OCR model is adapted for Ollama.
40
+
app.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import ollama
3
+ from ollama import Client
4
+ import spaces
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ import shutil
9
+ from PIL import Image, ImageDraw, ImageFont, ImageOps
10
+ import fitz
11
+ import re
12
+ import warnings
13
+ import random
14
+ import base64
15
+ from io import StringIO, BytesIO
16
+ from dotenv import load_dotenv
17
+
18
+ load_dotenv()
19
+
20
+ # Set Ollama models directory to E drive
21
+ os.environ['OLLAMA_MODELS'] = 'E:\\ollama\\models'
22
+
23
+ # Initialize Ollama client
24
+ if os.environ.get('OLLAMA_API_KEY'):
25
+ ollama_client = Client(host="https://ollama.com", headers={'Authorization': f'Bearer {os.environ.get("OLLAMA_API_KEY")}'})
26
+ else:
27
+ ollama_client = Client()
28
+
29
+ MODEL_CONFIGS = {
30
+ "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
31
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
32
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
33
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
34
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
35
+ }
36
+
37
+ TASK_PROMPTS = {
38
+ "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
39
+ "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
40
+ "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
41
+ "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
42
+ "✏️ Custom": {"prompt": "", "has_grounding": False}
43
+ }
44
+
45
+ def extract_grounding_references(text):
46
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
47
+ return re.findall(pattern, text, re.DOTALL)
48
+
49
+ def draw_bounding_boxes(image, refs, extract_images=False):
50
+ img_w, img_h = image.size
51
+ img_draw = image.copy()
52
+ draw = ImageDraw.Draw(img_draw)
53
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
54
+ draw2 = ImageDraw.Draw(overlay)
55
+ try:
56
+ font_path = "C:\\Windows\\Fonts\\arialbd.ttf" if os.name == 'nt' else "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
57
+ font = ImageFont.truetype(font_path, 30)
58
+ except:
59
+ font = ImageFont.load_default()
60
+ crops = []
61
+
62
+ color_map = {}
63
+ random.seed(42)
64
+
65
+ for ref in refs:
66
+ label = ref[1]
67
+ if label not in color_map:
68
+ color_map[label] = (random.randint(50, 255), random.randint(50, 255), random.randint(50, 255))
69
+
70
+ color = color_map[label]
71
+ coords = eval(ref[2])
72
+ color_a = color + (60,)
73
+
74
+ for box in coords:
75
+ x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
76
+
77
+ if extract_images and label == 'image':
78
+ crops.append(image.crop((x1, y1, x2, y2)))
79
+
80
+ width = 5 if label == 'title' else 3
81
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
82
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a)
83
+
84
+ text_bbox = draw.textbbox((0, 0), label, font=font)
85
+ tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
86
+ ty = max(0, y1 - 20)
87
+ draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
88
+ draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
89
+
90
+ img_draw.paste(overlay, (0, 0), overlay)
91
+ return img_draw, crops
92
+
93
+ def clean_output(text, include_images=False):
94
+ if not text:
95
+ return ""
96
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
97
+ matches = re.findall(pattern, text, re.DOTALL)
98
+ img_num = 0
99
+
100
+ for match in matches:
101
+ if '<|ref|>image<|/ref|>' in match[0]:
102
+ if include_images:
103
+ text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
104
+ img_num += 1
105
+ else:
106
+ text = text.replace(match[0], '', 1)
107
+ else:
108
+ text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
109
+
110
+ return text.strip()
111
+
112
+ def embed_images(markdown, crops):
113
+ if not crops:
114
+ return markdown
115
+ for i, img in enumerate(crops):
116
+ buf = BytesIO()
117
+ img.save(buf, format="PNG")
118
+ b64 = base64.b64encode(buf.getvalue()).decode()
119
+ markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
120
+ return markdown
121
+
122
+ @spaces.GPU(duration=60)
123
+ def process_image(image, mode, task, custom_prompt):
124
+ if image is None:
125
+ return " Error Upload image", "", "", None, []
126
+ if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
127
+ return "Enter prompt", "", "", None, []
128
+
129
+ if image.mode in ('RGBA', 'LA', 'P'):
130
+ image = image.convert('RGB')
131
+ image = ImageOps.exif_transpose(image)
132
+
133
+ config = MODEL_CONFIGS[mode]
134
+
135
+ if task == "✏️ Custom":
136
+ prompt = f"<image>\n{custom_prompt.strip()}"
137
+ has_grounding = '<|grounding|>' in custom_prompt
138
+ elif task == "πŸ“ Locate":
139
+ prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
140
+ has_grounding = True
141
+ else:
142
+ prompt = TASK_PROMPTS[task]["prompt"]
143
+ has_grounding = TASK_PROMPTS[task]["has_grounding"]
144
+
145
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
146
+ image.save(tmp.name, 'JPEG', quality=95)
147
+ tmp.close()
148
+
149
+ try:
150
+ response = ollama_client.chat(model='gpt-oss:120b-cloud', messages=[{'role': 'user', 'content': prompt, 'images': [tmp.name]}])
151
+ result = response['message']['content']
152
+ except Exception as e:
153
+ return f"Error: {str(e)}", "", "", None, []
154
+
155
+ os.unlink(tmp.name)
156
+
157
+ if not result:
158
+ return "No text", "", "", None, []
159
+
160
+ cleaned = clean_output(result, False)
161
+ markdown = clean_output(result, True)
162
+
163
+ img_out = None
164
+ crops = []
165
+
166
+ if has_grounding and '<|ref|>' in result:
167
+ refs = extract_grounding_references(result)
168
+ if refs:
169
+ img_out, crops = draw_bounding_boxes(image, refs, True)
170
+
171
+ markdown = embed_images(markdown, crops)
172
+
173
+ return cleaned, markdown, result, img_out, crops
174
+
175
+ @spaces.GPU(duration=60)
176
+ def process_pdf(path, mode, task, custom_prompt, page_num):
177
+ doc = fitz.open(path)
178
+ total_pages = len(doc)
179
+ if page_num < 1 or page_num > total_pages:
180
+ doc.close()
181
+ return f"Invalid page number. PDF has {total_pages} pages.", "", "", None, []
182
+ page = doc.load_page(page_num - 1)
183
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
184
+ img = Image.open(BytesIO(pix.tobytes("png")))
185
+ doc.close()
186
+
187
+ return process_image(img, mode, task, custom_prompt)
188
+
189
+ def process_file(path, mode, task, custom_prompt, page_num):
190
+ if not path:
191
+ return "Error Upload file", "", "", None, []
192
+ if path.lower().endswith('.pdf'):
193
+ return process_pdf(path, mode, task, custom_prompt, page_num)
194
+ else:
195
+ return process_image(Image.open(path), mode, task, custom_prompt)
196
+
197
+ def toggle_prompt(task):
198
+ if task == "✏️ Custom":
199
+ return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
200
+ elif task == "πŸ“ Locate":
201
+ return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
202
+ return gr.update(visible=False)
203
+
204
+ def select_boxes(task):
205
+ if task == "πŸ“ Locate":
206
+ return gr.update(selected="tab_boxes")
207
+ return gr.update()
208
+
209
+ def get_pdf_page_count(file_path):
210
+ if not file_path or not file_path.lower().endswith('.pdf'):
211
+ return 1
212
+ doc = fitz.open(file_path)
213
+ count = len(doc)
214
+ doc.close()
215
+ return count
216
+
217
+ def load_image(file_path, page_num=1):
218
+ if not file_path:
219
+ return None
220
+ if file_path.lower().endswith('.pdf'):
221
+ doc = fitz.open(file_path)
222
+ page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
223
+ page = doc.load_page(page_idx)
224
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
225
+ img = Image.open(BytesIO(pix.tobytes("png")))
226
+ doc.close()
227
+ return img
228
+ else:
229
+ return Image.open(file_path)
230
+
231
+ def update_page_selector(file_path):
232
+ if not file_path:
233
+ return gr.update(visible=False)
234
+ if file_path.lower().endswith('.pdf'):
235
+ page_count = get_pdf_page_count(file_path)
236
+ return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
237
+ label=f"Select Page (1-{page_count})")
238
+ return gr.update(visible=False)
239
+
240
+ with gr.Blocks(title="DeepSeek-OCR") as demo:
241
+ gr.Markdown("""
242
+ # πŸš€ DeepSeek-OCR Demo
243
+ **Convert documents to markdown, extract raw text, and locate specific content with bounding boxes. It takes 20~ sec for markdown and 3~ sec for locate task examples. Check the info at the bottom of the page for more information.**
244
+
245
+ **Hope this tool was helpful! If so, a quick like ❀️ would mean a lot :)**
246
+ """)
247
+
248
+ with gr.Row():
249
+ with gr.Column(scale=1):
250
+ file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
251
+ input_img = gr.Image(label="Input Image", type="pil", height=300)
252
+ page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
253
+ mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
254
+ task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="Task")
255
+ prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
256
+ btn = gr.Button("Extract", variant="primary", size="lg")
257
+
258
+ with gr.Column(scale=2):
259
+ with gr.Tabs() as tabs:
260
+ with gr.Tab("Text", id="tab_text"):
261
+ text_out = gr.Textbox(lines=20, show_label=False)
262
+ with gr.Tab("Markdown Preview", id="tab_markdown"):
263
+ md_out = gr.Markdown("")
264
+ with gr.Tab("Boxes", id="tab_boxes"):
265
+ img_out = gr.Image(type="pil", height=500, show_label=False)
266
+ with gr.Tab("Cropped Images", id="tab_crops"):
267
+ gallery = gr.Gallery(show_label=False, columns=3, height=400)
268
+ with gr.Tab("Raw Text", id="tab_raw"):
269
+ raw_out = gr.Textbox(lines=20, show_label=False)
270
+
271
+ # gr.Examples(
272
+ # examples=[
273
+ # ["examples/ocr.jpg", "Gundam", "πŸ“‹ Markdown", ""],
274
+ # ["examples/reachy-mini.jpg", "Gundam", "πŸ“ Locate", "Robot"]
275
+ # ],
276
+ # inputs=[input_img, mode, task, prompt],
277
+ # cache_examples=False
278
+ # )
279
+
280
+ with gr.Accordion("ℹ️ Info", open=False):
281
+ gr.Markdown("""
282
+ ### Modes
283
+ - **Gundam**: 1024 base + 640 tiles with cropping - Best balance
284
+ - **Tiny**: 512Γ—512, no crop - Fastest
285
+ - **Small**: 640Γ—640, no crop - Quick
286
+ - **Base**: 1024Γ—1024, no crop - Standard
287
+ - **Large**: 1280Γ—1280, no crop - Highest quality
288
+
289
+ ### Tasks
290
+ - **Markdown**: Convert document to structured markdown (grounding βœ…)
291
+ - **Free OCR**: Simple text extraction
292
+ - **Locate**: Find specific things in image (grounding βœ…)
293
+ - **Describe**: General image description
294
+ - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
295
+ """)
296
+
297
+ file_in.change(load_image, [file_in, page_selector], [input_img])
298
+ file_in.change(update_page_selector, [file_in], [page_selector])
299
+ page_selector.change(load_image, [file_in, page_selector], [input_img])
300
+ task.change(toggle_prompt, [task], [prompt])
301
+ task.change(select_boxes, [task], [tabs])
302
+
303
+ def run(image, file_path, mode, task, custom_prompt, page_num):
304
+ if file_path:
305
+ return process_file(file_path, mode, task, custom_prompt, int(page_num))
306
+ if image is not None:
307
+ return process_image(image, mode, task, custom_prompt)
308
+ return "Error uploading file or image", "", "", None, []
309
+
310
+ submit_event = btn.click(run, [input_img, file_in, mode, task, prompt, page_selector],
311
+ [text_out, md_out, raw_out, img_out, gallery])
312
+ submit_event.then(select_boxes, [task], [tabs])
313
+
314
+ if __name__ == "__main__":
315
+ demo.queue(max_size=20).launch(theme=gr.themes.Soft())
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ ollama
3
+ spaces
4
+ pillow
5
+ pymupdf
6
+ python-dotenv