defatul commited on
Commit
896b892
·
verified ·
1 Parent(s): 6fd1f03
Files changed (1) hide show
  1. app.py +248 -149
app.py CHANGED
@@ -1,31 +1,56 @@
1
  import gradio as gr
2
  from transformers import AutoModel, AutoTokenizer
3
  import torch
4
- import spaces
5
  import os
6
  import sys
7
  import tempfile
8
  import shutil
9
  from PIL import Image, ImageDraw, ImageFont, ImageOps
10
- import fitz
11
  import re
12
- import warnings
13
  import numpy as np
14
  import base64
15
  from io import StringIO, BytesIO
16
 
17
- MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
20
- model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
21
- model = model.eval().cuda()
 
 
 
 
 
 
 
22
 
23
  MODEL_CONFIGS = {
24
- "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
25
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
26
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
27
  "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
28
- "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
 
29
  }
30
 
31
  TASK_PROMPTS = {
@@ -33,7 +58,7 @@ TASK_PROMPTS = {
33
  "📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
34
  "📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
35
  "🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
36
- "✏️ Custom": {"prompt": "", "has_grounding": False}
37
  }
38
 
39
  def extract_grounding_references(text):
@@ -44,39 +69,60 @@ def draw_bounding_boxes(image, refs, extract_images=False):
44
  img_w, img_h = image.size
45
  img_draw = image.copy()
46
  draw = ImageDraw.Draw(img_draw)
47
- overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
48
  draw2 = ImageDraw.Draw(overlay)
49
- font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 30)
 
 
 
 
 
 
50
  crops = []
51
-
52
  color_map = {}
53
  np.random.seed(42)
54
 
55
  for ref in refs:
56
  label = ref[1]
57
  if label not in color_map:
58
- color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
 
 
 
 
59
 
60
  color = color_map[label]
61
- coords = eval(ref[2])
 
 
 
62
  color_a = color + (60,)
63
-
64
  for box in coords:
65
- x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
66
-
67
- if extract_images and label == 'image':
 
 
 
 
 
68
  crops.append(image.crop((x1, y1, x2, y2)))
69
-
70
- width = 5 if label == 'title' else 3
71
  draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
72
  draw2.rectangle([x1, y1, x2, y2], fill=color_a)
73
-
74
- text_bbox = draw.textbbox((0, 0), label, font=font)
75
- tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
 
 
 
 
76
  ty = max(0, y1 - 20)
77
  draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
78
  draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
79
-
80
  img_draw.paste(overlay, (0, 0), overlay)
81
  return img_draw, crops
82
 
@@ -86,17 +132,17 @@ def clean_output(text, include_images=False):
86
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
87
  matches = re.findall(pattern, text, re.DOTALL)
88
  img_num = 0
89
-
90
  for match in matches:
91
- if '<|ref|>image<|/ref|>' in match[0]:
92
  if include_images:
93
- text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
94
  img_num += 1
95
  else:
96
- text = text.replace(match[0], '', 1)
97
  else:
98
- text = re.sub(rf'(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?', '', text)
99
-
100
  return text.strip()
101
 
102
  def embed_images(markdown, crops):
@@ -106,123 +152,160 @@ def embed_images(markdown, crops):
106
  buf = BytesIO()
107
  img.save(buf, format="PNG")
108
  b64 = base64.b64encode(buf.getvalue()).decode()
109
- markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
 
 
 
 
110
  return markdown
111
 
112
- @spaces.GPU(duration=60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def process_image(image, mode, task, custom_prompt):
114
  if image is None:
115
- return " Error Upload image", "", "", None, []
 
116
  if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
117
- return "Enter prompt", "", "", None, []
118
-
119
- if image.mode in ('RGBA', 'LA', 'P'):
120
- image = image.convert('RGB')
121
  image = ImageOps.exif_transpose(image)
122
-
123
  config = MODEL_CONFIGS[mode]
124
-
125
  if task == "✏️ Custom":
126
  prompt = f"<image>\n{custom_prompt.strip()}"
127
- has_grounding = '<|grounding|>' in custom_prompt
128
  elif task == "📍 Locate":
129
  prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
130
  has_grounding = True
131
  else:
132
  prompt = TASK_PROMPTS[task]["prompt"]
133
  has_grounding = TASK_PROMPTS[task]["has_grounding"]
134
-
135
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
136
- image.save(tmp.name, 'JPEG', quality=95)
137
  tmp.close()
138
  out_dir = tempfile.mkdtemp()
139
-
140
- stdout = sys.stdout
141
- sys.stdout = StringIO()
142
-
143
- model.infer(tokenizer=tokenizer, prompt=prompt, image_file=tmp.name, output_path=out_dir,
144
- base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"])
145
-
146
- result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
147
- if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
148
- sys.stdout = stdout
149
-
150
- os.unlink(tmp.name)
151
- shutil.rmtree(out_dir, ignore_errors=True)
152
-
153
- if not result:
154
- return "No text", "", "", None, []
155
-
156
- cleaned = clean_output(result, False)
157
- markdown = clean_output(result, True)
158
-
159
- img_out = None
160
- crops = []
161
-
162
- if has_grounding and '<|ref|>' in result:
163
- refs = extract_grounding_references(result)
164
- if refs:
165
- img_out, crops = draw_bounding_boxes(image, refs, True)
166
-
167
- markdown = embed_images(markdown, crops)
168
-
169
- return cleaned, markdown, result, img_out, crops
170
-
171
- @spaces.GPU(duration=60)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def process_pdf(path, mode, task, custom_prompt, page_num):
173
  doc = fitz.open(path)
174
  total_pages = len(doc)
175
-
176
- # Process all pages
177
- all_cleaned = []
178
- all_markdown = []
179
- all_raw = []
180
- all_crops = []
181
  img_out = None
182
-
183
- for page_idx in range(total_pages):
184
- page = doc.load_page(page_idx)
185
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
186
- img = Image.open(BytesIO(pix.tobytes("png")))
187
-
188
- cleaned, markdown, result, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
189
-
190
- if page_idx == 0:
191
- # Use first page's error message if there's an error
192
- if cleaned.startswith(" Error") or cleaned.startswith("Enter prompt") or cleaned == "No text":
193
- doc.close()
194
  return cleaned, "", "", None, []
195
-
196
- all_cleaned.append(cleaned)
197
- all_markdown.append(markdown)
198
- all_raw.append(result)
199
- all_crops.extend(page_crops)
200
-
201
- # Use the last page's bounding boxes image, or first if available
202
- if page_img_out is not None:
203
- img_out = page_img_out
204
-
205
- doc.close()
206
-
207
- # Combine results from all pages
208
- combined_cleaned = "\n\n--- Page Break ---\n\n".join(all_cleaned)
209
- combined_markdown = "\n\n--- Page Break ---\n\n".join(all_markdown)
210
- combined_raw = "\n\n--- Page Break ---\n\n".join(all_raw)
211
-
212
- return combined_cleaned, combined_markdown, combined_raw, img_out, all_crops
213
 
214
  def process_file(path, mode, task, custom_prompt, page_num):
215
  if not path:
216
- return "Error Upload file", "", "", None, []
217
- if path.lower().endswith('.pdf'):
218
  return process_pdf(path, mode, task, custom_prompt, page_num)
219
- else:
220
- return process_image(Image.open(path), mode, task, custom_prompt)
221
 
222
  def toggle_prompt(task):
223
  if task == "✏️ Custom":
224
  return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
225
- elif task == "📍 Locate":
226
  return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
227
  return gr.update(visible=False)
228
 
@@ -232,53 +315,65 @@ def select_boxes(task):
232
  return gr.update()
233
 
234
  def get_pdf_page_count(file_path):
235
- if not file_path or not file_path.lower().endswith('.pdf'):
236
  return 1
237
  doc = fitz.open(file_path)
238
- count = len(doc)
239
- doc.close()
240
- return count
 
241
 
242
  def load_image(file_path, page_num=1):
243
  if not file_path:
244
  return None
245
- if file_path.lower().endswith('.pdf'):
246
  doc = fitz.open(file_path)
247
- page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
248
- page = doc.load_page(page_idx)
249
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
250
- img = Image.open(BytesIO(pix.tobytes("png")))
251
- doc.close()
252
- return img
253
- else:
254
- return Image.open(file_path)
255
 
256
  def update_page_selector(file_path):
257
  if not file_path:
258
  return gr.update(visible=False)
259
- if file_path.lower().endswith('.pdf'):
260
  page_count = get_pdf_page_count(file_path)
261
- return gr.update(visible=True, maximum=page_count, value=1, minimum=1,
262
- label=f"Select Page (1-{page_count})")
 
 
 
 
 
263
  return gr.update(visible=False)
264
 
265
- with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
266
- gr.Markdown("""
267
- # 🚀 DeepSeek-OCR
268
-
269
- **Document parser with OCR capabilities. Process multi-page PDFs and images to extract text, convert to markdown, or locate specific content with bounding boxes.**
270
- """)
271
-
 
 
 
 
 
 
272
  with gr.Row():
273
  with gr.Column(scale=1):
274
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
275
  input_img = gr.Image(label="Input Image", type="pil", height=300)
276
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
277
- mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="Mode")
278
- task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📋 Markdown", label="Task")
279
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
280
  btn = gr.Button("Extract", variant="primary", size="lg")
281
-
282
  with gr.Column(scale=2):
283
  with gr.Tabs() as tabs:
284
  with gr.Tab("Text", id="tab_text"):
@@ -291,23 +386,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR") as demo:
291
  gallery = gr.Gallery(show_label=False, columns=3, height=400)
292
  with gr.Tab("Raw Text", id="tab_raw"):
293
  raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
294
-
295
  file_in.change(load_image, [file_in, page_selector], [input_img])
296
  file_in.change(update_page_selector, [file_in], [page_selector])
297
  page_selector.change(load_image, [file_in, page_selector], [input_img])
298
  task.change(toggle_prompt, [task], [prompt])
299
  task.change(select_boxes, [task], [tabs])
300
-
301
  def run(image, file_path, mode, task, custom_prompt, page_num):
302
  if file_path:
303
  return process_file(file_path, mode, task, custom_prompt, int(page_num))
304
  if image is not None:
305
  return process_image(image, mode, task, custom_prompt)
306
- return "Error uploading file or image", "", "", None, []
307
 
308
- submit_event = btn.click(run, [input_img, file_in, mode, task, prompt, page_selector],
309
- [text_out, md_out, raw_out, img_out, gallery])
 
 
 
310
  submit_event.then(select_boxes, [task], [tabs])
311
 
312
  if __name__ == "__main__":
313
- demo.queue(max_size=20).launch()
 
 
1
  import gradio as gr
2
  from transformers import AutoModel, AutoTokenizer
3
  import torch
 
4
  import os
5
  import sys
6
  import tempfile
7
  import shutil
8
  from PIL import Image, ImageDraw, ImageFont, ImageOps
9
+ import fitz # PyMuPDF
10
  import re
 
11
  import numpy as np
12
  import base64
13
  from io import StringIO, BytesIO
14
 
15
+ """
16
+ CPU-friendly version of the DeepSeekOCR Space app.
17
+
18
+ Changes vs GPU version:
19
+ - Removed `spaces` and @spaces.GPU decorators.
20
+ - Removed FlashAttention2 forcing (`_attn_implementation='flash_attention_2'`).
21
+ - Removed `.cuda()`; model runs on CPU.
22
+ - Uses torch.float32 (CPU-safe). This will be SLOW and may use lots of RAM.
23
+ """
24
+
25
+ # Force CPU usage (helps avoid accidental CUDA paths)
26
+ os.environ["CUDA_VISIBLE_DEVICES"] = ""
27
+
28
+ MODEL_NAME = "deepseek-ai/DeepSeek-OCR"
29
+
30
+ # Optional: limit CPU threads if your machine spikes (tweak as you like)
31
+ try:
32
+ torch.set_num_threads(max(1, min(8, os.cpu_count() or 1)))
33
+ except Exception:
34
+ pass
35
 
36
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
37
+
38
+ # CPU-safe load: no flash-attn, float32
39
+ model = AutoModel.from_pretrained(
40
+ MODEL_NAME,
41
+ torch_dtype=torch.float32,
42
+ trust_remote_code=True,
43
+ use_safetensors=True,
44
+ )
45
+ model = model.eval() # keep on CPU
46
 
47
  MODEL_CONFIGS = {
48
+ # On CPU, prefer smaller modes for speed/memory.
49
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
50
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
51
  "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
52
+ "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
53
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
54
  }
55
 
56
  TASK_PROMPTS = {
 
58
  "📝 Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
59
  "📍 Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
60
  "🔍 Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
61
+ "✏️ Custom": {"prompt": "", "has_grounding": False},
62
  }
63
 
64
  def extract_grounding_references(text):
 
69
  img_w, img_h = image.size
70
  img_draw = image.copy()
71
  draw = ImageDraw.Draw(img_draw)
72
+ overlay = Image.new("RGBA", img_draw.size, (0, 0, 0, 0))
73
  draw2 = ImageDraw.Draw(overlay)
74
+ # Fallback font if path doesn't exist
75
+ font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
76
+ try:
77
+ font = ImageFont.truetype(font_path, 30)
78
+ except Exception:
79
+ font = ImageFont.load_default()
80
+
81
  crops = []
 
82
  color_map = {}
83
  np.random.seed(42)
84
 
85
  for ref in refs:
86
  label = ref[1]
87
  if label not in color_map:
88
+ color_map[label] = (
89
+ int(np.random.randint(50, 255)),
90
+ int(np.random.randint(50, 255)),
91
+ int(np.random.randint(50, 255)),
92
+ )
93
 
94
  color = color_map[label]
95
+ try:
96
+ coords = eval(ref[2])
97
+ except Exception:
98
+ continue
99
  color_a = color + (60,)
100
+
101
  for box in coords:
102
+ x1, y1, x2, y2 = (
103
+ int(box[0] / 999 * img_w),
104
+ int(box[1] / 999 * img_h),
105
+ int(box[2] / 999 * img_w),
106
+ int(box[3] / 999 * img_h),
107
+ )
108
+
109
+ if extract_images and label == "image":
110
  crops.append(image.crop((x1, y1, x2, y2)))
111
+
112
+ width = 5 if label == "title" else 3
113
  draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
114
  draw2.rectangle([x1, y1, x2, y2], fill=color_a)
115
+
116
+ try:
117
+ text_bbox = draw.textbbox((0, 0), label, font=font)
118
+ tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
119
+ except Exception:
120
+ tw, th = (len(label) * 10, 20)
121
+
122
  ty = max(0, y1 - 20)
123
  draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
124
  draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
125
+
126
  img_draw.paste(overlay, (0, 0), overlay)
127
  return img_draw, crops
128
 
 
132
  pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
133
  matches = re.findall(pattern, text, re.DOTALL)
134
  img_num = 0
135
+
136
  for match in matches:
137
+ if "<|ref|>image<|/ref|>" in match[0]:
138
  if include_images:
139
+ text = text.replace(match[0], f"\n\n**[Figure {img_num + 1}]**\n\n", 1)
140
  img_num += 1
141
  else:
142
+ text = text.replace(match[0], "", 1)
143
  else:
144
+ text = re.sub(rf"(?m)^[^\n]*{re.escape(match[0])}[^\n]*\n?", "", text)
145
+
146
  return text.strip()
147
 
148
  def embed_images(markdown, crops):
 
152
  buf = BytesIO()
153
  img.save(buf, format="PNG")
154
  b64 = base64.b64encode(buf.getvalue()).decode()
155
+ markdown = markdown.replace(
156
+ f"**[Figure {i + 1}]**",
157
+ f"\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n",
158
+ 1,
159
+ )
160
  return markdown
161
 
162
+ def _infer_with_model(prompt, jpg_path, out_dir, base_size, image_size, crop_mode):
163
+ # DeepSeek model prints progress to stdout; capture it like original.
164
+ stdout = sys.stdout
165
+ sys.stdout = StringIO()
166
+ try:
167
+ model.infer(
168
+ tokenizer=tokenizer,
169
+ prompt=prompt,
170
+ image_file=jpg_path,
171
+ output_path=out_dir,
172
+ base_size=base_size,
173
+ image_size=image_size,
174
+ crop_mode=crop_mode,
175
+ )
176
+ raw = sys.stdout.getvalue()
177
+ finally:
178
+ sys.stdout = stdout
179
+ return raw
180
+
181
  def process_image(image, mode, task, custom_prompt):
182
  if image is None:
183
+ return "Error: Upload image", "", "", None, []
184
+
185
  if task in ["✏️ Custom", "📍 Locate"] and not custom_prompt.strip():
186
+ return "Error: Enter prompt", "", "", None, []
187
+
188
+ if image.mode in ("RGBA", "LA", "P"):
189
+ image = image.convert("RGB")
190
  image = ImageOps.exif_transpose(image)
191
+
192
  config = MODEL_CONFIGS[mode]
193
+
194
  if task == "✏️ Custom":
195
  prompt = f"<image>\n{custom_prompt.strip()}"
196
+ has_grounding = "<|grounding|>" in custom_prompt
197
  elif task == "📍 Locate":
198
  prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
199
  has_grounding = True
200
  else:
201
  prompt = TASK_PROMPTS[task]["prompt"]
202
  has_grounding = TASK_PROMPTS[task]["has_grounding"]
203
+
204
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
205
+ image.save(tmp.name, "JPEG", quality=95)
206
  tmp.close()
207
  out_dir = tempfile.mkdtemp()
208
+
209
+ try:
210
+ raw_stdout = _infer_with_model(
211
+ prompt=prompt,
212
+ jpg_path=tmp.name,
213
+ out_dir=out_dir,
214
+ base_size=config["base_size"],
215
+ image_size=config["image_size"],
216
+ crop_mode=config["crop_mode"],
217
+ )
218
+
219
+ # Filter noisy lines
220
+ result = "\n".join(
221
+ [
222
+ l
223
+ for l in raw_stdout.split("\n")
224
+ if not any(
225
+ s in l
226
+ for s in [
227
+ "image:",
228
+ "other:",
229
+ "PATCHES",
230
+ "====",
231
+ "BASE:",
232
+ "%|",
233
+ "torch.Size",
234
+ ]
235
+ )
236
+ ]
237
+ ).strip()
238
+
239
+ if not result:
240
+ return "No text", "", "", None, []
241
+
242
+ cleaned = clean_output(result, False)
243
+ markdown = clean_output(result, True)
244
+
245
+ img_out = None
246
+ crops = []
247
+
248
+ if has_grounding and "<|ref|>" in result:
249
+ refs = extract_grounding_references(result)
250
+ if refs:
251
+ img_out, crops = draw_bounding_boxes(image, refs, True)
252
+
253
+ markdown = embed_images(markdown, crops)
254
+ return cleaned, markdown, result, img_out, crops
255
+
256
+ except Exception as e:
257
+ return f"Runtime error: {type(e).__name__}: {e}", "", "", None, []
258
+ finally:
259
+ try:
260
+ os.unlink(tmp.name)
261
+ except Exception:
262
+ pass
263
+ shutil.rmtree(out_dir, ignore_errors=True)
264
+
265
  def process_pdf(path, mode, task, custom_prompt, page_num):
266
  doc = fitz.open(path)
267
  total_pages = len(doc)
268
+
269
+ all_cleaned, all_markdown, all_raw, all_crops = [], [], [], []
 
 
 
 
270
  img_out = None
271
+
272
+ try:
273
+ for page_idx in range(total_pages):
274
+ page = doc.load_page(page_idx)
275
+ pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
276
+ img = Image.open(BytesIO(pix.tobytes("png")))
277
+
278
+ cleaned, markdown, result, page_img_out, page_crops = process_image(img, mode, task, custom_prompt)
279
+
280
+ if page_idx == 0 and (cleaned.startswith("Error") or cleaned == "No text"):
 
 
281
  return cleaned, "", "", None, []
282
+
283
+ all_cleaned.append(cleaned)
284
+ all_markdown.append(markdown)
285
+ all_raw.append(result)
286
+ all_crops.extend(page_crops)
287
+
288
+ if page_img_out is not None:
289
+ img_out = page_img_out
290
+
291
+ combined_cleaned = "\n\n--- Page Break ---\n\n".join(all_cleaned)
292
+ combined_markdown = "\n\n--- Page Break ---\n\n".join(all_markdown)
293
+ combined_raw = "\n\n--- Page Break ---\n\n".join(all_raw)
294
+ return combined_cleaned, combined_markdown, combined_raw, img_out, all_crops
295
+ finally:
296
+ doc.close()
 
 
 
297
 
298
  def process_file(path, mode, task, custom_prompt, page_num):
299
  if not path:
300
+ return "Error: Upload file", "", "", None, []
301
+ if path.lower().endswith(".pdf"):
302
  return process_pdf(path, mode, task, custom_prompt, page_num)
303
+ return process_image(Image.open(path), mode, task, custom_prompt)
 
304
 
305
  def toggle_prompt(task):
306
  if task == "✏️ Custom":
307
  return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
308
+ if task == "📍 Locate":
309
  return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
310
  return gr.update(visible=False)
311
 
 
315
  return gr.update()
316
 
317
  def get_pdf_page_count(file_path):
318
+ if not file_path or not file_path.lower().endswith(".pdf"):
319
  return 1
320
  doc = fitz.open(file_path)
321
+ try:
322
+ return len(doc)
323
+ finally:
324
+ doc.close()
325
 
326
  def load_image(file_path, page_num=1):
327
  if not file_path:
328
  return None
329
+ if file_path.lower().endswith(".pdf"):
330
  doc = fitz.open(file_path)
331
+ try:
332
+ page_idx = max(0, min(int(page_num) - 1, len(doc) - 1))
333
+ page = doc.load_page(page_idx)
334
+ pix = page.get_pixmap(matrix=fitz.Matrix(300 / 72, 300 / 72), alpha=False)
335
+ return Image.open(BytesIO(pix.tobytes("png")))
336
+ finally:
337
+ doc.close()
338
+ return Image.open(file_path)
339
 
340
  def update_page_selector(file_path):
341
  if not file_path:
342
  return gr.update(visible=False)
343
+ if file_path.lower().endswith(".pdf"):
344
  page_count = get_pdf_page_count(file_path)
345
+ return gr.update(
346
+ visible=True,
347
+ maximum=page_count,
348
+ value=1,
349
+ minimum=1,
350
+ label=f"Select Page (1-{page_count})",
351
+ )
352
  return gr.update(visible=False)
353
 
354
+ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR (CPU)") as demo:
355
+ gr.Markdown(
356
+ """
357
+ # 🐢 DeepSeek-OCR (CPU)
358
+
359
+ ⚠️ **CPU mode is very slow** and may fail on large documents due to RAM/time limits.
360
+ - Prefer **Tiny/Small** modes on CPU.
361
+ - For best results/latency, use GPU.
362
+
363
+ This Space processes images and multi-page PDFs: extract text, convert to markdown, or locate content with bounding boxes.
364
+ """
365
+ )
366
+
367
  with gr.Row():
368
  with gr.Column(scale=1):
369
  file_in = gr.File(label="Upload Image or PDF", file_types=["image", ".pdf"], type="filepath")
370
  input_img = gr.Image(label="Input Image", type="pil", height=300)
371
  page_selector = gr.Number(label="Select Page", value=1, minimum=1, step=1, visible=False)
372
+ mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Tiny", label="Mode (CPU recommend: Tiny/Small)")
373
+ task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="📝 Free OCR", label="Task")
374
  prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
375
  btn = gr.Button("Extract", variant="primary", size="lg")
376
+
377
  with gr.Column(scale=2):
378
  with gr.Tabs() as tabs:
379
  with gr.Tab("Text", id="tab_text"):
 
386
  gallery = gr.Gallery(show_label=False, columns=3, height=400)
387
  with gr.Tab("Raw Text", id="tab_raw"):
388
  raw_out = gr.Textbox(lines=20, show_copy_button=True, show_label=False)
389
+
390
  file_in.change(load_image, [file_in, page_selector], [input_img])
391
  file_in.change(update_page_selector, [file_in], [page_selector])
392
  page_selector.change(load_image, [file_in, page_selector], [input_img])
393
  task.change(toggle_prompt, [task], [prompt])
394
  task.change(select_boxes, [task], [tabs])
395
+
396
  def run(image, file_path, mode, task, custom_prompt, page_num):
397
  if file_path:
398
  return process_file(file_path, mode, task, custom_prompt, int(page_num))
399
  if image is not None:
400
  return process_image(image, mode, task, custom_prompt)
401
+ return "Error: uploading file or image", "", "", None, []
402
 
403
+ submit_event = btn.click(
404
+ run,
405
+ [input_img, file_in, mode, task, prompt, page_selector],
406
+ [text_out, md_out, raw_out, img_out, gallery],
407
+ )
408
  submit_event.then(select_boxes, [task], [tabs])
409
 
410
  if __name__ == "__main__":
411
+ # Keep queue modest on CPU
412
+ demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)