akshayve3 commited on
Commit
6bca8e9
Β·
verified Β·
1 Parent(s): addc274

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +472 -66
app.py CHANGED
@@ -1,70 +1,476 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
-
19
- messages = [{"role": "system", "content": system_message}]
20
-
21
- messages.extend(history)
22
-
23
- messages.append({"role": "user", "content": message})
24
-
25
- response = ""
26
-
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
- ],
61
- )
62
-
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  if __name__ == "__main__":
70
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
+ import torch
4
+ import spaces
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ import shutil
9
+ from PIL import Image, ImageDraw, ImageFont, ImageOps
10
+ import fitz
11
+ import re
12
+ import numpy as np
13
+ import base64
14
+ from io import StringIO, BytesIO
15
+ from pathlib import Path
16
+ import time
17
+ from docx import Document
18
+ from pptx import Presentation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ MODEL_NAME = 'deepseek-ai/DeepSeek-OCR'
21
+
22
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
23
+ model = AutoModel.from_pretrained(MODEL_NAME, _attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16, trust_remote_code=True, use_safetensors=True)
24
+ model = model.eval().cuda()
25
+
26
+ MODEL_CONFIGS = {
27
+ "Gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
28
+ "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
29
+ "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
30
+ "Base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
31
+ "Large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}
32
+ }
33
+
34
+ TASK_PROMPTS = {
35
+ "πŸ“‹ Markdown": {"prompt": "<image>\n<|grounding|>Convert the document to markdown.", "has_grounding": True},
36
+ "πŸ“ Free OCR": {"prompt": "<image>\nFree OCR.", "has_grounding": False},
37
+ "πŸ“ Locate": {"prompt": "<image>\nLocate <|ref|>text<|/ref|> in the image.", "has_grounding": True},
38
+ "πŸ” Describe": {"prompt": "<image>\nDescribe this image in detail.", "has_grounding": False},
39
+ "✏️ Custom": {"prompt": "", "has_grounding": False}
40
+ }
41
+
42
+ def extract_grounding_references(text):
43
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
44
+ return re.findall(pattern, text, re.DOTALL)
45
+
46
+ def draw_bounding_boxes(image, refs, extract_images=False):
47
+ img_w, img_h = image.size
48
+ img_draw = image.copy()
49
+ draw = ImageDraw.Draw(img_draw)
50
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
51
+ draw2 = ImageDraw.Draw(overlay)
52
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", 25)
53
+ crops = []
54
+
55
+ color_map = {}
56
+ np.random.seed(42)
57
+
58
+ for ref in refs:
59
+ label = ref[1]
60
+ if label not in color_map:
61
+ color_map[label] = (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255))
62
+
63
+ color = color_map[label]
64
+ coords = eval(ref[2])
65
+ color_a = color + (60,)
66
+
67
+ for box in coords:
68
+ x1, y1, x2, y2 = int(box[0]/999*img_w), int(box[1]/999*img_h), int(box[2]/999*img_w), int(box[3]/999*img_h)
69
+
70
+ if extract_images and label == 'image':
71
+ crops.append(image.crop((x1, y1, x2, y2)))
72
+
73
+ width = 5 if label == 'title' else 3
74
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=width)
75
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a)
76
+
77
+ text_bbox = draw.textbbox((0, 0), label, font=font)
78
+ tw, th = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
79
+ ty = max(0, y1 - 20)
80
+ draw.rectangle([x1, ty, x1 + tw + 4, ty + th + 4], fill=color)
81
+ draw.text((x1 + 2, ty + 2), label, font=font, fill=(255, 255, 255))
82
+
83
+ img_draw.paste(overlay, (0, 0), overlay)
84
+ return img_draw, crops
85
+
86
+ def clean_output(text, include_images=False, remove_labels=False):
87
+ if not text:
88
+ return ""
89
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
90
+ matches = re.findall(pattern, text, re.DOTALL)
91
+ img_num = 0
92
+
93
+ for match in matches:
94
+ if '<|ref|>image<|/ref|>' in match[0]:
95
+ if include_images:
96
+ text = text.replace(match[0], f'\n\n**[Figure {img_num + 1}]**\n\n', 1)
97
+ img_num += 1
98
+ else:
99
+ text = text.replace(match[0], '', 1)
100
+ else:
101
+ if remove_labels:
102
+ text = text.replace(match[0], '', 1)
103
+ else:
104
+ text = text.replace(match[0], match[1], 1)
105
+
106
+ return text.strip()
107
+
108
+ def embed_images(markdown, crops):
109
+ if not crops:
110
+ return markdown
111
+ for i, img in enumerate(crops):
112
+ buf = BytesIO()
113
+ img.save(buf, format="PNG")
114
+ b64 = base64.b64encode(buf.getvalue()).decode()
115
+ markdown = markdown.replace(f'**[Figure {i + 1}]**', f'\n\n![Figure {i + 1}](data:image/png;base64,{b64})\n\n', 1)
116
+ return markdown
117
+
118
+ @spaces.GPU(duration=60)
119
+ def process_image(image, mode, task, custom_prompt):
120
+ if image is None:
121
+ return "Error: Upload image", "", "", None, []
122
+ if task in ["✏️ Custom", "πŸ“ Locate"] and not custom_prompt.strip():
123
+ return "Enter prompt", "", "", None, []
124
+
125
+ if image.mode in ('RGBA', 'LA', 'P'):
126
+ image = image.convert('RGB')
127
+ image = ImageOps.exif_transpose(image)
128
+
129
+ config = MODEL_CONFIGS[mode]
130
+
131
+ if task == "✏️ Custom":
132
+ prompt = f"<image>\n{custom_prompt.strip()}"
133
+ has_grounding = '<|grounding|>' in custom_prompt
134
+ elif task == "πŸ“ Locate":
135
+ prompt = f"<image>\nLocate <|ref|>{custom_prompt.strip()}<|/ref|> in the image."
136
+ has_grounding = True
137
+ else:
138
+ prompt = TASK_PROMPTS[task]["prompt"]
139
+ has_grounding = TASK_PROMPTS[task]["has_grounding"]
140
+
141
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
142
+ image.save(tmp.name, 'JPEG', quality=95)
143
+ tmp.close()
144
+ out_dir = tempfile.mkdtemp()
145
+
146
+ stdout = sys.stdout
147
+ sys.stdout = StringIO()
148
+
149
+ model.infer(tokenizer=tokenizer, prompt=prompt, image_file=tmp.name, output_path=out_dir,
150
+ base_size=config["base_size"], image_size=config["image_size"], crop_mode=config["crop_mode"])
151
+
152
+ result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
153
+ if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
154
+ sys.stdout = stdout
155
+
156
+ os.unlink(tmp.name)
157
+ shutil.rmtree(out_dir, ignore_errors=True)
158
+
159
+ if not result:
160
+ return "No text", "", "", None, []
161
+
162
+ cleaned = clean_output(result, False, False)
163
+ markdown = clean_output(result, True, True)
164
+
165
+ img_out = None
166
+ crops = []
167
+
168
+ if has_grounding and '<|ref|>' in result:
169
+ refs = extract_grounding_references(result)
170
+ if refs:
171
+ img_out, crops = draw_bounding_boxes(image, refs, True)
172
+
173
+ markdown = embed_images(markdown, crops)
174
+
175
+ return cleaned, markdown, result, img_out, crops
176
+
177
+ def docx_to_images(path):
178
+ doc = Document(path)
179
+ images = []
180
+
181
+ for i, para in enumerate(doc.paragraphs):
182
+ if para.text.strip():
183
+ img = Image.new('RGB', (800, 1100), color='white')
184
+ draw = ImageDraw.Draw(img)
185
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20)
186
+ draw.text((50, 50), para.text, fill='black', font=font)
187
+ images.append(img)
188
+
189
+ return images
190
+
191
+ def pptx_to_images(path):
192
+ prs = Presentation(path)
193
+ images = []
194
+
195
+ for i, slide in enumerate(prs.slides):
196
+ img = Image.new('RGB', (960, 720), color='white')
197
+ draw = ImageDraw.Draw(img)
198
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 20)
199
+ y = 50
200
+ for shape in slide.shapes:
201
+ if hasattr(shape, "text") and shape.text.strip():
202
+ draw.text((50, y), shape.text, fill='black', font=font)
203
+ y += 100
204
+ images.append(img)
205
+
206
+ return images
207
+
208
+ @spaces.GPU(duration=300)
209
+ def process_pdf(path, mode, task, custom_prompt):
210
+ doc = fitz.open(path)
211
+ texts, markdowns, raws, all_crops = [], [], [], []
212
+ box_images = []
213
+
214
+ for i in range(len(doc)):
215
+ page = doc.load_page(i)
216
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72), alpha=False)
217
+ img = Image.open(BytesIO(pix.tobytes("png")))
218
+
219
+ text, md, raw, box_img, crops = process_image(img, mode, task, custom_prompt)
220
+
221
+ if text and text != "No text":
222
+ texts.append(f"### Page {i + 1}\n\n{text}")
223
+ markdowns.append(f"### Page {i + 1}\n\n{md}")
224
+ raws.append(f"=== Page {i + 1} ===\n{raw}")
225
+ all_crops.extend(crops)
226
+ box_images.append(box_img)
227
+
228
+ total_pages = len(doc)
229
+ doc.close()
230
+
231
+ return ("\n\n---\n\n".join(texts) if texts else "No text in PDF",
232
+ "\n\n---\n\n".join(markdowns) if markdowns else "No text in PDF",
233
+ "\n\n".join(raws), box_images, all_crops, total_pages)
234
+
235
+ def save_outputs(doc_name, text_content, md_content, raw_content, box_images, cropped_images):
236
+ base_dir = Path("outputs")
237
+ base_dir.mkdir(exist_ok=True)
238
+
239
+ existing_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
240
+ folder_num = len(existing_dirs) + 1
241
+
242
+ doc_folder = base_dir / f"{folder_num:02d}_{doc_name}"
243
+ doc_folder.mkdir(exist_ok=True)
244
+
245
+ (doc_folder / "text_output.txt").write_text(text_content, encoding='utf-8')
246
+ (doc_folder / "clean_output.md").write_text(md_content, encoding='utf-8')
247
+ (doc_folder / "raw_output.txt").write_text(raw_content, encoding='utf-8')
248
+
249
+ boxes_dir = doc_folder / "boxes"
250
+ boxes_dir.mkdir(exist_ok=True)
251
+ for i, img in enumerate(box_images):
252
+ if img is not None:
253
+ img.save(boxes_dir / f"page_{i+1:02d}_box.jpg")
254
+
255
+ cropped_dir = doc_folder / "cropped"
256
+ cropped_dir.mkdir(exist_ok=True)
257
+ for i, img in enumerate(cropped_images):
258
+ if img is not None:
259
+ img.save(cropped_dir / f"crop_{i+1:02d}.jpg")
260
+
261
+ return str(doc_folder)
262
+
263
+ def process_single_file(file_path, mode, task, custom_prompt):
264
+ start_time = time.time()
265
+
266
+ file_name = Path(file_path).stem
267
+ ext = Path(file_path).suffix.lower()
268
+
269
+ if ext == '.pdf':
270
+ text, md, raw, box_images, crops, total_pages = process_pdf(file_path, mode, task, custom_prompt)
271
+
272
+ elif ext == '.docx':
273
+ images = docx_to_images(file_path)
274
+ texts, mds, raws, box_images, crops = [], [], [], [], []
275
+ for i, img in enumerate(images):
276
+ text, md, raw, box_img, crp = process_image(img, mode, task, custom_prompt)
277
+ texts.append(f"### Page {i+1}\n\n{text}")
278
+ mds.append(f"### Page {i+1}\n\n{md}")
279
+ raws.append(f"=== Page {i+1} ===\n{raw}")
280
+ box_images.append(box_img)
281
+ crops.extend(crp)
282
+ text = "\n\n---\n\n".join(texts)
283
+ md = "\n\n---\n\n".join(mds)
284
+ raw = "\n\n".join(raws)
285
+ total_pages = len(images)
286
+
287
+ elif ext == '.pptx':
288
+ images = pptx_to_images(file_path)
289
+ texts, mds, raws, box_images, crops = [], [], [], [], []
290
+ for i, img in enumerate(images):
291
+ text, md, raw, box_img, crp = process_image(img, mode, task, custom_prompt)
292
+ texts.append(f"### Slide {i+1}\n\n{text}")
293
+ mds.append(f"### Slide {i+1}\n\n{md}")
294
+ raws.append(f"=== Slide {i+1} ===\n{raw}")
295
+ box_images.append(box_img)
296
+ crops.extend(crp)
297
+ text = "\n\n---\n\n".join(texts)
298
+ md = "\n\n---\n\n".join(mds)
299
+ raw = "\n\n".join(raws)
300
+ total_pages = len(images)
301
+
302
+ else:
303
+ img = Image.open(file_path)
304
+ text, md, raw, box_img, crops = process_image(img, mode, task, custom_prompt)
305
+ box_images = [box_img] if box_img else []
306
+ total_pages = 1
307
+
308
+ elapsed_time = time.time() - start_time
309
+
310
+ folder_path = save_outputs(file_name, text, md, raw, box_images, crops)
311
+
312
+ summary = f"πŸ“„ File: {file_name}\nπŸ“Š Pages/Slides: {total_pages}\nπŸ–ΌοΈ Cropped Images: {len(crops)}\n⏱️ Processing Time: {elapsed_time:.2f}s\nπŸ“ Saved to: {folder_path}"
313
+
314
+ return text, md, raw, box_images, crops, summary
315
+
316
+ def process_multiple_files(files, mode, task, custom_prompt):
317
+ if not files:
318
+ return "No files uploaded", "", "", [], [], "No files to process"
319
+
320
+ all_texts, all_mds, all_raws, all_boxes, all_crops = [], [], [], [], []
321
+ summaries = []
322
+ total_start = time.time()
323
+
324
+ for file in files:
325
+ text, md, raw, boxes, crops, summary = process_single_file(file.name, mode, task, custom_prompt)
326
+ all_texts.append(text)
327
+ all_mds.append(md)
328
+ all_raws.append(raw)
329
+ all_boxes.extend(boxes)
330
+ all_crops.extend(crops)
331
+ summaries.append(summary)
332
+
333
+ total_time = time.time() - total_start
334
+
335
+ combined_text = "\n\n========================================\n\n".join(all_texts)
336
+ combined_md = "\n\n========================================\n\n".join(all_mds)
337
+ combined_raw = "\n\n========================================\n\n".join(all_raws)
338
+
339
+ final_summary = f"βœ… Processed {len(files)} file(s)\n⏱️ Total Time: {total_time:.2f}s\n\n" + "\n\n".join(summaries)
340
+
341
+ return combined_text, combined_md, combined_raw, all_boxes, all_crops, final_summary
342
+
343
+ def toggle_prompt(task):
344
+ if task == "✏️ Custom":
345
+ return gr.update(visible=True, label="Custom Prompt", placeholder="Add <|grounding|> for boxes")
346
+ elif task == "πŸ“ Locate":
347
+ return gr.update(visible=True, label="Text to Locate", placeholder="Enter text")
348
+ return gr.update(visible=False)
349
+
350
+ def show_view(view_type):
351
+ """Toggle visibility of different output views"""
352
+ return (
353
+ gr.update(visible=(view_type == "text")),
354
+ gr.update(visible=(view_type == "markdown")),
355
+ gr.update(visible=(view_type == "raw")),
356
+ gr.update(visible=(view_type == "boxes")),
357
+ gr.update(visible=(view_type == "crops"))
358
+ )
359
+
360
+ with gr.Blocks(theme=gr.themes.Soft(), title="DeepSeek-OCR Multi-file") as demo:
361
+ gr.Markdown("""
362
+ # πŸš€ DeepSeek-OCR Multi-file Processor
363
+ Upload multiple files (PDF, DOCX, PPTX, Images) and process them with document-wise folder structure.
364
+ """)
365
+
366
+ with gr.Row():
367
+ with gr.Column(scale=1):
368
+ files_in = gr.File(label="πŸ“ Upload Files", file_count="multiple", type="filepath")
369
+ mode = gr.Dropdown(list(MODEL_CONFIGS.keys()), value="Gundam", label="βš™οΈ Mode")
370
+ task = gr.Dropdown(list(TASK_PROMPTS.keys()), value="πŸ“‹ Markdown", label="πŸ“ Task")
371
+ prompt = gr.Textbox(label="Prompt", lines=2, visible=False)
372
+ btn = gr.Button("πŸ”„ Process All Files", variant="primary", size="lg")
373
+
374
+ gr.Markdown("---")
375
+ summary_out = gr.Textbox(label="πŸ“Š Processing Summary", lines=8)
376
+
377
+ with gr.Column(scale=2):
378
+ # View selection buttons in one row
379
+ with gr.Row():
380
+ text_btn = gr.Button("πŸ“„ Text", variant="secondary", size="sm")
381
+ md_btn = gr.Button("πŸ“‹ Markdown", variant="secondary", size="sm")
382
+ raw_btn = gr.Button("πŸ” Raw", variant="secondary", size="sm")
383
+ boxes_btn = gr.Button("🎯 Boxes", variant="secondary", size="sm")
384
+ crops_btn = gr.Button("βœ‚οΈ Crops", variant="secondary", size="sm")
385
+
386
+ # Output containers (only one visible at a time)
387
+ text_container = gr.Column(visible=True)
388
+ with text_container:
389
+ gr.Markdown("### πŸ“„ Text Output")
390
+ text_out = gr.Textbox(lines=25, show_copy_button=True, show_label=False)
391
+
392
+ md_container = gr.Column(visible=False)
393
+ with md_container:
394
+ gr.Markdown("### πŸ“‹ Markdown Output")
395
+ md_out = gr.Markdown("")
396
+
397
+ raw_container = gr.Column(visible=False)
398
+ with raw_container:
399
+ gr.Markdown("### πŸ” Raw Output")
400
+ raw_out = gr.Textbox(lines=25, show_copy_button=True, show_label=False)
401
+
402
+ boxes_container = gr.Column(visible=False)
403
+ with boxes_container:
404
+ gr.Markdown("### 🎯 Bounding Boxes")
405
+ boxes_gallery = gr.Gallery(show_label=False, columns=3, height=600)
406
+
407
+ crops_container = gr.Column(visible=False)
408
+ with crops_container:
409
+ gr.Markdown("### βœ‚οΈ Cropped Images")
410
+ crops_gallery = gr.Gallery(show_label=False, columns=4, height=600)
411
+
412
+ with gr.Accordion("ℹ️ Info", open=False):
413
+ gr.Markdown("""
414
+ ### Modes
415
+ - **Gundam**: 1024 base + 640 tiles with cropping - Best balance
416
+ - **Tiny**: 512Γ—512, no crop - Fastest
417
+ - **Small**: 640Γ—640, no crop - Quick
418
+ - **Base**: 1024Γ—1024, no crop - Standard
419
+ - **Large**: 1280Γ—1280, no crop - Highest quality
420
+
421
+ ### Tasks
422
+ - **Markdown**: Convert document to structured markdown (grounding βœ…)
423
+ - **Free OCR**: Simple text extraction
424
+ - **Locate**: Find specific things in image (grounding βœ…)
425
+ - **Describe**: General image description
426
+ - **Custom**: Your own prompt (add `<|grounding|>` for boxes)
427
+
428
+ ### Supported Formats
429
+ - πŸ“„ PDF files
430
+ - πŸ“ Word documents (.docx)
431
+ - πŸ“Š PowerPoint presentations (.pptx)
432
+ - πŸ–ΌοΈ Images (JPG, PNG, etc.)
433
+ """)
434
+
435
+ # Event handlers
436
+ task.change(toggle_prompt, [task], [prompt])
437
+
438
+ btn.click(
439
+ process_multiple_files,
440
+ [files_in, mode, task, prompt],
441
+ [text_out, md_out, raw_out, boxes_gallery, crops_gallery, summary_out]
442
+ )
443
+
444
+ # View toggle buttons
445
+ text_btn.click(
446
+ lambda: show_view("text"),
447
+ None,
448
+ [text_container, md_container, raw_container, boxes_container, crops_container]
449
+ )
450
+
451
+ md_btn.click(
452
+ lambda: show_view("markdown"),
453
+ None,
454
+ [text_container, md_container, raw_container, boxes_container, crops_container]
455
+ )
456
+
457
+ raw_btn.click(
458
+ lambda: show_view("raw"),
459
+ None,
460
+ [text_container, md_container, raw_container, boxes_container, crops_container]
461
+ )
462
+
463
+ boxes_btn.click(
464
+ lambda: show_view("boxes"),
465
+ None,
466
+ [text_container, md_container, raw_container, boxes_container, crops_container]
467
+ )
468
+
469
+ crops_btn.click(
470
+ lambda: show_view("crops"),
471
+ None,
472
+ [text_container, md_container, raw_container, boxes_container, crops_container]
473
+ )
474
 
475
  if __name__ == "__main__":
476
+ demo.queue(max_size=20).launch(share=True)