nicolet8 commited on
Commit
a192b42
·
verified ·
1 Parent(s): 357ce5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -118
app.py CHANGED
@@ -1,15 +1,21 @@
1
- import gradio as gr
2
- import torch
3
- import spaces
4
  import os
5
  import tempfile
 
 
 
 
 
 
6
  from PIL import Image, ImageOps
7
- from typing import Iterable
8
 
9
  from transformers import AutoProcessor, AutoModelForImageTextToText
10
  from gradio.themes import Soft
11
  from gradio.themes.utils import colors, fonts, sizes
12
 
 
 
 
 
13
  colors.hot_pink = colors.Color(
14
  name="hot_pink",
15
  c50="#FFF0F5",
@@ -166,22 +172,6 @@ body, .gradio-container {
166
  background: rgba(255, 105, 180, 0.02) !important;
167
  }
168
 
169
- /* Radio buttons */
170
- .gradio-radio label {
171
- border-radius: 6px !important;
172
- transition: all 0.2s ease !important;
173
- border: 1px solid transparent !important;
174
- }
175
-
176
- .gradio-radio label:hover {
177
- background: rgba(255, 105, 180, 0.05) !important;
178
- }
179
-
180
- .gradio-radio label.selected {
181
- background: rgba(255, 105, 180, 0.1) !important;
182
- border-color: #FF69B4 !important;
183
- }
184
-
185
  /* Primary button */
186
  .primary {
187
  border-radius: 8px !important;
@@ -215,72 +205,13 @@ body, .gradio-container {
215
  line-height: 1.7 !important;
216
  }
217
 
218
- .gradio-markdown code {
219
- font-family: 'IBM Plex Mono', monospace !important;
220
- background: rgba(255, 105, 180, 0.08) !important;
221
- padding: 2px 6px !important;
222
- border-radius: 4px !important;
223
- color: #CC4C8C !important;
224
- }
225
-
226
- .gradio-markdown pre {
227
- background: rgba(255, 105, 180, 0.05) !important;
228
- border: 1px solid #FFC0D9 !important;
229
- border-radius: 8px !important;
230
- padding: 1rem !important;
231
- }
232
-
233
- /* Examples */
234
- .gradio-examples .gallery-item {
235
- border: 2px solid #FFC0D9 !important;
236
- border-radius: 8px !important;
237
- transition: all 0.2s ease !important;
238
- }
239
-
240
- .gradio-examples .gallery-item:hover {
241
- border-color: #FF69B4 !important;
242
- transform: translateY(-2px) !important;
243
- box-shadow: 0 4px 12px rgba(255, 105, 180, 0.15) !important;
244
- }
245
-
246
- /* Scrollbar */
247
- ::-webkit-scrollbar { width: 8px; height: 8px; }
248
- ::-webkit-scrollbar-track { background: rgba(255,105,180,0.05); border-radius: 4px; }
249
- ::-webkit-scrollbar-thumb { background: linear-gradient(135deg, #FF69B4, #FF99C4); border-radius: 4px; }
250
- ::-webkit-scrollbar-thumb:hover { background: linear-gradient(135deg, #E55AA0, #FF69B4); }
251
-
252
- /* Accordion */
253
- .gradio-accordion {
254
- border-radius: 10px !important;
255
- border: 1px solid #FFC0D9 !important;
256
- }
257
-
258
- .gradio-accordion > .label-wrap {
259
- background: rgba(255, 105, 180, 0.03) !important;
260
- border-radius: 10px !important;
261
- }
262
-
263
- /* Animations */
264
- @keyframes fadeIn {
265
- from { opacity: 0; transform: translateY(10px); }
266
- to { opacity: 1; transform: translateY(0); }
267
- }
268
-
269
- .gradio-row { animation: fadeIn 0.4s ease-out; }
270
-
271
- label { font-weight: 600 !important; color: #333 !important; }
272
- .dark label { color: #eee !important; }
273
-
274
  footer { display: none !important; }
275
-
276
-
277
- /* Wider sidebar */
278
- .sidebar {
279
- min-width: 420px !important;
280
- max-width: 480px !important;
281
- }
282
  """
283
 
 
 
 
 
284
  MODEL_PATH = "zai-org/GLM-OCR"
285
 
286
  processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
@@ -297,22 +228,56 @@ TASK_PROMPTS = {
297
  "Table": "Table Recognition:",
298
  }
299
 
300
- @spaces.GPU
301
- def process_image(image, task):
302
- """Run OCR on the uploaded image with the selected recognition type."""
303
- if image is None:
304
- return "Please upload an image first.", "Please upload an image first."
305
 
306
- if image.mode in ("RGBA", "LA", "P"):
307
- image = image.convert("RGB")
308
- image = ImageOps.exif_transpose(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
311
  image.save(tmp.name, "PNG")
312
  tmp.close()
313
 
314
- prompt = TASK_PROMPTS.get(task, "Text Recognition:")
315
-
316
  messages = [
317
  {
318
  "role": "user",
@@ -340,21 +305,45 @@ def process_image(image, task):
340
  )
341
 
342
  os.unlink(tmp.name)
 
343
 
344
- result = output_text.strip()
345
- return result, result
346
-
347
- with gr.Blocks(fill_height=True) as demo:
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  with gr.Sidebar(width=450):
350
-
351
  gr.Markdown("# **GLM-OCR**", elem_id="main-title")
352
 
353
- image_input = gr.Image(
354
- type="pil",
355
- label="Upload Image",
356
- sources=["upload", "clipboard"],
357
- height=300,
358
  )
359
 
360
  task = gr.Radio(
@@ -363,18 +352,19 @@ with gr.Blocks(fill_height=True) as demo:
363
  label="Recognition Type",
364
  )
365
 
 
 
 
 
 
 
 
 
366
  btn = gr.Button("Perform OCR", variant="primary")
367
 
368
- gr.Examples(
369
- examples=[
370
- "examples/1.jpg",
371
- "examples/4.jpg",
372
- "examples/5.webp",
373
- "examples/2.jpg",
374
- "examples/3.jpg",
375
- ],
376
- inputs=image_input,
377
- label="Examples",
378
  )
379
 
380
  gr.Markdown("## Output", elem_id="output-title")
@@ -389,12 +379,12 @@ with gr.Blocks(fill_height=True) as demo:
389
  output_md = gr.Markdown(label="Rendered Markdown")
390
 
391
  btn.click(
392
- fn=process_image,
393
- inputs=[image_input, task],
394
  outputs=[output_text, output_md],
395
  )
396
 
397
- image_input.change(
398
  fn=lambda: ("", ""),
399
  inputs=None,
400
  outputs=[output_text, output_md],
@@ -402,8 +392,6 @@ with gr.Blocks(fill_height=True) as demo:
402
 
403
  if __name__ == "__main__":
404
  demo.queue(max_size=50).launch(
405
- css=css,
406
- theme=hot_pink_theme,
407
  mcp_server=True,
408
  ssr_mode=False,
409
  show_error=True,
 
 
 
 
1
  import os
2
  import tempfile
3
+ from typing import Iterable, List, Tuple
4
+
5
+ import fitz # pymupdf
6
+ import gradio as gr
7
+ import spaces
8
+ import torch
9
  from PIL import Image, ImageOps
 
10
 
11
  from transformers import AutoProcessor, AutoModelForImageTextToText
12
  from gradio.themes import Soft
13
  from gradio.themes.utils import colors, fonts, sizes
14
 
15
+
16
+ # -------------------------
17
+ # Theme + CSS (unchanged)
18
+ # -------------------------
19
  colors.hot_pink = colors.Color(
20
  name="hot_pink",
21
  c50="#FFF0F5",
 
172
  background: rgba(255, 105, 180, 0.02) !important;
173
  }
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  /* Primary button */
176
  .primary {
177
  border-radius: 8px !important;
 
205
  line-height: 1.7 !important;
206
  }
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  footer { display: none !important; }
 
 
 
 
 
 
 
209
  """
210
 
211
+
212
+ # -------------------------
213
+ # Model
214
+ # -------------------------
215
  MODEL_PATH = "zai-org/GLM-OCR"
216
 
217
  processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
 
228
  "Table": "Table Recognition:",
229
  }
230
 
 
 
 
 
 
231
 
232
+ # -------------------------
233
+ # Helpers
234
+ # -------------------------
235
+ def _normalize_pil(img: Image.Image) -> Image.Image:
236
+ if img.mode in ("RGBA", "LA", "P"):
237
+ img = img.convert("RGB")
238
+ img = ImageOps.exif_transpose(img)
239
+ return img
240
+
241
+
242
+ def file_to_images(file_path: str, max_pages: int = 20, dpi: int = 200) -> List[Image.Image]:
243
+ """
244
+ Convert an input file (pdf or image) into a list of PIL images.
245
+ Safety: limit pages for huge PDFs.
246
+ """
247
+ ext = os.path.splitext(file_path)[1].lower()
248
+
249
+ if ext in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"]:
250
+ return [_normalize_pil(Image.open(file_path))]
251
+
252
+ if ext == ".pdf":
253
+ doc = fitz.open(file_path)
254
+ images: List[Image.Image] = []
255
+ zoom = dpi / 72
256
+ mat = fitz.Matrix(zoom, zoom)
257
+
258
+ n = min(len(doc), max_pages)
259
+ for i in range(n):
260
+ page = doc.load_page(i)
261
+ pix = page.get_pixmap(matrix=mat, alpha=False)
262
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
263
+ images.append(_normalize_pil(img))
264
+
265
+ doc.close()
266
+ return images
267
+
268
+ raise ValueError(f"Unsupported file type: {ext}")
269
+
270
+
271
+ def ocr_one_image(image: Image.Image, task: str) -> str:
272
+ """
273
+ OCR one PIL image. Returns markdown-like text (model output).
274
+ """
275
+ prompt = TASK_PROMPTS.get(task, "Text Recognition:")
276
 
277
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
278
  image.save(tmp.name, "PNG")
279
  tmp.close()
280
 
 
 
281
  messages = [
282
  {
283
  "role": "user",
 
305
  )
306
 
307
  os.unlink(tmp.name)
308
+ return output_text.strip()
309
 
 
 
 
 
310
 
311
+ @spaces.GPU
312
+ def process_file(file_obj, task: str, max_pages: int) -> Tuple[str, str]:
313
+ """
314
+ Process an uploaded file (PDF or image).
315
+ Returns (raw_text, rendered_markdown).
316
+ """
317
+ if file_obj is None:
318
+ return "Please upload a PDF or an image first.", "Please upload a PDF or an image first."
319
+
320
+ # Gradio File gives an object with .name (path). Sometimes it's already a string path.
321
+ file_path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
322
+
323
+ try:
324
+ pages = file_to_images(file_path, max_pages=int(max_pages), dpi=200)
325
+ except Exception as e:
326
+ return f"Failed to read file: {e}", f"Failed to read file: {e}"
327
+
328
+ md_pages = []
329
+ for i, img in enumerate(pages, start=1):
330
+ page_md = ocr_one_image(img, task)
331
+ md_pages.append(f"<!-- Page {i} -->\n\n{page_md}")
332
+
333
+ final_md = "\n\n---\n\n".join(md_pages)
334
+ return final_md, final_md
335
+
336
+
337
+ # -------------------------
338
+ # UI
339
+ # -------------------------
340
+ with gr.Blocks(fill_height=True, css=css, theme=hot_pink_theme) as demo:
341
  with gr.Sidebar(width=450):
 
342
  gr.Markdown("# **GLM-OCR**", elem_id="main-title")
343
 
344
+ file_input = gr.File(
345
+ label="Upload PDF or Image",
346
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"],
 
 
347
  )
348
 
349
  task = gr.Radio(
 
352
  label="Recognition Type",
353
  )
354
 
355
+ max_pages = gr.Slider(
356
+ minimum=1,
357
+ maximum=50,
358
+ value=20,
359
+ step=1,
360
+ label="Max PDF Pages (safety limit)",
361
+ )
362
+
363
  btn = gr.Button("Perform OCR", variant="primary")
364
 
365
+ gr.Markdown(
366
+ "Tip: If you upload a PDF, it will OCR pages in order and join results with separators.\n"
367
+ "For very large PDFs, increase the page limit carefully."
 
 
 
 
 
 
 
368
  )
369
 
370
  gr.Markdown("## Output", elem_id="output-title")
 
379
  output_md = gr.Markdown(label="Rendered Markdown")
380
 
381
  btn.click(
382
+ fn=process_file,
383
+ inputs=[file_input, task, max_pages],
384
  outputs=[output_text, output_md],
385
  )
386
 
387
+ file_input.change(
388
  fn=lambda: ("", ""),
389
  inputs=None,
390
  outputs=[output_text, output_md],
 
392
 
393
  if __name__ == "__main__":
394
  demo.queue(max_size=50).launch(
 
 
395
  mcp_server=True,
396
  ssr_mode=False,
397
  show_error=True,