prithivMLmods commited on
Commit
ef119b4
·
verified ·
1 Parent(s): b25885d

update app

Browse files
Files changed (1) hide show
  1. app.py +157 -222
app.py CHANGED
@@ -4,11 +4,9 @@ import spaces
4
  import os
5
  import tempfile
6
  from PIL import Image, ImageOps
7
- from threading import Thread
8
  from typing import Iterable
9
- from transformers import AutoProcessor, AutoModelForImageTextToText
10
 
11
- from transformers.image_utils import load_image
12
  from gradio.themes import Soft
13
  from gradio.themes.utils import colors, fonts, sizes
14
 
@@ -27,6 +25,7 @@ colors.hot_pink = colors.Color(
27
  c950="#802050",
28
  )
29
 
 
30
  class HotPinkTheme(Soft):
31
  def __init__(
32
  self,
@@ -36,10 +35,14 @@ class HotPinkTheme(Soft):
36
  neutral_hue: colors.Color | str = colors.slate,
37
  text_size: sizes.Size | str = sizes.text_lg,
38
  font: fonts.Font | str | Iterable[fonts.Font | str] = (
39
- fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
 
 
40
  ),
41
  font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
42
- fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
 
 
43
  ),
44
  ):
45
  super().__init__(
@@ -78,105 +81,39 @@ class HotPinkTheme(Soft):
78
  block_label_background_fill="*primary_200",
79
  )
80
 
81
- hot_pink_theme = HotPinkTheme()
82
-
83
- MODEL_PATH = "zai-org/GLM-OCR"
84
-
85
- processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
86
- model = AutoModelForImageTextToText.from_pretrained(
87
- pretrained_model_name_or_path=MODEL_PATH,
88
- torch_dtype=torch.bfloat16,
89
- device_map="auto",
90
- trust_remote_code=True
91
- )
92
-
93
- TASK_PROMPTS = {
94
- "Text": "Text Recognition:",
95
- "Formula": "Formula Recognition:",
96
- "Table": "Table Recognition:",
97
- }
98
-
99
- @spaces.GPU
100
- def process_image(image, task):
101
- if image is None:
102
- return "Please upload an image first"
103
-
104
- if image.mode in ('RGBA', 'LA', 'P'):
105
- image = image.convert('RGB')
106
- image = ImageOps.exif_transpose(image)
107
-
108
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
109
- image.save(tmp.name, 'PNG')
110
- tmp.close()
111
-
112
- prompt = TASK_PROMPTS.get(task, "Text Recognition:")
113
-
114
- messages = [
115
- {
116
- "role": "user",
117
- "content": [
118
- {"type": "image", "url": tmp.name},
119
- {"type": "text", "text": prompt}
120
- ],
121
- }
122
- ]
123
-
124
- inputs = processor.apply_chat_template(
125
- messages,
126
- tokenize=True,
127
- add_generation_prompt=True,
128
- return_dict=True,
129
- return_tensors="pt"
130
- ).to(model.device)
131
-
132
- inputs.pop("token_type_ids", None)
133
-
134
- generated_ids = model.generate(**inputs, max_new_tokens=8192)
135
- output_text = processor.decode(
136
- generated_ids[0][inputs["input_ids"].shape[1]:],
137
- skip_special_tokens=True
138
- )
139
-
140
- os.unlink(tmp.name)
141
-
142
- return output_text.strip()
143
 
 
144
 
145
  css = """
146
  @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700&family=IBM+Plex+Mono:wght@400;500;600&display=swap');
147
 
148
- /* Background grid pattern - Hot Pink theme */
149
  body, .gradio-container {
150
  background-color: #FFF0F5 !important;
151
- background-image:
152
- linear-gradient(#FFC0D9 1px, transparent 1px),
153
  linear-gradient(90deg, #FFC0D9 1px, transparent 1px) !important;
154
  background-size: 40px 40px !important;
155
  font-family: 'Outfit', sans-serif !important;
156
  }
157
 
158
- /* Dark mode grid */
159
  .dark body, .dark .gradio-container {
160
  background-color: #1a1a1a !important;
161
- background-image:
162
- linear-gradient(rgba(255, 105, 180, 0.1) 1px, transparent 1px),
163
  linear-gradient(90deg, rgba(255, 105, 180, 0.1) 1px, transparent 1px) !important;
164
  background-size: 40px 40px !important;
165
  }
166
 
167
- #col-container {
168
- margin: 0 auto;
169
- max-width: 1000px;
170
- }
171
-
172
- /* Main title styling */
173
- #main-title {
174
- text-align: center !important;
175
- padding: 1rem 0 0.5rem 0;
176
  }
177
 
 
178
  #main-title h1 {
179
- font-size: 2.7em !important;
180
  font-weight: 700 !important;
181
  background: linear-gradient(135deg, #FF69B4 0%, #FF99C4 50%, #E55AA0 100%);
182
  background-size: 200% 200%;
@@ -187,33 +124,15 @@ body, .gradio-container {
187
  letter-spacing: -0.02em;
188
  }
189
 
 
 
 
 
190
  @keyframes gradient-shift {
191
  0%, 100% { background-position: 0% 50%; }
192
  50% { background-position: 100% 50%; }
193
  }
194
 
195
- /* Subtitle styling */
196
- #subtitle {
197
- text-align: center !important;
198
- margin-bottom: 1.5rem;
199
- }
200
-
201
- #subtitle p {
202
- margin: 0 auto;
203
- color: #666666;
204
- font-size: 1rem;
205
- }
206
-
207
- #subtitle a {
208
- color: #FF69B4 !important;
209
- text-decoration: none;
210
- font-weight: 500;
211
- }
212
-
213
- #subtitle a:hover {
214
- text-decoration: underline;
215
- }
216
-
217
  /* Card styling */
218
  .gradio-group {
219
  background: rgba(255, 255, 255, 0.9) !important;
@@ -234,7 +153,7 @@ body, .gradio-container {
234
  border-color: rgba(255, 105, 180, 0.3) !important;
235
  }
236
 
237
- /* Image upload area */
238
  .gradio-image {
239
  border-radius: 10px !important;
240
  overflow: hidden;
@@ -248,10 +167,6 @@ body, .gradio-container {
248
  }
249
 
250
  /* Radio buttons */
251
- .gradio-radio {
252
- border-radius: 8px !important;
253
- }
254
-
255
  .gradio-radio label {
256
  border-radius: 6px !important;
257
  transition: all 0.2s ease !important;
@@ -279,23 +194,6 @@ body, .gradio-container {
279
  transform: translateY(-2px) !important;
280
  }
281
 
282
- /* Tabs styling */
283
- .tab-nav {
284
- border-bottom: 2px solid #FFC0D9 !important;
285
- }
286
-
287
- .tab-nav button {
288
- font-weight: 500 !important;
289
- padding: 10px 18px !important;
290
- border-radius: 8px 8px 0 0 !important;
291
- transition: all 0.2s ease !important;
292
- }
293
-
294
- .tab-nav button.selected {
295
- background: rgba(255, 105, 180, 0.1) !important;
296
- border-bottom: 2px solid #FF69B4 !important;
297
- }
298
-
299
  /* Output textbox */
300
  .gradio-textbox textarea {
301
  font-family: 'IBM Plex Mono', monospace !important;
@@ -332,11 +230,7 @@ body, .gradio-container {
332
  padding: 1rem !important;
333
  }
334
 
335
- /* Examples section */
336
- .gradio-examples {
337
- border-radius: 10px !important;
338
- }
339
-
340
  .gradio-examples .gallery-item {
341
  border: 2px solid #FFC0D9 !important;
342
  border-radius: 8px !important;
@@ -349,27 +243,13 @@ body, .gradio-container {
349
  box-shadow: 0 4px 12px rgba(255, 105, 180, 0.15) !important;
350
  }
351
 
352
- /* Scrollbar styling */
353
- ::-webkit-scrollbar {
354
- width: 8px;
355
- height: 8px;
356
- }
357
-
358
- ::-webkit-scrollbar-track {
359
- background: rgba(255, 105, 180, 0.05);
360
- border-radius: 4px;
361
- }
362
 
363
- ::-webkit-scrollbar-thumb {
364
- background: linear-gradient(135deg, #FF69B4, #FF99C4);
365
- border-radius: 4px;
366
- }
367
-
368
- ::-webkit-scrollbar-thumb:hover {
369
- background: linear-gradient(135deg, #E55AA0, #FF69B4);
370
- }
371
-
372
- /* Accordion styling */
373
  .gradio-accordion {
374
  border-radius: 10px !important;
375
  border: 1px solid #FFC0D9 !important;
@@ -380,89 +260,144 @@ body, .gradio-container {
380
  border-radius: 10px !important;
381
  }
382
 
383
- /* Hide footer */
384
- footer {
385
- display: none !important;
386
- }
387
-
388
  /* Animations */
389
  @keyframes fadeIn {
390
  from { opacity: 0; transform: translateY(10px); }
391
  to { opacity: 1; transform: translateY(0); }
392
  }
393
 
394
- .gradio-row {
395
- animation: fadeIn 0.4s ease-out;
396
- }
397
 
398
- /* Label styling */
399
- label {
400
- font-weight: 600 !important;
401
- color: #333 !important;
402
- }
403
 
404
- .dark label {
405
- color: #eee !important;
406
- }
407
  """
408
 
409
- with gr.Blocks() as demo:
410
-
411
- gr.Markdown("# **GLM-OCR**", elem_id="main-title")
412
- gr.Markdown("*A multimodal [OCR model](https://huggingface.co/zai-org/GLM-OCR) for complex document understanding.*", elem_id="subtitle")
413
-
414
- with gr.Row():
415
-
416
- with gr.Column(scale=1):
417
- image_input = gr.Image(
418
- type="pil",
419
- label="Upload Image",
420
- sources=["upload", "clipboard"],
421
- height=300
422
- )
423
- with gr.Row():
424
- task = gr.Radio(
425
- choices=list(TASK_PROMPTS.keys()),
426
- value="Text",
427
- label="Recognition Type"
428
- )
429
-
430
- with gr.Row():
431
- btn = gr.Button("Perform OCR", variant="primary")
432
-
433
- gr.Examples(
434
- examples=["examples/1.jpg", "examples/4.jpg", "examples/5.webp", "examples/2.jpg", "examples/3.jpg"],
435
- inputs=image_input,
436
- label="Examples"
437
- )
438
-
439
- with gr.Column(scale=1):
440
- with gr.Tabs():
441
- with gr.Tab("Text"):
442
- output_text = gr.Textbox(
443
- label="Output",
444
- lines=18,
445
- interactive=True,
446
- )
447
-
448
- with gr.Tab("Markdown"):
449
- output_md = gr.Markdown(value="")
450
-
451
- def run_ocr(image, task):
452
- result = process_image(image, task)
453
- return result, result
454
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  btn.click(
456
- run_ocr,
457
- [image_input, task],
458
- [output_text, output_md]
459
  )
460
-
461
  image_input.change(
462
- lambda: ("", ""),
463
- None,
464
- [output_text, output_md]
465
  )
466
 
467
  if __name__ == "__main__":
468
- demo.queue(max_size=50).launch(css=css, theme=hot_pink_theme, mcp_server=True, ssr_mode=False, show_error=True)
 
 
 
 
 
 
 
4
  import os
5
  import tempfile
6
  from PIL import Image, ImageOps
 
7
  from typing import Iterable
 
8
 
9
+ from transformers import AutoProcessor, AutoModelForImageTextToText
10
  from gradio.themes import Soft
11
  from gradio.themes.utils import colors, fonts, sizes
12
 
 
25
  c950="#802050",
26
  )
27
 
28
+
29
  class HotPinkTheme(Soft):
30
  def __init__(
31
  self,
 
35
  neutral_hue: colors.Color | str = colors.slate,
36
  text_size: sizes.Size | str = sizes.text_lg,
37
  font: fonts.Font | str | Iterable[fonts.Font | str] = (
38
+ fonts.GoogleFont("Outfit"),
39
+ "Arial",
40
+ "sans-serif",
41
  ),
42
  font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
43
+ fonts.GoogleFont("IBM Plex Mono"),
44
+ "ui-monospace",
45
+ "monospace",
46
  ),
47
  ):
48
  super().__init__(
 
81
  block_label_background_fill="*primary_200",
82
  )
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ hot_pink_theme = HotPinkTheme()
86
 
87
  css = """
88
  @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700&family=IBM+Plex+Mono:wght@400;500;600&display=swap');
89
 
90
+ /* Grid background */
91
  body, .gradio-container {
92
  background-color: #FFF0F5 !important;
93
+ background-image:
94
+ linear-gradient(#FFC0D9 1px, transparent 1px),
95
  linear-gradient(90deg, #FFC0D9 1px, transparent 1px) !important;
96
  background-size: 40px 40px !important;
97
  font-family: 'Outfit', sans-serif !important;
98
  }
99
 
 
100
  .dark body, .dark .gradio-container {
101
  background-color: #1a1a1a !important;
102
+ background-image:
103
+ linear-gradient(rgba(255, 105, 180, 0.1) 1px, transparent 1px),
104
  linear-gradient(90deg, rgba(255, 105, 180, 0.1) 1px, transparent 1px) !important;
105
  background-size: 40px 40px !important;
106
  }
107
 
108
+ /* Sidebar width */
109
+ .gradio-sidebar {
110
+ min-width: 420px !important;
111
+ max-width: 480px !important;
 
 
 
 
 
112
  }
113
 
114
+ /* Titles */
115
  #main-title h1 {
116
+ font-size: 2.5em !important;
117
  font-weight: 700 !important;
118
  background: linear-gradient(135deg, #FF69B4 0%, #FF99C4 50%, #E55AA0 100%);
119
  background-size: 200% 200%;
 
124
  letter-spacing: -0.02em;
125
  }
126
 
127
+ #output-title h2 {
128
+ font-size: 2.2em !important;
129
+ }
130
+
131
  @keyframes gradient-shift {
132
  0%, 100% { background-position: 0% 50%; }
133
  50% { background-position: 100% 50%; }
134
  }
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  /* Card styling */
137
  .gradio-group {
138
  background: rgba(255, 255, 255, 0.9) !important;
 
153
  border-color: rgba(255, 105, 180, 0.3) !important;
154
  }
155
 
156
+ /* Image upload */
157
  .gradio-image {
158
  border-radius: 10px !important;
159
  overflow: hidden;
 
167
  }
168
 
169
  /* Radio buttons */
 
 
 
 
170
  .gradio-radio label {
171
  border-radius: 6px !important;
172
  transition: all 0.2s ease !important;
 
194
  transform: translateY(-2px) !important;
195
  }
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  /* Output textbox */
198
  .gradio-textbox textarea {
199
  font-family: 'IBM Plex Mono', monospace !important;
 
230
  padding: 1rem !important;
231
  }
232
 
233
+ /* Examples */
 
 
 
 
234
  .gradio-examples .gallery-item {
235
  border: 2px solid #FFC0D9 !important;
236
  border-radius: 8px !important;
 
243
  box-shadow: 0 4px 12px rgba(255, 105, 180, 0.15) !important;
244
  }
245
 
246
+ /* Scrollbar */
247
+ ::-webkit-scrollbar { width: 8px; height: 8px; }
248
+ ::-webkit-scrollbar-track { background: rgba(255,105,180,0.05); border-radius: 4px; }
249
+ ::-webkit-scrollbar-thumb { background: linear-gradient(135deg, #FF69B4, #FF99C4); border-radius: 4px; }
250
+ ::-webkit-scrollbar-thumb:hover { background: linear-gradient(135deg, #E55AA0, #FF69B4); }
 
 
 
 
 
251
 
252
+ /* Accordion */
 
 
 
 
 
 
 
 
 
253
  .gradio-accordion {
254
  border-radius: 10px !important;
255
  border: 1px solid #FFC0D9 !important;
 
260
  border-radius: 10px !important;
261
  }
262
 
 
 
 
 
 
263
  /* Animations */
264
  @keyframes fadeIn {
265
  from { opacity: 0; transform: translateY(10px); }
266
  to { opacity: 1; transform: translateY(0); }
267
  }
268
 
269
+ .gradio-row { animation: fadeIn 0.4s ease-out; }
 
 
270
 
271
+ label { font-weight: 600 !important; color: #333 !important; }
272
+ .dark label { color: #eee !important; }
 
 
 
273
 
274
+ footer { display: none !important; }
 
 
275
  """
276
 
277
+ MODEL_PATH = "zai-org/GLM-OCR"
278
+
279
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
280
+ model = AutoModelForImageTextToText.from_pretrained(
281
+ pretrained_model_name_or_path=MODEL_PATH,
282
+ torch_dtype=torch.bfloat16,
283
+ device_map="auto",
284
+ trust_remote_code=True,
285
+ )
286
+
287
+ TASK_PROMPTS = {
288
+ "Text": "Text Recognition:",
289
+ "Formula": "Formula Recognition:",
290
+ "Table": "Table Recognition:",
291
+ }
292
+
293
+ @spaces.GPU
294
+ def process_image(image, task):
295
+ """Run OCR on the uploaded image with the selected recognition type."""
296
+ if image is None:
297
+ return "Please upload an image first.", "Please upload an image first."
298
+
299
+ if image.mode in ("RGBA", "LA", "P"):
300
+ image = image.convert("RGB")
301
+ image = ImageOps.exif_transpose(image)
302
+
303
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
304
+ image.save(tmp.name, "PNG")
305
+ tmp.close()
306
+
307
+ prompt = TASK_PROMPTS.get(task, "Text Recognition:")
308
+
309
+ messages = [
310
+ {
311
+ "role": "user",
312
+ "content": [
313
+ {"type": "image", "url": tmp.name},
314
+ {"type": "text", "text": prompt},
315
+ ],
316
+ }
317
+ ]
318
+
319
+ inputs = processor.apply_chat_template(
320
+ messages,
321
+ tokenize=True,
322
+ add_generation_prompt=True,
323
+ return_dict=True,
324
+ return_tensors="pt",
325
+ ).to(model.device)
326
+
327
+ inputs.pop("token_type_ids", None)
328
+
329
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
330
+ output_text = processor.decode(
331
+ generated_ids[0][inputs["input_ids"].shape[1] :],
332
+ skip_special_tokens=True,
333
+ )
334
+
335
+ os.unlink(tmp.name)
336
+
337
+ result = output_text.strip()
338
+ return result, result
339
+
340
+ with gr.Blocks(fill_height=True) as demo:
341
+
342
+ with gr.Sidebar():
343
+
344
+ gr.Markdown("# **GLM-OCR**", elem_id="main-title")
345
+
346
+ image_input = gr.Image(
347
+ type="pil",
348
+ label="Upload Image",
349
+ sources=["upload", "clipboard"],
350
+ height=300,
351
+ )
352
+
353
+ task = gr.Radio(
354
+ choices=list(TASK_PROMPTS.keys()),
355
+ value="Text",
356
+ label="Recognition Type",
357
+ )
358
+
359
+ btn = gr.Button("Perform OCR", variant="primary")
360
+
361
+ gr.Examples(
362
+ examples=[
363
+ "examples/1.jpg",
364
+ "examples/4.jpg",
365
+ "examples/5.webp",
366
+ "examples/2.jpg",
367
+ "examples/3.jpg",
368
+ ],
369
+ inputs=image_input,
370
+ label="Examples",
371
+ )
372
+
373
+ gr.Markdown("## Output", elem_id="output-title")
374
+
375
+ output_text = gr.Textbox(
376
+ label="Raw Output Stream",
377
+ interactive=True,
378
+ lines=22,
379
+ )
380
+
381
+ with gr.Accordion("(Result.md)", open=False):
382
+ output_md = gr.Markdown(label="Rendered Markdown")
383
+
384
  btn.click(
385
+ fn=process_image,
386
+ inputs=[image_input, task],
387
+ outputs=[output_text, output_md],
388
  )
389
+
390
  image_input.change(
391
+ fn=lambda: ("", ""),
392
+ inputs=None,
393
+ outputs=[output_text, output_md],
394
  )
395
 
396
  if __name__ == "__main__":
397
+ demo.queue(max_size=50).launch(
398
+ css=css,
399
+ theme=hot_pink_theme,
400
+ mcp_server=True,
401
+ ssr_mode=False,
402
+ show_error=True,
403
+ )