prithivMLmods commited on
Commit
fb8008a
·
verified ·
1 Parent(s): 0282214

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -399
app.py CHANGED
@@ -1,443 +1,408 @@
1
  import gradio as gr
2
  import torch
3
  import spaces
4
- import base64
5
- import io
6
- from PIL import Image
7
  from transformers import AutoProcessor, AutoModelForImageTextToText
8
 
9
- # -----------------------------------------------------------------------------
10
- # Model Initialization
11
- # -----------------------------------------------------------------------------
12
-
13
  MODEL_PATH = "zai-org/GLM-OCR"
14
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
-
16
- print(f"Loading model on {DEVICE}...")
17
-
18
- # Load Processor
19
- try:
20
- processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
21
- # Load Model
22
- model = AutoModelForImageTextToText.from_pretrained(
23
- pretrained_model_name_or_path=MODEL_PATH,
24
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
25
- trust_remote_code=True,
26
- device_map="auto" if torch.cuda.is_available() else None,
27
- )
28
- if DEVICE == "cpu":
29
- model = model.to("cpu") # explicit fallback if no gpu
30
-
31
- print("Model loaded successfully.")
32
- except Exception as e:
33
- print(f"Error loading model: {e}")
34
- # Fallback for building UI without model (for debugging/building phase)
35
- processor = None
36
- model = None
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # -----------------------------------------------------------------------------
40
- # Inference Logic
41
- # -----------------------------------------------------------------------------
42
 
43
- @spaces.GPU
44
- def run_inference(image_b64, task_prompt):
45
- if not image_b64:
46
- return "Please upload an image first."
47
-
48
- if model is None:
49
- return "Model not loaded correctly. Check logs."
50
 
51
- try:
52
- # 1. Decode Base64 to PIL Image
53
- if "base64," in image_b64:
54
- image_b64 = image_b64.split("base64,")[1]
55
-
56
- image_data = base64.b64decode(image_b64)
57
- image = Image.open(io.BytesIO(image_data)).convert("RGB")
58
 
59
- # 2. Prepare Messages
60
- # The prompt is selected via the radio buttons
61
- messages = [
62
- {
63
- "role": "user",
64
- "content": [
65
- {
66
- "type": "image",
67
- "image": image,
68
- },
69
- {
70
- "type": "text",
71
- "text": task_prompt
72
- }
73
- ],
74
- }
75
- ]
76
 
77
- # 3. Process Inputs
78
- inputs = processor.apply_chat_template(
79
- messages,
80
- tokenize=True,
81
- add_generation_prompt=True,
82
- return_dict=True,
83
- return_tensors="pt"
84
- ).to(model.device)
85
 
86
- # Remove token_type_ids if present (transformers fix)
87
- inputs.pop("token_type_ids", None)
 
 
 
88
 
89
- # 4. Generate
90
- with torch.no_grad():
91
- generated_ids = model.generate(
92
- **inputs,
93
- max_new_tokens=2048,
94
- do_sample=False, # Deterministic for OCR usually better
95
- temperature=0.01
96
- )
97
 
98
- # 5. Decode
99
- output_text = processor.decode(
100
- generated_ids[0][inputs["input_ids"].shape[1]:],
101
- skip_special_tokens=False
102
- )
103
-
104
- # Clean up tags usually returned by VLM
105
- output_text = output_text.replace("<|endoftext|>", "").strip()
106
-
107
- return output_text
108
 
109
- except Exception as e:
110
- return f"Error during inference: {str(e)}"
111
-
112
- # -----------------------------------------------------------------------------
113
- # Custom Component & UI Assets
114
- # -----------------------------------------------------------------------------
115
-
116
- # CSS from your snippet + additions for image preview and layout
117
- CUSTOM_CSS = """
118
- /* Reset & Layout */
119
- .container {
120
- position: relative;
121
- max-width: 600px;
122
- width: 100%;
123
- background: #FCEDDA;
124
- padding: 25px;
125
- border-radius: 8px;
126
- box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
127
- margin: 0 auto;
128
- font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
129
- }
130
-
131
- .container header {
132
- font-size: 1.5rem;
133
- color: #000;
134
- font-weight: 600;
135
- text-align: center;
136
- margin-bottom: 20px;
137
- }
138
-
139
- .form {
140
- margin-top: 15px;
141
- }
142
-
143
- .input-box {
144
- width: 100%;
145
- margin-top: 15px;
146
- }
147
-
148
- .input-box label {
149
- color: #000;
150
- font-weight: 500;
151
- margin-bottom: 5px;
152
- display: block;
153
- }
154
-
155
- /* Custom Upload Area */
156
- .upload-area {
157
- width: 100%;
158
- min-height: 150px;
159
- background: #fff8f0;
160
- border: 2px dashed #EE4E34;
161
- border-radius: 6px;
162
  display: flex;
163
- flex-direction: column;
164
  align-items: center;
165
- justify-content: center;
166
- cursor: pointer;
167
- transition: background 0.2s;
168
- padding: 10px;
169
- }
170
- .upload-area:hover {
171
- background: #fff0e0;
172
- }
173
- .upload-text {
174
- color: #808080;
175
- margin-top: 10px;
176
- }
177
- #preview-img {
178
- max-width: 100%;
179
- max-height: 300px;
180
- border-radius: 4px;
181
- display: none;
182
- box-shadow: 0 2px 5px rgba(0,0,0,0.1);
183
- }
184
-
185
- /* Radio Buttons */
186
- .gender-box {
187
- margin-top: 20px;
188
- }
189
- .gender-option {
190
- display: flex;
191
- align-items: center;
192
- column-gap: 20px;
193
- flex-wrap: wrap;
194
- margin-top: 10px;
195
- background: #fff8f0;
196
- padding: 10px;
197
- border-radius: 6px;
198
- border: 1px solid #EE4E34;
199
- }
200
- .gender {
201
- display: flex;
202
- align-items: center;
203
- column-gap: 5px;
204
- }
205
- .gender input {
206
- accent-color: #EE4E34;
207
- width: 18px;
208
- height: 18px;
209
- cursor: pointer;
210
- }
211
- .gender label {
212
- cursor: pointer;
213
- margin: 0; /* Reset margin from input-box label */
214
- }
215
-
216
- /* Textarea Output */
217
- textarea.result-field {
218
- width: 100%;
219
- height: 200px;
220
- padding: 15px;
221
- outline: none;
222
- font-size: 0.95rem;
223
- color: #333;
224
- margin-top: 5px;
225
- border: 1px solid #EE4E34;
226
- border-radius: 6px;
227
- background: #fff;
228
- resize: vertical;
229
- font-family: monospace;
230
  }
231
 
232
- /* Submit Button */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  .submit-btn {
234
- height: 45px;
235
- width: 100%;
236
- color: #fff;
237
- font-size: 1.1rem;
238
- font-weight: 500;
239
- margin-top: 25px;
240
- border: none;
241
- border-radius: 6px;
242
- cursor: pointer;
243
- transition: all 0.2s ease;
244
- background: #EE4E34;
 
245
  }
 
246
  .submit-btn:hover {
247
- background: #d63d24;
 
 
 
 
 
 
 
 
 
 
 
248
  }
249
- .submit-btn:disabled {
250
- background: #fabab5;
251
- cursor: not-allowed;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  }
253
- .status-msg {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  text-align: center;
255
- margin-top: 10px;
 
256
  font-size: 0.9rem;
257
- min-height: 20px;
258
  }
259
- """
260
 
261
- # JavaScript to handle interactions and bridge with Gradio
262
- CUSTOM_JS = """
263
- <script>
264
- function initOcrUI() {
265
- const fileInput = document.getElementById('hidden-file-input');
266
- const uploadArea = document.getElementById('upload-area');
267
- const previewImg = document.getElementById('preview-img');
268
- const uploadText = document.getElementById('upload-text');
269
- const submitBtn = document.getElementById('custom-submit');
270
- const resultArea = document.getElementById('result-area');
271
- const statusMsg = document.getElementById('status-msg');
272
-
273
- // Trigger file input
274
- uploadArea.onclick = () => fileInput.click();
275
-
276
- // Handle File Selection
277
- fileInput.onchange = (e) => {
278
- const file = e.target.files[0];
279
- if (file) {
280
- const reader = new FileReader();
281
- reader.onload = (evt) => {
282
- const b64 = evt.target.result;
283
- // Show Preview
284
- previewImg.src = b64;
285
- previewImg.style.display = 'block';
286
- uploadText.style.display = 'none';
287
-
288
- // Update Hidden Gradio Component
289
- updateGradioImage(b64);
290
- }
291
- reader.readAsDataURL(file);
292
- }
293
- };
294
 
295
- // Handle Submit
296
- submitBtn.onclick = (e) => {
297
- e.preventDefault();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
- // Get selected Task
300
- const task = document.querySelector('input[name="task"]:checked').value;
 
 
 
 
 
 
 
 
301
 
302
- // Update Hidden Gradio Task Input
303
- updateGradioTask(task);
 
 
 
 
 
 
 
304
 
305
- // Visual Feedback
306
- submitBtn.innerText = "Processing...";
307
- submitBtn.disabled = true;
308
- statusMsg.innerText = "Model is running. Please wait...";
309
- resultArea.value = ""; // Clear previous
 
310
 
311
- // Trigger Hidden Gradio Button
312
- const gradioBtn = document.getElementById('bridge-btn');
313
- if (gradioBtn) gradioBtn.click();
314
- };
315
-
316
- // --- Bridge Functions ---
317
 
318
- function updateGradioImage(b64Data) {
319
- const ta = document.querySelector('#bridge-img-input textarea');
320
- if (ta) {
321
- ta.value = b64Data;
322
- ta.dispatchEvent(new Event('input', { bubbles: true }));
323
- }
324
- }
325
-
326
- function updateGradioTask(taskVal) {
327
- const ta = document.querySelector('#bridge-task-input textarea');
328
- if (ta) {
329
- ta.value = taskVal;
330
- ta.dispatchEvent(new Event('input', { bubbles: true }));
331
- }
332
- }
333
- }
334
-
335
- // Function called by Gradio when output changes
336
- function updateResultUI(text) {
337
- const resultArea = document.getElementById('result-area');
338
- const submitBtn = document.getElementById('custom-submit');
339
- const statusMsg = document.getElementById('status-msg');
340
 
341
- if(resultArea) resultArea.value = text;
342
- if(submitBtn) {
343
- submitBtn.innerText = "Submit";
344
- submitBtn.disabled = false;
345
- }
346
- if(statusMsg) statusMsg.innerText = "Extraction complete.";
347
- }
348
-
349
- // Initialize after a slight delay to ensure DOM is ready
350
- setTimeout(initOcrUI, 1000);
351
- </script>
352
- """
353
 
354
- HTML_TEMPLATE = """
355
- <div class="container">
356
- <header>GLM-OCR Interface</header>
357
-
358
- <div class="form">
359
-
360
- <!-- Image Input Section -->
361
- <div class="input-box">
362
- <label>Document Image</label>
363
- <div class="upload-area" id="upload-area">
364
- <span class="upload-text" id="upload-text">Click to Upload Image</span>
365
- <img id="preview-img" alt="Preview"/>
366
- </div>
367
- <input type="file" id="hidden-file-input" style="display:none" accept="image/*">
368
- </div>
369
-
370
- <!-- Task Selection -->
371
- <div class="gender-box">
372
- <label>Extraction Mode</label>
373
- <div class="gender-option">
374
- <div class="gender">
375
- <input type="radio" id="check-text" name="task" value="Text Recognition:" checked>
376
- <label for="check-text">Text</label>
377
- </div>
378
- <div class="gender">
379
- <input type="radio" id="check-formula" name="task" value="Formula Recognition:">
380
- <label for="check-formula">Formula</label>
381
- </div>
382
- <div class="gender">
383
- <input type="radio" id="check-table" name="task" value="Table Recognition:">
384
- <label for="check-table">Table</label>
385
- </div>
386
- </div>
387
- </div>
388
-
389
- <!-- Submit Action -->
390
- <button class="submit-btn" id="custom-submit">Submit</button>
391
- <div class="status-msg" id="status-msg"></div>
392
-
393
- <!-- Result Output -->
394
- <div class="input-box">
395
- <label>Extraction Result</label>
396
- <textarea id="result-area" class="result-field" readonly placeholder="Output will appear here..."></textarea>
397
- </div>
398
-
399
- </div>
400
- </div>
401
- """
402
-
403
- class GlmOcr(gr.HTML):
404
- """Custom component wrapper to render the specific UI"""
405
- def __init__(self):
406
- super().__init__(value=HTML_TEMPLATE + CUSTOM_JS)
407
-
408
- # -----------------------------------------------------------------------------
409
- # Gradio App Structure
410
- # -----------------------------------------------------------------------------
411
 
 
412
  with gr.Blocks(title="GLM-OCR") as demo:
413
 
414
- # 1. The Custom UI
415
- GlmOcr()
416
-
417
- # 2. Hidden Bridge Components (To transfer data between Custom HTML and Python)
418
- with gr.Row(visible=False):
419
- # Stores Base64 string of the image
420
- bridge_img_input = gr.Textbox(elem_id="bridge-img-input", label="Hidden Img")
421
- # Stores the selected task string
422
- bridge_task_input = gr.Textbox(elem_id="bridge-task-input", value="Text Recognition:", label="Hidden Task")
423
- # The trigger button clicked by JS
424
- bridge_btn = gr.Button("Run", elem_id="bridge-btn")
425
- # The output storage, watched by JS
426
- bridge_output = gr.Textbox(elem_id="bridge-output", label="Hidden Output")
427
-
428
- # 3. Python Logic Connections
429
- bridge_btn.click(
430
- fn=run_inference,
431
- inputs=[bridge_img_input, bridge_task_input],
432
- outputs=[bridge_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  )
434
-
435
- # 4. Feedback Loop: When python output changes, update HTML via JS
436
- bridge_output.change(
437
- fn=None,
438
- inputs=[bridge_output],
439
- js="(v) => updateResultUI(v)"
440
  )
441
 
 
442
  if __name__ == "__main__":
443
- demo.launch(css=CUSTOM_CSS, ssr_mode=False)
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import spaces
4
+ import os
5
+ import tempfile
6
+ from PIL import Image, ImageOps
7
  from transformers import AutoProcessor, AutoModelForImageTextToText
8
 
9
+ # Model configuration
 
 
 
10
  MODEL_PATH = "zai-org/GLM-OCR"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # Load model and processor
13
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
14
+ model = AutoModelForImageTextToText.from_pretrained(
15
+ pretrained_model_name_or_path=MODEL_PATH,
16
+ torch_dtype="auto",
17
+ device_map="auto",
18
+ trust_remote_code=True
19
+ )
20
+
21
+ # Task prompts for document parsing
22
+ TASK_PROMPTS = {
23
+ "Text": "Text Recognition:",
24
+ "Formula": "Formula Recognition:",
25
+ "Table": "Table Recognition:"
26
+ }
27
 
28
+ # Custom CSS based on the provided theme
29
+ css = """
30
+ @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;500;600;700&display=swap');
31
 
32
+ * {
33
+ font-family: 'Outfit', sans-serif !important;
34
+ }
 
 
 
 
35
 
36
+ body, .gradio-container {
37
+ background: linear-gradient(135deg, #FCEDDA, #FFF5EB) !important;
38
+ min-height: 100vh;
39
+ }
 
 
 
40
 
41
+ .main-header {
42
+ text-align: center;
43
+ padding: 20px 0 30px 0;
44
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ .main-header h1 {
47
+ font-size: 2.8rem;
48
+ color: #EE4E34;
49
+ font-weight: 700;
50
+ margin: 0;
51
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.1);
52
+ }
 
53
 
54
+ .main-header p {
55
+ color: #555;
56
+ font-size: 1.1rem;
57
+ margin-top: 8px;
58
+ }
59
 
60
+ .form-section {
61
+ background: #FCEDDA;
62
+ padding: 25px;
63
+ border-radius: 12px;
64
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
65
+ border: 2px solid #EE4E34;
66
+ }
 
67
 
68
+ .form-section label {
69
+ color: #000 !important;
70
+ font-weight: 600 !important;
71
+ font-size: 1rem !important;
72
+ }
 
 
 
 
 
73
 
74
+ .output-section {
75
+ background: #FCEDDA;
76
+ padding: 25px;
77
+ border-radius: 12px;
78
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
79
+ border: 2px solid #EE4E34;
80
+ }
81
+
82
+ .output-header {
83
+ color: #EE4E34;
84
+ font-size: 1.2rem;
85
+ font-weight: 600;
86
+ margin-bottom: 15px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  display: flex;
 
88
  align-items: center;
89
+ gap: 8px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
 
92
+ /* Image upload styling */
93
+ .image-upload-area {
94
+ border: 2px dashed #EE4E34 !important;
95
+ border-radius: 10px !important;
96
+ background: rgba(255, 255, 255, 0.6) !important;
97
+ transition: all 0.3s ease !important;
98
+ }
99
+
100
+ .image-upload-area:hover {
101
+ background: rgba(255, 255, 255, 0.9) !important;
102
+ border-color: #D43E2A !important;
103
+ }
104
+
105
+ /* Radio buttons styling */
106
+ .task-radio-group {
107
+ margin: 15px 0;
108
+ }
109
+
110
+ .task-radio-group .wrap {
111
+ gap: 15px !important;
112
+ }
113
+
114
+ .task-radio-group label {
115
+ background: #fff !important;
116
+ border: 2px solid #EE4E34 !important;
117
+ border-radius: 8px !important;
118
+ padding: 12px 24px !important;
119
+ cursor: pointer !important;
120
+ transition: all 0.3s ease !important;
121
+ font-weight: 500 !important;
122
+ color: #000 !important;
123
+ }
124
+
125
+ .task-radio-group label:hover {
126
+ background: #FFF0E5 !important;
127
+ }
128
+
129
+ .task-radio-group input:checked + label {
130
+ background: #EE4E34 !important;
131
+ color: #fff !important;
132
+ }
133
+
134
+ /* Submit button */
135
  .submit-btn {
136
+ width: 100% !important;
137
+ height: 48px !important;
138
+ background: linear-gradient(90deg, #EE4E34, #FF6B4E) !important;
139
+ color: #fff !important;
140
+ font-size: 1.1rem !important;
141
+ font-weight: 600 !important;
142
+ border: none !important;
143
+ border-radius: 8px !important;
144
+ cursor: pointer !important;
145
+ transition: all 0.3s ease !important;
146
+ margin-top: 15px !important;
147
+ box-shadow: 0 4px 15px rgba(238, 78, 52, 0.3) !important;
148
  }
149
+
150
  .submit-btn:hover {
151
+ background: linear-gradient(90deg, #D43E2A, #EE4E34) !important;
152
+ transform: translateY(-2px) !important;
153
+ box-shadow: 0 6px 20px rgba(238, 78, 52, 0.4) !important;
154
+ }
155
+
156
+ /* Output textarea */
157
+ .output-textbox textarea {
158
+ background: #fff !important;
159
+ border: 1px solid #EE4E34 !important;
160
+ border-radius: 8px !important;
161
+ font-size: 0.95rem !important;
162
+ line-height: 1.6 !important;
163
  }
164
+
165
+ .output-textbox textarea:focus {
166
+ border-color: #D43E2A !important;
167
+ box-shadow: 0 0 0 3px rgba(238, 78, 52, 0.1) !important;
168
+ }
169
+
170
+ /* Tabs styling */
171
+ .output-tabs .tab-nav {
172
+ background: transparent !important;
173
+ border-bottom: 2px solid #EE4E34 !important;
174
+ gap: 5px !important;
175
+ }
176
+
177
+ .output-tabs .tab-nav button {
178
+ background: transparent !important;
179
+ color: #555 !important;
180
+ font-weight: 500 !important;
181
+ border: none !important;
182
+ padding: 10px 20px !important;
183
+ border-radius: 8px 8px 0 0 !important;
184
+ transition: all 0.2s ease !important;
185
+ }
186
+
187
+ .output-tabs .tab-nav button:hover {
188
+ color: #EE4E34 !important;
189
+ background: rgba(238, 78, 52, 0.1) !important;
190
  }
191
+
192
+ .output-tabs .tab-nav button.selected {
193
+ color: #EE4E34 !important;
194
+ background: #fff !important;
195
+ border: 2px solid #EE4E34 !important;
196
+ border-bottom: 2px solid #fff !important;
197
+ margin-bottom: -2px !important;
198
+ }
199
+
200
+ /* Markdown preview */
201
+ .markdown-preview {
202
+ background: #fff;
203
+ padding: 20px;
204
+ border-radius: 8px;
205
+ border: 1px solid #EE4E34;
206
+ min-height: 300px;
207
+ }
208
+
209
+ /* Accordion */
210
+ .examples-accordion {
211
+ border: 1px solid #EE4E34 !important;
212
+ border-radius: 8px !important;
213
+ background: rgba(255, 255, 255, 0.5) !important;
214
+ margin-top: 15px !important;
215
+ }
216
+
217
+ .examples-accordion .label-wrap {
218
+ color: #EE4E34 !important;
219
+ font-weight: 600 !important;
220
+ }
221
+
222
+ /* Footer */
223
+ .footer-section {
224
  text-align: center;
225
+ padding: 25px 0;
226
+ color: #666;
227
  font-size: 0.9rem;
 
228
  }
 
229
 
230
+ .footer-section a {
231
+ color: #EE4E34;
232
+ text-decoration: none;
233
+ font-weight: 500;
234
+ }
235
+
236
+ .footer-section a:hover {
237
+ text-decoration: underline;
238
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ /* Copy button */
241
+ .copy-btn {
242
+ background: #EE4E34 !important;
243
+ color: #fff !important;
244
+ }
245
+
246
+ /* Loading animation */
247
+ .generating {
248
+ border-color: #EE4E34 !important;
249
+ }
250
+ """
251
+
252
+ @spaces.GPU
253
+ def process_image(image, task):
254
+ """Process image with GLM-OCR model."""
255
+ if image is None:
256
+ return "⚠️ Please upload an image first.", ""
257
+
258
+ # Convert image mode if needed
259
+ if image.mode in ('RGBA', 'LA', 'P'):
260
+ image = image.convert('RGB')
261
+ image = ImageOps.exif_transpose(image)
262
+
263
+ # Save image temporarily
264
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
265
+ image.save(tmp.name, 'PNG')
266
+ tmp.close()
267
+
268
+ try:
269
+ # Get prompt for selected task
270
+ prompt_text = TASK_PROMPTS[task]
271
 
272
+ # Prepare messages
273
+ messages = [
274
+ {
275
+ "role": "user",
276
+ "content": [
277
+ {"type": "image", "url": tmp.name},
278
+ {"type": "text", "text": prompt_text}
279
+ ],
280
+ }
281
+ ]
282
 
283
+ # Process input
284
+ inputs = processor.apply_chat_template(
285
+ messages,
286
+ tokenize=True,
287
+ add_generation_prompt=True,
288
+ return_dict=True,
289
+ return_tensors="pt"
290
+ ).to(model.device)
291
+ inputs.pop("token_type_ids", None)
292
 
293
+ # Generate output
294
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
295
+ output_text = processor.decode(
296
+ generated_ids[0][inputs["input_ids"].shape[1]:],
297
+ skip_special_tokens=True
298
+ )
299
 
300
+ return output_text.strip(), output_text.strip()
 
 
 
 
 
301
 
302
+ except Exception as e:
303
+ return f"❌ Error: {str(e)}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
+ finally:
306
+ # Clean up temp file
307
+ if os.path.exists(tmp.name):
308
+ os.unlink(tmp.name)
 
 
 
 
 
 
 
 
309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ # Build the Gradio interface
312
  with gr.Blocks(title="GLM-OCR") as demo:
313
 
314
+ # Header
315
+ gr.HTML("""
316
+ <div class="main-header">
317
+ <h1>📄 GLM-OCR</h1>
318
+ <p>Extract text, formulas, and tables from documents with AI</p>
319
+ </div>
320
+ """)
321
+
322
+ with gr.Row(equal_height=True):
323
+ # Left Column - Input
324
+ with gr.Column(scale=1):
325
+ with gr.Group(elem_classes=["form-section"]):
326
+ image_input = gr.Image(
327
+ type="pil",
328
+ label="Upload Image",
329
+ sources=["upload", "clipboard"],
330
+ elem_classes=["image-upload-area"],
331
+ height=280
332
+ )
333
+
334
+ task = gr.Radio(
335
+ choices=list(TASK_PROMPTS.keys()),
336
+ value="Text",
337
+ label="Recognition Type",
338
+ elem_classes=["task-radio-group"]
339
+ )
340
+
341
+ submit_btn = gr.Button(
342
+ "🔍 Recognize",
343
+ variant="primary",
344
+ elem_classes=["submit-btn"]
345
+ )
346
+
347
+ with gr.Accordion("📁 Examples", open=False, elem_classes=["examples-accordion"]):
348
+ examples = gr.Examples(
349
+ examples=[
350
+ ["examples/1.jpg"],
351
+ ["examples/2.jpg"],
352
+ ["examples/3.jpg"]
353
+ ],
354
+ inputs=[image_input],
355
+ label=""
356
+ )
357
+
358
+ # Right Column - Output
359
+ with gr.Column(scale=1):
360
+ with gr.Group(elem_classes=["output-section"]):
361
+ gr.HTML('<div class="output-header">📋 Recognition Result</div>')
362
+
363
+ with gr.Tabs(elem_classes=["output-tabs"]):
364
+ with gr.Tab("Text"):
365
+ text_output = gr.Textbox(
366
+ lines=14,
367
+ show_label=False,
368
+ elem_classes=["output-textbox"],
369
+ show_copy_button=True,
370
+ placeholder="Recognition result will appear here..."
371
+ )
372
+
373
+ with gr.Tab("Markdown"):
374
+ md_output = gr.Markdown(
375
+ value="",
376
+ elem_classes=["markdown-preview"]
377
+ )
378
+
379
+ # Footer
380
+ gr.HTML("""
381
+ <div class="footer-section">
382
+ <p>
383
+ Powered by <a href="https://huggingface.co/zai-org/GLM-OCR" target="_blank">GLM-OCR</a> ·
384
+ Built with <a href="https://gradio.app" target="_blank">Gradio</a>
385
+ </p>
386
+ </div>
387
+ """)
388
+
389
+ # Event handlers
390
+ submit_btn.click(
391
+ fn=process_image,
392
+ inputs=[image_input, task],
393
+ outputs=[text_output, md_output]
394
  )
395
+
396
+ # Also trigger on image upload
397
+ image_input.change(
398
+ fn=lambda: ("", ""),
399
+ outputs=[text_output, md_output]
 
400
  )
401
 
402
+
403
  if __name__ == "__main__":
404
+ demo.queue(max_size=50).launch(
405
+ css=css,
406
+ show_error=True,
407
+ share=False
408
+ )