prithivMLmods commited on
Commit
0282214
·
verified ·
1 Parent(s): 8ecd775

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +406 -227
app.py CHANGED
@@ -1,264 +1,443 @@
1
  import gradio as gr
2
  import torch
3
  import spaces
 
 
4
  from PIL import Image
5
  from transformers import AutoProcessor, AutoModelForImageTextToText
6
- from gradio.themes import Soft
7
- from gradio.themes.utils import colors, fonts, sizes
8
- from typing import Iterable
9
-
10
- colors.orange_red = colors.Color(
11
- name="orange_red",
12
- c50="#FFF0E5",
13
- c100="#FFE0CC",
14
- c200="#FFC299",
15
- c300="#FFA366",
16
- c400="#FF8533",
17
- c500="#FF4500",
18
- c600="#E63E00",
19
- c700="#CC3700",
20
- c800="#B33000",
21
- c900="#992900",
22
- c950="#802200",
23
- )
24
-
25
- class OrangeRedTheme(Soft):
26
- def __init__(
27
- self,
28
- *,
29
- primary_hue: colors.Color | str = colors.gray,
30
- secondary_hue: colors.Color | str = colors.orange_red,
31
- neutral_hue: colors.Color | str = colors.slate,
32
- text_size: sizes.Size | str = sizes.text_lg,
33
- font: fonts.Font | str | Iterable[fonts.Font | str] = (
34
- fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
35
- ),
36
- font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
37
- fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
38
- ),
39
- ):
40
- super().__init__(
41
- primary_hue=primary_hue,
42
- secondary_hue=secondary_hue,
43
- neutral_hue=neutral_hue,
44
- text_size=text_size,
45
- font=font,
46
- font_mono=font_mono,
47
- )
48
- super().set(
49
- background_fill_primary="*primary_50",
50
- background_fill_primary_dark="*primary_900",
51
- body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
52
- body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
53
- button_primary_text_color="white",
54
- button_primary_text_color_hover="white",
55
- button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
56
- button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
57
- button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
58
- button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
59
- slider_color="*secondary_500",
60
- block_title_text_weight="600",
61
- block_border_width="0px",
62
- block_shadow="*shadow_drop_lg",
63
- button_large_padding="12px 24px",
64
- color_accent_soft="*primary_100",
65
- )
66
 
67
- orange_red_theme = OrangeRedTheme()
 
 
68
 
69
  MODEL_PATH = "zai-org/GLM-OCR"
 
70
 
71
- device = "cuda" if torch.cuda.is_available() else "cpu"
72
-
73
- print(f"Loading {MODEL_PATH} on {device}...")
74
 
 
75
  try:
76
  processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
 
77
  model = AutoModelForImageTextToText.from_pretrained(
78
  pretrained_model_name_or_path=MODEL_PATH,
79
  torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
80
- device_map="auto",
81
  trust_remote_code=True,
82
- attn_implementation="flash_attention_2" if torch.cuda.is_available() else "eager"
83
  )
 
 
 
 
84
  except Exception as e:
85
  print(f"Error loading model: {e}")
86
- # Fallback for CPU/No-Flash-Attn environments if necessary
87
- model = AutoModelForImageTextToText.from_pretrained(
88
- pretrained_model_name_or_path=MODEL_PATH,
89
- torch_dtype="auto",
90
- device_map="auto",
91
- trust_remote_code=True
92
- )
93
 
94
- class GlmOcr(gr.HTML):
95
- """
96
- Custom Header Component for the minimalistic UI.
97
- """
98
- def __init__(self):
99
- content = """
100
- <div style="text-align: center; margin-bottom: 2rem; padding: 2rem 1rem;">
101
- <h1 style="font-size: 3rem; font-weight: 800; margin: 0;
102
- background: linear-gradient(90deg, #FF4500, #E63E00);
103
- -webkit-background-clip: text; -webkit-text-fill-color: transparent;">
104
- GLM-OCR
105
- </h1>
106
- <p style="font-size: 1.2rem; margin-top: 0.5rem; opacity: 0.8; font-weight: 300;">
107
- High-precision Document, Formula, and Table Recognition
108
- </p>
109
- <div style="display: flex; justify-content: center; gap: 10px; margin-top: 15px;">
110
- <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">Text</span>
111
- <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">LaTeX Formulas</span>
112
- <span style="background: rgba(255, 69, 0, 0.1); color: #E63E00; padding: 4px 12px; border-radius: 20px; font-size: 0.9rem; font-weight: 600;">Tables</span>
113
- </div>
114
- </div>
115
- """
116
- super().__init__(value=content)
117
 
118
- TASK_MAPPING = {
119
- "Text Parsing": "Text Recognition:",
120
- "Formula/LaTeX": "Formula Recognition:",
121
- "Table Extraction": "Table Recognition:"
122
- }
123
 
124
  @spaces.GPU
125
- def run_ocr(image, task_key):
126
- if image is None:
127
- return None, "Please upload an image."
128
 
129
- prompt_text = TASK_MAPPING.get(task_key, "Text Recognition:")
130
-
131
- # Prepare messages
132
- messages = [
133
- {
134
- "role": "user",
135
- "content": [
136
- {
137
- "type": "image",
138
- "image": image, # Passing PIL image directly
139
- },
140
- {
141
- "type": "text",
142
- "text": prompt_text
143
- }
144
- ],
145
- }
146
- ]
147
-
148
- # Process inputs
149
- # Note: apply_chat_template with return_tensors="pt" handles image processing if the processor is multimodal aware
150
- inputs = processor.apply_chat_template(
151
- messages,
152
- tokenize=True,
153
- add_generation_prompt=True,
154
- return_dict=True,
155
- return_tensors="pt"
156
- ).to(model.device)
157
-
158
- # Remove token_type_ids if present (common issue with some models)
159
- inputs.pop("token_type_ids", None)
160
-
161
- # Generate
162
- with torch.no_grad():
163
- generated_ids = model.generate(
164
- **inputs,
165
- max_new_tokens=8192,
166
- do_sample=False, # Deterministic for OCR
167
- temperature=0.01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
- # Decode
171
- # We skip the input prompt tokens to get only the new text
172
- output_text = processor.decode(
173
- generated_ids[0][inputs["input_ids"].shape[1]:],
174
- skip_special_tokens=True
175
- )
176
-
177
- return output_text, output_text
178
 
179
- css = """
180
- .gradio-container {
181
- max-width: 1200px !important;
182
- margin: 0 auto;
 
 
 
183
  }
184
- .image-container {
185
- border-radius: 12px;
186
- overflow: hidden;
187
- box-shadow: 0 4px 6px rgba(0,0,0,0.1);
 
 
 
 
 
 
 
 
 
188
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  """
190
 
 
 
 
 
 
 
 
 
 
191
  with gr.Blocks(title="GLM-OCR") as demo:
192
 
193
- # Custom Header
194
  GlmOcr()
195
-
196
- with gr.Row():
197
- # Left Column: Inputs
198
- with gr.Column(scale=1):
199
- with gr.Group():
200
- image_input = gr.Image(
201
- type="pil",
202
- label="Document Image",
203
- elem_classes="image-container",
204
- height=400
205
- )
206
-
207
- with gr.Row():
208
- task_select = gr.Dropdown(
209
- choices=list(TASK_MAPPING.keys()),
210
- value="Text Parsing",
211
- label="Extraction Mode",
212
- interactive=True,
213
- scale=2
214
- )
215
- submit_btn = gr.Button(
216
- "Process",
217
- variant="primary",
218
- scale=1,
219
- size="lg"
220
- )
221
-
222
- with gr.Accordion("Tips", open=True):
223
- gr.Markdown("""
224
- - **Text Parsing**: Extracts all text and layout structure.
225
- - **Formula/LaTeX**: Optimized for scientific papers and math.
226
- - **Table Extraction**: Converts tables directly to Markdown/Structure.
227
- """)
228
-
229
- # Right Column: Outputs
230
- with gr.Column(scale=1):
231
- with gr.Tabs():
232
- with gr.Tab("Rendered Output"):
233
- md_output = gr.Markdown(
234
- label="Result",
235
- value="_Output will appear here..._",
236
- latex_delimiters=[
237
- {"left": "$$", "right": "$$", "display": True},
238
- {"left": "$", "right": "$", "display": False},
239
- {"left": "\\(", "right": "\\)", "display": False},
240
- {"left": "\\[", "right": "\\]", "display": True}
241
- ]
242
- )
243
- with gr.Tab("Raw Source"):
244
- raw_output = gr.Textbox(
245
- label="Raw Text/LaTeX",
246
- lines=20,
247
- #show_copy_button=True,
248
- interactive=True
249
- )
250
-
251
- # Event Wiring
252
- submit_btn.click(
253
- fn=run_ocr,
254
- inputs=[image_input, task_select],
255
- outputs=[md_output, raw_output]
256
  )
257
 
258
  if __name__ == "__main__":
259
- demo.queue().launch(
260
- theme=orange_red_theme,
261
- css=css,
262
- ssr_mode=False,
263
- show_error=True
264
- )
 
1
  import gradio as gr
2
  import torch
3
  import spaces
4
+ import base64
5
+ import io
6
  from PIL import Image
7
  from transformers import AutoProcessor, AutoModelForImageTextToText
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # -----------------------------------------------------------------------------
10
+ # Model Initialization
11
+ # -----------------------------------------------------------------------------
12
 
13
  MODEL_PATH = "zai-org/GLM-OCR"
14
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
+ print(f"Loading model on {DEVICE}...")
 
 
17
 
18
+ # Load Processor
19
  try:
20
  processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
21
+ # Load Model
22
  model = AutoModelForImageTextToText.from_pretrained(
23
  pretrained_model_name_or_path=MODEL_PATH,
24
  torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
 
25
  trust_remote_code=True,
26
+ device_map="auto" if torch.cuda.is_available() else None,
27
  )
28
+ if DEVICE == "cpu":
29
+ model = model.to("cpu") # explicit fallback if no gpu
30
+
31
+ print("Model loaded successfully.")
32
  except Exception as e:
33
  print(f"Error loading model: {e}")
34
+ # Fallback for building UI without model (for debugging/building phase)
35
+ processor = None
36
+ model = None
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # -----------------------------------------------------------------------------
40
+ # Inference Logic
41
+ # -----------------------------------------------------------------------------
 
 
42
 
43
  @spaces.GPU
44
+ def run_inference(image_b64, task_prompt):
45
+ if not image_b64:
46
+ return "Please upload an image first."
47
 
48
+ if model is None:
49
+ return "Model not loaded correctly. Check logs."
50
+
51
+ try:
52
+ # 1. Decode Base64 to PIL Image
53
+ if "base64," in image_b64:
54
+ image_b64 = image_b64.split("base64,")[1]
55
+
56
+ image_data = base64.b64decode(image_b64)
57
+ image = Image.open(io.BytesIO(image_data)).convert("RGB")
58
+
59
+ # 2. Prepare Messages
60
+ # The prompt is selected via the radio buttons
61
+ messages = [
62
+ {
63
+ "role": "user",
64
+ "content": [
65
+ {
66
+ "type": "image",
67
+ "image": image,
68
+ },
69
+ {
70
+ "type": "text",
71
+ "text": task_prompt
72
+ }
73
+ ],
74
+ }
75
+ ]
76
+
77
+ # 3. Process Inputs
78
+ inputs = processor.apply_chat_template(
79
+ messages,
80
+ tokenize=True,
81
+ add_generation_prompt=True,
82
+ return_dict=True,
83
+ return_tensors="pt"
84
+ ).to(model.device)
85
+
86
+ # Remove token_type_ids if present (transformers fix)
87
+ inputs.pop("token_type_ids", None)
88
+
89
+ # 4. Generate
90
+ with torch.no_grad():
91
+ generated_ids = model.generate(
92
+ **inputs,
93
+ max_new_tokens=2048,
94
+ do_sample=False, # Deterministic for OCR usually better
95
+ temperature=0.01
96
+ )
97
+
98
+ # 5. Decode
99
+ output_text = processor.decode(
100
+ generated_ids[0][inputs["input_ids"].shape[1]:],
101
+ skip_special_tokens=False
102
  )
103
+
104
+ # Clean up tags usually returned by VLM
105
+ output_text = output_text.replace("<|endoftext|>", "").strip()
106
+
107
+ return output_text
108
+
109
+ except Exception as e:
110
+ return f"Error during inference: {str(e)}"
111
+
112
+ # -----------------------------------------------------------------------------
113
+ # Custom Component & UI Assets
114
+ # -----------------------------------------------------------------------------
115
+
116
+ # CSS from your snippet + additions for image preview and layout
117
+ CUSTOM_CSS = """
118
+ /* Reset & Layout */
119
+ .container {
120
+ position: relative;
121
+ max-width: 600px;
122
+ width: 100%;
123
+ background: #FCEDDA;
124
+ padding: 25px;
125
+ border-radius: 8px;
126
+ box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
127
+ margin: 0 auto;
128
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
129
+ }
130
+
131
+ .container header {
132
+ font-size: 1.5rem;
133
+ color: #000;
134
+ font-weight: 600;
135
+ text-align: center;
136
+ margin-bottom: 20px;
137
+ }
138
+
139
+ .form {
140
+ margin-top: 15px;
141
+ }
142
+
143
+ .input-box {
144
+ width: 100%;
145
+ margin-top: 15px;
146
+ }
147
+
148
+ .input-box label {
149
+ color: #000;
150
+ font-weight: 500;
151
+ margin-bottom: 5px;
152
+ display: block;
153
+ }
154
+
155
+ /* Custom Upload Area */
156
+ .upload-area {
157
+ width: 100%;
158
+ min-height: 150px;
159
+ background: #fff8f0;
160
+ border: 2px dashed #EE4E34;
161
+ border-radius: 6px;
162
+ display: flex;
163
+ flex-direction: column;
164
+ align-items: center;
165
+ justify-content: center;
166
+ cursor: pointer;
167
+ transition: background 0.2s;
168
+ padding: 10px;
169
+ }
170
+ .upload-area:hover {
171
+ background: #fff0e0;
172
+ }
173
+ .upload-text {
174
+ color: #808080;
175
+ margin-top: 10px;
176
+ }
177
+ #preview-img {
178
+ max-width: 100%;
179
+ max-height: 300px;
180
+ border-radius: 4px;
181
+ display: none;
182
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
183
+ }
184
+
185
+ /* Radio Buttons */
186
+ .gender-box {
187
+ margin-top: 20px;
188
+ }
189
+ .gender-option {
190
+ display: flex;
191
+ align-items: center;
192
+ column-gap: 20px;
193
+ flex-wrap: wrap;
194
+ margin-top: 10px;
195
+ background: #fff8f0;
196
+ padding: 10px;
197
+ border-radius: 6px;
198
+ border: 1px solid #EE4E34;
199
+ }
200
+ .gender {
201
+ display: flex;
202
+ align-items: center;
203
+ column-gap: 5px;
204
+ }
205
+ .gender input {
206
+ accent-color: #EE4E34;
207
+ width: 18px;
208
+ height: 18px;
209
+ cursor: pointer;
210
+ }
211
+ .gender label {
212
+ cursor: pointer;
213
+ margin: 0; /* Reset margin from input-box label */
214
+ }
215
+
216
+ /* Textarea Output */
217
+ textarea.result-field {
218
+ width: 100%;
219
+ height: 200px;
220
+ padding: 15px;
221
+ outline: none;
222
+ font-size: 0.95rem;
223
+ color: #333;
224
+ margin-top: 5px;
225
+ border: 1px solid #EE4E34;
226
+ border-radius: 6px;
227
+ background: #fff;
228
+ resize: vertical;
229
+ font-family: monospace;
230
+ }
231
+
232
+ /* Submit Button */
233
+ .submit-btn {
234
+ height: 45px;
235
+ width: 100%;
236
+ color: #fff;
237
+ font-size: 1.1rem;
238
+ font-weight: 500;
239
+ margin-top: 25px;
240
+ border: none;
241
+ border-radius: 6px;
242
+ cursor: pointer;
243
+ transition: all 0.2s ease;
244
+ background: #EE4E34;
245
+ }
246
+ .submit-btn:hover {
247
+ background: #d63d24;
248
+ }
249
+ .submit-btn:disabled {
250
+ background: #fabab5;
251
+ cursor: not-allowed;
252
+ }
253
+ .status-msg {
254
+ text-align: center;
255
+ margin-top: 10px;
256
+ font-size: 0.9rem;
257
+ min-height: 20px;
258
+ }
259
+ """
260
+
261
+ # JavaScript to handle interactions and bridge with Gradio
262
+ CUSTOM_JS = """
263
+ <script>
264
+ function initOcrUI() {
265
+ const fileInput = document.getElementById('hidden-file-input');
266
+ const uploadArea = document.getElementById('upload-area');
267
+ const previewImg = document.getElementById('preview-img');
268
+ const uploadText = document.getElementById('upload-text');
269
+ const submitBtn = document.getElementById('custom-submit');
270
+ const resultArea = document.getElementById('result-area');
271
+ const statusMsg = document.getElementById('status-msg');
272
+
273
+ // Trigger file input
274
+ uploadArea.onclick = () => fileInput.click();
275
+
276
+ // Handle File Selection
277
+ fileInput.onchange = (e) => {
278
+ const file = e.target.files[0];
279
+ if (file) {
280
+ const reader = new FileReader();
281
+ reader.onload = (evt) => {
282
+ const b64 = evt.target.result;
283
+ // Show Preview
284
+ previewImg.src = b64;
285
+ previewImg.style.display = 'block';
286
+ uploadText.style.display = 'none';
287
+
288
+ // Update Hidden Gradio Component
289
+ updateGradioImage(b64);
290
+ }
291
+ reader.readAsDataURL(file);
292
+ }
293
+ };
294
+
295
+ // Handle Submit
296
+ submitBtn.onclick = (e) => {
297
+ e.preventDefault();
298
+
299
+ // Get selected Task
300
+ const task = document.querySelector('input[name="task"]:checked').value;
301
+
302
+ // Update Hidden Gradio Task Input
303
+ updateGradioTask(task);
304
+
305
+ // Visual Feedback
306
+ submitBtn.innerText = "Processing...";
307
+ submitBtn.disabled = true;
308
+ statusMsg.innerText = "Model is running. Please wait...";
309
+ resultArea.value = ""; // Clear previous
310
+
311
+ // Trigger Hidden Gradio Button
312
+ const gradioBtn = document.getElementById('bridge-btn');
313
+ if (gradioBtn) gradioBtn.click();
314
+ };
315
+
316
+ // --- Bridge Functions ---
317
 
318
+ function updateGradioImage(b64Data) {
319
+ const ta = document.querySelector('#bridge-img-input textarea');
320
+ if (ta) {
321
+ ta.value = b64Data;
322
+ ta.dispatchEvent(new Event('input', { bubbles: true }));
323
+ }
324
+ }
 
325
 
326
+ function updateGradioTask(taskVal) {
327
+ const ta = document.querySelector('#bridge-task-input textarea');
328
+ if (ta) {
329
+ ta.value = taskVal;
330
+ ta.dispatchEvent(new Event('input', { bubbles: true }));
331
+ }
332
+ }
333
  }
334
+
335
+ // Function called by Gradio when output changes
336
+ function updateResultUI(text) {
337
+ const resultArea = document.getElementById('result-area');
338
+ const submitBtn = document.getElementById('custom-submit');
339
+ const statusMsg = document.getElementById('status-msg');
340
+
341
+ if(resultArea) resultArea.value = text;
342
+ if(submitBtn) {
343
+ submitBtn.innerText = "Submit";
344
+ submitBtn.disabled = false;
345
+ }
346
+ if(statusMsg) statusMsg.innerText = "Extraction complete.";
347
  }
348
+
349
+ // Initialize after a slight delay to ensure DOM is ready
350
+ setTimeout(initOcrUI, 1000);
351
+ </script>
352
+ """
353
+
354
+ HTML_TEMPLATE = """
355
+ <div class="container">
356
+ <header>GLM-OCR Interface</header>
357
+
358
+ <div class="form">
359
+
360
+ <!-- Image Input Section -->
361
+ <div class="input-box">
362
+ <label>Document Image</label>
363
+ <div class="upload-area" id="upload-area">
364
+ <span class="upload-text" id="upload-text">Click to Upload Image</span>
365
+ <img id="preview-img" alt="Preview"/>
366
+ </div>
367
+ <input type="file" id="hidden-file-input" style="display:none" accept="image/*">
368
+ </div>
369
+
370
+ <!-- Task Selection -->
371
+ <div class="gender-box">
372
+ <label>Extraction Mode</label>
373
+ <div class="gender-option">
374
+ <div class="gender">
375
+ <input type="radio" id="check-text" name="task" value="Text Recognition:" checked>
376
+ <label for="check-text">Text</label>
377
+ </div>
378
+ <div class="gender">
379
+ <input type="radio" id="check-formula" name="task" value="Formula Recognition:">
380
+ <label for="check-formula">Formula</label>
381
+ </div>
382
+ <div class="gender">
383
+ <input type="radio" id="check-table" name="task" value="Table Recognition:">
384
+ <label for="check-table">Table</label>
385
+ </div>
386
+ </div>
387
+ </div>
388
+
389
+ <!-- Submit Action -->
390
+ <button class="submit-btn" id="custom-submit">Submit</button>
391
+ <div class="status-msg" id="status-msg"></div>
392
+
393
+ <!-- Result Output -->
394
+ <div class="input-box">
395
+ <label>Extraction Result</label>
396
+ <textarea id="result-area" class="result-field" readonly placeholder="Output will appear here..."></textarea>
397
+ </div>
398
+
399
+ </div>
400
+ </div>
401
  """
402
 
403
+ class GlmOcr(gr.HTML):
404
+ """Custom component wrapper to render the specific UI"""
405
+ def __init__(self):
406
+ super().__init__(value=HTML_TEMPLATE + CUSTOM_JS)
407
+
408
+ # -----------------------------------------------------------------------------
409
+ # Gradio App Structure
410
+ # -----------------------------------------------------------------------------
411
+
412
  with gr.Blocks(title="GLM-OCR") as demo:
413
 
414
+ # 1. The Custom UI
415
  GlmOcr()
416
+
417
+ # 2. Hidden Bridge Components (To transfer data between Custom HTML and Python)
418
+ with gr.Row(visible=False):
419
+ # Stores Base64 string of the image
420
+ bridge_img_input = gr.Textbox(elem_id="bridge-img-input", label="Hidden Img")
421
+ # Stores the selected task string
422
+ bridge_task_input = gr.Textbox(elem_id="bridge-task-input", value="Text Recognition:", label="Hidden Task")
423
+ # The trigger button clicked by JS
424
+ bridge_btn = gr.Button("Run", elem_id="bridge-btn")
425
+ # The output storage, watched by JS
426
+ bridge_output = gr.Textbox(elem_id="bridge-output", label="Hidden Output")
427
+
428
+ # 3. Python Logic Connections
429
+ bridge_btn.click(
430
+ fn=run_inference,
431
+ inputs=[bridge_img_input, bridge_task_input],
432
+ outputs=[bridge_output]
433
+ )
434
+
435
+ # 4. Feedback Loop: When python output changes, update HTML via JS
436
+ bridge_output.change(
437
+ fn=None,
438
+ inputs=[bridge_output],
439
+ js="(v) => updateResultUI(v)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  )
441
 
442
  if __name__ == "__main__":
443
+ demo.launch(css=CUSTOM_CSS, ssr_mode=False)