Files changed (6) hide show
  1. .gitattributes +0 -1
  2. README.md +3 -3
  3. app.py +320 -1253
  4. examples/4.jpg +0 -3
  5. pre-requirements.txt +1 -1
  6. requirements.txt +1 -1
.gitattributes CHANGED
@@ -35,4 +35,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  examples/1.jpg filter=lfs diff=lfs merge=lfs -text
37
  examples/2.jpg filter=lfs diff=lfs merge=lfs -text
38
- examples/4.jpg filter=lfs diff=lfs merge=lfs -text
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  examples/1.jpg filter=lfs diff=lfs merge=lfs -text
37
  examples/2.jpg filter=lfs diff=lfs merge=lfs -text
 
README.md CHANGED
@@ -4,11 +4,11 @@ emoji: 🌖
4
  colorFrom: gray
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
  app_file: app.py
9
- pinned: true
10
  license: apache-2.0
11
- short_description: Chandra-OCR / Nanonets-OCR2 / olmOCR-2 / Dots.OCR
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: gray
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 6.3.0
8
  app_file: app.py
9
+ pinned: false
10
  license: apache-2.0
11
+ short_description: demo of a collection of impressive ocr models on the hub
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,24 +1,143 @@
1
  import os
2
- import gc
 
 
3
  import json
4
- import base64
5
  import time
6
- from io import BytesIO
7
  from threading import Thread
 
 
8
 
9
  import gradio as gr
10
  import spaces
11
  import torch
 
12
  from PIL import Image
 
13
 
14
  from transformers import (
15
  Qwen2_5_VLForConditionalGeneration,
16
  Qwen3VLForConditionalGeneration,
 
17
  AutoModelForCausalLM,
18
  AutoProcessor,
19
  TextIteratorStreamer,
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  MAX_MAX_NEW_TOKENS = 4096
24
  DEFAULT_MAX_NEW_TOKENS = 2048
@@ -34,8 +153,82 @@ print("cuda device count:", torch.cuda.device_count())
34
  if torch.cuda.is_available():
35
  print("current device:", torch.cuda.current_device())
36
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 
37
  print("Using device:", device)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  MODEL_ID_V = "datalab-to/chandra"
41
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
@@ -55,7 +248,7 @@ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
55
  torch_dtype=torch.bfloat16,
56
  ).to(device).eval()
57
 
58
- MODEL_PATH_D = "prithivMLmods/Dots.OCR-Latest-BF16"
59
  processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
60
  model_d = AutoModelForCausalLM.from_pretrained(
61
  MODEL_PATH_D,
@@ -74,1266 +267,140 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
  torch_dtype=torch.float16
75
  ).to(device).eval()
76
 
77
- MODEL_MAP = {
78
- "Nanonets-OCR2-3B": (processor_x, model_x),
79
- "Chandra-OCR": (processor_v, model_v),
80
- "Dots.OCR": (processor_d, model_d),
81
- "olmOCR-2-7B-1025": (processor_m, model_m),
82
- }
83
-
84
- MODEL_CHOICES = list(MODEL_MAP.keys())
85
-
86
- image_examples = [
87
- {"query": "Convert to Markdown.", "image": "examples/3.jpg", "model": "Nanonets-OCR2-3B"},
88
- {"query": "Perform OCR on the image. [Markdown]", "image": "examples/1.jpg", "model": "Nanonets-OCR2-3B"},
89
- {"query": "Extract the contents. [Markdown].", "image": "examples/2.jpg", "model": "olmOCR-2-7B-1025"},
90
- {"query": "OCR the Image", "image": "examples/4.jpg", "model": "Chandra-OCR"},
91
- ]
92
-
93
-
94
- def select_model(model_name: str):
95
- if model_name not in MODEL_MAP:
96
- raise ValueError("Invalid model selected.")
97
- return MODEL_MAP[model_name]
98
-
99
-
100
- def pil_to_data_url(img: Image.Image, fmt="PNG"):
101
- buf = BytesIO()
102
- img.save(buf, format=fmt)
103
- data = base64.b64encode(buf.getvalue()).decode()
104
- mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
105
- return f"data:{mime};base64,{data}"
106
-
107
-
108
- def file_to_data_url(path):
109
- if not os.path.exists(path):
110
- return ""
111
- ext = path.rsplit(".", 1)[-1].lower()
112
- mime = {
113
- "jpg": "image/jpeg",
114
- "jpeg": "image/jpeg",
115
- "png": "image/png",
116
- "webp": "image/webp",
117
- }.get(ext, "image/jpeg")
118
- with open(path, "rb") as f:
119
- data = base64.b64encode(f.read()).decode()
120
- return f"data:{mime};base64,{data}"
121
-
122
-
123
- def make_thumb_b64(path, max_dim=240):
124
- try:
125
- img = Image.open(path).convert("RGB")
126
- img.thumbnail((max_dim, max_dim))
127
- return pil_to_data_url(img, "JPEG")
128
- except Exception as e:
129
- print("Thumbnail error:", e)
130
- return ""
131
-
132
-
133
- def build_example_cards_html():
134
- cards = ""
135
- for i, ex in enumerate(image_examples):
136
- thumb = make_thumb_b64(ex["image"])
137
- prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
138
- cards += f"""
139
- <div class="example-card" data-idx="{i}">
140
- <div class="example-thumb-wrap">
141
- {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
142
- </div>
143
- <div class="example-meta-row">
144
- <span class="example-badge">{ex["model"]}</span>
145
- </div>
146
- <div class="example-prompt-text">{prompt_short}</div>
147
- </div>
148
- """
149
- return cards
150
-
151
-
152
- EXAMPLE_CARDS_HTML = build_example_cards_html()
153
-
154
-
155
- def load_example_data(idx_str):
156
- try:
157
- idx = int(str(idx_str).strip())
158
- except Exception:
159
- return gr.update(value=json.dumps({"status": "error", "message": "Invalid example index"}))
160
-
161
- if idx < 0 or idx >= len(image_examples):
162
- return gr.update(value=json.dumps({"status": "error", "message": "Example index out of range"}))
163
-
164
- ex = image_examples[idx]
165
- img_b64 = file_to_data_url(ex["image"])
166
- if not img_b64:
167
- return gr.update(value=json.dumps({"status": "error", "message": "Could not load example image"}))
168
-
169
- return gr.update(value=json.dumps({
170
- "status": "ok",
171
- "query": ex["query"],
172
- "image": img_b64,
173
- "model": ex["model"],
174
- "name": os.path.basename(ex["image"]),
175
- }))
176
-
177
-
178
- def b64_to_pil(b64_str):
179
- if not b64_str:
180
- return None
181
- try:
182
- if b64_str.startswith("data:"):
183
- _, data = b64_str.split(",", 1)
184
- else:
185
- data = b64_str
186
- image_data = base64.b64decode(data)
187
- return Image.open(BytesIO(image_data)).convert("RGB")
188
- except Exception:
189
- return None
190
-
191
-
192
- def calc_timeout_image(*args, **kwargs):
193
- gpu_timeout = kwargs.get("gpu_timeout", None)
194
- if gpu_timeout is None and args:
195
- gpu_timeout = args[-1]
196
  try:
197
  return int(gpu_timeout)
198
- except Exception:
199
  return 60
200
 
201
-
202
  @spaces.GPU(duration=calc_timeout_image)
203
- def generate_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout=60):
204
- try:
205
- if not model_name or model_name not in MODEL_MAP:
206
- yield "[ERROR] Please select a valid model."
207
- return
208
- if image is None:
209
- yield "[ERROR] Please upload an image."
210
- return
211
- if not text or not str(text).strip():
212
- yield "[ERROR] Please enter your OCR/query instruction."
213
- return
214
- if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
215
- yield "[ERROR] Query is too long. Please shorten your input."
216
- return
217
-
218
- processor, model = select_model(model_name)
219
-
220
- messages = [{
221
- "role": "user",
222
- "content": [
223
- {"type": "image"},
224
- {"type": "text", "text": text},
225
- ]
226
- }]
227
-
228
- prompt_full = processor.apply_chat_template(
229
- messages,
230
- tokenize=False,
231
- add_generation_prompt=True
232
- )
233
-
234
- inputs = processor(
235
- text=[prompt_full],
236
- images=[image],
237
- return_tensors="pt",
238
- padding=True,
239
- truncation=True,
240
- max_length=MAX_INPUT_TOKEN_LENGTH
241
- )
242
-
243
- model_device = getattr(model, "device", None)
244
- if model_device is None:
245
- try:
246
- model_device = next(model.parameters()).device
247
- except Exception:
248
- model_device = device
249
-
250
- inputs = inputs.to(model_device)
251
-
252
- streamer = TextIteratorStreamer(
253
- processor.tokenizer if hasattr(processor, "tokenizer") else processor,
254
- skip_prompt=True,
255
- skip_special_tokens=True
256
- )
257
-
258
- generation_error = {"error": None}
259
-
260
- generation_kwargs = {
261
- **inputs,
262
- "streamer": streamer,
263
- "max_new_tokens": int(max_new_tokens),
264
- "do_sample": True,
265
- "temperature": float(temperature),
266
- "top_p": float(top_p),
267
- "top_k": int(top_k),
268
- "repetition_penalty": float(repetition_penalty),
269
- }
270
-
271
- def _run_generation():
272
- try:
273
- model.generate(**generation_kwargs)
274
- except Exception as e:
275
- generation_error["error"] = e
276
- try:
277
- streamer.end()
278
- except Exception:
279
- pass
280
-
281
- thread = Thread(target=_run_generation, daemon=True)
282
- thread.start()
283
-
284
- buffer = ""
285
- for new_text in streamer:
286
- buffer += new_text.replace("<|im_end|>", "")
287
- time.sleep(0.01)
288
- yield buffer
289
-
290
- thread.join(timeout=1.0)
291
-
292
- if generation_error["error"] is not None:
293
- err_msg = f"[ERROR] Inference failed: {str(generation_error['error'])}"
294
- if buffer.strip():
295
- yield buffer + "\n\n" + err_msg
296
- else:
297
- yield err_msg
298
- return
299
-
300
- if not buffer.strip():
301
- yield "[ERROR] No output was generated."
302
-
303
- except Exception as e:
304
- yield f"[ERROR] {str(e)}"
305
- finally:
306
- gc.collect()
307
- if torch.cuda.is_available():
308
- torch.cuda.empty_cache()
309
-
310
-
311
- def run_ocr(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
312
- try:
313
- image = b64_to_pil(image_b64)
314
- yield from generate_image(
315
- model_name=model_name,
316
- text=text,
317
- image=image,
318
- max_new_tokens=max_new_tokens_v,
319
- temperature=temperature_v,
320
- top_p=top_p_v,
321
- top_k=top_k_v,
322
- repetition_penalty=repetition_penalty_v,
323
- gpu_timeout=gpu_timeout_v,
324
- )
325
- except Exception as e:
326
- yield f"[ERROR] {str(e)}"
327
-
328
-
329
- def noop():
330
- return None
331
-
332
-
333
- css = r"""
334
- @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
335
- *{box-sizing:border-box;margin:0;padding:0}
336
- html,body{height:100%;overflow-x:hidden}
337
- body,.gradio-container{
338
- background:#0f0f13!important;
339
- font-family:'Inter',system-ui,-apple-system,sans-serif!important;
340
- font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
341
- }
342
- .dark body,.dark .gradio-container{background:#0f0f13!important;color:#e4e4e7!important}
343
- footer{display:none!important}
344
- .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
345
-
346
- #gradio-run-btn,#example-load-btn{
347
- position:absolute!important;left:-9999px!important;top:-9999px!important;
348
- width:1px!important;height:1px!important;opacity:0.01!important;
349
- pointer-events:none!important;overflow:hidden!important;
350
- }
351
-
352
- .app-shell{
353
- background:#18181b;border:1px solid #27272a;border-radius:16px;
354
- margin:12px auto;max-width:1400px;overflow:hidden;
355
- box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
356
- }
357
- .app-header{
358
- background:linear-gradient(135deg,#18181b,#1e1e24);border-bottom:1px solid #27272a;
359
- padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
360
- }
361
- .app-header-left{display:flex;align-items:center;gap:12px}
362
- .app-logo{
363
- width:38px;height:38px;background:linear-gradient(135deg,#00FFFF,#4DFFFF,#99FFFF);
364
- border-radius:10px;display:flex;align-items:center;justify-content:center;
365
- box-shadow:0 4px 12px rgba(0,255,255,.30);
366
- }
367
- .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
368
-
369
- .app-title{
370
- font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#bdbdbd);
371
- -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
372
- }
373
- .app-badge{
374
- font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
375
- background:rgba(0,255,255,.10);color:#9EFFFF;border:1px solid rgba(0,255,255,.24);letter-spacing:.3px;
376
- }
377
- .app-badge.fast{background:rgba(0,255,255,.08);color:#6AFFFF;border:1px solid rgba(0,255,255,.20)}
378
-
379
- .model-tabs-bar{
380
- background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px;
381
- display:flex;gap:8px;align-items:center;flex-wrap:wrap;
382
- }
383
- .model-tab{
384
- display:inline-flex;align-items:center;justify-content:center;gap:6px;
385
- min-width:32px;height:34px;background:transparent;border:1px solid #27272a;
386
- border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
387
- color:#ffffff!important;transition:all .15s ease;
388
- }
389
- .model-tab:hover{background:rgba(0,255,255,.10);border-color:rgba(0,255,255,.35)}
390
- .model-tab.active{background:rgba(0,255,255,.16);border-color:#00FFFF;color:#fff!important;box-shadow:0 0 0 2px rgba(0,255,255,.08)}
391
- .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
392
-
393
- .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
394
- .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
395
- .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
396
-
397
- #image-drop-zone{
398
- position:relative;background:#09090b;height:440px;min-height:440px;max-height:440px;
399
- overflow:hidden;
400
- }
401
- #image-drop-zone.drag-over{outline:2px solid #00FFFF;outline-offset:-2px;background:rgba(0,255,255,.04)}
402
- .upload-prompt-modern{
403
- position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
404
- padding:20px;z-index:20;overflow:hidden;
405
- }
406
- .upload-click-area{
407
- display:flex;flex-direction:column;align-items:center;justify-content:center;
408
- cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
409
- border:2px dashed #3f3f46;border-radius:16px;
410
- background:rgba(0,255,255,.03);transition:all .2s ease;gap:8px;text-align:center;
411
- overflow:hidden;
412
- }
413
- .upload-click-area:hover{background:rgba(0,255,255,.08);border-color:#00FFFF;transform:scale(1.02)}
414
- .upload-click-area:active{background:rgba(0,255,255,.12);transform:scale(.99)}
415
- .upload-click-area svg{width:86px;height:86px;max-width:100%;flex-shrink:0}
416
- .upload-main-text{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px}
417
- .upload-sub-text{color:#71717a;font-size:12px}
418
-
419
- .single-preview-wrap{
420
- width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px;
421
- overflow:hidden;
422
- }
423
- .single-preview-card{
424
- width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
425
- overflow:hidden;border:1px solid #27272a;background:#111114;
426
- display:flex;align-items:center;justify-content:center;position:relative;
427
- }
428
- .single-preview-card img{
429
- width:100%;height:100%;max-width:100%;max-height:100%;
430
- object-fit:contain;display:block;
431
- }
432
- .preview-overlay-actions{
433
- position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5;
434
- }
435
- .preview-action-btn{
436
- display:inline-flex;align-items:center;justify-content:center;
437
- min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65);
438
- border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer;
439
- color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease;
440
- }
441
- .preview-action-btn:hover{background:#00FFFF;border-color:#00FFFF;color:#0b0f12!important}
442
-
443
- .hint-bar{
444
- background:rgba(0,255,255,.05);border-top:1px solid #27272a;border-bottom:1px solid #27272a;
445
- padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
446
- }
447
- .hint-bar b{color:#8EFFFF;font-weight:600}
448
- .hint-bar kbd{
449
- display:inline-block;padding:1px 6px;background:#27272a;border:1px solid #3f3f46;
450
- border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
451
- }
452
-
453
- .examples-section{border-top:1px solid #27272a;padding:12px 16px}
454
- .examples-title{
455
- font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
456
- letter-spacing:.8px;margin-bottom:10px;
457
- }
458
- .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
459
- .examples-scroll::-webkit-scrollbar{height:6px}
460
- .examples-scroll::-webkit-scrollbar-track{background:#09090b;border-radius:3px}
461
- .examples-scroll::-webkit-scrollbar-thumb{background:#27272a;border-radius:3px}
462
- .examples-scroll::-webkit-scrollbar-thumb:hover{background:#3f3f46}
463
- .example-card{
464
- flex-shrink:0;width:220px;background:#09090b;border:1px solid #27272a;
465
- border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
466
- }
467
- .example-card:hover{border-color:#00FFFF;transform:translateY(-2px);box-shadow:0 4px 12px rgba(0,255,255,.14)}
468
- .example-card.loading{opacity:.5;pointer-events:none}
469
- .example-thumb-wrap{height:120px;overflow:hidden;background:#18181b}
470
- .example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
471
- .example-thumb-placeholder{
472
- width:100%;height:100%;display:flex;align-items:center;justify-content:center;
473
- background:#18181b;color:#3f3f46;font-size:11px;
474
- }
475
- .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px}
476
- .example-badge{
477
- display:inline-flex;padding:2px 7px;background:rgba(0,255,255,.12);border-radius:4px;
478
- font-size:10px;font-weight:600;color:#8EFFFF;font-family:'JetBrains Mono',monospace;white-space:nowrap;
479
- }
480
- .example-prompt-text{
481
- padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
482
- display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
483
- }
484
-
485
- .panel-card{border-bottom:1px solid #27272a}
486
- .panel-card-title{
487
- padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
488
- text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
489
- }
490
- .panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
491
- .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
492
- .modern-textarea{
493
- width:100%;background:#09090b;border:1px solid #27272a;border-radius:8px;
494
- padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
495
- resize:none;outline:none;min-height:100px;transition:border-color .2s;
496
- }
497
- .modern-textarea:focus{border-color:#00FFFF;box-shadow:0 0 0 3px rgba(0,255,255,.14)}
498
- .modern-textarea::placeholder{color:#3f3f46}
499
- .modern-textarea.error-flash{
500
- border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
501
- }
502
- @keyframes shake{0%,100%{transform:translateX(0)}20%,60%{transform:translateX(-4px)}40%,80%{transform:translateX(4px)}}
503
-
504
- .toast-notification{
505
- position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%);
506
- z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif;
507
- font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px;
508
- box-shadow:0 8px 24px rgba(0,0,0,.5);
509
- transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none;
510
- }
511
- .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
512
- .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
513
- .toast-notification.warning{background:linear-gradient(135deg,#0891b2,#0e7490);color:#fff;border:1px solid rgba(255,255,255,.15)}
514
- .toast-notification.info{background:linear-gradient(135deg,#06b6d4,#0891b2);color:#fff;border:1px solid rgba(255,255,255,.15)}
515
- .toast-notification .toast-icon{font-size:16px;line-height:1}
516
- .toast-notification .toast-text{line-height:1.3}
517
-
518
- .btn-run{
519
- display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
520
- background:linear-gradient(135deg,#00FFFF,#00D9D9);border:none;border-radius:10px;
521
- padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
522
- color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
523
- transition:all .2s ease;letter-spacing:-.2px;
524
- box-shadow:0 4px 16px rgba(0,255,255,.25),inset 0 1px 0 rgba(255,255,255,.18);
525
- }
526
- .btn-run:hover{
527
- background:linear-gradient(135deg,#5EFFFF,#00FFFF);transform:translateY(-1px);
528
- box-shadow:0 6px 24px rgba(0,255,255,.35),inset 0 1px 0 rgba(255,255,255,.22);
529
- }
530
- .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(0,255,255,.25)}
531
- #custom-run-btn,#custom-run-btn *,#run-btn-label,.btn-run,.btn-run *{
532
- color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
533
- }
534
- body:not(.dark) .btn-run,body:not(.dark) .btn-run *,
535
- .dark .btn-run,.dark .btn-run *,
536
- .gradio-container .btn-run,.gradio-container .btn-run *,
537
- .gradio-container #custom-run-btn,.gradio-container #custom-run-btn *{
538
- color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
539
- }
540
-
541
- .output-frame{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative}
542
- .output-frame .out-title,
543
- .output-frame .out-title *,
544
- #output-title-label{
545
- color:#ffffff!important;
546
- -webkit-text-fill-color:#ffffff!important;
547
- }
548
- .output-frame .out-title{
549
- padding:10px 20px;font-size:13px;font-weight:700;
550
- text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
551
- display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
552
- }
553
- .out-title-right{display:flex;gap:8px;align-items:center}
554
- .out-action-btn{
555
- display:inline-flex;align-items:center;justify-content:center;background:rgba(0,255,255,.10);
556
- border:1px solid rgba(0,255,255,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
557
- font-size:11px;font-weight:500;color:#8EFFFF!important;gap:4px;height:24px;transition:all .15s;
558
- }
559
- .out-action-btn:hover{background:rgba(0,255,255,.2);border-color:rgba(0,255,255,.35);color:#ffffff!important}
560
- .out-action-btn svg{width:12px;height:12px;fill:#8EFFFF}
561
- .output-frame .out-body{
562
- flex:1;background:#09090b;display:flex;align-items:stretch;justify-content:stretch;
563
- overflow:hidden;min-height:320px;position:relative;
564
- }
565
- .output-scroll-wrap{
566
- width:100%;height:100%;padding:0;overflow:hidden;
567
- }
568
- .output-textarea{
569
- width:100%;height:320px;min-height:320px;max-height:320px;background:#09090b;color:#e4e4e7;
570
- border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
571
- font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
572
- }
573
- .output-textarea::placeholder{color:#52525b}
574
- .output-textarea.error-flash{
575
- box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
576
- }
577
- .modern-loader{
578
- display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(9,9,11,.92);
579
- z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
580
- }
581
- .modern-loader.active{display:flex}
582
- .modern-loader .loader-spinner{
583
- width:36px;height:36px;border:3px solid #27272a;border-top-color:#00FFFF;
584
- border-radius:50%;animation:spin .8s linear infinite;
585
- }
586
- @keyframes spin{to{transform:rotate(360deg)}}
587
- .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
588
- .loader-bar-track{width:200px;height:4px;background:#27272a;border-radius:2px;overflow:hidden}
589
- .loader-bar-fill{
590
- height:100%;background:linear-gradient(90deg,#00FFFF,#6AFFFF,#00FFFF);
591
- background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
592
- }
593
- @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
594
-
595
- .settings-group{border:1px solid #27272a;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
596
- .settings-group-title{
597
- font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
598
- padding:10px 16px;border-bottom:1px solid #27272a;background:rgba(24,24,27,.5);
599
- }
600
- .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
601
- .slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
602
- .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
603
- .slider-row input[type="range"]{
604
- flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#27272a;
605
- border-radius:3px;outline:none;min-width:0;
606
- }
607
- .slider-row input[type="range"]::-webkit-slider-thumb{
608
- -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#00FFFF,#00D9D9);
609
- border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(0,255,255,.35);transition:transform .15s;
610
- }
611
- .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
612
- .slider-row input[type="range"]::-moz-range-thumb{
613
- width:16px;height:16px;background:linear-gradient(135deg,#00FFFF,#00D9D9);
614
- border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(0,255,255,.35);
615
- }
616
- .slider-row .slider-val{
617
- min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
618
- font-weight:500;padding:3px 8px;background:#09090b;border:1px solid #27272a;
619
- border-radius:6px;color:#a1a1aa;flex-shrink:0;
620
- }
621
-
622
- .app-statusbar{
623
- background:#18181b;border-top:1px solid #27272a;padding:6px 20px;
624
- display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
625
- }
626
- .app-statusbar .sb-section{
627
- padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
628
- font-size:12px;color:#52525b;overflow:hidden;white-space:nowrap;
629
- }
630
- .app-statusbar .sb-section.sb-fixed{
631
- flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
632
- padding:3px 12px;background:rgba(0,255,255,.08);border-radius:6px;color:#8EFFFF;font-weight:500;
633
- }
634
-
635
- .exp-note{padding:10px 20px;font-size:12px;color:#52525b;border-top:1px solid #27272a;text-align:center}
636
- .exp-note a{color:#8EFFFF;text-decoration:none}
637
- .exp-note a:hover{text-decoration:underline}
638
-
639
- ::-webkit-scrollbar{width:8px;height:8px}
640
- ::-webkit-scrollbar-track{background:#09090b}
641
- ::-webkit-scrollbar-thumb{background:#27272a;border-radius:4px}
642
- ::-webkit-scrollbar-thumb:hover{background:#3f3f46}
643
-
644
- @media(max-width:980px){
645
- .app-main-row{flex-direction:column}
646
- .app-main-right{width:100%}
647
- .app-main-left{border-right:none;border-bottom:1px solid #27272a}
648
- }
649
- """
650
-
651
- gallery_js = r"""
652
- () => {
653
- function init() {
654
- if (window.__ocr3InitDone) return;
655
-
656
- const dropZone = document.getElementById('image-drop-zone');
657
- const uploadPrompt = document.getElementById('upload-prompt');
658
- const uploadClick = document.getElementById('upload-click-area');
659
- const fileInput = document.getElementById('custom-file-input');
660
- const previewWrap = document.getElementById('single-preview-wrap');
661
- const previewImg = document.getElementById('single-preview-img');
662
- const btnUpload = document.getElementById('preview-upload-btn');
663
- const btnClear = document.getElementById('preview-clear-btn');
664
- const promptInput = document.getElementById('custom-query-input');
665
- const runBtnEl = document.getElementById('custom-run-btn');
666
- const outputArea = document.getElementById('custom-output-textarea');
667
- const imgStatus = document.getElementById('sb-image-status');
668
-
669
- if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
670
- setTimeout(init, 250);
671
- return;
672
- }
673
-
674
- window.__ocr3InitDone = true;
675
- let imageState = null;
676
- let toastTimer = null;
677
- let examplePoller = null;
678
- let lastSeenExamplePayload = null;
679
-
680
- function showToast(message, type) {
681
- let toast = document.getElementById('app-toast');
682
- if (!toast) {
683
- toast = document.createElement('div');
684
- toast.id = 'app-toast';
685
- toast.className = 'toast-notification';
686
- toast.innerHTML = '<span class="toast-icon"></span><span class="toast-text"></span>';
687
- document.body.appendChild(toast);
688
- }
689
- const icon = toast.querySelector('.toast-icon');
690
- const text = toast.querySelector('.toast-text');
691
- toast.className = 'toast-notification ' + (type || 'error');
692
- if (type === 'warning') icon.textContent = '\u26A0';
693
- else if (type === 'info') icon.textContent = '\u2139';
694
- else icon.textContent = '\u2717';
695
- text.textContent = message;
696
- if (toastTimer) clearTimeout(toastTimer);
697
- void toast.offsetWidth;
698
- toast.classList.add('visible');
699
- toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500);
700
- }
701
-
702
- function showLoader() {
703
- const l = document.getElementById('output-loader');
704
- if (l) l.classList.add('active');
705
- const sb = document.getElementById('sb-run-state');
706
- if (sb) sb.textContent = 'Processing...';
707
- }
708
- function hideLoader() {
709
- const l = document.getElementById('output-loader');
710
- if (l) l.classList.remove('active');
711
- const sb = document.getElementById('sb-run-state');
712
- if (sb) sb.textContent = 'Done';
713
  }
714
- function setRunErrorState() {
715
- const l = document.getElementById('output-loader');
716
- if (l) l.classList.remove('active');
717
- const sb = document.getElementById('sb-run-state');
718
- if (sb) sb.textContent = 'Error';
719
- }
720
-
721
- window.__showToast = showToast;
722
- window.__showLoader = showLoader;
723
- window.__hideLoader = hideLoader;
724
- window.__setRunErrorState = setRunErrorState;
725
-
726
- function flashPromptError() {
727
- promptInput.classList.add('error-flash');
728
- promptInput.focus();
729
- setTimeout(() => promptInput.classList.remove('error-flash'), 800);
730
- }
731
-
732
- function flashOutputError() {
733
- if (!outputArea) return;
734
- outputArea.classList.add('error-flash');
735
- setTimeout(() => outputArea.classList.remove('error-flash'), 800);
736
- }
737
-
738
- function getValueFromContainer(containerId) {
739
- const container = document.getElementById(containerId);
740
- if (!container) return '';
741
- const el = container.querySelector('textarea, input');
742
- return el ? (el.value || '') : '';
743
- }
744
-
745
- function setGradioValue(containerId, value) {
746
- const container = document.getElementById(containerId);
747
- if (!container) return false;
748
- const el = container.querySelector('textarea, input');
749
- if (!el) return false;
750
- const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
751
- const ns = Object.getOwnPropertyDescriptor(proto, 'value');
752
- if (ns && ns.set) {
753
- ns.set.call(el, value);
754
- el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
755
- el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
756
- return true;
757
- }
758
- return false;
759
- }
760
-
761
- function syncImageToGradio() {
762
- setGradioValue('hidden-image-b64', imageState ? imageState.b64 : '');
763
- const txt = imageState ? '1 image uploaded' : 'No image uploaded';
764
- if (imgStatus) imgStatus.textContent = txt;
765
- }
766
-
767
- function syncPromptToGradio() {
768
- setGradioValue('prompt-gradio-input', promptInput.value);
769
- }
770
-
771
- function syncModelToGradio(name) {
772
- setGradioValue('hidden-model-name', name);
773
- }
774
-
775
- function setPreview(b64, name) {
776
- imageState = {b64, name: name || 'image'};
777
- previewImg.src = b64;
778
- previewWrap.style.display = 'flex';
779
- if (uploadPrompt) uploadPrompt.style.display = 'none';
780
- syncImageToGradio();
781
- }
782
- window.__setPreview = setPreview;
783
-
784
- function clearPreview() {
785
- imageState = null;
786
- previewImg.src = '';
787
- previewWrap.style.display = 'none';
788
- if (uploadPrompt) uploadPrompt.style.display = 'flex';
789
- syncImageToGradio();
790
- }
791
- window.__clearPreview = clearPreview;
792
-
793
- function processFile(file) {
794
- if (!file) return;
795
- if (!file.type.startsWith('image/')) {
796
- showToast('Only image files are supported', 'error');
797
- return;
798
- }
799
- const reader = new FileReader();
800
- reader.onload = (e) => setPreview(e.target.result, file.name);
801
- reader.readAsDataURL(file);
802
- }
803
-
804
- fileInput.addEventListener('change', (e) => {
805
- const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
806
- if (file) processFile(file);
807
- e.target.value = '';
808
- });
809
-
810
- if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
811
- if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
812
- if (btnClear) btnClear.addEventListener('click', clearPreview);
813
-
814
- dropZone.addEventListener('dragover', (e) => {
815
- e.preventDefault();
816
- dropZone.classList.add('drag-over');
817
- });
818
- dropZone.addEventListener('dragleave', (e) => {
819
- e.preventDefault();
820
- dropZone.classList.remove('drag-over');
821
- });
822
- dropZone.addEventListener('drop', (e) => {
823
- e.preventDefault();
824
- dropZone.classList.remove('drag-over');
825
- if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]);
826
- });
827
-
828
- promptInput.addEventListener('input', syncPromptToGradio);
829
-
830
- function activateModelTab(name) {
831
- document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
832
- btn.classList.toggle('active', btn.getAttribute('data-model') === name);
833
- });
834
- syncModelToGradio(name);
835
- }
836
- window.__activateModelTab = activateModelTab;
837
-
838
- document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
839
- btn.addEventListener('click', () => {
840
- const model = btn.getAttribute('data-model');
841
- activateModelTab(model);
842
- });
843
- });
844
-
845
- activateModelTab('Nanonets-OCR2-3B');
846
-
847
- function syncSlider(customId, gradioId) {
848
- const slider = document.getElementById(customId);
849
- const valSpan = document.getElementById(customId + '-val');
850
- if (!slider) return;
851
- slider.addEventListener('input', () => {
852
- if (valSpan) valSpan.textContent = slider.value;
853
- const container = document.getElementById(gradioId);
854
- if (!container) return;
855
- container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => {
856
- const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value');
857
- if (ns && ns.set) {
858
- ns.set.call(el, slider.value);
859
- el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
860
- el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
861
- }
862
- });
863
- });
864
- }
865
-
866
- syncSlider('custom-max-new-tokens', 'gradio-max-new-tokens');
867
- syncSlider('custom-temperature', 'gradio-temperature');
868
- syncSlider('custom-top-p', 'gradio-top-p');
869
- syncSlider('custom-top-k', 'gradio-top-k');
870
- syncSlider('custom-repetition-penalty', 'gradio-repetition-penalty');
871
- syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
872
 
873
- function validateBeforeRun() {
874
- const promptVal = promptInput.value.trim();
875
- if (!imageState && !promptVal) {
876
- showToast('Please upload an image and enter your OCR instruction', 'error');
877
- flashPromptError();
878
- return false;
879
- }
880
- if (!imageState) {
881
- showToast('Please upload an image', 'error');
882
- return false;
883
- }
884
- if (!promptVal) {
885
- showToast('Please enter your OCR/query instruction', 'warning');
886
- flashPromptError();
887
- return false;
888
- }
889
- const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
890
- if (!currentModel) {
891
- showToast('Please select a model', 'error');
892
- return false;
893
- }
894
- return true;
895
- }
896
-
897
- window.__clickGradioRunBtn = function() {
898
- if (!validateBeforeRun()) return;
899
- syncPromptToGradio();
900
- syncImageToGradio();
901
- const active = document.querySelector('.model-tab.active');
902
- if (active) syncModelToGradio(active.getAttribute('data-model'));
903
- if (outputArea) outputArea.value = '';
904
- showLoader();
905
- setTimeout(() => {
906
- const gradioBtn = document.getElementById('gradio-run-btn');
907
- if (!gradioBtn) {
908
- setRunErrorState();
909
- if (outputArea) outputArea.value = '[ERROR] Run button not found.';
910
- showToast('Run button not found', 'error');
911
- return;
912
- }
913
- const btn = gradioBtn.querySelector('button');
914
- if (btn) btn.click(); else gradioBtn.click();
915
- }, 180);
916
- };
917
-
918
- if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn());
919
-
920
- const copyBtn = document.getElementById('copy-output-btn');
921
- if (copyBtn) {
922
- copyBtn.addEventListener('click', async () => {
923
- try {
924
- const text = outputArea ? outputArea.value : '';
925
- if (!text.trim()) {
926
- showToast('No output to copy', 'warning');
927
- flashOutputError();
928
- return;
929
- }
930
- await navigator.clipboard.writeText(text);
931
- showToast('Output copied to clipboard', 'info');
932
- } catch(e) {
933
- showToast('Copy failed', 'error');
934
- }
935
- });
936
- }
937
-
938
- const saveBtn = document.getElementById('save-output-btn');
939
- if (saveBtn) {
940
- saveBtn.addEventListener('click', () => {
941
- const text = outputArea ? outputArea.value : '';
942
- if (!text.trim()) {
943
- showToast('No output to save', 'warning');
944
- flashOutputError();
945
- return;
946
- }
947
- const blob = new Blob([text], {type: 'text/plain;charset=utf-8'});
948
- const a = document.createElement('a');
949
- a.href = URL.createObjectURL(blob);
950
- a.download = 'multimodal_ocr3_output.txt';
951
- document.body.appendChild(a);
952
- a.click();
953
- setTimeout(() => {
954
- URL.revokeObjectURL(a.href);
955
- document.body.removeChild(a);
956
- }, 200);
957
- showToast('Output saved', 'info');
958
- });
959
- }
960
-
961
- function applyExamplePayload(raw) {
962
- try {
963
- const data = JSON.parse(raw);
964
- if (data.status === 'ok') {
965
- if (data.image) setPreview(data.image, data.name || 'example.jpg');
966
- if (data.query) {
967
- promptInput.value = data.query;
968
- syncPromptToGradio();
969
- }
970
- if (data.model) activateModelTab(data.model);
971
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
972
- showToast('Example loaded', 'info');
973
- } else if (data.status === 'error') {
974
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
975
- showToast(data.message || 'Failed to load example', 'error');
976
- }
977
- } catch (e) {
978
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
979
- }
980
- }
981
-
982
- function startExamplePolling() {
983
- if (examplePoller) clearInterval(examplePoller);
984
- let attempts = 0;
985
- examplePoller = setInterval(() => {
986
- attempts += 1;
987
- const current = getValueFromContainer('example-result-data');
988
- if (current && current !== lastSeenExamplePayload) {
989
- lastSeenExamplePayload = current;
990
- clearInterval(examplePoller);
991
- examplePoller = null;
992
- applyExamplePayload(current);
993
- return;
994
- }
995
- if (attempts >= 100) {
996
- clearInterval(examplePoller);
997
- examplePoller = null;
998
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
999
- showToast('Example load timed out', 'error');
1000
- }
1001
- }, 120);
1002
- }
1003
-
1004
- function triggerExampleLoad(idx) {
1005
- const btnWrap = document.getElementById('example-load-btn');
1006
- const btn = btnWrap ? (btnWrap.querySelector('button') || btnWrap) : null;
1007
- if (!btn) return;
1008
-
1009
- let attempts = 0;
1010
-
1011
- function writeIdxAndClick() {
1012
- attempts += 1;
1013
- const ok1 = setGradioValue('example-idx-input', String(idx));
1014
- setGradioValue('example-result-data', '');
1015
- const currentVal = getValueFromContainer('example-idx-input');
1016
-
1017
- if (ok1 && currentVal === String(idx)) {
1018
- btn.click();
1019
- startExamplePolling();
1020
- return;
1021
- }
1022
-
1023
- if (attempts < 30) {
1024
- setTimeout(writeIdxAndClick, 100);
1025
- } else {
1026
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1027
- showToast('Failed to initialize example loader', 'error');
1028
- }
1029
- }
1030
-
1031
- writeIdxAndClick();
1032
- }
1033
-
1034
- document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1035
- card.addEventListener('click', () => {
1036
- const idx = card.getAttribute('data-idx');
1037
- if (idx === null || idx === undefined || idx === '') return;
1038
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1039
- card.classList.add('loading');
1040
- showToast('Loading example...', 'info');
1041
- triggerExampleLoad(idx);
1042
- });
1043
- });
1044
-
1045
- const observerTarget = document.getElementById('example-result-data');
1046
- if (observerTarget) {
1047
- const obs = new MutationObserver(() => {
1048
- const current = getValueFromContainer('example-result-data');
1049
- if (!current || current === lastSeenExamplePayload) return;
1050
- lastSeenExamplePayload = current;
1051
- if (examplePoller) {
1052
- clearInterval(examplePoller);
1053
- examplePoller = null;
1054
- }
1055
- applyExamplePayload(current);
1056
- });
1057
- obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true});
1058
- }
1059
-
1060
- if (outputArea) outputArea.value = '';
1061
- const sb = document.getElementById('sb-run-state');
1062
- if (sb) sb.textContent = 'Ready';
1063
- if (imgStatus) imgStatus.textContent = 'No image uploaded';
1064
- }
1065
- init();
1066
- }
1067
- """
1068
-
1069
- wire_outputs_js = r"""
1070
- () => {
1071
- function watchOutputs() {
1072
- const resultContainer = document.getElementById('gradio-result');
1073
- const outArea = document.getElementById('custom-output-textarea');
1074
- if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; }
1075
-
1076
- let lastText = '';
1077
-
1078
- function isErrorText(val) {
1079
- return typeof val === 'string' && val.trim().startsWith('[ERROR]');
1080
- }
1081
-
1082
- function syncOutput() {
1083
- const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input');
1084
- if (!el) return;
1085
- const val = el.value || '';
1086
- if (val !== lastText) {
1087
- lastText = val;
1088
- outArea.value = val;
1089
- outArea.scrollTop = outArea.scrollHeight;
1090
-
1091
- if (val.trim()) {
1092
- if (isErrorText(val)) {
1093
- if (window.__setRunErrorState) window.__setRunErrorState();
1094
- if (window.__showToast) window.__showToast('Inference failed', 'error');
1095
- } else {
1096
- if (window.__hideLoader) window.__hideLoader();
1097
- }
1098
- }
1099
- }
1100
- }
1101
-
1102
- const observer = new MutationObserver(syncOutput);
1103
- observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1104
- setInterval(syncOutput, 500);
1105
- }
1106
- watchOutputs();
1107
- }
1108
- """
1109
-
1110
- OCR_LOGO_SVG = """
1111
- <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
1112
- <path d="M4 5.5A2.5 2.5 0 0 1 6.5 3H11v2H6.5a.5.5 0 0 0-.5.5V10H4V5.5Z"/>
1113
- <path d="M20 10h-2V5.5a.5.5 0 0 0-.5-.5H13V3h4.5A2.5 2.5 0 0 1 20 5.5V10Z"/>
1114
- <path d="M4 14h2v4.5a.5.5 0 0 0 .5.5H11v2H6.5A2.5 2.5 0 0 1 4 18.5V14Z"/>
1115
- <path d="M20 14v4.5A2.5 2.5 0 0 1 17.5 21H13v-2h4.5a.5.5 0 0 0 .5-.5V14h2Z"/>
1116
- <path d="M8 8h8v2H8V8Zm0 3h8v2H8v-2Zm0 3h5v2H8v-2Z"/>
1117
- </svg>
1118
- """
1119
-
1120
- UPLOAD_PREVIEW_SVG = """
1121
- <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
1122
- <rect x="8" y="14" width="64" height="52" rx="6" fill="none" stroke="#00FFFF" stroke-width="2" stroke-dasharray="4 3"/>
1123
- <polygon points="12,62 30,40 42,50 54,34 68,62" fill="rgba(0,255,255,0.14)" stroke="#00FFFF" stroke-width="1.5"/>
1124
- <circle cx="28" cy="30" r="6" fill="rgba(0,255,255,0.2)" stroke="#00FFFF" stroke-width="1.5"/>
1125
- </svg>
1126
- """
1127
-
1128
- COPY_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16 1H4C2.9 1 2 1.9 2 3v12h2V3h12V1zm3 4H8C6.9 5 6 5.9 6 7v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"/></svg>"""
1129
- SAVE_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V7l-4-4zM7 5h8v4H7V5zm12 14H5v-6h14v6z"/></svg>"""
1130
-
1131
- MODEL_TABS_HTML = "".join([
1132
- f'<button class="model-tab{" active" if m == "Nanonets-OCR2-3B" else ""}" data-model="{m}"><span class="model-tab-label">{m}</span></button>'
1133
- for m in MODEL_CHOICES
1134
- ])
1135
 
1136
  with gr.Blocks() as demo:
1137
- hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
1138
- prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1139
- hidden_model_name = gr.Textbox(value="Nanonets-OCR2-3B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1140
-
1141
- max_new_tokens = gr.Slider(minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, elem_id="gradio-max-new-tokens", elem_classes="hidden-input", container=False)
1142
- temperature = gr.Slider(minimum=0.1, maximum=4.0, step=0.1, value=0.7, elem_id="gradio-temperature", elem_classes="hidden-input", container=False)
1143
- top_p = gr.Slider(minimum=0.05, maximum=1.0, step=0.05, value=0.9, elem_id="gradio-top-p", elem_classes="hidden-input", container=False)
1144
- top_k = gr.Slider(minimum=1, maximum=1000, step=1, value=50, elem_id="gradio-top-k", elem_classes="hidden-input", container=False)
1145
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.1, elem_id="gradio-repetition-penalty", elem_classes="hidden-input", container=False)
1146
- gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False)
1147
-
1148
- result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
1149
-
1150
- example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
1151
- example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
1152
- example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
1153
-
1154
- gr.HTML(f"""
1155
- <div class="app-shell">
1156
- <div class="app-header">
1157
- <div class="app-header-left">
1158
- <div class="app-logo">{OCR_LOGO_SVG}</div>
1159
- <span class="app-title">Multimodal OCR3</span>
1160
- <span class="app-badge">vision enabled</span>
1161
- <span class="app-badge fast">OCR Suite</span>
1162
- </div>
1163
- </div>
1164
-
1165
- <div class="model-tabs-bar">
1166
- {MODEL_TABS_HTML}
1167
- </div>
1168
-
1169
- <div class="app-main-row">
1170
- <div class="app-main-left">
1171
- <div id="image-drop-zone">
1172
- <div id="upload-prompt" class="upload-prompt-modern">
1173
- <div id="upload-click-area" class="upload-click-area">
1174
- {UPLOAD_PREVIEW_SVG}
1175
- <span class="upload-main-text">Click or drag an image here</span>
1176
- <span class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for OCR and multimodal understanding</span>
1177
- </div>
1178
- </div>
1179
-
1180
- <input id="custom-file-input" type="file" accept="image/*" style="display:none;" />
1181
-
1182
- <div id="single-preview-wrap" class="single-preview-wrap">
1183
- <div class="single-preview-card">
1184
- <img id="single-preview-img" src="" alt="Preview">
1185
- <div class="preview-overlay-actions">
1186
- <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1187
- <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
1188
- </div>
1189
- </div>
1190
- </div>
1191
- </div>
1192
-
1193
- <div class="hint-bar">
1194
- <b>Upload:</b> Click or drag to add an image &nbsp;&middot;&nbsp;
1195
- <b>Model:</b> Switch model tabs from the header &nbsp;&middot;&nbsp;
1196
- <kbd>Clear</kbd> removes the current image
1197
- </div>
1198
-
1199
- <div class="examples-section">
1200
- <div class="examples-title">Quick Examples</div>
1201
- <div class="examples-scroll">
1202
- {EXAMPLE_CARDS_HTML}
1203
- </div>
1204
- </div>
1205
- </div>
1206
-
1207
- <div class="app-main-right">
1208
- <div class="panel-card">
1209
- <div class="panel-card-title">OCR / Vision Instruction</div>
1210
- <div class="panel-card-body">
1211
- <label class="modern-label" for="custom-query-input">Query Input</label>
1212
- <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., convert to markdown, extract the contents, OCR the image, read all visible text, preserve layout..."></textarea>
1213
- </div>
1214
- </div>
1215
-
1216
- <div style="padding:12px 20px;">
1217
- <button id="custom-run-btn" class="btn-run">
1218
- <span id="run-btn-label">Run OCR</span>
1219
- </button>
1220
- </div>
1221
-
1222
- <div class="output-frame">
1223
- <div class="out-title">
1224
- <span id="output-title-label">Raw Output Stream</span>
1225
- <div class="out-title-right">
1226
- <button id="copy-output-btn" class="out-action-btn" title="Copy">{COPY_SVG} Copy</button>
1227
- <button id="save-output-btn" class="out-action-btn" title="Save">{SAVE_SVG} Save File</button>
1228
- </div>
1229
- </div>
1230
- <div class="out-body">
1231
- <div class="modern-loader" id="output-loader">
1232
- <div class="loader-spinner"></div>
1233
- <div class="loader-text">Running OCR...</div>
1234
- <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
1235
- </div>
1236
- <div class="output-scroll-wrap">
1237
- <textarea id="custom-output-textarea" class="output-textarea" placeholder="Raw output will appear here..." readonly></textarea>
1238
- </div>
1239
- </div>
1240
- </div>
1241
-
1242
- <div class="settings-group">
1243
- <div class="settings-group-title">Advanced Settings</div>
1244
- <div class="settings-group-body">
1245
- <div class="slider-row">
1246
- <label>Max new tokens</label>
1247
- <input type="range" id="custom-max-new-tokens" min="1" max="{MAX_MAX_NEW_TOKENS}" step="1" value="{DEFAULT_MAX_NEW_TOKENS}">
1248
- <span class="slider-val" id="custom-max-new-tokens-val">{DEFAULT_MAX_NEW_TOKENS}</span>
1249
- </div>
1250
- <div class="slider-row">
1251
- <label>Temperature</label>
1252
- <input type="range" id="custom-temperature" min="0.1" max="4.0" step="0.1" value="0.7">
1253
- <span class="slider-val" id="custom-temperature-val">0.7</span>
1254
- </div>
1255
- <div class="slider-row">
1256
- <label>Top-p</label>
1257
- <input type="range" id="custom-top-p" min="0.05" max="1.0" step="0.05" value="0.9">
1258
- <span class="slider-val" id="custom-top-p-val">0.9</span>
1259
- </div>
1260
- <div class="slider-row">
1261
- <label>Top-k</label>
1262
- <input type="range" id="custom-top-k" min="1" max="1000" step="1" value="50">
1263
- <span class="slider-val" id="custom-top-k-val">50</span>
1264
- </div>
1265
- <div class="slider-row">
1266
- <label>Repetition penalty</label>
1267
- <input type="range" id="custom-repetition-penalty" min="1.0" max="2.0" step="0.05" value="1.1">
1268
- <span class="slider-val" id="custom-repetition-penalty-val">1.1</span>
1269
- </div>
1270
- <div class="slider-row">
1271
- <label>GPU Duration (seconds)</label>
1272
- <input type="range" id="custom-gpu-duration" min="60" max="300" step="30" value="60">
1273
- <span class="slider-val" id="custom-gpu-duration-val">60</span>
1274
- </div>
1275
- </div>
1276
- </div>
1277
- </div>
1278
- </div>
1279
-
1280
- <div class="exp-note">
1281
- Experimental OCR Suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/Multimodal-OCR3" target="_blank">GitHub</a>
1282
- </div>
1283
-
1284
- <div class="app-statusbar">
1285
- <div class="sb-section" id="sb-image-status">No image uploaded</div>
1286
- <div class="sb-section sb-fixed" id="sb-run-state">Ready</div>
1287
- </div>
1288
- </div>
1289
- """)
1290
-
1291
- run_btn = gr.Button("Run", elem_id="gradio-run-btn")
1292
-
1293
- demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
1294
- demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
1295
-
1296
- run_btn.click(
1297
- fn=run_ocr,
1298
- inputs=[
1299
- hidden_model_name,
1300
- prompt,
1301
- hidden_image_b64,
1302
- max_new_tokens,
1303
- temperature,
1304
- top_p,
1305
- top_k,
1306
- repetition_penalty,
1307
- gpu_duration_state,
1308
- ],
1309
- outputs=[result],
1310
- js=r"""(m, p, img, mnt, t, tp, tk, rp, gd) => {
1311
- const modelEl = document.querySelector('.model-tab.active');
1312
- const model = modelEl ? modelEl.getAttribute('data-model') : m;
1313
- const promptEl = document.getElementById('custom-query-input');
1314
- const promptVal = promptEl ? promptEl.value : p;
1315
- const imgContainer = document.getElementById('hidden-image-b64');
1316
- let imgVal = img;
1317
- if (imgContainer) {
1318
- const inner = imgContainer.querySelector('textarea, input');
1319
- if (inner) imgVal = inner.value;
1320
- }
1321
- return [model, promptVal, imgVal, mnt, t, tp, tk, rp, gd];
1322
- }""",
1323
  )
1324
 
1325
- example_load_btn.click(
1326
- fn=load_example_data,
1327
- inputs=[example_idx],
1328
- outputs=[example_result],
1329
- queue=False,
1330
  )
1331
 
1332
  if __name__ == "__main__":
1333
- demo.queue(max_size=50).launch(
1334
- css=css,
1335
- mcp_server=True,
1336
- ssr_mode=False,
1337
- show_error=True,
1338
- allowed_paths=["examples"],
1339
- )
 
1
  import os
2
+ import sys
3
+ import random
4
+ import uuid
5
  import json
 
6
  import time
 
7
  from threading import Thread
8
+ from typing import Iterable
9
+ from huggingface_hub import snapshot_download
10
 
11
  import gradio as gr
12
  import spaces
13
  import torch
14
+ import numpy as np
15
  from PIL import Image
16
+ import cv2
17
 
18
  from transformers import (
19
  Qwen2_5_VLForConditionalGeneration,
20
  Qwen3VLForConditionalGeneration,
21
+ AutoModelForImageTextToText,
22
  AutoModelForCausalLM,
23
  AutoProcessor,
24
  TextIteratorStreamer,
25
  )
26
 
27
+ from transformers.image_utils import load_image
28
+ from gradio.themes import Soft
29
+ from gradio.themes.utils import colors, fonts, sizes
30
+
31
+ colors.steel_blue = colors.Color(
32
+ name="steel_blue",
33
+ c50="#EBF3F8",
34
+ c100="#D3E5F0",
35
+ c200="#A8CCE1",
36
+ c300="#7DB3D2",
37
+ c400="#529AC3",
38
+ c500="#4682B4",
39
+ c600="#3E72A0",
40
+ c700="#36638C",
41
+ c800="#2E5378",
42
+ c900="#264364",
43
+ c950="#1E3450",
44
+ )
45
+
46
+ class SteelBlueTheme(Soft):
47
+ def __init__(
48
+ self,
49
+ *,
50
+ primary_hue: colors.Color | str = colors.gray,
51
+ secondary_hue: colors.Color | str = colors.steel_blue,
52
+ neutral_hue: colors.Color | str = colors.slate,
53
+ text_size: sizes.Size | str = sizes.text_lg,
54
+ font: fonts.Font | str | Iterable[fonts.Font | str] = (
55
+ fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
56
+ ),
57
+ font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
58
+ fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
59
+ ),
60
+ ):
61
+ super().__init__(
62
+ primary_hue=primary_hue,
63
+ secondary_hue=secondary_hue,
64
+ neutral_hue=neutral_hue,
65
+ text_size=text_size,
66
+ font=font,
67
+ font_mono=font_mono,
68
+ )
69
+ super().set(
70
+ background_fill_primary="*primary_50",
71
+ background_fill_primary_dark="*primary_900",
72
+ body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
73
+ body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
74
+ button_primary_text_color="white",
75
+ button_primary_text_color_hover="white",
76
+ button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
77
+ button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
78
+ button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
79
+ button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
80
+ button_secondary_text_color="black",
81
+ button_secondary_text_color_hover="white",
82
+ button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
83
+ button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
84
+ button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
85
+ button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
86
+ slider_color="*secondary_500",
87
+ slider_color_dark="*secondary_600",
88
+ block_title_text_weight="600",
89
+ block_border_width="3px",
90
+ block_shadow="*shadow_drop_lg",
91
+ button_primary_shadow="*shadow_drop_lg",
92
+ button_large_padding="11px",
93
+ color_accent_soft="*primary_100",
94
+ block_label_background_fill="*primary_200",
95
+ )
96
+
97
+ steel_blue_theme = SteelBlueTheme()
98
+
99
+ css = """
100
+ #main-title h1 {
101
+ font-size: 2.3em !important;
102
+ }
103
+ #output-title h2 {
104
+ font-size: 2.2em !important;
105
+ }
106
+
107
+ /* RadioAnimated Styles */
108
+ .ra-wrap{ width: fit-content; }
109
+ .ra-inner{
110
+ position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
111
+ background: var(--neutral-200); border-radius: 9999px; overflow: hidden;
112
+ }
113
+ .ra-input{ display: none; }
114
+ .ra-label{
115
+ position: relative; z-index: 2; padding: 8px 16px;
116
+ font-family: inherit; font-size: 14px; font-weight: 600;
117
+ color: var(--neutral-500); cursor: pointer; transition: color 0.2s; white-space: nowrap;
118
+ }
119
+ .ra-highlight{
120
+ position: absolute; z-index: 1; top: 6px; left: 6px;
121
+ height: calc(100% - 12px); border-radius: 9999px;
122
+ background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);
123
+ transition: transform 0.2s, width 0.2s;
124
+ }
125
+ .ra-input:checked + .ra-label{ color: black; }
126
+
127
+ /* Dark mode adjustments for Radio */
128
+ .dark .ra-inner { background: var(--neutral-800); }
129
+ .dark .ra-label { color: var(--neutral-400); }
130
+ .dark .ra-highlight { background: var(--neutral-600); }
131
+ .dark .ra-input:checked + .ra-label { color: white; }
132
+
133
+ #gpu-duration-container {
134
+ padding: 10px;
135
+ border-radius: 8px;
136
+ background: var(--background-fill-secondary);
137
+ border: 1px solid var(--border-color-primary);
138
+ margin-top: 10px;
139
+ }
140
+ """
141
 
142
  MAX_MAX_NEW_TOKENS = 4096
143
  DEFAULT_MAX_NEW_TOKENS = 2048
 
153
  if torch.cuda.is_available():
154
  print("current device:", torch.cuda.current_device())
155
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
156
+
157
  print("Using device:", device)
158
 
159
+ class RadioAnimated(gr.HTML):
160
+ def __init__(self, choices, value=None, **kwargs):
161
+ if not choices or len(choices) < 2:
162
+ raise ValueError("RadioAnimated requires at least 2 choices.")
163
+ if value is None:
164
+ value = choices[0]
165
+
166
+ uid = uuid.uuid4().hex[:8]
167
+ group_name = f"ra-{uid}"
168
+
169
+ inputs_html = "\n".join(
170
+ f"""
171
+ <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
172
+ <label class="ra-label" for="{group_name}-{i}">{c}</label>
173
+ """
174
+ for i, c in enumerate(choices)
175
+ )
176
+
177
+ html_template = f"""
178
+ <div class="ra-wrap" data-ra="{uid}">
179
+ <div class="ra-inner">
180
+ <div class="ra-highlight"></div>
181
+ {inputs_html}
182
+ </div>
183
+ </div>
184
+ """
185
+
186
+ js_on_load = r"""
187
+ (() => {
188
+ const wrap = element.querySelector('.ra-wrap');
189
+ const inner = element.querySelector('.ra-inner');
190
+ const highlight = element.querySelector('.ra-highlight');
191
+ const inputs = Array.from(element.querySelectorAll('.ra-input'));
192
+
193
+ if (!inputs.length) return;
194
+
195
+ const choices = inputs.map(i => i.value);
196
+
197
+ function setHighlightByIndex(idx) {
198
+ const n = choices.length;
199
+ const pct = 100 / n;
200
+ highlight.style.width = `calc(${pct}% - 6px)`;
201
+ highlight.style.transform = `translateX(${idx * 100}%)`;
202
+ }
203
+
204
+ function setCheckedByValue(val, shouldTrigger=false) {
205
+ const idx = Math.max(0, choices.indexOf(val));
206
+ inputs.forEach((inp, i) => { inp.checked = (i === idx); });
207
+ setHighlightByIndex(idx);
208
+
209
+ props.value = choices[idx];
210
+ if (shouldTrigger) trigger('change', props.value);
211
+ }
212
+
213
+ setCheckedByValue(props.value ?? choices[0], false);
214
+
215
+ inputs.forEach((inp) => {
216
+ inp.addEventListener('change', () => {
217
+ setCheckedByValue(inp.value, true);
218
+ });
219
+ });
220
+ })();
221
+ """
222
+
223
+ super().__init__(
224
+ value=value,
225
+ html_template=html_template,
226
+ js_on_load=js_on_load,
227
+ **kwargs
228
+ )
229
+
230
+ def apply_gpu_duration(val: str):
231
+ return int(val)
232
 
233
  MODEL_ID_V = "datalab-to/chandra"
234
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 
248
  torch_dtype=torch.bfloat16,
249
  ).to(device).eval()
250
 
251
+ MODEL_PATH_D = "prithivMLmods/Dots.OCR-Latest-BF16" # -> alt of [rednote-hilab/dots.ocr]
252
  processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
253
  model_d = AutoModelForCausalLM.from_pretrained(
254
  MODEL_PATH_D,
 
267
  torch_dtype=torch.float16
268
  ).to(device).eval()
269
 
270
+ def calc_timeout_image(model_name: str, text: str, image: Image.Image,
271
+ max_new_tokens: int, temperature: float, top_p: float,
272
+ top_k: int, repetition_penalty: float, gpu_timeout: int):
273
+ """Calculate GPU timeout duration for image inference."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  try:
275
  return int(gpu_timeout)
276
+ except:
277
  return 60
278
 
 
279
  @spaces.GPU(duration=calc_timeout_image)
280
+ def generate_image(model_name: str, text: str, image: Image.Image,
281
+ max_new_tokens: int, temperature: float, top_p: float,
282
+ top_k: int, repetition_penalty: float, gpu_timeout: int = 60):
283
+ """
284
+ Generates responses using the selected model for image input.
285
+ Yields raw text and Markdown-formatted text.
286
+ """
287
+ if model_name == "olmOCR-2-7B-1025":
288
+ processor = processor_m
289
+ model = model_m
290
+ elif model_name == "Nanonets-OCR2-3B":
291
+ processor = processor_x
292
+ model = model_x
293
+ elif model_name == "Chandra-OCR":
294
+ processor = processor_v
295
+ model = model_v
296
+ elif model_name == "Dots.OCR":
297
+ processor = processor_d
298
+ model = model_d
299
+ else:
300
+ yield "Invalid model selected.", "Invalid model selected."
301
+ return
302
+
303
+ if image is None:
304
+ yield "Please upload an image.", "Please upload an image."
305
+ return
306
+
307
+ messages = [{
308
+ "role": "user",
309
+ "content": [
310
+ {"type": "image"},
311
+ {"type": "text", "text": text},
312
+ ]
313
+ }]
314
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
315
+
316
+ inputs = processor(
317
+ text=[prompt_full],
318
+ images=[image],
319
+ return_tensors="pt",
320
+ padding=True).to(device)
321
+
322
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
323
+ generation_kwargs = {
324
+ **inputs,
325
+ "streamer": streamer,
326
+ "max_new_tokens": max_new_tokens,
327
+ "do_sample": True,
328
+ "temperature": temperature,
329
+ "top_p": top_p,
330
+ "top_k": top_k,
331
+ "repetition_penalty": repetition_penalty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  }
333
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
334
+ thread.start()
335
+ buffer = ""
336
+ for new_text in streamer:
337
+ buffer += new_text
338
+ buffer = buffer.replace("<|im_end|>", "")
339
+ time.sleep(0.01)
340
+ yield buffer, buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ image_examples = [
343
+ ["Convert to Markdown.", "examples/3.jpg"],
344
+ ["Perform OCR on the image. [Markdown]", "examples/1.jpg"],
345
+ ["Extract the contents. [Markdown].", "examples/2.jpg"],
346
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  with gr.Blocks() as demo:
349
+ gr.Markdown("# **Multimodal OCR3**", elem_id="main-title")
350
+ with gr.Row():
351
+ with gr.Column(scale=2):
352
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
353
+ image_upload = gr.Image(type="pil", label="Upload Image", height=290)
354
+
355
+ image_submit = gr.Button("Submit", variant="primary")
356
+ gr.Examples(
357
+ examples=image_examples,
358
+ inputs=[image_query, image_upload]
359
+ )
360
+
361
+ with gr.Accordion("Advanced options", open=False):
362
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
363
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.7)
364
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
365
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
366
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
367
+
368
+ with gr.Column(scale=3):
369
+ gr.Markdown("## Output", elem_id="output-title")
370
+ output = gr.Textbox(label="Raw Output Stream", interactive=True, lines=15)
371
+ with gr.Accordion("(Result.md)", open=False):
372
+ markdown_output = gr.Markdown(label="(Result.Md)")
373
+
374
+ model_choice = gr.Radio(
375
+ choices=["Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR", "olmOCR-2-7B-1025"],
376
+ label="Select Model",
377
+ value="Nanonets-OCR2-3B"
378
+ )
379
+
380
+ with gr.Row(elem_id="gpu-duration-container"):
381
+ with gr.Column():
382
+ gr.Markdown("**GPU Duration (seconds)**")
383
+ radioanimated_gpu_duration = RadioAnimated(
384
+ choices=["60", "90", "120", "180", "240", "300"],
385
+ value="60",
386
+ elem_id="radioanimated_gpu_duration"
387
+ )
388
+ gpu_duration_state = gr.Number(value=60, visible=False)
389
+
390
+ gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
391
+
392
+ radioanimated_gpu_duration.change(
393
+ fn=apply_gpu_duration,
394
+ inputs=radioanimated_gpu_duration,
395
+ outputs=[gpu_duration_state],
396
+ api_visibility="private"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  )
398
 
399
+ image_submit.click(
400
+ fn=generate_image,
401
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
402
+ outputs=[output, markdown_output]
 
403
  )
404
 
405
  if __name__ == "__main__":
406
+ demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)
 
 
 
 
 
 
examples/4.jpg DELETED

Git LFS Details

  • SHA256: 7bbd962476f020c656479c6aa48189c9b6a03eb7a5c2f9645e7ce62b2b8063c8
  • Pointer size: 131 Bytes
  • Size of remote file: 160 kB
pre-requirements.txt CHANGED
@@ -1 +1 @@
1
- pip>=26.1.1
 
1
+ pip>=23.0.0
requirements.txt CHANGED
@@ -14,5 +14,5 @@ kernels
14
  hf_xet
15
  spaces
16
  pillow
17
- gradio
18
  av
 
14
  hf_xet
15
  spaces
16
  pillow
17
+ gradio # - gradio@6.3.0
18
  av