.gitattributes CHANGED
@@ -38,11 +38,3 @@ images/2.jpg filter=lfs diff=lfs merge=lfs -text
38
  images/3.png filter=lfs diff=lfs merge=lfs -text
39
  videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
40
  videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
41
- images/0.png filter=lfs diff=lfs merge=lfs -text
42
- images/8.png filter=lfs diff=lfs merge=lfs -text
43
- examples/1.jpg filter=lfs diff=lfs merge=lfs -text
44
- examples/2.jpg filter=lfs diff=lfs merge=lfs -text
45
- examples/3.jpg filter=lfs diff=lfs merge=lfs -text
46
- examples/4.jpg filter=lfs diff=lfs merge=lfs -text
47
- examples/5.jpg filter=lfs diff=lfs merge=lfs -text
48
- examples/6.jpg filter=lfs diff=lfs merge=lfs -text
 
38
  images/3.png filter=lfs diff=lfs merge=lfs -text
39
  videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
40
  videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: Multimodal OCR2
3
  emoji: 💻
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- short_description: FireRed / Nanonets / Monkey / Thyme / Typhoon / SmolDocling
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: OCR2
3
  emoji: 💻
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ short_description: nanonets ocr / typhoon ocr / smoldocling / monkey ocr
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,54 +1,50 @@
1
  import os
2
- import gc
3
- import re
4
- import ast
5
- import json
6
- import base64
7
  import random
8
- from io import BytesIO
 
 
 
9
  from threading import Thread
10
 
11
  import gradio as gr
12
  import spaces
13
  import torch
 
14
  from PIL import Image, ImageOps
 
15
 
16
  from transformers import (
 
17
  Qwen2_5_VLForConditionalGeneration,
18
- Qwen3VLForConditionalGeneration,
19
  AutoModelForVision2Seq,
20
  AutoProcessor,
21
  TextIteratorStreamer,
22
  )
 
23
 
24
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
25
 
 
 
 
26
 
27
- MAX_MAX_NEW_TOKENS = 4096
28
- DEFAULT_MAX_NEW_TOKENS = 2048
 
29
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
30
 
31
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
32
- print("Using device:", device)
33
-
34
- MODEL_ID_F = "FireRedTeam/FireRed-OCR"
35
- processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
36
- model_f = Qwen3VLForConditionalGeneration.from_pretrained(
37
- MODEL_ID_F,
38
- attn_implementation="kernels-community/flash-attn2",
39
- trust_remote_code=True,
40
- torch_dtype=torch.float16
41
- ).to(device).eval()
42
 
 
43
  MODEL_ID_M = "nanonets/Nanonets-OCR-s"
44
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
45
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
46
  MODEL_ID_M,
47
- attn_implementation="kernels-community/flash-attn2",
48
  trust_remote_code=True,
49
  torch_dtype=torch.float16
50
  ).to(device).eval()
51
 
 
52
  MODEL_ID_G = "echo840/MonkeyOCR"
53
  SUBFOLDER = "Recognition"
54
  processor_g = AutoProcessor.from_pretrained(
@@ -58,21 +54,22 @@ processor_g = AutoProcessor.from_pretrained(
58
  )
59
  model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
60
  MODEL_ID_G,
61
- attn_implementation="kernels-community/flash-attn2",
62
  trust_remote_code=True,
63
  subfolder=SUBFOLDER,
64
  torch_dtype=torch.float16
65
  ).to(device).eval()
66
 
 
67
  MODEL_ID_L = "scb10x/typhoon-ocr-7b"
68
  processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
69
  model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
70
  MODEL_ID_L,
71
- attn_implementation="kernels-community/flash-attn2",
72
  trust_remote_code=True,
73
  torch_dtype=torch.float16
74
  ).to(device).eval()
75
 
 
 
76
  MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
77
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
78
  model_x = AutoModelForVision2Seq.from_pretrained(
@@ -80,125 +77,23 @@ model_x = AutoModelForVision2Seq.from_pretrained(
80
  trust_remote_code=True,
81
  torch_dtype=torch.float16
82
  ).to(device).eval()
 
83
 
84
- MODEL_ID_N = "Kwai-Keye/Thyme-RL"
85
- processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
86
- model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
87
- MODEL_ID_N,
88
- attn_implementation="kernels-community/flash-attn2",
89
- trust_remote_code=True,
90
- torch_dtype=torch.float16
91
- ).to(device).eval()
92
-
93
- MODEL_MAP = {
94
- "FireRed-OCR": (processor_f, model_f),
95
- "Nanonets-OCR-s": (processor_m, model_m),
96
- "MonkeyOCR-Recognition": (processor_g, model_g),
97
- "Thyme-RL": (processor_n, model_n),
98
- "Typhoon-OCR-7B": (processor_l, model_l),
99
- "SmolDocling-256M-preview": (processor_x, model_x),
100
- }
101
-
102
- MODEL_CHOICES = list(MODEL_MAP.keys())
103
-
104
- image_examples = [
105
- {"query": "Run OCR on the image and ensure high accuracy.", "image": "examples/4.jpg", "model": "Nanonets-OCR-s"},
106
- {"query": "Conduct OCR on the image with exact text recognition.", "image": "examples/2.jpg", "model": "Typhoon-OCR-7B"},
107
- {"query": "Generate Markdown", "image": "examples/6.jpg", "model": "SmolDocling-256M-preview"},
108
- {"query": "Perform OCR on the image precisely.", "image": "examples/5.jpg", "model": "FireRed-OCR"},
109
- {"query": "Perform precise OCR extraction on the image.", "image": "examples/1.jpg", "model": "Thyme-RL"},
110
- {"query": "Convert this page to docling", "image": "examples/3.jpg", "model": "SmolDocling-256M-preview"},
111
- ]
112
-
113
-
114
- def pil_to_data_url(img: Image.Image, fmt="PNG"):
115
- buf = BytesIO()
116
- img.save(buf, format=fmt)
117
- data = base64.b64encode(buf.getvalue()).decode()
118
- mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
119
- return f"data:{mime};base64,{data}"
120
-
121
-
122
- def file_to_data_url(path):
123
- if not os.path.exists(path):
124
- return ""
125
- ext = path.rsplit(".", 1)[-1].lower()
126
- mime = {
127
- "jpg": "image/jpeg",
128
- "jpeg": "image/jpeg",
129
- "png": "image/png",
130
- "webp": "image/webp",
131
- }.get(ext, "image/jpeg")
132
- with open(path, "rb") as f:
133
- data = base64.b64encode(f.read()).decode()
134
- return f"data:{mime};base64,{data}"
135
-
136
-
137
- def make_thumb_b64(path, max_dim=240):
138
- try:
139
- img = Image.open(path).convert("RGB")
140
- img.thumbnail((max_dim, max_dim))
141
- return pil_to_data_url(img, "JPEG")
142
- except Exception as e:
143
- print("Thumbnail error:", e)
144
- return ""
145
-
146
-
147
- def build_example_cards_html():
148
- cards = ""
149
- for i, ex in enumerate(image_examples):
150
- thumb = make_thumb_b64(ex["image"])
151
- prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
152
- cards += f"""
153
- <div class="example-card" data-idx="{i}">
154
- <div class="example-thumb-wrap">
155
- {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
156
- </div>
157
- <div class="example-meta-row">
158
- <span class="example-badge">{ex["model"]}</span>
159
- </div>
160
- <div class="example-prompt-text">{prompt_short}</div>
161
- </div>
162
- """
163
- return cards
164
-
165
-
166
- EXAMPLE_CARDS_HTML = build_example_cards_html()
167
-
168
-
169
- def load_example_data(idx_str):
170
- try:
171
- idx = int(float(idx_str))
172
- except Exception:
173
- return json.dumps({"status": "error", "message": "Invalid example index"})
174
- if idx < 0 or idx >= len(image_examples):
175
- return json.dumps({"status": "error", "message": "Example index out of range"})
176
- ex = image_examples[idx]
177
- img_b64 = file_to_data_url(ex["image"])
178
- if not img_b64:
179
- return json.dumps({"status": "error", "message": "Could not load example image"})
180
- return json.dumps({
181
- "status": "ok",
182
- "query": ex["query"],
183
- "image": img_b64,
184
- "model": ex["model"],
185
- "name": os.path.basename(ex["image"]),
186
- })
187
-
188
-
189
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
 
190
  image = image.convert("RGB")
191
  width, height = image.size
192
  pad_w_percent = random.uniform(min_percent, max_percent)
193
  pad_h_percent = random.uniform(min_percent, max_percent)
194
  pad_w = int(width * pad_w_percent)
195
  pad_h = int(height * pad_h_percent)
196
- corner_pixel = image.getpixel((0, 0))
197
  padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
198
  return padded_image
199
 
200
-
201
  def normalize_values(text, target_max=500):
 
202
  def normalize_list(values):
203
  max_value = max(values) if values else 1
204
  return [round((v / max_value) * target_max) for v in values]
@@ -212,1102 +107,278 @@ def normalize_values(text, target_max=500):
212
  normalized_text = re.sub(pattern, process_match, text)
213
  return normalized_text
214
 
215
-
216
- def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
217
- try:
218
- return int(gpu_timeout)
219
- except Exception:
220
- return 60
221
-
222
-
223
- @spaces.GPU(duration=calc_timeout_image)
224
- def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
225
- buffer = ""
226
- try:
227
- if not model_name or model_name not in MODEL_MAP:
228
- yield "[ERROR] Please select a valid model."
229
- return
230
- if image is None:
231
- yield "[ERROR] Please upload an image."
232
- return
233
- if not text or not str(text).strip():
234
- yield "[ERROR] Please enter your OCR/query instruction."
235
- return
236
- if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
237
- yield "[ERROR] Query is too long. Please shorten your input."
238
- return
239
-
240
- processor, model = MODEL_MAP[model_name]
241
- images = [image]
242
-
243
- if model_name == "SmolDocling-256M-preview":
244
- if "OTSL" in text or "code" in text:
245
- images = [add_random_padding(img) for img in images]
246
- if "OCR at text at" in text or "Identify element" in text or "formula" in text:
247
- text = normalize_values(text, target_max=500)
248
-
249
- messages = [{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  "role": "user",
251
- "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]
252
- }]
253
-
254
- prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
255
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
256
-
257
- streamer = TextIteratorStreamer(
258
- processor.tokenizer if hasattr(processor, "tokenizer") else processor,
259
- skip_prompt=True,
260
- skip_special_tokens=True
261
- )
262
-
263
- generation_error = {"error": None}
264
-
265
- generation_kwargs = {
266
- **inputs,
267
- "streamer": streamer,
268
- "max_new_tokens": int(max_new_tokens),
269
- "temperature": float(temperature),
270
- "top_p": float(top_p),
271
- "top_k": int(top_k),
272
- "repetition_penalty": float(repetition_penalty),
273
  }
274
-
275
- def _run_generation():
276
- try:
277
- model.generate(**generation_kwargs)
278
- except Exception as e:
279
- generation_error["error"] = e
280
- try:
281
- streamer.end()
282
- except Exception:
283
- pass
284
-
285
- thread = Thread(target=_run_generation, daemon=True)
286
- thread.start()
287
-
288
- for new_text in streamer:
289
- buffer += new_text.replace("<|im_end|>", "")
290
- yield buffer
291
-
292
- thread.join(timeout=1.0)
293
-
294
- if generation_error["error"] is not None:
295
- err_msg = f"[ERROR] Inference failed: {str(generation_error['error'])}"
296
- if buffer.strip():
297
- yield buffer + "\n\n" + err_msg
298
- else:
299
- yield err_msg
300
- return
301
-
302
- if model_name == "SmolDocling-256M-preview":
303
- cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
304
- if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
305
- try:
306
- if "<chart>" in cleaned_output:
307
- cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
308
- cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
309
- doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
310
- doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
311
- markdown_output = doc.export_to_markdown()
312
- yield markdown_output
313
- except Exception as e:
314
- yield f"[ERROR] Post-processing failed: {str(e)}"
315
- return
316
- else:
317
- if cleaned_output.strip():
318
- yield cleaned_output
319
- else:
320
- yield "[ERROR] No output was generated."
321
- else:
322
- if not buffer.strip():
323
- yield "[ERROR] No output was generated."
324
-
325
- except Exception as e:
326
- yield f"[ERROR] {str(e)}"
327
- finally:
328
- gc.collect()
329
- if torch.cuda.is_available():
330
- torch.cuda.empty_cache()
331
-
332
-
333
- def noop():
334
- return None
335
-
336
-
337
- css = r"""
338
- @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
339
- *{box-sizing:border-box;margin:0;padding:0}
340
- html,body{height:100%;overflow-x:hidden}
341
- body,.gradio-container{
342
- background:#0f0f13!important;
343
- font-family:'Inter',system-ui,-apple-system,sans-serif!important;
344
- font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
345
- }
346
- .dark body,.dark .gradio-container{background:#0f0f13!important;color:#e4e4e7!important}
347
- footer{display:none!important}
348
- .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
349
-
350
- #gradio-run-btn,#example-load-btn{
351
- position:absolute!important;left:-9999px!important;top:-9999px!important;
352
- width:1px!important;height:1px!important;opacity:0.01!important;
353
- pointer-events:none!important;overflow:hidden!important;
354
- }
355
-
356
- .app-shell{
357
- background:#18181b;border:1px solid #27272a;border-radius:16px;
358
- margin:12px auto;max-width:1400px;overflow:hidden;
359
- box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
360
- }
361
- .app-header{
362
- background:linear-gradient(135deg,#18181b,#1e1e24);border-bottom:1px solid #27272a;
363
- padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
364
- }
365
- .app-header-left{display:flex;align-items:center;gap:12px}
366
- .app-logo{
367
- width:38px;height:38px;background:linear-gradient(135deg,#FF4500,#FF6A33,#FF8C66);
368
- border-radius:10px;display:flex;align-items:center;justify-content:center;
369
- box-shadow:0 4px 12px rgba(255,69,0,.35);
370
- }
371
- .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
372
- .app-title{
373
- font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#bdbdbd);
374
- -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
375
- }
376
- .app-badge{
377
- font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
378
- background:rgba(255,69,0,.12);color:#FF8A66;border:1px solid rgba(255,69,0,.25);letter-spacing:.3px;
379
- }
380
- .app-badge.fast{background:rgba(255,99,71,.10);color:#FF7043;border:1px solid rgba(255,99,71,.22)}
381
-
382
- .model-tabs-bar{
383
- background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px;
384
- display:flex;gap:8px;align-items:center;flex-wrap:wrap;
385
- }
386
- .model-tab{
387
- display:inline-flex;align-items:center;justify-content:center;gap:6px;
388
- min-width:32px;height:34px;background:transparent;border:1px solid #27272a;
389
- border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
390
- color:#ffffff!important;transition:all .15s ease;
391
- }
392
- .model-tab:hover{background:rgba(255,69,0,.12);border-color:rgba(255,69,0,.35)}
393
- .model-tab.active{background:rgba(255,69,0,.22);border-color:#FF4500;color:#fff!important;box-shadow:0 0 0 2px rgba(255,69,0,.10)}
394
- .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
395
-
396
- .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
397
- .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
398
- .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
399
-
400
- #image-drop-zone{
401
- position:relative;background:#09090b;height:440px;min-height:440px;max-height:440px;
402
- overflow:hidden;
403
- }
404
- #image-drop-zone.drag-over{outline:2px solid #FF4500;outline-offset:-2px;background:rgba(255,69,0,.04)}
405
- .upload-prompt-modern{
406
- position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
407
- padding:20px;z-index:20;overflow:hidden;
408
- }
409
- .upload-click-area{
410
- display:flex;flex-direction:column;align-items:center;justify-content:center;
411
- cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
412
- border:2px dashed #3f3f46;border-radius:16px;
413
- background:rgba(255,69,0,.03);transition:all .2s ease;gap:8px;text-align:center;
414
- overflow:hidden;
415
- }
416
- .upload-click-area:hover{background:rgba(255,69,0,.08);border-color:#FF4500;transform:scale(1.02)}
417
- .upload-click-area:active{background:rgba(255,69,0,.12);transform:scale(.99)}
418
- .upload-click-area svg{width:86px;height:86px;max-width:100%;flex-shrink:0}
419
- .upload-main-text{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px}
420
- .upload-sub-text{color:#71717a;font-size:12px}
421
-
422
- .single-preview-wrap{
423
- width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px;
424
- overflow:hidden;
425
- }
426
- .single-preview-card{
427
- width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
428
- overflow:hidden;border:1px solid #27272a;background:#111114;
429
- display:flex;align-items:center;justify-content:center;position:relative;
430
- }
431
- .single-preview-card img{
432
- width:100%;height:100%;max-width:100%;max-height:100%;
433
- object-fit:contain;display:block;
434
- }
435
- .preview-overlay-actions{
436
- position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5;
437
- }
438
- .preview-action-btn{
439
- display:inline-flex;align-items:center;justify-content:center;
440
- min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65);
441
- border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer;
442
- color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease;
443
- }
444
- .preview-action-btn:hover{background:#FF4500;border-color:#FF4500}
445
-
446
- .hint-bar{
447
- background:rgba(255,69,0,.06);border-top:1px solid #27272a;border-bottom:1px solid #27272a;
448
- padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
449
- }
450
- .hint-bar b{color:#FF8A66;font-weight:600}
451
- .hint-bar kbd{
452
- display:inline-block;padding:1px 6px;background:#27272a;border:1px solid #3f3f46;
453
- border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
454
- }
455
-
456
- .examples-section{border-top:1px solid #27272a;padding:12px 16px}
457
- .examples-title{
458
- font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
459
- letter-spacing:.8px;margin-bottom:10px;
460
- }
461
- .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
462
- .examples-scroll::-webkit-scrollbar{height:6px}
463
- .examples-scroll::-webkit-scrollbar-track{background:#09090b;border-radius:3px}
464
- .examples-scroll::-webkit-scrollbar-thumb{background:#27272a;border-radius:3px}
465
- .examples-scroll::-webkit-scrollbar-thumb:hover{background:#3f3f46}
466
- .example-card{
467
- flex-shrink:0;width:220px;background:#09090b;border:1px solid #27272a;
468
- border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
469
- }
470
- .example-card:hover{border-color:#FF4500;transform:translateY(-2px);box-shadow:0 4px 12px rgba(255,69,0,.15)}
471
- .example-card.loading{opacity:.5;pointer-events:none}
472
- .example-thumb-wrap{height:120px;overflow:hidden;background:#18181b}
473
- .example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
474
- .example-thumb-placeholder{
475
- width:100%;height:100%;display:flex;align-items:center;justify-content:center;
476
- background:#18181b;color:#3f3f46;font-size:11px;
477
- }
478
- .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px}
479
- .example-badge{
480
- display:inline-flex;padding:2px 7px;background:rgba(255,69,0,.12);border-radius:4px;
481
- font-size:10px;font-weight:600;color:#FF8A66;font-family:'JetBrains Mono',monospace;white-space:nowrap;
482
- }
483
- .example-prompt-text{
484
- padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
485
- display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
486
- }
487
-
488
- .panel-card{border-bottom:1px solid #27272a}
489
- .panel-card-title{
490
- padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
491
- text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
492
- }
493
- .panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
494
- .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
495
- .modern-textarea{
496
- width:100%;background:#09090b;border:1px solid #27272a;border-radius:8px;
497
- padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
498
- resize:none;outline:none;min-height:100px;transition:border-color .2s;
499
- }
500
- .modern-textarea:focus{border-color:#FF4500;box-shadow:0 0 0 3px rgba(255,69,0,.15)}
501
- .modern-textarea::placeholder{color:#3f3f46}
502
- .modern-textarea.error-flash{
503
- border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
504
- }
505
- @keyframes shake{0%,100%{transform:translateX(0)}20%,60%{transform:translateX(-4px)}40%,80%{transform:translateX(4px)}}
506
-
507
- .toast-notification{
508
- position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%);
509
- z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif;
510
- font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px;
511
- box-shadow:0 8px 24px rgba(0,0,0,.5);
512
- transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none;
513
- }
514
- .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
515
- .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
516
- .toast-notification.warning{background:linear-gradient(135deg,#d97706,#b45309);color:#fff;border:1px solid rgba(255,255,255,.15)}
517
- .toast-notification.info{background:linear-gradient(135deg,#ea580c,#c2410c);color:#fff;border:1px solid rgba(255,255,255,.15)}
518
- .toast-notification .toast-icon{font-size:16px;line-height:1}
519
- .toast-notification .toast-text{line-height:1.3}
520
-
521
- .btn-run{
522
- display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
523
- background:linear-gradient(135deg,#FF4500,#E03E00);border:none;border-radius:10px;
524
- padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
525
- color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
526
- transition:all .2s ease;letter-spacing:-.2px;
527
- box-shadow:0 4px 16px rgba(255,69,0,.3),inset 0 1px 0 rgba(255,255,255,.1);
528
- }
529
- .btn-run:hover{
530
- background:linear-gradient(135deg,#FF6A33,#FF4500);transform:translateY(-1px);
531
- box-shadow:0 6px 24px rgba(255,69,0,.45),inset 0 1px 0 rgba(255,255,255,.15);
532
- }
533
- .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(255,69,0,.3)}
534
- #custom-run-btn,#custom-run-btn *,#run-btn-label,.btn-run,.btn-run *{
535
- color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
536
- }
537
- body:not(.dark) .btn-run,body:not(.dark) .btn-run *,
538
- .dark .btn-run,.dark .btn-run *,
539
- .gradio-container .btn-run,.gradio-container .btn-run *,
540
- .gradio-container #custom-run-btn,.gradio-container #custom-run-btn *{
541
- color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
542
- }
543
-
544
- .output-frame{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative}
545
- .output-frame .out-title,
546
- .output-frame .out-title *,
547
- #output-title-label{
548
- color:#ffffff!important;
549
- -webkit-text-fill-color:#ffffff!important;
550
- }
551
- .output-frame .out-title{
552
- padding:10px 20px;font-size:13px;font-weight:700;
553
- text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
554
- display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
555
- }
556
- .out-title-right{display:flex;gap:8px;align-items:center}
557
- .out-action-btn{
558
- display:inline-flex;align-items:center;justify-content:center;background:rgba(255,69,0,.1);
559
- border:1px solid rgba(255,69,0,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
560
- font-size:11px;font-weight:500;color:#FF8A66!important;gap:4px;height:24px;transition:all .15s;
561
- }
562
- .out-action-btn:hover{background:rgba(255,69,0,.2);border-color:rgba(255,69,0,.35);color:#ffffff!important}
563
- .out-action-btn svg{width:12px;height:12px;fill:#FF8A66}
564
- .output-frame .out-body{
565
- flex:1;background:#09090b;display:flex;align-items:stretch;justify-content:stretch;
566
- overflow:hidden;min-height:320px;position:relative;
567
- }
568
- .output-scroll-wrap{
569
- width:100%;height:100%;padding:0;overflow:hidden;
570
- }
571
- .output-textarea{
572
- width:100%;height:320px;min-height:320px;max-height:320px;background:#09090b;color:#e4e4e7;
573
- border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
574
- font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
575
- }
576
- .output-textarea::placeholder{color:#52525b}
577
- .output-textarea.error-flash{
578
- box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
579
- }
580
- .modern-loader{
581
- display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(9,9,11,.92);
582
- z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
583
- }
584
- .modern-loader.active{display:flex}
585
- .modern-loader .loader-spinner{
586
- width:36px;height:36px;border:3px solid #27272a;border-top-color:#FF4500;
587
- border-radius:50%;animation:spin .8s linear infinite;
588
- }
589
- @keyframes spin{to{transform:rotate(360deg)}}
590
- .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
591
- .loader-bar-track{width:200px;height:4px;background:#27272a;border-radius:2px;overflow:hidden}
592
- .loader-bar-fill{
593
- height:100%;background:linear-gradient(90deg,#FF4500,#FF7A4D,#FF4500);
594
- background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
595
- }
596
- @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
597
-
598
- .settings-group{border:1px solid #27272a;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
599
- .settings-group-title{
600
- font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
601
- padding:10px 16px;border-bottom:1px solid #27272a;background:rgba(24,24,27,.5);
602
- }
603
- .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
604
- .slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
605
- .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
606
- .slider-row input[type="range"]{
607
- flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#27272a;
608
- border-radius:3px;outline:none;min-width:0;
609
- }
610
- .slider-row input[type="range"]::-webkit-slider-thumb{
611
- -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#FF4500,#E03E00);
612
- border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(255,69,0,.4);transition:transform .15s;
613
- }
614
- .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
615
- .slider-row input[type="range"]::-moz-range-thumb{
616
- width:16px;height:16px;background:linear-gradient(135deg,#FF4500,#E03E00);
617
- border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(255,69,0,.4);
618
- }
619
- .slider-row .slider-val{
620
- min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
621
- font-weight:500;padding:3px 8px;background:#09090b;border:1px solid #27272a;
622
- border-radius:6px;color:#a1a1aa;flex-shrink:0;
623
- }
624
-
625
- .app-statusbar{
626
- background:#18181b;border-top:1px solid #27272a;padding:6px 20px;
627
- display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
628
- }
629
- .app-statusbar .sb-section{
630
- padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
631
- font-size:12px;color:#52525b;overflow:hidden;white-space:nowrap;
632
- }
633
- .app-statusbar .sb-section.sb-fixed{
634
- flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
635
- padding:3px 12px;background:rgba(255,69,0,.08);border-radius:6px;color:#FF8A66;font-weight:500;
636
- }
637
-
638
- .exp-note{padding:10px 20px;font-size:12px;color:#52525b;border-top:1px solid #27272a;text-align:center}
639
- .exp-note a{color:#FF8A66;text-decoration:none}
640
- .exp-note a:hover{text-decoration:underline}
641
-
642
- ::-webkit-scrollbar{width:8px;height:8px}
643
- ::-webkit-scrollbar-track{background:#09090b}
644
- ::-webkit-scrollbar-thumb{background:#27272a;border-radius:4px}
645
- ::-webkit-scrollbar-thumb:hover{background:#3f3f46}
646
-
647
- @media(max-width:980px){
648
- .app-main-row{flex-direction:column}
649
- .app-main-right{width:100%}
650
- .app-main-left{border-right:none;border-bottom:1px solid #27272a}
651
- }
652
- """
653
-
654
- gallery_js = r"""
655
- () => {
656
- function init() {
657
- if (window.__ocr2InitDone) return;
658
-
659
- const dropZone = document.getElementById('image-drop-zone');
660
- const uploadPrompt = document.getElementById('upload-prompt');
661
- const uploadClick = document.getElementById('upload-click-area');
662
- const fileInput = document.getElementById('custom-file-input');
663
- const previewWrap = document.getElementById('single-preview-wrap');
664
- const previewImg = document.getElementById('single-preview-img');
665
- const btnUpload = document.getElementById('preview-upload-btn');
666
- const btnClear = document.getElementById('preview-clear-btn');
667
- const promptInput = document.getElementById('custom-query-input');
668
- const runBtnEl = document.getElementById('custom-run-btn');
669
- const outputArea = document.getElementById('custom-output-textarea');
670
- const imgStatus = document.getElementById('sb-image-status');
671
- const exampleResultContainer = document.getElementById('example-result-data');
672
-
673
- if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
674
- setTimeout(init, 250);
675
- return;
676
  }
 
 
677
 
678
- window.__ocr2InitDone = true;
679
- let imageState = null;
680
- let toastTimer = null;
681
-
682
- function showToast(message, type) {
683
- let toast = document.getElementById('app-toast');
684
- if (!toast) {
685
- toast = document.createElement('div');
686
- toast.id = 'app-toast';
687
- toast.className = 'toast-notification';
688
- toast.innerHTML = '<span class="toast-icon"></span><span class="toast-text"></span>';
689
- document.body.appendChild(toast);
690
- }
691
- const icon = toast.querySelector('.toast-icon');
692
- const text = toast.querySelector('.toast-text');
693
- toast.className = 'toast-notification ' + (type || 'error');
694
- if (type === 'warning') icon.textContent = '\u26A0';
695
- else if (type === 'info') icon.textContent = '\u2139';
696
- else icon.textContent = '\u2717';
697
- text.textContent = message;
698
- if (toastTimer) clearTimeout(toastTimer);
699
- void toast.offsetWidth;
700
- toast.classList.add('visible');
701
- toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500);
702
- }
703
- window.__showToast = showToast;
704
-
705
- function showLoader() {
706
- const l = document.getElementById('output-loader');
707
- if (l) l.classList.add('active');
708
- const sb = document.getElementById('sb-run-state');
709
- if (sb) sb.textContent = 'Processing...';
710
- }
711
- function hideLoader() {
712
- const l = document.getElementById('output-loader');
713
- if (l) l.classList.remove('active');
714
- const sb = document.getElementById('sb-run-state');
715
- if (sb) sb.textContent = 'Done';
716
- }
717
- function setRunErrorState() {
718
- const l = document.getElementById('output-loader');
719
- if (l) l.classList.remove('active');
720
- const sb = document.getElementById('sb-run-state');
721
- if (sb) sb.textContent = 'Error';
722
- }
723
-
724
- window.__showLoader = showLoader;
725
- window.__hideLoader = hideLoader;
726
- window.__setRunErrorState = setRunErrorState;
727
-
728
- function flashPromptError() {
729
- promptInput.classList.add('error-flash');
730
- promptInput.focus();
731
- setTimeout(() => promptInput.classList.remove('error-flash'), 800);
732
- }
733
-
734
- function flashOutputError() {
735
- if (!outputArea) return;
736
- outputArea.classList.add('error-flash');
737
- setTimeout(() => outputArea.classList.remove('error-flash'), 800);
738
- }
739
-
740
- function setGradioValue(containerId, value) {
741
- const container = document.getElementById(containerId);
742
- if (!container) return;
743
- container.querySelectorAll('input, textarea').forEach(el => {
744
- if (el.type === 'file' || el.type === 'range' || el.type === 'checkbox') return;
745
- const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
746
- const ns = Object.getOwnPropertyDescriptor(proto, 'value');
747
- if (ns && ns.set) {
748
- ns.set.call(el, value);
749
- el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
750
- el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
751
- }
752
- });
753
- }
754
-
755
- function syncImageToGradio() {
756
- setGradioValue('hidden-image-b64', imageState ? imageState.b64 : '');
757
- const txt = imageState ? '1 image uploaded' : 'No image uploaded';
758
- if (imgStatus) imgStatus.textContent = txt;
759
- }
760
-
761
- function syncPromptToGradio() {
762
- setGradioValue('prompt-gradio-input', promptInput.value);
763
- }
764
-
765
- function syncModelToGradio(name) {
766
- setGradioValue('hidden-model-name', name);
767
- }
768
-
769
- function setPreview(b64, name) {
770
- imageState = {b64, name: name || 'image'};
771
- previewImg.src = b64;
772
- previewWrap.style.display = 'flex';
773
- if (uploadPrompt) uploadPrompt.style.display = 'none';
774
- syncImageToGradio();
775
- }
776
- window.__setPreview = setPreview;
777
-
778
- function clearPreview() {
779
- imageState = null;
780
- previewImg.src = '';
781
- previewWrap.style.display = 'none';
782
- if (uploadPrompt) uploadPrompt.style.display = 'flex';
783
- syncImageToGradio();
784
- }
785
- window.__clearPreview = clearPreview;
786
-
787
- function processFile(file) {
788
- if (!file) return;
789
- if (!file.type.startsWith('image/')) {
790
- showToast('Only image files are supported', 'error');
791
- return;
792
- }
793
- const reader = new FileReader();
794
- reader.onload = (e) => setPreview(e.target.result, file.name);
795
- reader.readAsDataURL(file);
796
- }
797
-
798
- fileInput.addEventListener('change', (e) => {
799
- const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
800
- if (file) processFile(file);
801
- e.target.value = '';
802
- });
803
-
804
- if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
805
- if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
806
- if (btnClear) btnClear.addEventListener('click', clearPreview);
807
-
808
- dropZone.addEventListener('dragover', (e) => {
809
- e.preventDefault();
810
- dropZone.classList.add('drag-over');
811
- });
812
- dropZone.addEventListener('dragleave', (e) => {
813
- e.preventDefault();
814
- dropZone.classList.remove('drag-over');
815
- });
816
- dropZone.addEventListener('drop', (e) => {
817
- e.preventDefault();
818
- dropZone.classList.remove('drag-over');
819
- if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]);
820
- });
821
-
822
- promptInput.addEventListener('input', syncPromptToGradio);
823
-
824
- function activateModelTab(name) {
825
- document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
826
- btn.classList.toggle('active', btn.getAttribute('data-model') === name);
827
- });
828
- syncModelToGradio(name);
829
- }
830
- window.__activateModelTab = activateModelTab;
831
-
832
- document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
833
- btn.addEventListener('click', () => {
834
- const model = btn.getAttribute('data-model');
835
- activateModelTab(model);
836
- });
837
- });
838
-
839
- activateModelTab('FireRed-OCR');
840
-
841
- function syncSlider(customId, gradioId) {
842
- const slider = document.getElementById(customId);
843
- const valSpan = document.getElementById(customId + '-val');
844
- if (!slider) return;
845
- slider.addEventListener('input', () => {
846
- if (valSpan) valSpan.textContent = slider.value;
847
- const container = document.getElementById(gradioId);
848
- if (!container) return;
849
- container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => {
850
- const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value');
851
- if (ns && ns.set) {
852
- ns.set.call(el, slider.value);
853
- el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
854
- el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
855
- }
856
- });
857
- });
858
- }
859
-
860
- syncSlider('custom-max-new-tokens', 'gradio-max-new-tokens');
861
- syncSlider('custom-temperature', 'gradio-temperature');
862
- syncSlider('custom-top-p', 'gradio-top-p');
863
- syncSlider('custom-top-k', 'gradio-top-k');
864
- syncSlider('custom-repetition-penalty', 'gradio-repetition-penalty');
865
- syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
866
-
867
- function validateBeforeRun() {
868
- const promptVal = promptInput.value.trim();
869
- if (!imageState && !promptVal) {
870
- showToast('Please upload an image and enter your OCR instruction', 'error');
871
- flashPromptError();
872
- return false;
873
- }
874
- if (!imageState) {
875
- showToast('Please upload an image', 'error');
876
- return false;
877
- }
878
- if (!promptVal) {
879
- showToast('Please enter your OCR/query instruction', 'warning');
880
- flashPromptError();
881
- return false;
882
- }
883
- const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
884
- if (!currentModel) {
885
- showToast('Please select a model', 'error');
886
- return false;
887
  }
888
- return true;
889
- }
890
-
891
- window.__clickGradioRunBtn = function() {
892
- if (!validateBeforeRun()) return;
893
- syncPromptToGradio();
894
- syncImageToGradio();
895
- const active = document.querySelector('.model-tab.active');
896
- if (active) syncModelToGradio(active.getAttribute('data-model'));
897
- if (outputArea) outputArea.value = '';
898
- showLoader();
899
- setTimeout(() => {
900
- const gradioBtn = document.getElementById('gradio-run-btn');
901
- if (!gradioBtn) {
902
- setRunErrorState();
903
- if (outputArea) outputArea.value = '[ERROR] Run button not found.';
904
- showToast('Run button not found', 'error');
905
- return;
906
- }
907
- const btn = gradioBtn.querySelector('button');
908
- if (btn) btn.click(); else gradioBtn.click();
909
- }, 180);
910
- };
911
-
912
- if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn());
913
-
914
- const copyBtn = document.getElementById('copy-output-btn');
915
- if (copyBtn) {
916
- copyBtn.addEventListener('click', async () => {
917
- try {
918
- const text = outputArea ? outputArea.value : '';
919
- if (!text.trim()) {
920
- showToast('No output to copy', 'warning');
921
- flashOutputError();
922
- return;
923
- }
924
- await navigator.clipboard.writeText(text);
925
- showToast('Output copied to clipboard', 'info');
926
- } catch(e) {
927
- showToast('Copy failed', 'error');
928
- }
929
- });
930
- }
931
-
932
- const saveBtn = document.getElementById('save-output-btn');
933
- if (saveBtn) {
934
- saveBtn.addEventListener('click', () => {
935
- const text = outputArea ? outputArea.value : '';
936
- if (!text.trim()) {
937
- showToast('No output to save', 'warning');
938
- flashOutputError();
939
- return;
940
- }
941
- const blob = new Blob([text], {type: 'text/plain;charset=utf-8'});
942
- const a = document.createElement('a');
943
- a.href = URL.createObjectURL(blob);
944
- a.download = 'multimodal_ocr2_output.txt';
945
- document.body.appendChild(a);
946
- a.click();
947
- setTimeout(() => {
948
- URL.revokeObjectURL(a.href);
949
- document.body.removeChild(a);
950
- }, 200);
951
- showToast('Output saved', 'info');
952
- });
953
- }
954
-
955
- document.querySelectorAll('.example-card[data-idx]').forEach(card => {
956
- card.addEventListener('click', () => {
957
- const idx = card.getAttribute('data-idx');
958
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
959
- card.classList.add('loading');
960
- showToast('Loading example...', 'info');
961
- setGradioValue('example-result-data', '');
962
- setGradioValue('example-idx-input', idx);
963
- setTimeout(() => {
964
- const btn = document.getElementById('example-load-btn');
965
- if (btn) {
966
- const b = btn.querySelector('button');
967
- if (b) b.click(); else btn.click();
968
- }
969
- }, 150);
970
- setTimeout(() => card.classList.remove('loading'), 12000);
971
- });
972
- });
973
-
974
- function checkExampleResult() {
975
- if (!exampleResultContainer) return;
976
- const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
977
- if (!el || !el.value) return;
978
- if (window.__lastExampleVal === el.value) return;
979
- try {
980
- const data = JSON.parse(el.value);
981
- if (data.status === 'ok') {
982
- window.__lastExampleVal = el.value;
983
- if (data.image) setPreview(data.image, data.name || 'example.jpg');
984
- if (data.query) {
985
- promptInput.value = data.query;
986
- syncPromptToGradio();
987
- }
988
- if (data.model) activateModelTab(data.model);
989
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
990
- showToast('Example loaded', 'info');
991
- } else if (data.status === 'error') {
992
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
993
- showToast(data.message || 'Failed to load example', 'error');
994
- }
995
- } catch(e) {}
996
- }
997
-
998
- const obsExample = new MutationObserver(checkExampleResult);
999
- if (exampleResultContainer) {
1000
- obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1001
  }
1002
- setInterval(checkExampleResult, 500);
 
1003
 
1004
- if (outputArea) outputArea.value = '';
1005
- const sb = document.getElementById('sb-run-state');
1006
- if (sb) sb.textContent = 'Ready';
1007
- if (imgStatus) imgStatus.textContent = 'No image uploaded';
1008
- }
1009
- init();
1010
- }
1011
- """
1012
-
1013
- wire_outputs_js = r"""
1014
- () => {
1015
- function watchOutputs() {
1016
- const resultContainer = document.getElementById('gradio-result');
1017
- const outArea = document.getElementById('custom-output-textarea');
1018
- if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; }
1019
-
1020
- let lastText = '';
1021
-
1022
- function isErrorText(val) {
1023
- return typeof val === 'string' && val.trim().startsWith('[ERROR]');
1024
- }
1025
 
1026
- function syncOutput() {
1027
- const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input');
1028
- if (!el) return;
1029
- const val = el.value || '';
1030
- if (val !== lastText) {
1031
- lastText = val;
1032
- outArea.value = val;
1033
- outArea.scrollTop = outArea.scrollHeight;
 
 
1034
 
1035
- if (val.trim()) {
1036
- if (isErrorText(val)) {
1037
- if (window.__setRunErrorState) window.__setRunErrorState();
1038
- if (window.__showToast) window.__showToast('OCR failed', 'error');
1039
- } else {
1040
- if (window.__hideLoader) window.__hideLoader();
1041
- }
1042
- }
1043
- }
1044
- }
1045
 
1046
- const observer = new MutationObserver(syncOutput);
1047
- observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1048
- setInterval(syncOutput, 500);
 
1049
  }
1050
- watchOutputs();
 
1051
  }
1052
  """
1053
 
1054
- OCR_LOGO_SVG = """
1055
- <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
1056
- <path d="M4 5.5A2.5 2.5 0 0 1 6.5 3H11v2H6.5a.5.5 0 0 0-.5.5V10H4V5.5Z" fill="white"/>
1057
- <path d="M20 10h-2V5.5a.5.5 0 0 0-.5-.5H13V3h4.5A2.5 2.5 0 0 1 20 5.5V10Z" fill="white"/>
1058
- <path d="M4 14h2v4.5a.5.5 0 0 0 .5.5H11v2H6.5A2.5 2.5 0 0 1 4 18.5V14Z" fill="white"/>
1059
- <path d="M20 14v4.5A2.5 2.5 0 0 1 17.5 21H13v-2h4.5a.5.5 0 0 0 .5-.5V14h2Z" fill="white"/>
1060
- <path d="M8 8h8v2H8V8Zm0 3h8v2H8v-2Zm0 3h5v2H8v-2Z" fill="white"/>
1061
- </svg>
1062
- """
1063
-
1064
- UPLOAD_PREVIEW_SVG = """
1065
- <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
1066
- <rect x="8" y="14" width="64" height="52" rx="6" fill="none" stroke="#FF4500" stroke-width="2" stroke-dasharray="4 3"/>
1067
- <polygon points="12,62 30,40 42,50 54,34 68,62" fill="rgba(255,69,0,0.15)" stroke="#FF4500" stroke-width="1.5"/>
1068
- <circle cx="28" cy="30" r="6" fill="rgba(255,69,0,0.2)" stroke="#FF4500" stroke-width="1.5"/>
1069
- </svg>
1070
- """
1071
-
1072
- COPY_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16 1H4C2.9 1 2 1.9 2 3v12h2V3h12V1zm3 4H8C6.9 5 6 5.9 6 7v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"/></svg>"""
1073
- SAVE_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V7l-4-4zM7 5h8v4H7V5zm12 14H5v-6h14v6z"/></svg>"""
1074
-
1075
- MODEL_TABS_HTML = "".join([
1076
- f'<button class="model-tab{" active" if m == "FireRed-OCR" else ""}" data-model="{m}"><span class="model-tab-label">{m}</span></button>'
1077
- for m in MODEL_CHOICES
1078
- ])
1079
-
1080
- with gr.Blocks() as demo:
1081
- hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
1082
- prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1083
- hidden_model_name = gr.Textbox(value="FireRed-OCR", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1084
-
1085
- max_new_tokens = gr.Slider(minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, elem_id="gradio-max-new-tokens", elem_classes="hidden-input", container=False)
1086
- temperature = gr.Slider(minimum=0.1, maximum=4.0, step=0.1, value=0.6, elem_id="gradio-temperature", elem_classes="hidden-input", container=False)
1087
- top_p = gr.Slider(minimum=0.05, maximum=1.0, step=0.05, value=0.9, elem_id="gradio-top-p", elem_classes="hidden-input", container=False)
1088
- top_k = gr.Slider(minimum=1, maximum=1000, step=1, value=50, elem_id="gradio-top-k", elem_classes="hidden-input", container=False)
1089
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.2, elem_id="gradio-repetition-penalty", elem_classes="hidden-input", container=False)
1090
- gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False)
1091
-
1092
- result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
1093
-
1094
- example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
1095
- example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
1096
- example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
1097
-
1098
- gr.HTML(f"""
1099
- <div class="app-shell">
1100
- <div class="app-header">
1101
- <div class="app-header-left">
1102
- <div class="app-logo">{OCR_LOGO_SVG}</div>
1103
- <span class="app-title">Multimodal OCR 2</span>
1104
- <span class="app-badge">vision enabled</span>
1105
- <span class="app-badge fast">OCR Suite</span>
1106
- </div>
1107
- </div>
1108
-
1109
- <div class="model-tabs-bar">
1110
- {MODEL_TABS_HTML}
1111
- </div>
1112
-
1113
- <div class="app-main-row">
1114
- <div class="app-main-left">
1115
- <div id="image-drop-zone">
1116
- <div id="upload-prompt" class="upload-prompt-modern">
1117
- <div id="upload-click-area" class="upload-click-area">
1118
- {UPLOAD_PREVIEW_SVG}
1119
- <span class="upload-main-text">Click or drag an image here</span>
1120
- <span class="upload-sub-text">Upload one document, page, receipt, screenshot, or scene image for OCR and vision tasks</span>
1121
- </div>
1122
- </div>
1123
-
1124
- <input id="custom-file-input" type="file" accept="image/*" style="display:none;" />
1125
-
1126
- <div id="single-preview-wrap" class="single-preview-wrap">
1127
- <div class="single-preview-card">
1128
- <img id="single-preview-img" src="" alt="Preview">
1129
- <div class="preview-overlay-actions">
1130
- <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1131
- <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
1132
- </div>
1133
- </div>
1134
- </div>
1135
- </div>
1136
-
1137
- <div class="hint-bar">
1138
- <b>Upload:</b> Click or drag to add an image &nbsp;&middot;&nbsp;
1139
- <b>Model:</b> Switch model tabs from the header &nbsp;&middot;&nbsp;
1140
- <kbd>Clear</kbd> removes the current image
1141
- </div>
1142
-
1143
- <div class="examples-section">
1144
- <div class="examples-title">Quick Examples</div>
1145
- <div class="examples-scroll">
1146
- {EXAMPLE_CARDS_HTML}
1147
- </div>
1148
- </div>
1149
- </div>
1150
-
1151
- <div class="app-main-right">
1152
- <div class="panel-card">
1153
- <div class="panel-card-title">OCR / Vision Instruction</div>
1154
- <div class="panel-card-body">
1155
- <label class="modern-label" for="custom-query-input">Query Input</label>
1156
- <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., perform OCR on the image precisely, extract all text, generate markdown, read the receipt, summarize the page..."></textarea>
1157
- </div>
1158
- </div>
1159
-
1160
- <div style="padding:12px 20px;">
1161
- <button id="custom-run-btn" class="btn-run">
1162
- <span id="run-btn-label">Run OCR</span>
1163
- </button>
1164
- </div>
1165
-
1166
- <div class="output-frame">
1167
- <div class="out-title">
1168
- <span id="output-title-label">Raw Output Stream</span>
1169
- <div class="out-title-right">
1170
- <button id="copy-output-btn" class="out-action-btn" title="Copy">{COPY_SVG} Copy</button>
1171
- <button id="save-output-btn" class="out-action-btn" title="Save">{SAVE_SVG} Save File</button>
1172
- </div>
1173
- </div>
1174
- <div class="out-body">
1175
- <div class="modern-loader" id="output-loader">
1176
- <div class="loader-spinner"></div>
1177
- <div class="loader-text">Running OCR...</div>
1178
- <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
1179
- </div>
1180
- <div class="output-scroll-wrap">
1181
- <textarea id="custom-output-textarea" class="output-textarea" placeholder="Raw output will appear here..." readonly></textarea>
1182
- </div>
1183
- </div>
1184
- </div>
1185
-
1186
- <div class="settings-group">
1187
- <div class="settings-group-title">Advanced Settings</div>
1188
- <div class="settings-group-body">
1189
- <div class="slider-row">
1190
- <label>Max new tokens</label>
1191
- <input type="range" id="custom-max-new-tokens" min="1" max="{MAX_MAX_NEW_TOKENS}" step="1" value="{DEFAULT_MAX_NEW_TOKENS}">
1192
- <span class="slider-val" id="custom-max-new-tokens-val">{DEFAULT_MAX_NEW_TOKENS}</span>
1193
- </div>
1194
- <div class="slider-row">
1195
- <label>Temperature</label>
1196
- <input type="range" id="custom-temperature" min="0.1" max="4.0" step="0.1" value="0.6">
1197
- <span class="slider-val" id="custom-temperature-val">0.6</span>
1198
- </div>
1199
- <div class="slider-row">
1200
- <label>Top-p</label>
1201
- <input type="range" id="custom-top-p" min="0.05" max="1.0" step="0.05" value="0.9">
1202
- <span class="slider-val" id="custom-top-p-val">0.9</span>
1203
- </div>
1204
- <div class="slider-row">
1205
- <label>Top-k</label>
1206
- <input type="range" id="custom-top-k" min="1" max="1000" step="1" value="50">
1207
- <span class="slider-val" id="custom-top-k-val">50</span>
1208
- </div>
1209
- <div class="slider-row">
1210
- <label>Repetition penalty</label>
1211
- <input type="range" id="custom-repetition-penalty" min="1.0" max="2.0" step="0.05" value="1.2">
1212
- <span class="slider-val" id="custom-repetition-penalty-val">1.2</span>
1213
- </div>
1214
- <div class="slider-row">
1215
- <label>GPU Duration (seconds)</label>
1216
- <input type="range" id="custom-gpu-duration" min="60" max="300" step="30" value="60">
1217
- <span class="slider-val" id="custom-gpu-duration-val">60</span>
1218
- </div>
1219
- </div>
1220
- </div>
1221
- </div>
1222
- </div>
1223
-
1224
- <div class="exp-note">
1225
- Experimental OCR Suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/Multimodal-OCR2" target="_blank">GitHub</a>
1226
- </div>
1227
-
1228
- <div class="app-statusbar">
1229
- <div class="sb-section" id="sb-image-status">No image uploaded</div>
1230
- <div class="sb-section sb-fixed" id="sb-run-state">Ready</div>
1231
- </div>
1232
- </div>
1233
- """)
1234
-
1235
- run_btn = gr.Button("Run", elem_id="gradio-run-btn")
1236
-
1237
- def b64_to_pil(b64_str):
1238
- if not b64_str:
1239
- return None
1240
- try:
1241
- if b64_str.startswith("data:image"):
1242
- _, data = b64_str.split(",", 1)
1243
- else:
1244
- data = b64_str
1245
- image_data = base64.b64decode(data)
1246
- return Image.open(BytesIO(image_data)).convert("RGB")
1247
- except Exception:
1248
- return None
1249
-
1250
- def run_ocr(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
1251
- try:
1252
- image = b64_to_pil(image_b64)
1253
- yield from generate_image(
1254
- model_name=model_name,
1255
- text=text,
1256
- image=image,
1257
- max_new_tokens=max_new_tokens_v,
1258
- temperature=temperature_v,
1259
- top_p=top_p_v,
1260
- top_k=top_k_v,
1261
- repetition_penalty=repetition_penalty_v,
1262
- gpu_timeout=gpu_timeout_v,
1263
  )
1264
- except Exception as e:
1265
- yield f"[ERROR] {str(e)}"
1266
-
1267
- demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
1268
- demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
1269
-
1270
- run_btn.click(
1271
- fn=run_ocr,
1272
- inputs=[
1273
- hidden_model_name,
1274
- prompt,
1275
- hidden_image_b64,
1276
- max_new_tokens,
1277
- temperature,
1278
- top_p,
1279
- top_k,
1280
- repetition_penalty,
1281
- gpu_duration_state,
1282
- ],
1283
- outputs=[result],
1284
- js=r"""(m, p, img, mnt, t, tp, tk, rp, gd) => {
1285
- const modelEl = document.querySelector('.model-tab.active');
1286
- const model = modelEl ? modelEl.getAttribute('data-model') : m;
1287
- const promptEl = document.getElementById('custom-query-input');
1288
- const promptVal = promptEl ? promptEl.value : p;
1289
- const imgContainer = document.getElementById('hidden-image-b64');
1290
- let imgVal = img;
1291
- if (imgContainer) {
1292
- const inner = imgContainer.querySelector('textarea, input');
1293
- if (inner) imgVal = inner.value;
1294
- }
1295
- return [model, promptVal, imgVal, mnt, t, tp, tk, rp, gd];
1296
- }""",
1297
  )
1298
-
1299
- example_load_btn.click(
1300
- fn=load_example_data,
1301
- inputs=[example_idx],
1302
- outputs=[example_result],
1303
- queue=False,
1304
  )
1305
 
1306
  if __name__ == "__main__":
1307
- demo.queue(max_size=50).launch(
1308
- css=css,
1309
- mcp_server=True,
1310
- ssr_mode=False,
1311
- show_error=True,
1312
- allowed_paths=["examples"],
1313
- )
 
1
  import os
 
 
 
 
 
2
  import random
3
+ import uuid
4
+ import json
5
+ import time
6
+ import asyncio
7
  from threading import Thread
8
 
9
  import gradio as gr
10
  import spaces
11
  import torch
12
+ import numpy as np
13
  from PIL import Image, ImageOps
14
+ import cv2
15
 
16
  from transformers import (
17
+ Qwen2VLForConditionalGeneration,
18
  Qwen2_5_VLForConditionalGeneration,
 
19
  AutoModelForVision2Seq,
20
  AutoProcessor,
21
  TextIteratorStreamer,
22
  )
23
+ from transformers.image_utils import load_image
24
 
25
  from docling_core.types.doc import DoclingDocument, DocTagsDocument
26
 
27
+ import re
28
+ import ast
29
+ import html
30
 
31
+ # Constants for text generation
32
+ MAX_MAX_NEW_TOKENS = 2048
33
+ DEFAULT_MAX_NEW_TOKENS = 1024
34
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
35
 
36
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Load Nanonets-OCR-s
39
  MODEL_ID_M = "nanonets/Nanonets-OCR-s"
40
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
41
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
42
  MODEL_ID_M,
 
43
  trust_remote_code=True,
44
  torch_dtype=torch.float16
45
  ).to(device).eval()
46
 
47
+ # Load MonkeyOCR
48
  MODEL_ID_G = "echo840/MonkeyOCR"
49
  SUBFOLDER = "Recognition"
50
  processor_g = AutoProcessor.from_pretrained(
 
54
  )
55
  model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
  MODEL_ID_G,
 
57
  trust_remote_code=True,
58
  subfolder=SUBFOLDER,
59
  torch_dtype=torch.float16
60
  ).to(device).eval()
61
 
62
+ # Load typhoon-ocr-7b
63
  MODEL_ID_L = "scb10x/typhoon-ocr-7b"
64
  processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
65
  model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
66
  MODEL_ID_L,
 
67
  trust_remote_code=True,
68
  torch_dtype=torch.float16
69
  ).to(device).eval()
70
 
71
+ #--------------------------------------------------#
72
+ # Load SmolDocling-256M-preview
73
  MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
74
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
75
  model_x = AutoModelForVision2Seq.from_pretrained(
 
77
  trust_remote_code=True,
78
  torch_dtype=torch.float16
79
  ).to(device).eval()
80
+ #--------------------------------------------------#
81
 
82
+ # Preprocessing functions for SmolDocling-256M
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
84
+ """Add random padding to an image based on its size."""
85
  image = image.convert("RGB")
86
  width, height = image.size
87
  pad_w_percent = random.uniform(min_percent, max_percent)
88
  pad_h_percent = random.uniform(min_percent, max_percent)
89
  pad_w = int(width * pad_w_percent)
90
  pad_h = int(height * pad_h_percent)
91
+ corner_pixel = image.getpixel((0, 0)) # Top-left corner
92
  padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
93
  return padded_image
94
 
 
95
  def normalize_values(text, target_max=500):
96
+ """Normalize numerical values in text to a target maximum."""
97
  def normalize_list(values):
98
  max_value = max(values) if values else 1
99
  return [round((v / max_value) * target_max) for v in values]
 
107
  normalized_text = re.sub(pattern, process_match, text)
108
  return normalized_text
109
 
110
+ def downsample_video(video_path):
111
+ """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
112
+ vidcap = cv2.VideoCapture(video_path)
113
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
114
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
115
+ frames = []
116
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
117
+ for i in frame_indices:
118
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
119
+ success, image = vidcap.read()
120
+ if success:
121
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
122
+ pil_image = Image.fromarray(image)
123
+ timestamp = round(i / fps, 2)
124
+ frames.append((pil_image, timestamp))
125
+ vidcap.release()
126
+ return frames
127
+
128
+ @spaces.GPU
129
+ def generate_image(model_name: str, text: str, image: Image.Image,
130
+ max_new_tokens: int = 1024,
131
+ temperature: float = 0.6,
132
+ top_p: float = 0.9,
133
+ top_k: int = 50,
134
+ repetition_penalty: float = 1.2):
135
+ """Generate responses for image input using the selected model."""
136
+ # Model selection
137
+ if model_name == "Nanonets-OCR-s":
138
+ processor = processor_m
139
+ model = model_m
140
+ elif model_name == "MonkeyOCR-Recognition":
141
+ processor = processor_g
142
+ model = model_g
143
+ elif model_name == "SmolDocling-256M-preview":
144
+ processor = processor_x
145
+ model = model_x
146
+ elif model_name == "Typhoon-OCR-7B":
147
+ processor = processor_l
148
+ model = model_l
149
+ else:
150
+ yield "Invalid model selected."
151
+ return
152
+
153
+ if image is None:
154
+ yield "Please upload an image."
155
+ return
156
+
157
+ # Prepare images as a list (single image for image inference)
158
+ images = [image]
159
+
160
+ # SmolDocling-256M specific preprocessing
161
+ if model_name == "SmolDocling-256M-preview":
162
+ if "OTSL" in text or "code" in text:
163
+ images = [add_random_padding(img) for img in images]
164
+ if "OCR at text at" in text or "Identify element" in text or "formula" in text:
165
+ text = normalize_values(text, target_max=500)
166
+
167
+ # Unified message structure for all models
168
+ messages = [
169
+ {
170
  "role": "user",
171
+ "content": [{"type": "image"} for _ in images] + [
172
+ {"type": "text", "text": text}
173
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  }
175
+ ]
176
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
177
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
178
+
179
+ # Generation with streaming
180
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
181
+ generation_kwargs = {
182
+ **inputs,
183
+ "streamer": streamer,
184
+ "max_new_tokens": max_new_tokens,
185
+ "temperature": temperature,
186
+ "top_p": top_p,
187
+ "top_k": top_k,
188
+ "repetition_penalty": repetition_penalty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  }
190
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
191
+ thread.start()
192
 
193
+ # Stream output and collect full response
194
+ buffer = ""
195
+ full_output = ""
196
+ for new_text in streamer:
197
+ full_output += new_text
198
+ buffer += new_text.replace("<|im_end|>", "")
199
+ yield buffer
200
+
201
+ # SmolDocling-256M specific postprocessing
202
+ if model_name == "SmolDocling-256M-preview":
203
+ cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
204
+ if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
205
+ if "<chart>" in cleaned_output:
206
+ cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
207
+ cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
208
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
209
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
210
+ markdown_output = doc.export_to_markdown()
211
+ yield f"**MD Output:**\n\n{markdown_output}"
212
+ else:
213
+ yield cleaned_output
214
+
215
+ @spaces.GPU
216
+ def generate_video(model_name: str, text: str, video_path: str,
217
+ max_new_tokens: int = 1024,
218
+ temperature: float = 0.6,
219
+ top_p: float = 0.9,
220
+ top_k: int = 50,
221
+ repetition_penalty: float = 1.2):
222
+ """Generate responses for video input using the selected model."""
223
+ # Model selection
224
+ if model_name == "Nanonets-OCR-s":
225
+ processor = processor_m
226
+ model = model_m
227
+ elif model_name == "MonkeyOCR-Recognition":
228
+ processor = processor_g
229
+ model = model_g
230
+ elif model_name == "SmolDocling-256M-preview":
231
+ processor = processor_x
232
+ model = model_x
233
+ elif model_name == "Typhoon-OCR-7B":
234
+ processor = processor_l
235
+ model = model_l
236
+ else:
237
+ yield "Invalid model selected."
238
+ return
239
+
240
+ if video_path is None:
241
+ yield "Please upload a video."
242
+ return
243
+
244
+ # Extract frames from video
245
+ frames = downsample_video(video_path)
246
+ images = [frame for frame, _ in frames]
247
+
248
+ # SmolDocling-256M specific preprocessing
249
+ if model_name == "SmolDocling-256M-preview":
250
+ if "OTSL" in text or "code" in text:
251
+ images = [add_random_padding(img) for img in images]
252
+ if "OCR at text at" in text or "Identify element" in text or "formula" in text:
253
+ text = normalize_values(text, target_max=500)
254
+
255
+ # Unified message structure for all models
256
+ messages = [
257
+ {
258
+ "role": "user",
259
+ "content": [{"type": "image"} for _ in images] + [
260
+ {"type": "text", "text": text}
261
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  }
263
+ ]
264
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
265
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
266
+
267
+ # Generation with streaming
268
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
269
+ generation_kwargs = {
270
+ **inputs,
271
+ "streamer": streamer,
272
+ "max_new_tokens": max_new_tokens,
273
+ "temperature": temperature,
274
+ "top_p": top_p,
275
+ "top_k": top_k,
276
+ "repetition_penalty": repetition_penalty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  }
278
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
279
+ thread.start()
280
 
281
+ # Stream output and collect full response
282
+ buffer = ""
283
+ full_output = ""
284
+ for new_text in streamer:
285
+ full_output += new_text
286
+ buffer += new_text.replace("<|im_end|>", "")
287
+ yield buffer
288
+
289
+ # SmolDocling-256M specific postprocessing
290
+ if model_name == "SmolDocling-256M-preview":
291
+ cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
292
+ if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
293
+ if "<chart>" in cleaned_output:
294
+ cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
295
+ cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
296
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
297
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
298
+ markdown_output = doc.export_to_markdown()
299
+ yield f"**MD Output:**\n\n{markdown_output}"
300
+ else:
301
+ yield cleaned_output
302
 
303
+ # Define examples for image and video inference
304
+ image_examples = [
305
+ ["OCR the image", "images/2.jpg"],
306
+ ["Convert this page to docling", "images/1.png"],
307
+ ["Convert this page to docling", "images/3.png"],
308
+ ["Convert chart to OTSL.", "images/4.png"],
309
+ ["Convert code to text", "images/5.jpg"],
310
+ ["Convert this table to OTSL.", "images/6.jpg"],
311
+ ["Convert formula to late.", "images/7.jpg"],
312
+ ]
313
 
314
+ video_examples = [
315
+ ["Explain the video in detail.", "videos/1.mp4"],
316
+ ["Explain the video in detail.", "videos/2.mp4"]
317
+ ]
 
 
 
 
 
 
318
 
319
+ css = """
320
+ .submit-btn {
321
+ background-color: #2980b9 !important;
322
+ color: white !important;
323
  }
324
+ .submit-btn:hover {
325
+ background-color: #3498db !important;
326
  }
327
  """
328
 
329
+ # Create the Gradio Interface
330
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
331
+ gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
332
+ with gr.Row():
333
+ with gr.Column():
334
+ with gr.Tabs():
335
+ with gr.TabItem("Image Inference"):
336
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
337
+ image_upload = gr.Image(type="pil", label="Image")
338
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
339
+ gr.Examples(
340
+ examples=image_examples,
341
+ inputs=[image_query, image_upload]
342
+ )
343
+ with gr.TabItem("Video Inference"):
344
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
345
+ video_upload = gr.Video(label="Video")
346
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
347
+ gr.Examples(
348
+ examples=video_examples,
349
+ inputs=[video_query, video_upload]
350
+ )
351
+ with gr.Accordion("Advanced options", open=False):
352
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
353
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
354
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
355
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
356
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
357
+ with gr.Column():
358
+ output = gr.Textbox(label="Output", interactive=False, lines=3, scale=2)
359
+ model_choice = gr.Radio(
360
+ choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "SmolDocling-256M-preview", "Typhoon-OCR-7B"],
361
+ label="Select Model",
362
+ value="Nanonets-OCR-s"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  )
364
+
365
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
366
+
367
+ gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
368
+ gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
369
+ gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
370
+ gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
371
+
372
+ image_submit.click(
373
+ fn=generate_image,
374
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
375
+ outputs=output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  )
377
+ video_submit.click(
378
+ fn=generate_video,
379
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
380
+ outputs=output
 
 
381
  )
382
 
383
  if __name__ == "__main__":
384
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
 
 
 
 
 
examples/5.jpg DELETED

Git LFS Details

  • SHA256: 9dc887ffe90d5dea23caacb19e3f6d2877d73c5585183d996463d88564fe7dcc
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
examples/6.jpg DELETED

Git LFS Details

  • SHA256: d2c2e455d2841efb2faa2b3efdd1993f851dcec166b35cbcd8bd4e9729d8b375
  • Pointer size: 131 Bytes
  • Size of remote file: 623 kB
examples/1.jpg → images/1.png RENAMED
File without changes
{examples → images}/2.jpg RENAMED
File without changes
examples/3.jpg → images/3.png RENAMED
File without changes
images/4.png ADDED
images/5.jpg ADDED
images/6.jpg ADDED
images/7.jpg ADDED
pre-requirements.txt DELETED
@@ -1 +0,0 @@
1
- pip>=26.0.0
 
 
requirements.txt CHANGED
@@ -1,19 +1,15 @@
1
- git+https://github.com/huggingface/transformers.git@v4.57.6
2
- git+https://github.com/huggingface/accelerate.git
3
- git+https://github.com/huggingface/peft.git
4
- transformers-stream-generator
5
- huggingface_hub
6
- qwen-vl-utils
7
- sentencepiece
8
- opencv-python
9
- torch==2.8.0
10
- docling-core
11
- torchvision
12
- matplotlib
13
- requests
14
- kernels
15
- hf_xet
16
- spaces
17
- pillow
18
- gradio
19
  av
 
1
+ gradio
2
+ transformers
3
+ transformers-stream-generator
4
+ qwen-vl-utils
5
+ torchvision
6
+ docling-core
7
+ torch
8
+ requests
9
+ huggingface_hub
10
+ albumentations
11
+ spaces
12
+ accelerate
13
+ pillow
14
+ opencv-python
 
 
 
 
15
  av
examples/4.jpg → videos/1.mp4 RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b97b675f13f09b0c03f1e222d9c89be146370e067a38fed7bf971638717bd87
3
- size 176412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9127aaafccef6f02fce6812bc9c89e1e4026832cf133492481952cc4b94cb595
3
+ size 791367
videos/2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdf85ced4e76f2afd1a66b2c41e93868ccd9f928a02105de5e7db3c8651c692e
3
+ size 1040341