prithivMLmods commited on
Commit
63cec06
·
verified ·
1 Parent(s): 43c3626

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -322
app.py CHANGED
@@ -23,6 +23,8 @@ from transformers import (
23
  )
24
  from transformers.image_utils import load_image
25
 
 
 
26
  import re
27
  import ast
28
  import html
@@ -34,7 +36,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
34
 
35
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
36
 
37
- # --- Model Loading ---
38
  # Load Nanonets-OCR-s
39
  MODEL_ID_M = "nanonets/Nanonets-OCR-s"
40
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -86,7 +87,7 @@ model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
86
  torch_dtype=torch.float16
87
  ).to(device).eval()
88
 
89
- # --- Preprocessing and Helper Functions ---
90
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
91
  """Add random padding to an image based on its size."""
92
  image = image.convert("RGB")
@@ -120,7 +121,6 @@ def downsample_video(video_path):
120
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
121
  fps = vidcap.get(cv2.CAP_PROP_FPS)
122
  frames = []
123
- # Use 10 frames for video processing
124
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
125
  for i in frame_indices:
126
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -133,11 +133,76 @@ def downsample_video(video_path):
133
  vidcap.release()
134
  return frames
135
 
136
- # A placeholder function in case docling_core is not installed
137
- def format_smoldocling_output(buffer_text, images):
138
- cleaned_output = buffer_text.replace("<end_of_utterance>", "").strip()
139
- # Check if docling_core is available and was imported
140
- if 'DocTagsDocument' in globals() and 'DoclingDocument' in globals():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
142
  if "<chart>" in cleaned_output:
143
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
@@ -145,44 +210,43 @@ def format_smoldocling_output(buffer_text, images):
145
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
146
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
147
  markdown_output = doc.export_to_markdown()
148
- return buffer_text, markdown_output
149
- # Fallback if library is not available or tags are not present
150
- return buffer_text, cleaned_output
151
 
152
- # --- Core Generation Logic ---
153
- def get_model_and_processor(model_name):
154
- """Helper to select model and processor."""
 
 
 
 
 
155
  if model_name == "Nanonets-OCR-s":
156
- return processor_m, model_m
 
157
  elif model_name == "MonkeyOCR-Recognition":
158
- return processor_g, model_g
 
159
  elif model_name == "SmolDocling-256M-preview":
160
- return processor_x, model_x
 
161
  elif model_name == "Typhoon-OCR-7B":
162
- return processor_l, model_l
 
163
  elif model_name == "Thyme-RL":
164
- return processor_n, model_n
 
165
  else:
166
- return None, None
167
-
168
- @spaces.GPU
169
- def generate_response(model_name: str, text: str, media_input, media_type: str,
170
- max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
171
- """Unified generation function for both image and video."""
172
- processor, model = get_model_and_processor(model_name)
173
- if not processor or not model:
174
  yield "Invalid model selected.", "Invalid model selected."
175
  return
176
 
177
- if media_input is None:
178
- yield f"Please upload a {media_type}.", f"Please upload a {media_type}."
179
  return
180
 
181
- if media_type == "video":
182
- frames = downsample_video(media_input)
183
- images = [frame for frame, _ in frames]
184
- else: # image
185
- images = [media_input]
186
 
187
  if model_name == "SmolDocling-256M-preview":
188
  if "OTSL" in text or "code" in text:
@@ -191,7 +255,12 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
191
  text = normalize_values(text, target_max=500)
192
 
193
  messages = [
194
- {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]}
 
 
 
 
 
195
  ]
196
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
197
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
@@ -211,24 +280,23 @@ def generate_response(model_name: str, text: str, media_input, media_type: str,
211
 
212
  buffer = ""
213
  for new_text in streamer:
214
- buffer += new_text.replace("<|end_of_text|>", "")
215
  yield buffer, buffer
216
 
217
  if model_name == "SmolDocling-256M-preview":
218
- raw_output, formatted_output = format_smoldocling_output(buffer, images)
219
- yield raw_output, formatted_output
220
- else:
221
- # For other models, the formatted output is just the cleaned buffer
222
- yield buffer, buffer.strip()
223
-
224
- def generate_image_wrapper(text, img, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
225
- yield from generate_response(model, text, img, "image", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
226
-
227
- def generate_video_wrapper(text, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty):
228
- yield from generate_response(model, text, vid, "video", max_tokens, temp, top_p_val, top_k_val, rep_penalty)
229
-
230
 
231
- # --- Examples ---
232
  image_examples = [
233
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
234
  ["Describe the image!", "images/8.png"],
@@ -238,7 +306,7 @@ image_examples = [
238
  ["Convert chart to OTSL.", "images/4.png"],
239
  ["Convert code to text", "images/5.jpg"],
240
  ["Convert this table to OTSL.", "images/6.jpg"],
241
- ["Convert formula to latex.", "images/7.jpg"],
242
  ]
243
 
244
  video_examples = [
@@ -246,292 +314,84 @@ video_examples = [
246
  ["Explain the video in detail.", "videos/2.mp4"]
247
  ]
248
 
249
- # --- Custom CSS for the new UI ---
250
  css = """
251
- /* Left sidebar styles */
252
- .sidebar {
253
- background-color: #f8f9fa;
254
- border-right: 1px solid #e9ecef;
255
- padding: 20px;
256
- height: 100vh;
257
- }
258
-
259
- /* Main content area */
260
- .content-area {
261
- padding: 20px;
262
- }
263
-
264
- /* Document grid */
265
- .doc-grid {
266
- display: grid;
267
- grid-template-columns: repeat(5, 1fr);
268
- gap: 10px;
269
- margin: 20px 0;
270
- }
271
-
272
- .doc-item {
273
- border: 1px solid #dee2e6;
274
- border-radius: 8px;
275
- padding: 10px;
276
- text-align: center;
277
- height: 120px;
278
- background-color: #f8f9fa;
279
- cursor: pointer;
280
- transition: all 0.2s ease;
281
- }
282
-
283
- .doc-item:hover {
284
- border-color: #007bff;
285
- background-color: #e9f0ff;
286
- }
287
-
288
- /* Upload and controls area */
289
- .upload-controls {
290
- display: flex;
291
- align-items: center;
292
- gap: 10px;
293
- margin: 20px 0;
294
- padding: 15px;
295
- border: 1px solid #e9ecef;
296
- border-radius: 8px;
297
- }
298
-
299
- .file-upload {
300
- flex: 1;
301
- }
302
-
303
- .model-dropdown {
304
- width: 200px;
305
- }
306
-
307
  .submit-btn {
308
- background-color: #007bff;
309
- color: white;
310
- border: none;
311
- border-radius: 4px;
312
- padding: 10px 20px;
313
- font-size: 1.2rem;
314
- cursor: pointer;
315
- transition: background-color 0.2s;
316
  }
317
-
318
  .submit-btn:hover {
319
- background-color: #0069d9;
320
- }
321
-
322
- /* Output area */
323
- .output-area {
324
- margin-top: 20px;
325
  }
326
-
327
- /* Add conversation button */
328
- .add-conv-btn {
329
- background-color: #28a745;
330
- color: white;
331
- border: none;
332
- padding: 8px 15px;
333
- border-radius: 4px;
334
- cursor: pointer;
335
- }
336
-
337
- .add-conv-btn:hover {
338
- background-color: #218838;
339
- }
340
-
341
- /* Examples section */
342
- .examples-section {
343
- margin-top: 20px;
344
- }
345
-
346
- /* Header styles */
347
- .header {
348
- margin-bottom: 15px;
349
- }
350
-
351
- /* Media upload icon styling */
352
- .upload-icon {
353
- font-size: 1.5rem;
354
- color: #6c757d;
355
- margin-right: 10px;
356
- }
357
-
358
- /* Document icon styling */
359
- .doc-icon {
360
- font-size: 2rem;
361
- color: #6c757d;
362
- margin-bottom: 5px;
363
- }
364
-
365
- /* Query input */
366
- .query-input {
367
- margin: 15px 0;
368
- }
369
-
370
- /* Model dropdown styling */
371
- .model-dropdown .select {
372
- padding: 8px 12px;
373
- border: 1px solid #ced4da;
374
- border-radius: 4px;
375
- }
376
-
377
- /* Output styling */
378
- .output-text {
379
- border: 1px solid #ced4da;
380
- border-radius: 4px;
381
- padding: 10px;
382
- min-height: 150px;
383
- }
384
-
385
- /* Add some space between elements */
386
- .gr-box {
387
- margin-bottom: 15px;
388
  }
389
  """
390
 
391
- # --- Gradio Interface ---
392
- with gr.Blocks(css=css) as demo:
393
- # Initialize state variables that hold data
394
- image_upload_state = gr.State(None)
395
- video_upload_state = gr.State(None)
396
- media_type_state = gr.State("image")
397
-
398
  gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
399
-
400
  with gr.Row():
401
- # Left sidebar - OCR section
402
- with gr.Column(scale=1, min_width=250, elem_classes="sidebar"):
403
- gr.Markdown("## OCR")
404
- add_conv_btn = gr.Button("+ Add Conv", elem_classes="add-conv-btn")
405
-
406
- # Document grid
407
- gr.Markdown("### Documents")
408
- with gr.Group(elem_classes="doc-grid"):
409
- for i in range(5):
410
- with gr.Column():
411
- gr.Markdown(f'<div class="doc-item"><div class="doc-icon">📄</div>Doc {i+1}</div>')
412
-
413
- # Main content area
414
- with gr.Column(scale=3, elem_classes="content-area"):
415
- # Document processing section
416
- with gr.Group():
417
- gr.Markdown("## Multimodal OCR2")
418
-
419
- # Document grid (5 document thumbnails as shown in the sketch)
420
- with gr.Row(elem_classes="doc-grid"):
421
- for i in range(5):
422
- with gr.Column():
423
- doc_item = gr.Image(
424
- value=None,
425
- label=f"Document {i+1}",
426
- height=120,
427
- show_label=False,
428
- container=False,
429
- elem_classes="doc-item"
430
- )
431
-
432
- # Define input components before they are referenced by gr.Examples
433
- with gr.Group(elem_classes="upload-controls"):
434
- with gr.Column(elem_classes="file-upload"):
435
- file_upload = gr.File(
436
- label="Upload files (image/video)",
437
- file_types=["image", "video"],
438
- elem_classes="file-upload"
439
- )
440
-
441
- model_dropdown = gr.Dropdown(
442
- choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
443
- value="Nanonets-OCR-s",
444
- label="Select Model",
445
- elem_classes="model-dropdown"
446
  )
447
-
448
- submit_btn = gr.Button("", size="lg", elem_classes="submit-btn")
449
-
450
- query_input = gr.Textbox(
451
- label="Enter your query",
452
- placeholder="Describe the image, extract text, convert to markdown...",
453
- elem_classes="query-input"
454
- )
455
-
456
- # Examples section
457
- gr.Markdown("### Examples")
458
- with gr.Row():
459
- with gr.Column():
460
- gr.Examples(
461
- examples=image_examples,
462
- inputs=[query_input, file_upload], # Corrected inputs
463
- label="Image Examples"
464
- )
465
- with gr.Column():
466
- gr.Examples(
467
- examples=video_examples,
468
- inputs=[query_input, file_upload], # Corrected inputs
469
- label="Video Examples"
470
- )
471
 
472
- # Advanced options (hidden by default)
473
- with gr.Accordion("Advanced Options", open=False):
474
- max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
475
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
476
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
477
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
478
- repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
479
 
480
- # Output area
481
- with gr.Group(elem_classes="output-area"):
482
- gr.Markdown("### Output")
483
- raw_output = gr.Textbox(
484
- label="Result",
485
- interactive=False,
486
- lines=10,
487
- elem_classes="output-text"
488
- )
489
-
490
- # --- Event Handlers ---
491
- def handle_file_upload(file):
492
- if file is None:
493
- return "image", None, None
494
- file_path = file.name
495
- if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
496
- return "image", Image.open(file_path), None
497
- elif file_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
498
- return "video", None, file_path
499
- return "image", None, None
500
-
501
- file_upload.change(
502
- fn=handle_file_upload,
503
- inputs=[file_upload],
504
- outputs=[media_type_state, image_upload_state, video_upload_state] # Corrected outputs
505
  )
506
-
507
- def generate_wrapper(text, img, vid, model, max_tokens, temp, top_p_val, top_k_val, rep_penalty, m_type):
508
- media_input = None
509
- if m_type == "image" and img is not None:
510
- media_input = img
511
- elif m_type == "video" and vid is not None:
512
- media_input = vid
513
- else:
514
- yield "Please upload a valid file.", "Please upload a valid file."
515
- return
516
-
517
- yield from generate_response(model, text, media_input, m_type, max_tokens, temp, top_p_val, top_k_val, rep_penalty)
518
-
519
- submit_btn.click(
520
- fn=generate_wrapper,
521
- inputs=[
522
- query_input,
523
- image_upload_state, # Corrected input state
524
- video_upload_state, # Corrected input state
525
- model_dropdown,
526
- max_new_tokens,
527
- temperature,
528
- top_p,
529
- top_k,
530
- repetition_penalty,
531
- media_type_state # Corrected input state
532
- ],
533
- outputs=[raw_output, raw_output]
534
  )
535
 
536
  if __name__ == "__main__":
537
- demo.queue(max_size=50).launch(share=True, show_error=True)
 
23
  )
24
  from transformers.image_utils import load_image
25
 
26
+ from docling_core.types.doc import DoclingDocument, DocTagsDocument
27
+
28
  import re
29
  import ast
30
  import html
 
36
 
37
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
38
 
 
39
  # Load Nanonets-OCR-s
40
  MODEL_ID_M = "nanonets/Nanonets-OCR-s"
41
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 
87
  torch_dtype=torch.float16
88
  ).to(device).eval()
89
 
90
+ # Preprocessing functions for SmolDocling-256M
91
  def add_random_padding(image, min_percent=0.1, max_percent=0.10):
92
  """Add random padding to an image based on its size."""
93
  image = image.convert("RGB")
 
121
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
122
  fps = vidcap.get(cv2.CAP_PROP_FPS)
123
  frames = []
 
124
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
125
  for i in frame_indices:
126
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
 
133
  vidcap.release()
134
  return frames
135
 
136
+ @spaces.GPU
137
+ def generate_image(model_name: str, text: str, image: Image.Image,
138
+ max_new_tokens: int = 1024,
139
+ temperature: float = 0.6,
140
+ top_p: float = 0.9,
141
+ top_k: int = 50,
142
+ repetition_penalty: float = 1.2):
143
+ """Generate responses for image input using the selected model."""
144
+ if model_name == "Nanonets-OCR-s":
145
+ processor = processor_m
146
+ model = model_m
147
+ elif model_name == "MonkeyOCR-Recognition":
148
+ processor = processor_g
149
+ model = model_g
150
+ elif model_name == "SmolDocling-256M-preview":
151
+ processor = processor_x
152
+ model = model_x
153
+ elif model_name == "Typhoon-OCR-7B":
154
+ processor = processor_l
155
+ model = model_l
156
+ elif model_name == "Thyme-RL":
157
+ processor = processor_n
158
+ model = model_n
159
+ else:
160
+ yield "Invalid model selected.", "Invalid model selected."
161
+ return
162
+
163
+ if image is None:
164
+ yield "Please upload an image.", "Please upload an image."
165
+ return
166
+
167
+ images = [image]
168
+
169
+ if model_name == "SmolDocling-256M-preview":
170
+ if "OTSL" in text or "code" in text:
171
+ images = [add_random_padding(img) for img in images]
172
+ if "OCR at text at" in text or "Identify element" in text or "formula" in text:
173
+ text = normalize_values(text, target_max=500)
174
+
175
+ messages = [
176
+ {
177
+ "role": "user",
178
+ "content": [{"type": "image"} for _ in images] + [
179
+ {"type": "text", "text": text}
180
+ ]
181
+ }
182
+ ]
183
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
184
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
185
+
186
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
187
+ generation_kwargs = {
188
+ **inputs,
189
+ "streamer": streamer,
190
+ "max_new_tokens": max_new_tokens,
191
+ "temperature": temperature,
192
+ "top_p": top_p,
193
+ "top_k": top_k,
194
+ "repetition_penalty": repetition_penalty,
195
+ }
196
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
197
+ thread.start()
198
+
199
+ buffer = ""
200
+ for new_text in streamer:
201
+ buffer += new_text.replace("<|im_end|>", "")
202
+ yield buffer, buffer
203
+
204
+ if model_name == "SmolDocling-256M-preview":
205
+ cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
206
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
207
  if "<chart>" in cleaned_output:
208
  cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
 
210
  doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
211
  doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
212
  markdown_output = doc.export_to_markdown()
213
+ yield buffer, markdown_output
214
+ else:
215
+ yield buffer, cleaned_output
216
 
217
+ @spaces.GPU
218
+ def generate_video(model_name: str, text: str, video_path: str,
219
+ max_new_tokens: int = 1024,
220
+ temperature: float = 0.6,
221
+ top_p: float = 0.9,
222
+ top_k: int = 50,
223
+ repetition_penalty: float = 1.2):
224
+ """Generate responses for video input using the selected model."""
225
  if model_name == "Nanonets-OCR-s":
226
+ processor = processor_m
227
+ model = model_m
228
  elif model_name == "MonkeyOCR-Recognition":
229
+ processor = processor_g
230
+ model = model_g
231
  elif model_name == "SmolDocling-256M-preview":
232
+ processor = processor_x
233
+ model = model_x
234
  elif model_name == "Typhoon-OCR-7B":
235
+ processor = processor_l
236
+ model = model_l
237
  elif model_name == "Thyme-RL":
238
+ processor = processor_n
239
+ model = model_n
240
  else:
 
 
 
 
 
 
 
 
241
  yield "Invalid model selected.", "Invalid model selected."
242
  return
243
 
244
+ if video_path is None:
245
+ yield "Please upload a video.", "Please upload a video."
246
  return
247
 
248
+ frames = downsample_video(video_path)
249
+ images = [frame for frame, _ in frames]
 
 
 
250
 
251
  if model_name == "SmolDocling-256M-preview":
252
  if "OTSL" in text or "code" in text:
 
255
  text = normalize_values(text, target_max=500)
256
 
257
  messages = [
258
+ {
259
+ "role": "user",
260
+ "content": [{"type": "image"} for _ in images] + [
261
+ {"type": "text", "text": text}
262
+ ]
263
+ }
264
  ]
265
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
266
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
 
280
 
281
  buffer = ""
282
  for new_text in streamer:
283
+ buffer += new_text.replace("<|im_end|>", "")
284
  yield buffer, buffer
285
 
286
  if model_name == "SmolDocling-256M-preview":
287
+ cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
288
+ if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
289
+ if "<chart>" in cleaned_output:
290
+ cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
291
+ cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
292
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
293
+ doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
294
+ markdown_output = doc.export_to_markdown()
295
+ yield buffer, markdown_output
296
+ else:
297
+ yield buffer, cleaned_output
 
298
 
299
+ # Define examples for image and video inference
300
  image_examples = [
301
  ["Reconstruct the doc [table] as it is.", "images/0.png"],
302
  ["Describe the image!", "images/8.png"],
 
306
  ["Convert chart to OTSL.", "images/4.png"],
307
  ["Convert code to text", "images/5.jpg"],
308
  ["Convert this table to OTSL.", "images/6.jpg"],
309
+ ["Convert formula to late.", "images/7.jpg"],
310
  ]
311
 
312
  video_examples = [
 
314
  ["Explain the video in detail.", "videos/2.mp4"]
315
  ]
316
 
317
+ #css
318
  css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  .submit-btn {
320
+ background-color: #2980b9 !important;
321
+ color: white !important;
 
 
 
 
 
 
322
  }
 
323
  .submit-btn:hover {
324
+ background-color: #3498db !important;
 
 
 
 
 
325
  }
326
+ .canvas-output {
327
+ border: 2px solid #4682B4;
328
+ border-radius: 10px;
329
+ padding: 20px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  }
331
  """
332
 
333
+ # Create the Gradio Interface
334
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
 
 
 
 
 
335
  gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
 
336
  with gr.Row():
337
+ with gr.Column():
338
+ with gr.Tabs():
339
+ with gr.TabItem("Image Inference"):
340
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
341
+ image_upload = gr.Image(type="pil", label="Image", height=290)
342
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
343
+ gr.Examples(
344
+ examples=image_examples,
345
+ inputs=[image_query, image_upload]
346
+ )
347
+ with gr.TabItem("Video Inference"):
348
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
349
+ video_upload = gr.Video(label="Video", height=290)
350
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
351
+ gr.Examples(
352
+ examples=video_examples,
353
+ inputs=[video_query, video_upload]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  )
355
+ with gr.Accordion("Advanced options", open=False):
356
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
357
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
358
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
359
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
360
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
+ with gr.Column():
363
+ with gr.Column(elem_classes="canvas-output"):
364
+ gr.Markdown("## Output")
365
+ raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
 
 
 
366
 
367
+ with gr.Accordion("(Result.md)", open=False):
368
+ formatted_output = gr.Markdown(label="(Result.md)")
369
+
370
+ model_choice = gr.Radio(
371
+ choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
372
+ label="Select Model",
373
+ value="Nanonets-OCR-s"
374
+ )
375
+
376
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
377
+ gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
378
+ gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
379
+ gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
380
+ gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
381
+ gr.Markdown("> [Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL): Thyme: Think Beyond Images. Thyme transcends traditional ``thinking with images'' paradigms by autonomously generating and executing diverse image processing and computational operations through executable code, significantly enhancing performance on high-resolution perception and complex reasoning tasks.")
382
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
383
+
384
+ image_submit.click(
385
+ fn=generate_image,
386
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
387
+ outputs=[raw_output, formatted_output]
 
 
 
 
388
  )
389
+ video_submit.click(
390
+ fn=generate_video,
391
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
392
+ outputs=[raw_output,
393
+ formatted_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  )
395
 
396
  if __name__ == "__main__":
397
+ demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)