Chhagan005 commited on
Commit
3c88fc5
·
verified ·
1 Parent(s): c9833a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -502
app.py CHANGED
@@ -3,8 +3,9 @@ import random
3
  import uuid
4
  import json
5
  import time
 
6
  from threading import Thread
7
- from typing import Iterable
8
 
9
  import gradio as gr
10
  import spaces
@@ -45,6 +46,7 @@ from transformers.image_utils import load_image
45
  from gradio.themes import Soft
46
  from gradio.themes.utils import colors, fonts, sizes
47
 
 
48
  colors.steel_blue = colors.Color(
49
  name="steel_blue",
50
  c50="#EBF3F8",
@@ -120,7 +122,6 @@ css = """
120
  #output-title h2 {
121
  font-size: 2.2em !important;
122
  }
123
- /* RadioAnimated Styles */
124
  .ra-wrap{ width: fit-content; }
125
  .ra-inner{
126
  position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
@@ -139,7 +140,6 @@ css = """
139
  transition: transform 0.2s, width 0.2s;
140
  }
141
  .ra-input:checked + .ra-label{ color: black; }
142
- /* Dark mode adjustments for Radio */
143
  .dark .ra-inner { background: var(--neutral-800); }
144
  .dark .ra-label { color: var(--neutral-400); }
145
  .dark .ra-highlight { background: var(--neutral-600); }
@@ -151,6 +151,11 @@ css = """
151
  border: 1px solid var(--border-color-primary);
152
  margin-top: 10px;
153
  }
 
 
 
 
 
154
  """
155
 
156
  MAX_MAX_NEW_TOKENS = 4096
@@ -170,502 +175,55 @@ if torch.cuda.is_available():
170
 
171
  print("Using device:", device)
172
 
173
- # Multilingual OCR prompt template
174
- MULTILINGUAL_OCR_PROMPT = """Perform comprehensive OCR extraction on this government ID/document. Follow these rules:
175
-
176
- 1. Extract ALL text exactly as it appears in the original language
177
- 2. If the text is NOT in English, provide an English translation after the original text
178
- 3. Identify the document type (ID Card, Passport, License, etc.)
179
- 4. Extract key fields with structured format
180
- 5. Preserve formatting and layout structure
181
-
182
- Format your response as:
183
-
184
- **Document Type:** [type]
185
-
186
- **Original Text:** (in source language)
187
- [extracted text with layout preserved]
188
-
189
- **English Translation:** (if not already in English)
190
- [translated text]
191
-
192
- **Key Fields:**
193
- - Full Name:
194
- - ID Number:
195
- - Date of Birth:
196
- - Issue Date:
197
- - Expiry Date:
198
- - Nationality:
199
- - [other relevant fields]
200
-
201
- Be accurate and preserve all details."""
202
-
203
- class RadioAnimated(gr.HTML):
204
- def __init__(self, choices, value=None, **kwargs):
205
- if not choices or len(choices) < 2:
206
- raise ValueError("RadioAnimated requires at least 2 choices.")
207
- if value is None:
208
- value = choices[0]
209
-
210
- uid = uuid.uuid4().hex[:8]
211
- group_name = f"ra-{uid}"
212
-
213
- inputs_html = "\\n".join(
214
- f"""
215
- <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
216
- <label class="ra-label" for="{group_name}-{i}">{c}</label>
217
- """
218
- for i, c in enumerate(choices)
219
- )
220
-
221
- html_template = f"""
222
- <div class="ra-wrap" data-ra="{uid}">
223
- <div class="ra-inner">
224
- <div class="ra-highlight"></div>
225
- {inputs_html}
226
- </div>
227
- </div>
228
- """
229
-
230
- js_on_load = r"""
231
- (() => {
232
- const wrap = element.querySelector('.ra-wrap');
233
- const inner = element.querySelector('.ra-inner');
234
- const highlight = element.querySelector('.ra-highlight');
235
- const inputs = Array.from(element.querySelectorAll('.ra-input'));
236
- if (!inputs.length) return;
237
- const choices = inputs.map(i => i.value);
238
- function setHighlightByIndex(idx) {
239
- const n = choices.length;
240
- const pct = 100 / n;
241
- highlight.style.width = `calc(${pct}% - 6px)`;
242
- highlight.style.transform = `translateX(${idx * 100}%)`;
243
- }
244
- function setCheckedByValue(val, shouldTrigger=false) {
245
- const idx = Math.max(0, choices.indexOf(val));
246
- inputs.forEach((inp, i) => { inp.checked = (i === idx); });
247
- setHighlightByIndex(idx);
248
- props.value = choices[idx];
249
- if (shouldTrigger) trigger('change', props.value);
250
- }
251
- setCheckedByValue(props.value ?? choices[0], false);
252
- inputs.forEach((inp) => {
253
- inp.addEventListener('change', () => {
254
- setCheckedByValue(inp.value, true);
255
- });
256
- });
257
- })();
258
- """
259
-
260
- super().__init__(
261
- value=value,
262
- html_template=html_template,
263
- js_on_load=js_on_load,
264
- **kwargs
265
- )
266
-
267
- def apply_gpu_duration(val: str):
268
- return int(val)
269
-
270
- # ===== MODEL LOADING =====
271
-
272
- print("\n" + "="*70)
273
- print("🚀 LOADING ALL 4 MODELS")
274
- print("="*70 + "\n")
275
-
276
- # Model 1: Chhagan_ML-VL-OCR-v1 (LoRA Fine-tuned for ID Cards)
277
- print("1️⃣ Loading Chhagan_ML-VL-OCR-v1 (LoRA Refined)...")
278
- MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
279
- CHHAGAN_V1_AVAILABLE = False
280
- processor_c1 = None
281
- model_c1 = None
282
-
283
- if PEFT_AVAILABLE:
284
- try:
285
- # Try to get base model from adapter config
286
- try:
287
- config = PeftConfig.from_pretrained(MODEL_ID_C1)
288
- base_model_id = config.base_model_name_or_path
289
- print(f" Base model from config: {base_model_id}")
290
- except:
291
- # Fallback to common base models
292
- base_model_id = "Qwen/Qwen2.5-VL-2B-Instruct"
293
- print(f" Using default base model: {base_model_id}")
294
-
295
- # Load processor
296
- processor_c1 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
297
-
298
- # Load base model
299
- base_model_c1 = Qwen2VLForConditionalGeneration.from_pretrained(
300
- base_model_id,
301
- torch_dtype=torch.float16,
302
- device_map="auto",
303
- trust_remote_code=True
304
- )
305
-
306
- # Load LoRA adapter
307
- model_c1 = PeftModel.from_pretrained(base_model_c1, MODEL_ID_C1)
308
- model_c1 = model_c1.to(device).eval()
309
-
310
- print(" ✅ Chhagan_ML-VL-OCR-v1 (Refined) loaded successfully!")
311
- CHHAGAN_V1_AVAILABLE = True
312
- except Exception as e:
313
- print(f" ❌ Chhagan_ML-VL-OCR-v1 failed: {e}")
314
- processor_c1 = None
315
- model_c1 = None
316
- else:
317
- print(" ⚠️ PEFT not available, skipping LoRA model")
318
-
319
- # Model 2: Chhagan-DocVL-Qwen3 (Qwen3-VL Refined for Documents)
320
- print("\n2️⃣ Loading Chhagan-DocVL-Qwen3 (Qwen3-VL Refined)...")
321
- MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
322
- CHHAGAN_QWEN3_AVAILABLE = False
323
- processor_c2 = None
324
- model_c2 = None
325
-
326
- if QWEN3_AVAILABLE:
327
- try:
328
- # Check if it's a PEFT adapter or full model
329
- try:
330
- # Try loading as PEFT adapter first
331
- if PEFT_AVAILABLE:
332
- config = PeftConfig.from_pretrained(MODEL_ID_C2)
333
- base_model_id = config.base_model_name_or_path
334
- print(f" Detected as LoRA adapter, base: {base_model_id}")
335
-
336
- processor_c2 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
337
- base_model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
338
- base_model_id,
339
- torch_dtype=torch.float16,
340
- device_map="auto",
341
- trust_remote_code=True
342
- )
343
- model_c2 = PeftModel.from_pretrained(base_model_c2, MODEL_ID_C2)
344
- model_c2 = model_c2.to(device).eval()
345
- else:
346
- raise Exception("PEFT not available")
347
- except:
348
- # Load as full fine-tuned model
349
- print(" Loading as full fine-tuned model...")
350
- processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
351
- model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
352
- MODEL_ID_C2,
353
- attn_implementation="flash_attention_2",
354
- torch_dtype=torch.float16,
355
- device_map="auto",
356
- trust_remote_code=True
357
- ).to(device).eval()
358
-
359
- print(" ✅ Chhagan-DocVL-Qwen3 (Refined) loaded successfully!")
360
- CHHAGAN_QWEN3_AVAILABLE = True
361
- except Exception as e:
362
- print(f" ❌ Chhagan-DocVL-Qwen3 failed: {e}")
363
- processor_c2 = None
364
- model_c2 = None
365
- else:
366
- print(" ⚠️ Qwen3VL not available in transformers version")
367
-
368
- # Model 3: Qwen3-VL-2B-Instruct (Baseline for Comparison)
369
- print("\n3️⃣ Loading Qwen3-VL-2B-Instruct (Baseline)...")
370
- MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
371
- QWEN3_BASELINE_AVAILABLE = False
372
- processor_q3 = None
373
- model_q3 = None
374
-
375
- if QWEN3_AVAILABLE:
376
- try:
377
- processor_q3 = AutoProcessor.from_pretrained(MODEL_ID_Q3, trust_remote_code=True)
378
- model_q3 = Qwen3VLForConditionalGeneration.from_pretrained(
379
- MODEL_ID_Q3,
380
- attn_implementation="flash_attention_2",
381
- torch_dtype=torch.float16,
382
- device_map="auto",
383
- trust_remote_code=True
384
- ).to(device).eval()
385
- print(" ✅ Qwen3-VL-2B-Instruct (Baseline) loaded successfully!")
386
- QWEN3_BASELINE_AVAILABLE = True
387
- except Exception as e:
388
- print(f" ❌ Qwen3-VL-2B-Instruct failed: {e}")
389
- else:
390
- print(" ⚠️ Qwen3VL not available in transformers version")
391
-
392
- # Model 4: Nanonets-OCR2-3B (General OCR Fallback)
393
- print("\n4️⃣ Loading Nanonets-OCR2-3B (General OCR)...")
394
- MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
395
- NANONETS_AVAILABLE = False
396
- processor_v = None
397
- model_v = None
398
-
399
- try:
400
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
401
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
402
- MODEL_ID_V,
403
- attn_implementation="flash_attention_2",
404
- trust_remote_code=True,
405
- torch_dtype=torch.float16
406
- ).to(device).eval()
407
- print(" ✅ Nanonets-OCR2-3B loaded successfully!")
408
- NANONETS_AVAILABLE = True
409
- except Exception as e:
410
- print(f" ❌ Nanonets-OCR2-3B failed: {e}")
411
-
412
-
413
- # Summary
414
- print("\n" + "="*70)
415
- print("📊 MODEL STATUS SUMMARY (4 Models)")
416
- print("="*70)
417
- print(f"{'Model Name':<40} {'Status':<15} {'Type'}")
418
- print("-"*70)
419
- print(f"{'Chhagan_ML-VL-OCR-v1':<40} {'✅ Loaded' if CHHAGAN_V1_AVAILABLE else '❌ Failed':<15} {'Refined (LoRA)'}")
420
- print(f"{'Chhagan-DocVL-Qwen3':<40} {'✅ Loaded' if CHHAGAN_QWEN3_AVAILABLE else '❌ Failed':<15} {'Refined (Qwen3)'}")
421
- print(f"{'Qwen3-VL-2B-Instruct':<40} {'✅ Loaded' if QWEN3_BASELINE_AVAILABLE else '❌ Failed':<15} {'Baseline'}")
422
- print(f"{'Nanonets-OCR2-3B':<40} {'✅ Loaded' if NANONETS_AVAILABLE else '❌ Failed':<15} {'General OCR'}")
423
- print("="*70)
424
-
425
- loaded_count = sum([CHHAGAN_V1_AVAILABLE, CHHAGAN_QWEN3_AVAILABLE, QWEN3_BASELINE_AVAILABLE, NANONETS_AVAILABLE])
426
- print(f"\n✨ Total models loaded: {loaded_count}/4")
427
-
428
- if CHHAGAN_V1_AVAILABLE or CHHAGAN_QWEN3_AVAILABLE:
429
- print("💡 Recommendation: Use Chhagan Refined models for best accuracy!")
430
- if QWEN3_BASELINE_AVAILABLE:
431
- print("📊 Comparison Tip: Test Refined vs Baseline to see improvement!")
432
- print()
433
-
434
- def calc_timeout_duration(model_name: str, text: str, image: Image.Image,
435
- max_new_tokens: int, temperature: float, top_p: float,
436
- top_k: int, repetition_penalty: float, gpu_timeout: int):
437
- """Calculate GPU timeout duration based on the last argument."""
438
- try:
439
- return int(gpu_timeout)
440
- except:
441
- return 60
442
-
443
-
444
- @spaces.GPU(duration=calc_timeout_duration)
445
- def generate_image(model_name: str, text: str, image: Image.Image,
446
- max_new_tokens: int, temperature: float, top_p: float,
447
- top_k: int, repetition_penalty: float, gpu_timeout: int):
448
- """
449
- Generates responses using the selected model for image input.
450
- Yields raw text and Markdown-formatted text.
451
- """
452
- # Select model and processor based on model name
453
- if model_name == "Chhagan-ID-OCR-v1 ⭐":
454
- if not CHHAGAN_V1_AVAILABLE:
455
- yield "Chhagan_ML-VL-OCR-v1 model is not available.", "Chhagan_ML-VL-OCR-v1 model is not available."
456
- return
457
- processor = processor_c1
458
- model = model_c1
459
- elif model_name == "Chhagan-DocVL-Qwen3 🔥":
460
- if not CHHAGAN_QWEN3_AVAILABLE:
461
- yield "Chhagan-DocVL-Qwen3 model is not available.", "Chhagan-DocVL-Qwen3 model is not available."
462
- return
463
- processor = processor_c2
464
- model = model_c2
465
- elif model_name == "Qwen3-VL-2B (Baseline) 📊":
466
- if not QWEN3_BASELINE_AVAILABLE:
467
- yield "Qwen3-VL-2B-Instruct baseline model is not available.", "Qwen3-VL-2B-Instruct baseline model is not available."
468
- return
469
- processor = processor_q3
470
- model = model_q3
471
- elif model_name == "Nanonets-OCR2-3B":
472
- if not NANONETS_AVAILABLE:
473
- yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
474
- return
475
- processor = processor_v
476
- model = model_v
477
- else:
478
- yield "Invalid model selected.", "Invalid model selected."
479
- return
480
-
481
- if image is None:
482
- yield "Please upload an image.", "Please upload an image."
483
- return
484
-
485
- # Use multilingual prompt if user query is empty or simple
486
- if not text or text.strip().lower() in ["ocr", "extract", "read", ""]:
487
- text = MULTILINGUAL_OCR_PROMPT
488
-
489
- messages = [{
490
- "role": "user",
491
- "content": [
492
- {"type": "image"},
493
- {"type": "text", "text": text},
494
- ]
495
- }]
496
-
497
- try:
498
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
499
- except Exception as e:
500
- print(f"Chat template error: {e}")
501
- # Fallback to simple prompt
502
- prompt_full = text
503
-
504
- inputs = processor(
505
- text=[prompt_full],
506
- images=[image],
507
- return_tensors="pt",
508
- padding=True).to(device)
509
-
510
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
511
- generation_kwargs = {
512
- **inputs,
513
- "streamer": streamer,
514
- "max_new_tokens": max_new_tokens,
515
- "do_sample": True,
516
- "temperature": temperature,
517
- "top_p": top_p,
518
- "top_k": top_k,
519
- "repetition_penalty": repetition_penalty,
520
- }
521
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
522
- thread.start()
523
- buffer = ""
524
- for new_text in streamer:
525
- buffer += new_text
526
- buffer = buffer.replace("<|im_end|>", "")
527
- buffer = buffer.replace("<|endoftext|>", "")
528
- time.sleep(0.01)
529
- yield buffer, buffer
530
-
531
-
532
- image_examples = [
533
- ["Extract all text with English translation from this government ID", "examples/5.jpg"],
534
- ["Perform comprehensive multilingual OCR on this document", "examples/4.jpg"],
535
- ["Extract key fields: Name, ID, DOB, Expiry from this card", "examples/2.jpg"],
536
- ["Identify document type and extract all information", "examples/1.jpg"],
537
- ["Convert this page with layout preservation", "examples/3.jpg"],
538
- ]
539
-
540
- # Build model choices dynamically (Order: Refined models first, then baseline)
541
- model_choices = []
542
- if CHHAGAN_V1_AVAILABLE:
543
- model_choices.append("Chhagan-ID-OCR-v1 ⭐")
544
- if CHHAGAN_QWEN3_AVAILABLE:
545
- model_choices.append("Chhagan-DocVL-Qwen3 🔥")
546
- if QWEN3_BASELINE_AVAILABLE:
547
- model_choices.append("Qwen3-VL-2B (Baseline) 📊")
548
- if NANONETS_AVAILABLE:
549
- model_choices.append("Nanonets-OCR2-3B")
550
-
551
- if not model_choices:
552
- model_choices = ["No models available"]
553
-
554
- demo = gr.Blocks()
555
- with demo:
556
- gr.Markdown("# 🌍 **Chhagan Multilingual ID Card OCR**", elem_id="main-title")
557
- gr.Markdown("### *4 AI Models: 2 Refined + 2 Baseline for Comparison*")
558
-
559
- # Model info banner
560
- loaded_models = []
561
- if CHHAGAN_V1_AVAILABLE:
562
- loaded_models.append("ID-OCR-v1 ⭐")
563
- if CHHAGAN_QWEN3_AVAILABLE:
564
- loaded_models.append("DocVL-Qwen3 🔥")
565
- if QWEN3_BASELINE_AVAILABLE:
566
- loaded_models.append("Qwen3-Baseline 📊")
567
- if NANONETS_AVAILABLE:
568
- loaded_models.append("Nanonets")
569
-
570
- model_info = f"**Loaded Models ({len(loaded_models)}/4):** {', '.join(loaded_models)}" if loaded_models else "⚠️ No models loaded"
571
-
572
- gr.Markdown(f"**Status:** {model_info}")
573
- gr.Markdown("**Supported**: Arabic, English, Hindi, Urdu, Persian, French, Spanish + 30 languages")
574
-
575
- with gr.Row():
576
- with gr.Column(scale=2):
577
- image_query = gr.Textbox(
578
- label="💬 Query (Optional)",
579
- placeholder="Leave empty for automatic ID card extraction...",
580
- value=""
581
- )
582
- image_upload = gr.Image(type="pil", label="📤 Upload ID Card / Document", height=290)
583
-
584
- image_submit = gr.Button("🚀 Extract OCR", variant="primary", size="lg")
585
- gr.Examples(
586
- examples=image_examples,
587
- inputs=[image_query, image_upload],
588
- label="📸 Sample Documents"
589
- )
590
-
591
- with gr.Accordion("⚙️ Advanced Settings", open=False):
592
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
593
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.7)
594
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
595
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
596
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
597
-
598
- with gr.Column(scale=3):
599
- gr.Markdown("## 📄 Extracted Results", elem_id="output-title")
600
- output = gr.Textbox(label="OCR Output (Streaming)", interactive=True, lines=11)
601
- with gr.Accordion("📝 Markdown Preview", open=False):
602
- markdown_output = gr.Markdown(label="Formatted Result")
603
-
604
- model_choice = gr.Radio(
605
- choices=model_choices,
606
- label="🤖 Select OCR Model",
607
- value=model_choices[0] if model_choices else None,
608
- info="⭐🔥 = Refined | 📊 = Baseline | Compare to see improvement!"
609
- )
610
-
611
- # Model descriptions
612
- gr.Markdown("""
613
- **Model Guide:**
614
- - **⭐ ID-OCR-v1**: Fine-tuned LoRA for Government IDs (Best for ID cards)
615
- - **🔥 DocVL-Qwen3**: Fine-tuned Qwen3-VL for Documents (Best for documents)
616
- - **📊 Qwen3-VL Baseline**: Vanilla pretrained (For comparison benchmark)
617
- - **Nanonets**: General OCR fallback
618
- """)
619
-
620
- with gr.Row(elem_id="gpu-duration-container"):
621
- with gr.Column():
622
- gr.Markdown("**⏱️ GPU Duration (seconds)**")
623
- radioanimated_gpu_duration = RadioAnimated(
624
- choices=["60", "90", "120", "180", "240"],
625
- value="60",
626
- elem_id="radioanimated_gpu_duration"
627
- )
628
- gpu_duration_state = gr.Number(value=60, visible=False)
629
-
630
- gr.Markdown("*💡 Tip: Test same document on Refined vs Baseline to see fine-tuning improvement*")
631
-
632
- radioanimated_gpu_duration.change(
633
- fn=apply_gpu_duration,
634
- inputs=radioanimated_gpu_duration,
635
- outputs=[gpu_duration_state],
636
- api_visibility="private"
637
- )
638
-
639
- image_submit.click(
640
- fn=generate_image,
641
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
642
- outputs=[output, markdown_output]
643
- )
644
-
645
- # Footer with detailed comparison table
646
- gr.Markdown("""
647
- ---
648
- ### 📊 Model Comparison Table
649
-
650
- | Model | Type | Base Architecture | Training | Specialization | Best For |
651
- |-------|------|------------------|----------|----------------|----------|
652
- | **Chhagan-ID-OCR-v1** ⭐ | Refined (LoRA) | Qwen2.5-VL-2B | Fine-tuned on IDs | Government IDs | Passports, National IDs, Licenses |
653
- | **Chhagan-DocVL-Qwen3** 🔥 | Refined (Full) | Qwen3-VL-2B | Fine-tuned on Docs | Documents | Contracts, Forms, Certificates |
654
- | **Qwen3-VL-2B** 📊 | Baseline | Qwen3-VL-2B | Pretrained only | General Vision | Comparison benchmark |
655
- | **Nanonets-OCR2-3B** | General OCR | Qwen2.5-VL-3B | General OCR training | Text extraction | Receipts, Invoices |
656
-
657
- ### 🎯 Performance Expectations
658
- - **Refined models (⭐🔥)**: 95-98% accuracy on target documents
659
- - **Baseline (📊)**: 75-85% accuracy (shows fine-tuning value)
660
- - **Improvement**: ~15-20% accuracy boost from fine-tuning
661
-
662
- ### 🔍 When to Use Each Model
663
- 1. **Start with Refined models** (⭐ or 🔥) based on document type
664
- 2. **Use Baseline** to benchmark improvement
665
- 3. **Fallback to Nanonets** for edge cases
666
-
667
- **🔒 Privacy**: All processing on-device | No data stored
668
- """)
669
-
670
- if __name__ == "__main__":
671
- demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)
 
3
  import uuid
4
  import json
5
  import time
6
+ import re
7
  from threading import Thread
8
+ from typing import Iterable, List, Dict, Any
9
 
10
  import gradio as gr
11
  import spaces
 
46
  from gradio.themes import Soft
47
  from gradio.themes.utils import colors, fonts, sizes
48
 
49
+ # Theme configuration (keeping your existing theme)
50
  colors.steel_blue = colors.Color(
51
  name="steel_blue",
52
  c50="#EBF3F8",
 
122
  #output-title h2 {
123
  font-size: 2.2em !important;
124
  }
 
125
  .ra-wrap{ width: fit-content; }
126
  .ra-inner{
127
  position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
 
140
  transition: transform 0.2s, width 0.2s;
141
  }
142
  .ra-input:checked + .ra-label{ color: black; }
 
143
  .dark .ra-inner { background: var(--neutral-800); }
144
  .dark .ra-label { color: var(--neutral-400); }
145
  .dark .ra-highlight { background: var(--neutral-600); }
 
151
  border: 1px solid var(--border-color-primary);
152
  margin-top: 10px;
153
  }
154
+ .dual-card-container {
155
+ display: grid;
156
+ grid-template-columns: 1fr 1fr;
157
+ gap: 15px;
158
+ }
159
  """
160
 
161
  MAX_MAX_NEW_TOKENS = 4096
 
175
 
176
  print("Using device:", device)
177
 
178
+ # Enhanced multilingual OCR prompt with embedded image extraction
179
+ DUAL_CARD_OCR_PROMPT = """Perform comprehensive OCR extraction on this ID card image. Extract ALL information with maximum English translation accuracy:
180
+
181
+ **EXTRACTION REQUIREMENTS:**
182
+
183
+ 1. **TEXT EXTRACTION**: Extract ALL text in original language with accurate English translation
184
+ 2. **EMBEDDED IMAGES**:
185
+ - Locate and describe profile photo/headshot (if present)
186
+ - Locate and describe signature (if present)
187
+ - Extract any logos or official seals
188
+ 3. **MRZ DATA**: If Machine Readable Zone is present (usually at bottom):
189
+ - Extract complete MRZ lines
190
+ - Parse: Document Type, Country Code, Document Number, Date of Birth, Expiry Date, Nationality
191
+ 4. **STRUCTURED FIELDS**: Extract with English labels:
192
+ - Full Name (in English)
193
+ - ID/Document Number
194
+ - Date of Birth
195
+ - Issue Date & Expiry Date
196
+ - Nationality/Country
197
+ - Address (if present)
198
+ - Document Type
199
+
200
+ **OUTPUT FORMAT:**
201
+ ```markdown
202
+ ## 📋 Document Type
203
+ [Type: Passport/ID Card/License/etc.]
204
+
205
+ ## 🖼️ Embedded Images
206
+ ### Profile Photo
207
+ - Location: [describe position]
208
+ - Description: [describe photo]
209
+
210
+ ### Signature
211
+ - Present: [Yes/No]
212
+ - Location: [describe position if present]
213
+
214
+ ## 📝 Original Text
215
+ [All text in original language with layout preserved]
216
+
217
+ ## 🔤 English Translation
218
+ [Complete accurate English translation]
219
+
220
+ ## 🔑 Key Fields (English)
221
+ - **Full Name**:
222
+ - **ID Number**:
223
+ - **Date of Birth**:
224
+ - **Issue Date**:
225
+ - **Expiry Date**:
226
+ - **Nationality**:
227
+ - **Address**:
228
+
229
+ ## 🔐 MRZ Data (if present)