Chhagan005 commited on
Commit
6007a3e
ยท
verified ยท
1 Parent(s): d471039

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +475 -364
app.py CHANGED
@@ -14,7 +14,6 @@ import numpy as np
14
  from PIL import Image
15
  import cv2
16
 
17
- # Clear cache conflicts
18
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
19
  os.environ["HF_HOME"] = "/tmp/hf_home"
20
 
@@ -26,7 +25,6 @@ from transformers import (
26
  AutoConfig
27
  )
28
 
29
- # PEFT for loading LoRA adapters
30
  try:
31
  from peft import PeftModel, PeftConfig
32
  PEFT_AVAILABLE = True
@@ -34,7 +32,6 @@ except:
34
  PEFT_AVAILABLE = False
35
  print("โš ๏ธ PEFT not available, LoRA adapters cannot be loaded")
36
 
37
- # Try importing Qwen3VL
38
  try:
39
  from transformers import Qwen3VLForConditionalGeneration
40
  QWEN3_AVAILABLE = True
@@ -46,6 +43,7 @@ from transformers.image_utils import load_image
46
  from gradio.themes import Soft
47
  from gradio.themes.utils import colors, fonts, sizes
48
 
 
49
  colors.steel_blue = colors.Color(
50
  name="steel_blue",
51
  c50="#EBF3F8",
@@ -53,7 +51,7 @@ colors.steel_blue = colors.Color(
53
  c200="#A8CCE1",
54
  c300="#7DB3D2",
55
  c400="#529AC3",
56
- c500="#4682B4",
57
  c600="#3E72A0",
58
  c700="#36638C",
59
  c800="#2E5378",
@@ -111,16 +109,12 @@ class SteelBlueTheme(Soft):
111
  color_accent_soft="*primary_100",
112
  block_label_background_fill="*primary_200",
113
  )
114
-
115
  steel_blue_theme = SteelBlueTheme()
116
 
117
  css = """
118
- #main-title h1 {
119
- font-size: 2.3em !important;
120
- }
121
- #output-title h2 {
122
- font-size: 2.2em !important;
123
- }
124
  .ra-wrap{ width: fit-content; }
125
  .ra-inner{
126
  position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
@@ -166,147 +160,108 @@ print("cuda device count:", torch.cuda.device_count())
166
  if torch.cuda.is_available():
167
  print("current device:", torch.cuda.current_device())
168
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
169
-
170
  print("Using device:", device)
171
 
172
- # Enhanced multilingual OCR prompt with embedded image extraction
173
- DUAL_CARD_OCR_PROMPT = """Perform comprehensive OCR extraction on this ID card image. Extract ALL information with maximum English translation accuracy:
174
-
175
- EXTRACTION REQUIREMENTS:
176
-
177
- 1. TEXT EXTRACTION: Extract ALL text in original language with accurate English translation
178
- 2. EMBEDDED IMAGES:
179
- - Locate and describe profile photo/headshot (if present)
180
- - Locate and describe signature (if present)
181
- - Extract any logos or official seals
182
- 3. MRZ DATA: If Machine Readable Zone is present (usually at bottom):
183
- - Extract complete MRZ lines
184
- - Parse: Document Type, Country Code, Document Number, Date of Birth, Expiry Date, Nationality
185
- 4. STRUCTURED FIELDS: Extract with English labels:
186
- - Full Name (in English)
187
- - ID/Document Number
188
- - Date of Birth
189
- - Issue Date & Expiry Date
190
- - Nationality/Country
191
- - Address (if present)
192
- - Document Type
193
-
194
- OUTPUT FORMAT:
195
- Document Type: [Type: Passport/ID Card/License/etc.]
196
-
197
- Embedded Images:
198
- - Profile Photo Location: [describe position]
199
- - Profile Photo Description: [describe photo]
200
- - Signature Present: [Yes/No]
201
- - Signature Location: [describe position if present]
202
-
203
- Original Text:
204
- [All text in original language with layout preserved]
205
-
206
- English Translation:
207
- [Complete accurate English translation]
208
-
209
- Key Fields (English):
210
- - Full Name:
211
- - ID Number:
212
- - Date of Birth:
213
- - Issue Date:
214
- - Expiry Date:
215
- - Nationality:
216
- - Address:
217
-
218
- MRZ Data (if present):
219
- Raw MRZ Lines: [extract here]
220
- Parsed MRZ:
221
- - Document Type:
222
- - Country Code:
223
- - Document Number:
224
- - Date of Birth:
225
- - Expiry Date:
226
- - Nationality:
227
- - Sex:
228
-
229
- ACCURACY REQUIREMENTS:
230
- - English translation must be 95%+ accurate
231
- - Preserve all numbers and dates exactly
232
- - MRZ must be character-perfect
233
- - Do not skip any fields"""
234
-
235
- SINGLE_SIDE_PROMPT = """Extract all information from this ID card side:
236
- - All visible text (original + English translation)
237
- - Profile photo location and description
238
- - Signature (if present)
239
- - MRZ data (if present at bottom)
240
- - All key fields in structured format
241
-
242
- Provide complete extraction with high English accuracy."""
243
 
244
- class RadioAnimated(gr.HTML):
245
- def __init__(self, choices, value=None, **kwargs):
246
- if not choices or len(choices) < 2:
247
- raise ValueError("RadioAnimated requires at least 2 choices.")
248
- if value is None:
249
- value = choices[0]
250
 
251
- uid = uuid.uuid4().hex[:8]
252
- group_name = f"ra-{uid}"
253
 
254
- inputs_html = "\\n".join(
255
- f"""
256
- <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
257
- <label class="ra-label" for="{group_name}-{i}">{c}</label>
258
- """
259
- for i, c in enumerate(choices)
260
- )
261
 
262
- html_template = f"""
263
- <div class="ra-wrap" data-ra="{uid}">
264
- <div class="ra-inner">
265
- <div class="ra-highlight"></div>
266
- {inputs_html}
267
- </div>
268
- </div>
269
- """
270
 
271
- js_on_load = r"""
272
- (() => {
273
- const wrap = element.querySelector('.ra-wrap');
274
- const inner = element.querySelector('.ra-inner');
275
- const highlight = element.querySelector('.ra-highlight');
276
- const inputs = Array.from(element.querySelectorAll('.ra-input'));
277
- if (!inputs.length) return;
278
- const choices = inputs.map(i => i.value);
279
- function setHighlightByIndex(idx) {
280
- const n = choices.length;
281
- const pct = 100 / n;
282
- highlight.style.width = `calc(${pct}% - 6px)`;
283
- highlight.style.transform = `translateX(${idx * 100}%)`;
284
- }
285
- function setCheckedByValue(val, shouldTrigger=false) {
286
- const idx = Math.max(0, choices.indexOf(val));
287
- inputs.forEach((inp, i) => { inp.checked = (i === idx); });
288
- setHighlightByIndex(idx);
289
- props.value = choices[idx];
290
- if (shouldTrigger) trigger('change', props.value);
291
- }
292
- setCheckedByValue(props.value ?? choices[0], false);
293
- inputs.forEach((inp) => {
294
- inp.addEventListener('change', () => {
295
- setCheckedByValue(inp.value, true);
296
- });
297
- });
298
- })();
299
- """
300
 
301
- super().__init__(
302
- value=value,
303
- html_template=html_template,
304
- js_on_load=js_on_load,
305
- **kwargs
306
- )
307
 
308
- def apply_gpu_duration(val: str):
309
- return int(val)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  # ===== MODEL LOADING =====
312
 
@@ -314,7 +269,7 @@ print("\n" + "="*70)
314
  print("๐Ÿš€ LOADING ALL 4 MODELS")
315
  print("="*70 + "\n")
316
 
317
- # Model 1: Chhagan_ML-VL-OCR-v1 (LoRA Fine-tuned for ID Cards)
318
  print("1๏ธโƒฃ Loading Chhagan_ML-VL-OCR-v1 (LoRA Refined)...")
319
  MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
320
  CHHAGAN_V1_AVAILABLE = False
@@ -330,7 +285,7 @@ if PEFT_AVAILABLE:
330
  except:
331
  base_model_id = "Qwen/Qwen2.5-VL-2B-Instruct"
332
  print(f" Using default base model: {base_model_id}")
333
-
334
  processor_c1 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
335
  base_model_c1 = Qwen2VLForConditionalGeneration.from_pretrained(
336
  base_model_id,
@@ -340,15 +295,14 @@ if PEFT_AVAILABLE:
340
  )
341
  model_c1 = PeftModel.from_pretrained(base_model_c1, MODEL_ID_C1)
342
  model_c1 = model_c1.to(device).eval()
343
-
344
- print(" โœ… Chhagan_ML-VL-OCR-v1 (Refined) loaded successfully!")
345
  CHHAGAN_V1_AVAILABLE = True
346
  except Exception as e:
347
  print(f" โŒ Chhagan_ML-VL-OCR-v1 failed: {e}")
348
  else:
349
  print(" โš ๏ธ PEFT not available, skipping LoRA model")
350
 
351
- # Model 2: Chhagan-DocVL-Qwen3 (Qwen3-VL Refined for Documents)
352
  print("\n2๏ธโƒฃ Loading Chhagan-DocVL-Qwen3 (Qwen3-VL Refined)...")
353
  MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
354
  CHHAGAN_QWEN3_AVAILABLE = False
@@ -362,7 +316,6 @@ if QWEN3_AVAILABLE:
362
  config = PeftConfig.from_pretrained(MODEL_ID_C2)
363
  base_model_id = config.base_model_name_or_path
364
  print(f" Detected as LoRA adapter, base: {base_model_id}")
365
-
366
  processor_c2 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
367
  base_model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
368
  base_model_id,
@@ -384,15 +337,14 @@ if QWEN3_AVAILABLE:
384
  device_map="auto",
385
  trust_remote_code=True
386
  ).to(device).eval()
387
-
388
- print(" โœ… Chhagan-DocVL-Qwen3 (Refined) loaded successfully!")
389
  CHHAGAN_QWEN3_AVAILABLE = True
390
  except Exception as e:
391
  print(f" โŒ Chhagan-DocVL-Qwen3 failed: {e}")
392
  else:
393
  print(" โš ๏ธ Qwen3VL not available in transformers version")
394
 
395
- # Model 3: Qwen3-VL-2B-Instruct (Baseline for Comparison)
396
  print("\n3๏ธโƒฃ Loading Qwen3-VL-2B-Instruct (Baseline)...")
397
  MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
398
  QWEN3_BASELINE_AVAILABLE = False
@@ -409,14 +361,14 @@ if QWEN3_AVAILABLE:
409
  device_map="auto",
410
  trust_remote_code=True
411
  ).to(device).eval()
412
- print(" โœ… Qwen3-VL-2B-Instruct (Baseline) loaded successfully!")
413
  QWEN3_BASELINE_AVAILABLE = True
414
  except Exception as e:
415
  print(f" โŒ Qwen3-VL-2B-Instruct failed: {e}")
416
  else:
417
  print(" โš ๏ธ Qwen3VL not available in transformers version")
418
 
419
- # Model 4: Nanonets-OCR2-3B (General OCR Fallback)
420
  print("\n4๏ธโƒฃ Loading Nanonets-OCR2-3B (General OCR)...")
421
  MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
422
  NANONETS_AVAILABLE = False
@@ -436,7 +388,6 @@ try:
436
  except Exception as e:
437
  print(f" โŒ Nanonets-OCR2-3B failed: {e}")
438
 
439
- # Summary
440
  print("\n" + "="*70)
441
  print("๐Ÿ“Š MODEL STATUS SUMMARY (4 Models)")
442
  print("="*70)
@@ -447,14 +398,79 @@ print(f"{'Chhagan-DocVL-Qwen3':<40} {'โœ… Loaded' if CHHAGAN_QWEN3_AVAILABLE els
447
  print(f"{'Qwen3-VL-2B-Instruct':<40} {'โœ… Loaded' if QWEN3_BASELINE_AVAILABLE else 'โŒ Failed':<15} {'Baseline'}")
448
  print(f"{'Nanonets-OCR2-3B':<40} {'โœ… Loaded' if NANONETS_AVAILABLE else 'โŒ Failed':<15} {'General OCR'}")
449
  print("="*70)
450
-
451
  loaded_count = sum([CHHAGAN_V1_AVAILABLE, CHHAGAN_QWEN3_AVAILABLE, QWEN3_BASELINE_AVAILABLE, NANONETS_AVAILABLE])
452
  print(f"\nโœจ Total models loaded: {loaded_count}/4")
453
 
454
- def calc_timeout_duration(model_name: str, text: str, image_front: Image.Image, image_back: Image.Image,
455
- max_new_tokens: int, temperature: float, top_p: float,
456
- top_k: int, repetition_penalty: float, gpu_timeout: int):
457
- """Calculate GPU timeout duration - doubled for dual card processing"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  try:
459
  base_timeout = int(gpu_timeout)
460
  if image_front is not None and image_back is not None:
@@ -464,67 +480,195 @@ def calc_timeout_duration(model_name: str, text: str, image_front: Image.Image,
464
  return 120
465
 
466
 
467
- def extract_embedded_images_info(text_output: str) -> Dict[str, Any]:
468
- """Parse extracted text to identify profile photo and signature mentions"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  result = {
470
- "has_profile_photo": False,
471
- "profile_location": "",
472
- "has_signature": False,
473
- "signature_location": "",
474
- "mrz_data": ""
 
 
475
  }
476
-
477
- if re.search(r"(profile|photo|picture|image|headshot)", text_output, re.IGNORECASE):
478
- result["has_profile_photo"] = True
479
- photo_match = re.search(r"(top|bottom|left|right|corner|center).{0,20}(photo|image|picture)", text_output, re.IGNORECASE)
480
- if photo_match:
481
- result["profile_location"] = photo_match.group(0)
482
-
483
- if re.search(r"signature", text_output, re.IGNORECASE):
484
- result["has_signature"] = True
485
- sig_match = re.search(r"(signature).{0,50}", text_output, re.IGNORECASE)
486
- if sig_match:
487
- result["signature_location"] = sig_match.group(0)
488
-
489
- mrz_pattern = r"^[A-Z][<0-9A-Z]{20,}$"
490
- mrz_lines = [line.strip() for line in text_output.split("\n") if re.match(mrz_pattern, line.strip())]
491
- if mrz_lines:
492
- result["mrz_data"] = "\n".join(mrz_lines)
493
-
 
 
 
494
  return result
495
 
496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  @spaces.GPU(duration=calc_timeout_duration)
498
- def generate_dual_card_ocr(model_name: str, text: str, image_front: Image.Image, image_back: Image.Image,
 
499
  max_new_tokens: int, temperature: float, top_p: float,
500
  top_k: int, repetition_penalty: float, gpu_timeout: int):
501
- """
502
- Enhanced OCR processing for front and back ID cards with embedded image extraction
503
- """
504
  if model_name == "Chhagan-ID-OCR-v1 โญ":
505
  if not CHHAGAN_V1_AVAILABLE:
506
  yield "Chhagan_ML-VL-OCR-v1 model is not available.", "Chhagan_ML-VL-OCR-v1 model is not available."
507
  return
508
- processor = processor_c1
509
- model = model_c1
510
  elif model_name == "Chhagan-DocVL-Qwen3 ๐Ÿ”ฅ":
511
  if not CHHAGAN_QWEN3_AVAILABLE:
512
  yield "Chhagan-DocVL-Qwen3 model is not available.", "Chhagan-DocVL-Qwen3 model is not available."
513
  return
514
- processor = processor_c2
515
- model = model_c2
516
  elif model_name == "Qwen3-VL-2B (Baseline) ๐Ÿ“Š":
517
  if not QWEN3_BASELINE_AVAILABLE:
518
  yield "Qwen3-VL-2B-Instruct baseline model is not available.", "Qwen3-VL-2B-Instruct baseline model is not available."
519
  return
520
- processor = processor_q3
521
- model = model_q3
522
  elif model_name == "Nanonets-OCR2-3B":
523
  if not NANONETS_AVAILABLE:
524
  yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
525
  return
526
- processor = processor_v
527
- model = model_v
528
  else:
529
  yield "Invalid model selected.", "Invalid model selected."
530
  return
@@ -533,131 +677,85 @@ def generate_dual_card_ocr(model_name: str, text: str, image_front: Image.Image,
533
  yield "Please upload at least one card image (front or back).", "Please upload at least one card image (front or back)."
534
  return
535
 
536
- if not text or text.strip().lower() in ["ocr", "extract", "read", ""]:
537
- text = DUAL_CARD_OCR_PROMPT
538
-
539
  full_output = ""
540
-
541
- # Process Front Card
 
 
542
  if image_front is not None:
543
- full_output += "# ๐ŸŽด FRONT CARD EXTRACTION\n\n"
 
544
  yield full_output, full_output
545
-
546
- messages_front = [{
547
- "role": "user",
548
- "content": [
549
- {"type": "image"},
550
- {"type": "text", "text": text},
551
- ]
552
- }]
553
-
554
- try:
555
- prompt_front = processor.apply_chat_template(messages_front, tokenize=False, add_generation_prompt=True)
556
- except:
557
- prompt_front = text
558
-
559
- inputs_front = processor(
560
- text=[prompt_front],
561
- images=[image_front],
562
- return_tensors="pt",
563
- padding=True).to(device)
564
-
565
- streamer_front = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
566
- generation_kwargs_front = {
567
- **inputs_front,
568
- "streamer": streamer_front,
569
- "max_new_tokens": max_new_tokens,
570
- "do_sample": True,
571
- "temperature": temperature,
572
- "top_p": top_p,
573
- "top_k": top_k,
574
- "repetition_penalty": repetition_penalty,
575
- }
576
- thread_front = Thread(target=model.generate, kwargs=generation_kwargs_front)
577
- thread_front.start()
578
-
579
- buffer_front = ""
580
- for new_text in streamer_front:
581
- buffer_front += new_text
582
- buffer_front = buffer_front.replace("<|im_end|>", "").replace("<|endoftext|>", "")
583
  time.sleep(0.01)
584
- current_output = full_output + buffer_front
585
- yield current_output, current_output
586
-
587
- full_output += buffer_front + "\n\n"
588
- thread_front.join()
589
-
590
- # Process Back Card
591
  if image_back is not None:
592
- full_output += "\n\n---\n\n# ๐ŸŽด BACK CARD EXTRACTION\n\n"
 
593
  yield full_output, full_output
594
-
595
- messages_back = [{
596
- "role": "user",
597
- "content": [
598
- {"type": "image"},
599
- {"type": "text", "text": text},
600
- ]
601
- }]
602
-
603
- try:
604
- prompt_back = processor.apply_chat_template(messages_back, tokenize=False, add_generation_prompt=True)
605
- except:
606
- prompt_back = text
607
-
608
- inputs_back = processor(
609
- text=[prompt_back],
610
- images=[image_back],
611
- return_tensors="pt",
612
- padding=True).to(device)
613
-
614
- streamer_back = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
615
- generation_kwargs_back = {
616
- **inputs_back,
617
- "streamer": streamer_back,
618
- "max_new_tokens": max_new_tokens,
619
- "do_sample": True,
620
- "temperature": temperature,
621
- "top_p": top_p,
622
- "top_k": top_k,
623
- "repetition_penalty": repetition_penalty,
624
- }
625
- thread_back = Thread(target=model.generate, kwargs=generation_kwargs_back)
626
- thread_back.start()
627
-
628
- buffer_back = ""
629
- for new_text in streamer_back:
630
- buffer_back += new_text
631
- buffer_back = buffer_back.replace("<|im_end|>", "").replace("<|endoftext|>", "")
632
  time.sleep(0.01)
633
- current_output = full_output + buffer_back
634
- yield current_output, current_output
635
-
636
- full_output += buffer_back
637
- thread_back.join()
638
-
639
- # Add summary section
640
- full_output += "\n\n---\n\n## ๐Ÿ“Š Extraction Summary\n"
641
-
642
- embedded_info = extract_embedded_images_info(full_output)
643
-
644
- full_output += f"\n### ๐Ÿ–ผ๏ธ Embedded Content Detection:\n"
645
- full_output += f"- **Profile Photo**: {'โœ… Detected' if embedded_info['has_profile_photo'] else 'โŒ Not found'}\n"
646
- if embedded_info['profile_location']:
647
- full_output += f" - Location: {embedded_info['profile_location']}\n"
648
- full_output += f"- **Signature**: {'โœ… Detected' if embedded_info['has_signature'] else 'โŒ Not found'}\n"
649
- if embedded_info['signature_location']:
650
- full_output += f" - Details: {embedded_info['signature_location']}\n"
651
-
652
- if embedded_info['mrz_data']:
653
- full_output += f"\n### ๐Ÿ” MRZ Data Extracted:\n```\n{embedded_info['mrz_data']}\n```\n"
654
-
655
- full_output += f"\n**โœจ Extraction Complete** | Model: {model_name}\n"
656
-
657
  yield full_output, full_output
658
 
659
 
660
- # Build model choices dynamically
 
661
  model_choices = []
662
  if CHHAGAN_V1_AVAILABLE:
663
  model_choices.append("Chhagan-ID-OCR-v1 โญ")
@@ -671,18 +769,20 @@ if NANONETS_AVAILABLE:
671
  if not model_choices:
672
  model_choices = ["No models available"]
673
 
674
- # Example images
675
  dual_card_examples = [
676
  ["Extract complete information from both sides", "examples/5.jpg", None],
677
  ["Multilingual OCR with MRZ extraction", "examples/4.jpg", None],
678
  ["Extract profile photo and signature locations", "examples/2.jpg", None],
679
  ]
680
 
 
 
 
681
  demo = gr.Blocks(css=css, theme=steel_blue_theme)
682
  with demo:
683
  gr.Markdown("# ๐ŸŒ **Chhagan Dual-Card ID OCR System**", elem_id="main-title")
684
- gr.Markdown("### *Advanced OCR with Profile Image, Signature & MRZ Extraction*")
685
-
686
  loaded_models = []
687
  if CHHAGAN_V1_AVAILABLE:
688
  loaded_models.append("ID-OCR-v1 โญ")
@@ -692,53 +792,52 @@ with demo:
692
  loaded_models.append("Qwen3-Baseline ๐Ÿ“Š")
693
  if NANONETS_AVAILABLE:
694
  loaded_models.append("Nanonets")
695
-
696
  model_info = f"**Loaded Models ({len(loaded_models)}/4):** {', '.join(loaded_models)}" if loaded_models else "โš ๏ธ No models loaded"
697
-
698
  gr.Markdown(f"**Status:** {model_info}")
699
- gr.Markdown("**Features**: โœ… Dual Card Upload | โœ… Profile Photo Detection | โœ… Signature Extraction | โœ… MRZ Reading | โœ… 95%+ English Accuracy")
700
-
701
  with gr.Row():
702
  with gr.Column(scale=2):
703
  image_query = gr.Textbox(
704
- label="๐Ÿ’ฌ Custom Query (Optional)",
705
- placeholder="Leave empty for automatic extraction of all data including images and MRZ...",
706
  value=""
707
  )
708
-
709
  gr.Markdown("### ๐Ÿ“ค Upload ID Cards")
710
  with gr.Row():
711
  image_front = gr.Image(type="pil", label="๐ŸŽด Front Card", height=250)
712
  image_back = gr.Image(type="pil", label="๐ŸŽด Back Card (Optional)", height=250)
713
 
714
- image_submit = gr.Button("๐Ÿš€ Extract Complete Data", variant="primary", size="lg")
715
-
716
  gr.Examples(
717
  examples=dual_card_examples,
718
  inputs=[image_query, image_front, image_back],
719
  label="๐Ÿ“ธ Sample ID Cards"
720
  )
721
-
722
  with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
723
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
724
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
725
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
726
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
727
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
728
-
729
  with gr.Column(scale=3):
730
- gr.Markdown("## ๐Ÿ“„ Complete Extraction Results", elem_id="output-title")
731
- output = gr.Textbox(label="OCR Output (Streaming)", interactive=True, lines=15)
732
- with gr.Accordion("๐Ÿ“ Markdown Preview", open=True):
733
  markdown_output = gr.Markdown(label="Formatted Result")
734
 
735
  model_choice = gr.Radio(
736
  choices=model_choices,
737
  label="๐Ÿค– Select OCR Model",
738
  value=model_choices[0] if model_choices else None,
739
- info="โญ๐Ÿ”ฅ = Refined for high accuracy | ๐Ÿ“Š = Baseline"
740
  )
741
-
742
  with gr.Row(elem_id="gpu-duration-container"):
743
  with gr.Column():
744
  gr.Markdown("**โฑ๏ธ GPU Duration (seconds)**")
@@ -748,62 +847,72 @@ with demo:
748
  elem_id="radioanimated_gpu_duration"
749
  )
750
  gpu_duration_state = gr.Number(value=120, visible=False)
751
-
752
  gr.Markdown("""
753
- **โœจ Extraction Includes:**
754
- - ๐Ÿ“ Complete text extraction (original + English)
755
- - ๐Ÿ–ผ๏ธ Profile photo location detection
756
- - โœ๏ธ Signature identification
757
- - ๐Ÿ” MRZ data parsing
758
- - ๐ŸŽฏ Structured key fields
 
 
759
  """)
760
 
761
  radioanimated_gpu_duration.change(
762
- fn=apply_gpu_duration,
763
- inputs=radioanimated_gpu_duration,
764
- outputs=[gpu_duration_state],
765
  api_visibility="private"
766
  )
767
 
768
  image_submit.click(
769
  fn=generate_dual_card_ocr,
770
- inputs=[model_choice, image_query, image_front, image_back, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
 
 
 
 
 
771
  outputs=[output, markdown_output]
772
  )
773
-
774
  gr.Markdown("""
775
  ---
776
- ### ๐ŸŽฏ Key Features
777
-
778
  | Feature | Status | Description |
779
  |---------|--------|-------------|
780
- | **Dual Card Upload** | โœ… | Process front and back simultaneously |
781
- | **Profile Photo Detection** | โœ… | Automatically locates and describes headshot |
782
- | **Signature Extraction** | โœ… | Identifies signature presence and location |
783
- | **MRZ Reading** | โœ… | Parses Machine Readable Zone data |
784
- | **English Translation** | โœ… | 95%+ accuracy for non-English text |
785
- | **Multilingual Support** | โœ… | 30+ languages including Arabic, Hindi, Urdu |
786
-
787
- ### ๐Ÿ“‹ Supported Document Types
788
- - Government ID Cards (front + back)
789
- - Passports (with MRZ)
790
- - Driver's Licenses
791
- - Residence Permits
792
- - Visas and Travel Documents
793
-
794
- ### ๐Ÿ”’ Privacy & Security
795
- - All processing on-device
 
 
 
796
  - No data stored or transmitted
797
  - GDPR compliant
798
-
799
- **๐Ÿ’ก Pro Tip**: Upload both front and back for complete extraction including hidden MRZ data on back side!
800
  """)
801
 
 
802
  if __name__ == "__main__":
803
  print("\n" + "="*70)
804
  print("๐Ÿš€ STARTING GRADIO INTERFACE...")
805
  print("="*70 + "\n")
806
-
807
  try:
808
  demo.queue(max_size=50).launch(
809
  server_name="0.0.0.0",
@@ -816,3 +925,5 @@ if __name__ == "__main__":
816
  print(f"โŒ Launch error: {e}")
817
  import traceback
818
  traceback.print_exc()
 
 
 
14
  from PIL import Image
15
  import cv2
16
 
 
17
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
18
  os.environ["HF_HOME"] = "/tmp/hf_home"
19
 
 
25
  AutoConfig
26
  )
27
 
 
28
  try:
29
  from peft import PeftModel, PeftConfig
30
  PEFT_AVAILABLE = True
 
32
  PEFT_AVAILABLE = False
33
  print("โš ๏ธ PEFT not available, LoRA adapters cannot be loaded")
34
 
 
35
  try:
36
  from transformers import Qwen3VLForConditionalGeneration
37
  QWEN3_AVAILABLE = True
 
43
  from gradio.themes import Soft
44
  from gradio.themes.utils import colors, fonts, sizes
45
 
46
+ # ===== THEME SETUP =====
47
  colors.steel_blue = colors.Color(
48
  name="steel_blue",
49
  c50="#EBF3F8",
 
51
  c200="#A8CCE1",
52
  c300="#7DB3D2",
53
  c400="#529AC3",
54
+ c500="#4682B4",
55
  c600="#3E72A0",
56
  c700="#36638C",
57
  c800="#2E5378",
 
109
  color_accent_soft="*primary_100",
110
  block_label_background_fill="*primary_200",
111
  )
112
+
113
  steel_blue_theme = SteelBlueTheme()
114
 
115
  css = """
116
+ #main-title h1 { font-size: 2.3em !important; }
117
+ #output-title h2 { font-size: 2.2em !important; }
 
 
 
 
118
  .ra-wrap{ width: fit-content; }
119
  .ra-inner{
120
  position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
 
160
  if torch.cuda.is_available():
161
  print("current device:", torch.cuda.current_device())
162
  print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 
163
  print("Using device:", device)
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # ===== PROMPTS =====
 
 
 
 
 
167
 
168
+ STEP1_EXTRACT_PROMPT = """You are a precision OCR engine. Your ONLY job is to extract raw text from this ID card image.
 
169
 
170
+ STRICT RULES:
171
+ - Copy ALL text EXACTLY as it appears in original language/script (Hindi, Arabic, Urdu, Chinese, Devanagari, etc.)
172
+ - DO NOT translate anything in this step
173
+ - DO NOT add any interpretation or explanation
174
+ - Preserve layout and line breaks exactly
175
+ - Extract every number, date, code, and character precisely
176
+ - Also detect visual element presence
177
 
178
+ Output ONLY in this exact structured format, nothing else:
 
 
 
 
 
 
 
179
 
180
+ PHOTO_PRESENT: yes/no
181
+ PHOTO_LOCATION: [top-left / top-right / center-left / center-right / bottom-left / not found]
182
+ SIGNATURE_PRESENT: yes/no
183
+ SIGNATURE_LOCATION: [bottom-center / bottom-right / bottom-left / not found]
184
+ MRZ_PRESENT: yes/no
185
+ DETECTED_LANGUAGE: [Hindi / Arabic / Urdu / Chinese / English / Mixed / etc.]
186
+ ---TEXT_START---
187
+ [Every piece of text in original script, line by line, layout preserved exactly]
188
+ ---TEXT_END---"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
 
 
 
 
 
 
190
 
191
+ STEP2_TEMPLATE = """You are a multilingual KYC document expert with 95%+ translation accuracy.
192
+
193
+ DOCUMENT METADATA (from Step 1 analysis):
194
+ - Photo Present: {photo_present} | Location: {photo_location}
195
+ - Signature Present: {sig_present} | Location: {sig_location}
196
+ - MRZ Present: {mrz_present}
197
+ - Detected Language: {detected_lang}
198
+
199
+ RAW EXTRACTED TEXT (original script):
200
+ {raw_text}
201
+
202
+ YOUR TASKS:
203
+ 1. If text is non-English โ†’ translate to English with 95%+ accuracy
204
+ 2. If text is already English โ†’ copy as-is
205
+ 3. Extract all key KYC fields
206
+ 4. Output EXACTLY in the format below โ€” no extra commentary
207
+
208
+ ---
209
+
210
+ ## ๐Ÿ–ผ๏ธ Visual Elements
211
+
212
+ | Element | Status | Location |
213
+ |---------|--------|----------|
214
+ | ๐Ÿ“ท Profile Photo | {photo_present} | {photo_location} |
215
+ | โœ๏ธ Signature | {sig_present} | {sig_location} |
216
+ | ๐Ÿ” MRZ Zone | {mrz_present} | Bottom strip |
217
+
218
+ ---
219
+
220
+ ## ๐Ÿ“œ Original Script
221
+
222
+ {raw_text}
223
+
224
+ ---
225
+
226
+ ## ๐ŸŒ English Translation
227
+ [Write complete English translation here. If already English, write: Already in English โ€” then copy text]
228
+
229
+ ---
230
+
231
+ ## ๐Ÿ—‚๏ธ Key Fields (English)
232
+
233
+ | Field | Value |
234
+ |-------|-------|
235
+ | ๐Ÿ“„ Document Type | |
236
+ | ๐Ÿ‘ค Full Name | |
237
+ | ๐Ÿ”ข ID / Document Number | |
238
+ | ๐ŸŽ‚ Date of Birth | |
239
+ | ๐Ÿ“… Issue Date | |
240
+ | โณ Expiry Date | |
241
+ | ๐ŸŒ Nationality | |
242
+ | โšง๏ธ Gender | |
243
+ | ๐Ÿ  Address | |
244
+ | ๐Ÿ‘จ Father / Guardian | |
245
+ | ๐Ÿ›๏ธ Issuing Authority | |
246
+
247
+ ---
248
+
249
+ ## ๐Ÿ” MRZ Data
250
+ [Raw MRZ lines here โ€” if not present write: NOT PRESENT]
251
+
252
+ **Parsed MRZ:**
253
+ | Field | Value |
254
+ |-------|-------|
255
+ | Document Type | |
256
+ | Country Code | |
257
+ | Document Number | |
258
+ | Date of Birth | |
259
+ | Expiry Date | |
260
+ | Nationality | |
261
+ | Sex | |
262
+
263
+ ---"""
264
+
265
 
266
  # ===== MODEL LOADING =====
267
 
 
269
  print("๐Ÿš€ LOADING ALL 4 MODELS")
270
  print("="*70 + "\n")
271
 
272
+ # Model 1: Chhagan_ML-VL-OCR-v1 (LoRA Fine-tuned)
273
  print("1๏ธโƒฃ Loading Chhagan_ML-VL-OCR-v1 (LoRA Refined)...")
274
  MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
275
  CHHAGAN_V1_AVAILABLE = False
 
285
  except:
286
  base_model_id = "Qwen/Qwen2.5-VL-2B-Instruct"
287
  print(f" Using default base model: {base_model_id}")
288
+
289
  processor_c1 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
290
  base_model_c1 = Qwen2VLForConditionalGeneration.from_pretrained(
291
  base_model_id,
 
295
  )
296
  model_c1 = PeftModel.from_pretrained(base_model_c1, MODEL_ID_C1)
297
  model_c1 = model_c1.to(device).eval()
298
+ print(" โœ… Chhagan_ML-VL-OCR-v1 loaded successfully!")
 
299
  CHHAGAN_V1_AVAILABLE = True
300
  except Exception as e:
301
  print(f" โŒ Chhagan_ML-VL-OCR-v1 failed: {e}")
302
  else:
303
  print(" โš ๏ธ PEFT not available, skipping LoRA model")
304
 
305
+ # Model 2: Chhagan-DocVL-Qwen3
306
  print("\n2๏ธโƒฃ Loading Chhagan-DocVL-Qwen3 (Qwen3-VL Refined)...")
307
  MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
308
  CHHAGAN_QWEN3_AVAILABLE = False
 
316
  config = PeftConfig.from_pretrained(MODEL_ID_C2)
317
  base_model_id = config.base_model_name_or_path
318
  print(f" Detected as LoRA adapter, base: {base_model_id}")
 
319
  processor_c2 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
320
  base_model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
321
  base_model_id,
 
337
  device_map="auto",
338
  trust_remote_code=True
339
  ).to(device).eval()
340
+ print(" โœ… Chhagan-DocVL-Qwen3 loaded successfully!")
 
341
  CHHAGAN_QWEN3_AVAILABLE = True
342
  except Exception as e:
343
  print(f" โŒ Chhagan-DocVL-Qwen3 failed: {e}")
344
  else:
345
  print(" โš ๏ธ Qwen3VL not available in transformers version")
346
 
347
+ # Model 3: Qwen3-VL-2B-Instruct (Baseline)
348
  print("\n3๏ธโƒฃ Loading Qwen3-VL-2B-Instruct (Baseline)...")
349
  MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
350
  QWEN3_BASELINE_AVAILABLE = False
 
361
  device_map="auto",
362
  trust_remote_code=True
363
  ).to(device).eval()
364
+ print(" โœ… Qwen3-VL-2B-Instruct loaded successfully!")
365
  QWEN3_BASELINE_AVAILABLE = True
366
  except Exception as e:
367
  print(f" โŒ Qwen3-VL-2B-Instruct failed: {e}")
368
  else:
369
  print(" โš ๏ธ Qwen3VL not available in transformers version")
370
 
371
+ # Model 4: Nanonets-OCR2-3B
372
  print("\n4๏ธโƒฃ Loading Nanonets-OCR2-3B (General OCR)...")
373
  MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
374
  NANONETS_AVAILABLE = False
 
388
  except Exception as e:
389
  print(f" โŒ Nanonets-OCR2-3B failed: {e}")
390
 
 
391
  print("\n" + "="*70)
392
  print("๐Ÿ“Š MODEL STATUS SUMMARY (4 Models)")
393
  print("="*70)
 
398
  print(f"{'Qwen3-VL-2B-Instruct':<40} {'โœ… Loaded' if QWEN3_BASELINE_AVAILABLE else 'โŒ Failed':<15} {'Baseline'}")
399
  print(f"{'Nanonets-OCR2-3B':<40} {'โœ… Loaded' if NANONETS_AVAILABLE else 'โŒ Failed':<15} {'General OCR'}")
400
  print("="*70)
 
401
  loaded_count = sum([CHHAGAN_V1_AVAILABLE, CHHAGAN_QWEN3_AVAILABLE, QWEN3_BASELINE_AVAILABLE, NANONETS_AVAILABLE])
402
  print(f"\nโœจ Total models loaded: {loaded_count}/4")
403
 
404
+
405
+ # ===== HELPER: RadioAnimated =====
406
+
407
+ class RadioAnimated(gr.HTML):
408
+ def __init__(self, choices, value=None, **kwargs):
409
+ if not choices or len(choices) < 2:
410
+ raise ValueError("RadioAnimated requires at least 2 choices.")
411
+ if value is None:
412
+ value = choices[0]
413
+ uid = uuid.uuid4().hex[:8]
414
+ group_name = f"ra-{uid}"
415
+ inputs_html = "\n".join(
416
+ f"""
417
+ <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
418
+ <label class="ra-label" for="{group_name}-{i}">{c}</label>
419
+ """
420
+ for i, c in enumerate(choices)
421
+ )
422
+ html_template = f"""
423
+ <div class="ra-wrap" data-ra="{uid}">
424
+ <div class="ra-inner">
425
+ <div class="ra-highlight"></div>
426
+ {inputs_html}
427
+ </div>
428
+ </div>
429
+ """
430
+ js_on_load = r"""
431
+ (() => {
432
+ const wrap = element.querySelector('.ra-wrap');
433
+ const inner = element.querySelector('.ra-inner');
434
+ const highlight = element.querySelector('.ra-highlight');
435
+ const inputs = Array.from(element.querySelectorAll('.ra-input'));
436
+ if (!inputs.length) return;
437
+ const choices = inputs.map(i => i.value);
438
+ function setHighlightByIndex(idx) {
439
+ const n = choices.length;
440
+ const pct = 100 / n;
441
+ highlight.style.width = `calc(${pct}% - 6px)`;
442
+ highlight.style.transform = `translateX(${idx * 100}%)`;
443
+ }
444
+ function setCheckedByValue(val, shouldTrigger=false) {
445
+ const idx = Math.max(0, choices.indexOf(val));
446
+ inputs.forEach((inp, i) => { inp.checked = (i === idx); });
447
+ setHighlightByIndex(idx);
448
+ props.value = choices[idx];
449
+ if (shouldTrigger) trigger('change', props.value);
450
+ }
451
+ setCheckedByValue(props.value ?? choices[0], false);
452
+ inputs.forEach((inp) => {
453
+ inp.addEventListener('change', () => {
454
+ setCheckedByValue(inp.value, true);
455
+ });
456
+ });
457
+ })();
458
+ """
459
+ super().__init__(
460
+ value=value,
461
+ html_template=html_template,
462
+ js_on_load=js_on_load,
463
+ **kwargs
464
+ )
465
+
466
+
467
+ def apply_gpu_duration(val: str):
468
+ return int(val)
469
+
470
+
471
+ def calc_timeout_duration(model_name, text, image_front, image_back,
472
+ max_new_tokens, temperature, top_p,
473
+ top_k, repetition_penalty, gpu_timeout):
474
  try:
475
  base_timeout = int(gpu_timeout)
476
  if image_front is not None and image_back is not None:
 
480
  return 120
481
 
482
 
483
+ # ===== STEP 1: RAW EXTRACTION (NO TRANSLATION) =====
484
+
485
+ def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
486
+ messages = [{
487
+ "role": "user",
488
+ "content": [
489
+ {"type": "image"},
490
+ {"type": "text", "text": STEP1_EXTRACT_PROMPT},
491
+ ]
492
+ }]
493
+ try:
494
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
495
+ except:
496
+ prompt = STEP1_EXTRACT_PROMPT
497
+
498
+ inputs = processor(
499
+ text=[prompt],
500
+ images=[image],
501
+ return_tensors="pt",
502
+ padding=True
503
+ ).to(device)
504
+
505
+ with torch.no_grad():
506
+ output_ids = model.generate(
507
+ **inputs,
508
+ max_new_tokens=512,
509
+ do_sample=True,
510
+ temperature=temperature,
511
+ top_p=top_p,
512
+ top_k=top_k,
513
+ repetition_penalty=repetition_penalty,
514
+ )
515
+ input_len = inputs['input_ids'].shape[1]
516
+ generated = output_ids[:, input_len:]
517
+ return processor.batch_decode(generated, skip_special_tokens=True)[0]
518
+
519
+
520
+ # ===== PARSE STEP 1 OUTPUT =====
521
+
522
+ def parse_step1_output(raw_output: str) -> dict:
523
  result = {
524
+ "photo_present": "โŒ Not detected",
525
+ "photo_location": "N/A",
526
+ "sig_present": "โŒ Not detected",
527
+ "sig_location": "N/A",
528
+ "mrz_present": "โŒ Not detected",
529
+ "detected_lang": "Unknown",
530
+ "original_text": raw_output
531
  }
532
+
533
+ def extract_field(pattern, text, default="N/A"):
534
+ match = re.search(pattern, text, re.IGNORECASE)
535
+ return match.group(1).strip() if match else default
536
+
537
+ photo = extract_field(r"PHOTO_PRESENT:\s*(yes|no)", raw_output)
538
+ result["photo_present"] = "โœ… Yes" if photo.lower() == "yes" else "โŒ No"
539
+ result["photo_location"] = extract_field(r"PHOTO_LOCATION:\s*([^\n]+)", raw_output)
540
+
541
+ sig = extract_field(r"SIGNATURE_PRESENT:\s*(yes|no)", raw_output)
542
+ result["sig_present"] = "โœ… Yes" if sig.lower() == "yes" else "โŒ No"
543
+ result["sig_location"] = extract_field(r"SIGNATURE_LOCATION:\s*([^\n]+)", raw_output)
544
+
545
+ mrz = extract_field(r"MRZ_PRESENT:\s*(yes|no)", raw_output)
546
+ result["mrz_present"] = "โœ… Yes" if mrz.lower() == "yes" else "โŒ No"
547
+ result["detected_lang"] = extract_field(r"DETECTED_LANGUAGE:\s*([^\n]+)", raw_output, "Unknown")
548
+
549
+ text_match = re.search(r"---TEXT_START---\n?(.*?)---TEXT_END---", raw_output, re.DOTALL)
550
+ if text_match:
551
+ result["original_text"] = text_match.group(1).strip()
552
+
553
  return result
554
 
555
 
556
+ # ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
557
+
558
+ def run_step2_structure(model, processor, metadata: dict, device,
559
+ max_new_tokens, temperature, top_p, top_k, repetition_penalty):
560
+ step2_prompt = STEP2_TEMPLATE.format(
561
+ photo_present=metadata["photo_present"],
562
+ photo_location=metadata["photo_location"],
563
+ sig_present=metadata["sig_present"],
564
+ sig_location=metadata["sig_location"],
565
+ mrz_present=metadata["mrz_present"],
566
+ detected_lang=metadata["detected_lang"],
567
+ raw_text=metadata["original_text"],
568
+ )
569
+
570
+ messages = [{"role": "user", "content": [{"type": "text", "text": step2_prompt}]}]
571
+ try:
572
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
573
+ except:
574
+ prompt = step2_prompt
575
+
576
+ inputs = processor(text=[prompt], return_tensors="pt", padding=True).to(device)
577
+
578
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
579
+ gen_kwargs = {
580
+ **inputs,
581
+ "streamer": streamer,
582
+ "max_new_tokens": max_new_tokens,
583
+ "do_sample": True,
584
+ "temperature": temperature,
585
+ "top_p": top_p,
586
+ "top_k": top_k,
587
+ "repetition_penalty": repetition_penalty,
588
+ }
589
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
590
+ thread.start()
591
+ return streamer, thread
592
+
593
+
594
+ # ===== UNIFIED DEDUPLICATED SUMMARY =====
595
+
596
+ def build_unified_summary(front_result: str, back_result: str) -> str:
597
+ summary = "## ๐Ÿ”„ Unified Deduplicated Record\n\n"
598
+ summary += "> *Unique fields from both sides merged. Conflicts flagged with โš ๏ธ.*\n\n"
599
+
600
+ def extract_table_rows(text):
601
+ rows = {}
602
+ table_match = re.search(
603
+ r"## ๐Ÿ—‚๏ธ Key Fields.*?\n\|.*?\n\|[-| ]+\n(.*?)(?=\n---|\Z)", text, re.DOTALL
604
+ )
605
+ if table_match:
606
+ for line in table_match.group(1).strip().split("\n"):
607
+ parts = [p.strip() for p in line.split("|") if p.strip()]
608
+ if len(parts) >= 2:
609
+ field = re.sub(r"[^\w\s/]", "", parts[0]).strip()
610
+ value = parts[1].strip()
611
+ if value and value != "โ€”":
612
+ rows[field] = value
613
+ return rows
614
+
615
+ front_fields = extract_table_rows(front_result)
616
+ back_fields = extract_table_rows(back_result)
617
+ all_fields = list(dict.fromkeys(list(front_fields.keys()) + list(back_fields.keys())))
618
+
619
+ summary += "| Field | Value | Source |\n"
620
+ summary += "|-------|-------|--------|\n"
621
+
622
+ for field in all_fields:
623
+ f_val = front_fields.get(field, "")
624
+ b_val = back_fields.get(field, "")
625
+
626
+ if f_val and b_val:
627
+ if f_val.lower() == b_val.lower():
628
+ summary += f"| {field} | {f_val} | Front + Back โœ… |\n"
629
+ else:
630
+ summary += f"| {field} | Front: **{f_val}** / Back: **{b_val}** | โš ๏ธ Mismatch |\n"
631
+ elif f_val:
632
+ summary += f"| {field} | {f_val} | Front only |\n"
633
+ elif b_val:
634
+ summary += f"| {field} | {b_val} | Back only |\n"
635
+
636
+ return summary + "\n"
637
+
638
+
639
+ # ===== MAIN OCR FUNCTION =====
640
+
641
  @spaces.GPU(duration=calc_timeout_duration)
642
+ def generate_dual_card_ocr(model_name: str, text: str,
643
+ image_front: Image.Image, image_back: Image.Image,
644
  max_new_tokens: int, temperature: float, top_p: float,
645
  top_k: int, repetition_penalty: float, gpu_timeout: int):
646
+
647
+ # Model selection
 
648
  if model_name == "Chhagan-ID-OCR-v1 โญ":
649
  if not CHHAGAN_V1_AVAILABLE:
650
  yield "Chhagan_ML-VL-OCR-v1 model is not available.", "Chhagan_ML-VL-OCR-v1 model is not available."
651
  return
652
+ processor, model = processor_c1, model_c1
653
+
654
  elif model_name == "Chhagan-DocVL-Qwen3 ๐Ÿ”ฅ":
655
  if not CHHAGAN_QWEN3_AVAILABLE:
656
  yield "Chhagan-DocVL-Qwen3 model is not available.", "Chhagan-DocVL-Qwen3 model is not available."
657
  return
658
+ processor, model = processor_c2, model_c2
659
+
660
  elif model_name == "Qwen3-VL-2B (Baseline) ๐Ÿ“Š":
661
  if not QWEN3_BASELINE_AVAILABLE:
662
  yield "Qwen3-VL-2B-Instruct baseline model is not available.", "Qwen3-VL-2B-Instruct baseline model is not available."
663
  return
664
+ processor, model = processor_q3, model_q3
665
+
666
  elif model_name == "Nanonets-OCR2-3B":
667
  if not NANONETS_AVAILABLE:
668
  yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
669
  return
670
+ processor, model = processor_v, model_v
671
+
672
  else:
673
  yield "Invalid model selected.", "Invalid model selected."
674
  return
 
677
  yield "Please upload at least one card image (front or back).", "Please upload at least one card image (front or back)."
678
  return
679
 
 
 
 
680
  full_output = ""
681
+ front_result = ""
682
+ back_result = ""
683
+
684
+ # ===== FRONT CARD =====
685
  if image_front is not None:
686
+ full_output += "# ๐ŸŽด FRONT CARD\n\n"
687
+ full_output += "โณ **Step 1 / 2 โ€” Extracting raw text (original script, no translation)...**\n\n"
688
  yield full_output, full_output
689
+
690
+ step1_raw = run_step1_extraction(
691
+ model, processor, image_front, device,
692
+ temperature, top_p, top_k, repetition_penalty
693
+ )
694
+ front_meta = parse_step1_output(step1_raw)
695
+
696
+ full_output += f"โœ… **Step 1 Complete** โ€” ๐ŸŒ Detected Language: **{front_meta['detected_lang']}**\n\n"
697
+ full_output += "โณ **Step 2 / 2 โ€” Translating to English & building structured output...**\n\n"
698
+ yield full_output, full_output
699
+
700
+ streamer_f, thread_f = run_step2_structure(
701
+ model, processor, front_meta, device,
702
+ max_new_tokens, temperature, top_p, top_k, repetition_penalty
703
+ )
704
+
705
+ buffer_f = ""
706
+ for new_text in streamer_f:
707
+ buffer_f += new_text
708
+ buffer_f = buffer_f.replace("<|im_end|>", "").replace("<|endoftext|>", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  time.sleep(0.01)
710
+ yield full_output + buffer_f, full_output + buffer_f
711
+
712
+ full_output += buffer_f + "\n\n"
713
+ front_result = buffer_f
714
+ thread_f.join()
715
+
716
+ # ===== BACK CARD =====
717
  if image_back is not None:
718
+ full_output += "\n\n---\n\n# ๐ŸŽด BACK CARD\n\n"
719
+ full_output += "โณ **Step 1 / 2 โ€” Extracting raw text (original script, no translation)...**\n\n"
720
  yield full_output, full_output
721
+
722
+ step1_raw_back = run_step1_extraction(
723
+ model, processor, image_back, device,
724
+ temperature, top_p, top_k, repetition_penalty
725
+ )
726
+ back_meta = parse_step1_output(step1_raw_back)
727
+
728
+ full_output += f"โœ… **Step 1 Complete** โ€” ๐ŸŒ Detected Language: **{back_meta['detected_lang']}**\n\n"
729
+ full_output += "โณ **Step 2 / 2 โ€” Translating to English & building structured output...**\n\n"
730
+ yield full_output, full_output
731
+
732
+ streamer_b, thread_b = run_step2_structure(
733
+ model, processor, back_meta, device,
734
+ max_new_tokens, temperature, top_p, top_k, repetition_penalty
735
+ )
736
+
737
+ buffer_b = ""
738
+ for new_text in streamer_b:
739
+ buffer_b += new_text
740
+ buffer_b = buffer_b.replace("<|im_end|>", "").replace("<|endoftext|>", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
  time.sleep(0.01)
742
+ yield full_output + buffer_b, full_output + buffer_b
743
+
744
+ full_output += buffer_b
745
+ back_result = buffer_b
746
+ thread_b.join()
747
+
748
+ # ===== UNIFIED SUMMARY (only when both sides uploaded) =====
749
+ if image_front is not None and image_back is not None:
750
+ full_output += "\n\n---\n\n"
751
+ full_output += build_unified_summary(front_result, back_result)
752
+
753
+ full_output += f"\n\n---\n\n**โœจ Extraction Complete** | Model: `{model_name}` | Pipeline: OCR โ†’ Language Detect โ†’ Translate โ†’ Structure\n"
 
 
 
 
 
 
 
 
 
 
 
 
754
  yield full_output, full_output
755
 
756
 
757
+ # ===== BUILD MODEL CHOICES =====
758
+
759
  model_choices = []
760
  if CHHAGAN_V1_AVAILABLE:
761
  model_choices.append("Chhagan-ID-OCR-v1 โญ")
 
769
  if not model_choices:
770
  model_choices = ["No models available"]
771
 
 
772
  dual_card_examples = [
773
  ["Extract complete information from both sides", "examples/5.jpg", None],
774
  ["Multilingual OCR with MRZ extraction", "examples/4.jpg", None],
775
  ["Extract profile photo and signature locations", "examples/2.jpg", None],
776
  ]
777
 
778
+
779
+ # ===== GRADIO UI =====
780
+
781
  demo = gr.Blocks(css=css, theme=steel_blue_theme)
782
  with demo:
783
  gr.Markdown("# ๐ŸŒ **Chhagan Dual-Card ID OCR System**", elem_id="main-title")
784
+ gr.Markdown("### *Advanced OCR โ€ข Auto Language Detection โ€ข English Translation โ€ข MRZ Parsing*")
785
+
786
  loaded_models = []
787
  if CHHAGAN_V1_AVAILABLE:
788
  loaded_models.append("ID-OCR-v1 โญ")
 
792
  loaded_models.append("Qwen3-Baseline ๐Ÿ“Š")
793
  if NANONETS_AVAILABLE:
794
  loaded_models.append("Nanonets")
795
+
796
  model_info = f"**Loaded Models ({len(loaded_models)}/4):** {', '.join(loaded_models)}" if loaded_models else "โš ๏ธ No models loaded"
 
797
  gr.Markdown(f"**Status:** {model_info}")
798
+ gr.Markdown("**Pipeline:** โœ… Step 1: Raw OCR (original script) โ†’ โœ… Step 2: Auto Translate to English โ†’ โœ… Structured Output โ†’ โœ… Front+Back Deduplication")
799
+
800
  with gr.Row():
801
  with gr.Column(scale=2):
802
  image_query = gr.Textbox(
803
+ label="๐Ÿ’ฌ Custom Query (Optional)",
804
+ placeholder="Leave empty for automatic full extraction (OCR + translate + structure)...",
805
  value=""
806
  )
807
+
808
  gr.Markdown("### ๐Ÿ“ค Upload ID Cards")
809
  with gr.Row():
810
  image_front = gr.Image(type="pil", label="๐ŸŽด Front Card", height=250)
811
  image_back = gr.Image(type="pil", label="๐ŸŽด Back Card (Optional)", height=250)
812
 
813
+ image_submit = gr.Button("๐Ÿš€ Extract + Translate + Structure", variant="primary", size="lg")
814
+
815
  gr.Examples(
816
  examples=dual_card_examples,
817
  inputs=[image_query, image_front, image_back],
818
  label="๐Ÿ“ธ Sample ID Cards"
819
  )
820
+
821
  with gr.Accordion("โš™๏ธ Advanced Settings", open=False):
822
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
823
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
824
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
825
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
826
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
827
+
828
  with gr.Column(scale=3):
829
+ gr.Markdown("## ๐Ÿ“„ Extraction Results", elem_id="output-title")
830
+ output = gr.Textbox(label="Raw Output (Streaming)", interactive=True, lines=15)
831
+ with gr.Accordion("๐Ÿ“ Markdown Preview (Structured)", open=True):
832
  markdown_output = gr.Markdown(label="Formatted Result")
833
 
834
  model_choice = gr.Radio(
835
  choices=model_choices,
836
  label="๐Ÿค– Select OCR Model",
837
  value=model_choices[0] if model_choices else None,
838
+ info="โญ๐Ÿ”ฅ = Fine-tuned for ID Cards | ๐Ÿ“Š = Baseline | General OCR = Nanonets"
839
  )
840
+
841
  with gr.Row(elem_id="gpu-duration-container"):
842
  with gr.Column():
843
  gr.Markdown("**โฑ๏ธ GPU Duration (seconds)**")
 
847
  elem_id="radioanimated_gpu_duration"
848
  )
849
  gpu_duration_state = gr.Number(value=120, visible=False)
850
+
851
  gr.Markdown("""
852
+ **โœจ What This Extracts:**
853
+ - ๐Ÿ“œ Original script (Hindi, Arabic, Urdu, Chinese, etc.)
854
+ - ๐ŸŒ Auto English translation (95%+ accuracy)
855
+ - ๐Ÿ–ผ๏ธ Profile photo location & description
856
+ - โœ๏ธ Signature detection & location
857
+ - ๐Ÿ” MRZ raw lines + parsed fields
858
+ - ๐Ÿ—‚๏ธ Structured key fields (Name, DOB, ID No., etc.)
859
+ - ๐Ÿ”„ Front + Back unified deduplicated record
860
  """)
861
 
862
  radioanimated_gpu_duration.change(
863
+ fn=apply_gpu_duration,
864
+ inputs=radioanimated_gpu_duration,
865
+ outputs=[gpu_duration_state],
866
  api_visibility="private"
867
  )
868
 
869
  image_submit.click(
870
  fn=generate_dual_card_ocr,
871
+ inputs=[
872
+ model_choice, image_query,
873
+ image_front, image_back,
874
+ max_new_tokens, temperature, top_p,
875
+ top_k, repetition_penalty, gpu_duration_state
876
+ ],
877
  outputs=[output, markdown_output]
878
  )
879
+
880
  gr.Markdown("""
881
  ---
882
+ ### ๐ŸŽฏ Feature Matrix
883
+
884
  | Feature | Status | Description |
885
  |---------|--------|-------------|
886
+ | **Two-Step Pipeline** | โœ… | Step 1 = Raw OCR, Step 2 = Translate + Structure |
887
+ | **Auto Language Detect** | โœ… | Hindi, Arabic, Urdu, Chinese, 30+ languages |
888
+ | **English Translation** | โœ… | 95%+ accuracy, only when non-English detected |
889
+ | **Original Script Preserved** | โœ… | Both original + translated shown side by side |
890
+ | **Profile Photo Detection** | โœ… | Location described in visual elements box |
891
+ | **Signature Extraction** | โœ… | Detected and located per card side |
892
+ | **MRZ Parsing** | โœ… | Raw lines + structured parsed fields |
893
+ | **Dual Card Deduplication** | โœ… | Front + Back merged, mismatches flagged โš ๏ธ |
894
+ | **Markdown Structured Output** | โœ… | Tables, code blocks, section headers |
895
+
896
+ ### ๐Ÿ“‹ Supported Documents
897
+ - ๐Ÿ‡ฎ๐Ÿ‡ณ Aadhaar Card, PAN Card, Voter ID
898
+ - ๐ŸŒ International Passports (with MRZ)
899
+ - ๐Ÿชช Driver's Licenses
900
+ - ๐Ÿ›๏ธ Government ID Cards (30+ countries)
901
+ - ๐Ÿ“‹ Residence Permits & Visas
902
+
903
+ ### ๐Ÿ”’ Privacy
904
+ - All processing on-device (GPU)
905
  - No data stored or transmitted
906
  - GDPR compliant
907
+
908
+ **๐Ÿ’ก Pro Tip**: Upload both front and back for full deduplication and MRZ cross-validation!
909
  """)
910
 
911
+
912
  if __name__ == "__main__":
913
  print("\n" + "="*70)
914
  print("๐Ÿš€ STARTING GRADIO INTERFACE...")
915
  print("="*70 + "\n")
 
916
  try:
917
  demo.queue(max_size=50).launch(
918
  server_name="0.0.0.0",
 
925
  print(f"โŒ Launch error: {e}")
926
  import traceback
927
  traceback.print_exc()
928
+
929
+