Chhagan005 commited on
Commit
31a316e
·
verified ·
1 Parent(s): 589e015

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +839 -699
app.py CHANGED
@@ -1,18 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import random
3
  import uuid
4
- import json
5
  import time
6
  import re
 
7
  from threading import Thread
8
- from typing import Iterable, List, Dict, Any
9
 
10
  import gradio as gr
11
  import spaces
12
  import torch
13
- import numpy as np
14
  from PIL import Image
15
- import cv2
16
 
17
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
18
  os.environ["HF_HOME"] = "/tmp/hf_home"
@@ -22,7 +32,7 @@ from transformers import (
22
  Qwen2VLForConditionalGeneration,
23
  AutoProcessor,
24
  TextIteratorStreamer,
25
- AutoConfig
26
  )
27
 
28
  try:
@@ -30,7 +40,7 @@ try:
30
  PEFT_AVAILABLE = True
31
  except:
32
  PEFT_AVAILABLE = False
33
- print("⚠️ PEFT not available, LoRA adapters cannot be loaded")
34
 
35
  try:
36
  from transformers import Qwen3VLForConditionalGeneration
@@ -39,68 +49,36 @@ except:
39
  QWEN3_AVAILABLE = False
40
  print("⚠️ Qwen3VL not available in current transformers version")
41
 
42
- from transformers.image_utils import load_image
43
  from gradio.themes import Soft
44
  from gradio.themes.utils import colors, fonts, sizes
45
 
46
- # ===== THEME SETUP =====
47
  colors.steel_blue = colors.Color(
48
  name="steel_blue",
49
- c50="#EBF3F8",
50
- c100="#D3E5F0",
51
- c200="#A8CCE1",
52
- c300="#7DB3D2",
53
- c400="#529AC3",
54
- c500="#4682B4",
55
- c600="#3E72A0",
56
- c700="#36638C",
57
- c800="#2E5378",
58
- c900="#264364",
59
- c950="#1E3450",
60
  )
61
 
62
  class SteelBlueTheme(Soft):
63
- def __init__(
64
- self,
65
- *,
66
- primary_hue: colors.Color | str = colors.gray,
67
- secondary_hue: colors.Color | str = colors.steel_blue,
68
- neutral_hue: colors.Color | str = colors.slate,
69
- text_size: sizes.Size | str = sizes.text_lg,
70
- font: fonts.Font | str | Iterable[fonts.Font | str] = (
71
- fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
72
- ),
73
- font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
74
- fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
75
- ),
76
- ):
77
- super().__init__(
78
- primary_hue=primary_hue,
79
- secondary_hue=secondary_hue,
80
- neutral_hue=neutral_hue,
81
- text_size=text_size,
82
- font=font,
83
- font_mono=font_mono,
84
- )
85
  super().set(
86
  background_fill_primary="*primary_50",
87
  background_fill_primary_dark="*primary_900",
88
  body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
89
  body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
90
  button_primary_text_color="white",
91
- button_primary_text_color_hover="white",
92
  button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
93
  button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
94
- button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
95
- button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
96
  button_secondary_text_color="black",
97
- button_secondary_text_color_hover="white",
98
  button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
99
  button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
100
- button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
101
- button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
102
  slider_color="*secondary_500",
103
- slider_color_dark="*secondary_600",
104
  block_title_text_weight="600",
105
  block_border_width="3px",
106
  block_shadow="*shadow_drop_lg",
@@ -116,485 +94,436 @@ css = """
116
  #main-title h1 { font-size: 2.3em !important; }
117
  #output-title h2 { font-size: 2.2em !important; }
118
  .ra-wrap{ width: fit-content; }
119
- .ra-inner{
120
- position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
121
- background: var(--neutral-200); border-radius: 9999px; overflow: hidden;
122
- }
123
  .ra-input{ display: none; }
124
- .ra-label{
125
- position: relative; z-index: 2; padding: 8px 16px;
126
- font-family: inherit; font-size: 14px; font-weight: 600;
127
- color: var(--neutral-500); cursor: pointer; transition: color 0.2s; white-space: nowrap;
128
- }
129
- .ra-highlight{
130
- position: absolute; z-index: 1; top: 6px; left: 6px;
131
- height: calc(100% - 12px); border-radius: 9999px;
132
- background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);
133
- transition: transform 0.2s, width 0.2s;
134
- }
135
  .ra-input:checked + .ra-label{ color: black; }
136
  .dark .ra-inner { background: var(--neutral-800); }
137
  .dark .ra-label { color: var(--neutral-400); }
138
  .dark .ra-highlight { background: var(--neutral-600); }
139
  .dark .ra-input:checked + .ra-label { color: white; }
140
- #gpu-duration-container {
141
- padding: 10px;
142
- border-radius: 8px;
143
- background: var(--background-fill-secondary);
144
- border: 1px solid var(--border-color-primary);
145
- margin-top: 10px;
146
- }
147
  """
148
 
149
  MAX_MAX_NEW_TOKENS = 4096
150
  DEFAULT_MAX_NEW_TOKENS = 1024
151
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
152
-
153
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
154
 
155
- print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
156
- print("torch.__version__ =", torch.__version__)
157
- print("torch.version.cuda =", torch.version.cuda)
158
- print("cuda available:", torch.cuda.is_available())
159
- print("cuda device count:", torch.cuda.device_count())
160
  if torch.cuda.is_available():
161
- print("current device:", torch.cuda.current_device())
162
- print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
163
- print("Using device:", device)
164
-
165
 
166
- # ===== PROMPTS =====
167
 
168
- STEP1_EXTRACT_PROMPT = """You are a precision OCR engine. Your ONLY job is to extract raw text from this ID card image.
 
 
169
 
170
- STRICT RULES:
171
- - Copy ALL text EXACTLY as it appears in original language/script (Hindi, Arabic, Urdu, Chinese, Devanagari, etc.)
172
- - DO NOT translate anything in this step
173
- - DO NOT add any interpretation or explanation
174
- - Preserve layout and line breaks exactly
175
- - Extract every number, date, code, and character precisely
176
- - Also detect visual element presence
177
-
178
- Output ONLY in this exact structured format, nothing else:
179
 
 
180
  PHOTO_PRESENT: yes/no
181
- PHOTO_LOCATION: [top-left / top-right / center-left / center-right / bottom-left / not found]
182
  SIGNATURE_PRESENT: yes/no
183
- SIGNATURE_LOCATION: [bottom-center / bottom-right / bottom-left / not found]
184
  MRZ_PRESENT: yes/no
185
- DETECTED_LANGUAGE: [Hindi / Arabic / Urdu / Chinese / English / Mixed / etc.]
186
  ---TEXT_START---
187
- [Every piece of text in original script, line by line, layout preserved exactly]
188
- ---TEXT_END---"""
 
 
 
 
189
 
 
 
 
 
 
190
 
191
- STEP2_TEMPLATE = """You are a multilingual KYC document expert with 95%+ translation accuracy.
192
 
193
- DOCUMENT METADATA (from Step 1 analysis):
194
- - Photo Present: {photo_present} | Location: {photo_location}
195
- - Signature Present: {sig_present} | Location: {sig_location}
196
- - MRZ Present: {mrz_present}
197
- - Detected Language: {detected_lang}
198
 
199
- RAW EXTRACTED TEXT (original script):
200
- {raw_text}
201
 
202
- YOUR TASKS:
203
- 1. If text is non-English → translate to English with 95%+ accuracy
204
- 2. If text is already English → copy as-is
205
- 3. Extract all key KYC fields
206
- 4. Output EXACTLY in the format below
207
-
208
- ⚠️ CRITICAL EXTRACTION RULES — READ BEFORE EXTRACTING:
209
-
210
- RULE 1 — COUNTRY/INSTITUTION vs PERSON NAME:
211
- - Text appearing at the TOP of ID cards like "Sultanate of Oman", "SULTANATE OF OMAN",
212
- "Republic of India", "United Arab Emirates", "ROYAL OMAN POLICE" etc. is the
213
- ISSUING COUNTRY or INSTITUTION NAME — THIS IS NOT THE PERSON'S NAME
214
- - Extract person's name ONLY from explicit name labels:
215
- الإسم / الاسم (Arabic) | NAME: | 姓名 (Chinese) | नाम (Hindi) | ИМЯ (Russian)
216
- - In MRZ: TD1 Line 3 = person's name (e.g., FERIL<SUNNA = "Feril Sunna")
217
-
218
- RULE 2 — CIVIL ID vs BARCODE/CHIP ID:
219
- - Long hex strings printed on barcodes/chips (e.g., 7E400DD3D032A7C) are card
220
- SERIAL/CHIP numbers — NOT the Civil ID
221
- - The actual Civil/Document ID is under labels:
222
- الرقم المدني (Civil No.) | رقم الهوية (ID No.) | ID NO. | CIVIL NO.
223
- - Actual Civil ID is typically 8-12 alphanumeric characters (e.g., 73616576)
224
-
225
- RULE 3 — MRZ IS GROUND TRUTH (do not override it):
226
- - MRZ lines (uppercase A-Z, 0-9, < characters) are cryptographically verified
227
- - MRZ date format is YYMMDD: first 2 = year, middle 2 = month, last 2 = day
228
- Example: 030512 = year 03 → 2003, month 05, day 12 → 12/05/2003
229
- Example: 260908 = year 26 → 2026, month 09, day 08 → 08/09/2026
230
- - MRZ Sex: M = Male, F = Female
231
- - If MRZ present, extract name/DOB/sex/expiry/nationality FROM MRZ LINES, not from visual text
232
 
233
- ---
 
234
 
235
- ## 🖼️ Visual Elements
236
 
237
- | Element | Status | Location |
238
- |---------|--------|----------|
239
- | 📷 Profile Photo | {photo_present} | {photo_location} |
240
- | ✍️ Signature | {sig_present} | {sig_location} |
241
- | 🔐 MRZ Zone | {mrz_present} | Bottom strip |
242
 
243
- ---
 
244
 
245
- ## 📜 Original Script
246
 
247
- {raw_text}
 
 
 
248
 
 
 
 
 
249
 
250
- ---
 
 
 
251
 
252
- ## 🌐 English Translation
253
-
254
- [Write complete English translation here. If already English, write: Already in English — then copy text]
 
 
255
 
 
256
 
257
  ---
258
 
259
- ## 🗂️ Key Fields (English)
260
-
261
- | Field | Value |
262
- |-------|-------|
263
- | 📄 Document Type | |
264
- | 👤 Full Name | |
265
- | 🔢 Civil / Document Number | |
266
- | 🎂 Date of Birth | |
267
- | 📅 Issue Date | |
268
- | ⏳ Expiry Date | |
269
- | 🌍 Nationality | |
270
- | ⚧️ Gender | |
271
- | 🏠 Address | |
272
- | 👨 Father / Guardian | |
273
- | 🏛️ Issuing Authority | |
274
 
275
  ---
276
 
277
- ## 🔐 MRZ Data
 
 
 
 
278
 
279
- [Raw MRZ lines here — copy exactly as-is. If not present write: NOT PRESENT]
280
 
 
281
 
282
- **Parsed MRZ:**
283
- | Field | Value |
284
- |-------|-------|
285
- | Document Type | |
286
- | Country Code | |
287
- | Document Number | |
288
- | Date of Birth | |
289
- | Expiry Date | |
290
- | Nationality | |
291
- | Sex | |
292
 
293
  ---"""
294
 
295
 
296
- # ===== MODEL LOADING =====
 
 
297
 
298
  print("\n" + "="*70)
299
- print("🚀 LOADING ALL 4 MODELS")
300
- print("="*70 + "\n")
 
 
 
 
 
 
 
 
301
 
302
- # Model 1: Chhagan_ML-VL-OCR-v1 (LoRA Fine-tuned)
303
- print("1️⃣ Loading Chhagan_ML-VL-OCR-v1 (LoRA Refined)...")
304
  MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
305
  CHHAGAN_V1_AVAILABLE = False
306
- processor_c1 = None
307
- model_c1 = None
308
 
309
  if PEFT_AVAILABLE:
310
  try:
311
  try:
312
  config = PeftConfig.from_pretrained(MODEL_ID_C1)
313
- base_model_id = config.base_model_name_or_path
314
- print(f" Base model from config: {base_model_id}")
315
  except:
316
- base_model_id = "Qwen/Qwen2.5-VL-2B-Instruct"
317
- print(f" Using default base model: {base_model_id}")
318
-
319
- processor_c1 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
320
- base_model_c1 = Qwen2VLForConditionalGeneration.from_pretrained(
321
- base_model_id,
322
- torch_dtype=torch.float16,
323
- device_map="auto",
324
- trust_remote_code=True
325
- )
326
- model_c1 = PeftModel.from_pretrained(base_model_c1, MODEL_ID_C1)
327
- model_c1 = model_c1.to(device).eval()
328
- print(" ✅ Chhagan_ML-VL-OCR-v1 loaded successfully!")
329
  CHHAGAN_V1_AVAILABLE = True
330
  except Exception as e:
331
- print(f" ❌ Chhagan_ML-VL-OCR-v1 failed: {e}")
332
  else:
333
- print(" ⚠️ PEFT not available, skipping LoRA model")
334
 
335
- # Model 2: Chhagan-DocVL-Qwen3
336
- print("\n2️⃣ Loading Chhagan-DocVL-Qwen3 (Qwen3-VL Refined)...")
337
  MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
338
  CHHAGAN_QWEN3_AVAILABLE = False
339
- processor_c2 = None
340
- model_c2 = None
341
 
342
  if QWEN3_AVAILABLE:
343
  try:
344
  try:
345
  if PEFT_AVAILABLE:
346
  config = PeftConfig.from_pretrained(MODEL_ID_C2)
347
- base_model_id = config.base_model_name_or_path
348
- print(f" Detected as LoRA adapter, base: {base_model_id}")
349
- processor_c2 = AutoProcessor.from_pretrained(base_model_id, trust_remote_code=True)
350
- base_model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
351
- base_model_id,
352
- torch_dtype=torch.float16,
353
- device_map="auto",
354
- trust_remote_code=True
355
- )
356
- model_c2 = PeftModel.from_pretrained(base_model_c2, MODEL_ID_C2)
357
- model_c2 = model_c2.to(device).eval()
358
  else:
359
- raise Exception("PEFT not available")
360
  except:
361
- print(" Loading as full fine-tuned model...")
362
  processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
363
  model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
364
- MODEL_ID_C2,
365
- attn_implementation="flash_attention_2",
366
- torch_dtype=torch.float16,
367
- device_map="auto",
368
- trust_remote_code=True
369
  ).to(device).eval()
370
- print(" ✅ Chhagan-DocVL-Qwen3 loaded successfully!")
371
  CHHAGAN_QWEN3_AVAILABLE = True
372
  except Exception as e:
373
- print(f" ❌ Chhagan-DocVL-Qwen3 failed: {e}")
374
  else:
375
- print(" ⚠️ Qwen3VL not available in transformers version")
376
 
377
- # Model 3: Qwen3-VL-2B-Instruct (Baseline)
378
- print("\n3️⃣ Loading Qwen3-VL-2B-Instruct (Baseline)...")
379
- MODEL_ID_Q3 = "Qwen/Qwen3-VL-2B-Instruct"
380
- QWEN3_BASELINE_AVAILABLE = False
381
- processor_q3 = None
382
- model_q3 = None
383
 
384
- if QWEN3_AVAILABLE:
 
 
385
  try:
386
- processor_q3 = AutoProcessor.from_pretrained(MODEL_ID_Q3, trust_remote_code=True)
387
- model_q3 = Qwen3VLForConditionalGeneration.from_pretrained(
388
- MODEL_ID_Q3,
389
- attn_implementation="flash_attention_2",
390
  torch_dtype=torch.float16,
391
  device_map="auto",
392
- trust_remote_code=True
393
- ).to(device).eval()
394
- print(" ✅ Qwen3-VL-2B-Instruct loaded successfully!")
395
- QWEN3_BASELINE_AVAILABLE = True
396
- except Exception as e:
397
- print(f" ❌ Qwen3-VL-2B-Instruct failed: {e}")
398
- else:
399
- print(" ⚠️ Qwen3VL not available in transformers version")
 
 
 
 
 
 
 
 
 
 
400
 
401
- # Model 4: Nanonets-OCR2-3B
402
- print("\n4️⃣ Loading Nanonets-OCR2-3B (General OCR)...")
403
- MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
404
- NANONETS_AVAILABLE = False
405
- processor_v = None
406
- model_v = None
407
 
408
  try:
409
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
410
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
411
- MODEL_ID_V,
412
- attn_implementation="flash_attention_2",
413
- trust_remote_code=True,
414
- torch_dtype=torch.float16
415
- ).to(device).eval()
416
- print(" ✅ Nanonets-OCR2-3B loaded successfully!")
417
- NANONETS_AVAILABLE = True
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  except Exception as e:
419
- print(f" ❌ Nanonets-OCR2-3B failed: {e}")
420
 
421
  print("\n" + "="*70)
422
- print("📊 MODEL STATUS SUMMARY (4 Models)")
423
  print("="*70)
424
- print(f"{'Model Name':<40} {'Status':<15} {'Type'}")
425
- print("-"*70)
426
- print(f"{'Chhagan_ML-VL-OCR-v1':<40} {'✅ Loaded' if CHHAGAN_V1_AVAILABLE else '❌ Failed':<15} {'Refined (LoRA)'}")
427
- print(f"{'Chhagan-DocVL-Qwen3':<40} {'✅ Loaded' if CHHAGAN_QWEN3_AVAILABLE else '❌ Failed':<15} {'Refined (Qwen3)'}")
428
- print(f"{'Qwen3-VL-2B-Instruct':<40} {'✅ Loaded' if QWEN3_BASELINE_AVAILABLE else '❌ Failed':<15} {'Baseline'}")
429
- print(f"{'Nanonets-OCR2-3B':<40} {'✅ Loaded' if NANONETS_AVAILABLE else '❌ Failed':<15} {'General OCR'}")
 
 
430
  print("="*70)
431
- loaded_count = sum([CHHAGAN_V1_AVAILABLE, CHHAGAN_QWEN3_AVAILABLE, QWEN3_BASELINE_AVAILABLE, NANONETS_AVAILABLE])
432
- print(f"\n✨ Total models loaded: {loaded_count}/4")
433
-
434
-
435
- # ===== HELPER: RadioAnimated =====
436
-
437
- class RadioAnimated(gr.HTML):
438
- def __init__(self, choices, value=None, **kwargs):
439
- if not choices or len(choices) < 2:
440
- raise ValueError("RadioAnimated requires at least 2 choices.")
441
- if value is None:
442
- value = choices[0]
443
- uid = uuid.uuid4().hex[:8]
444
- group_name = f"ra-{uid}"
445
- inputs_html = "\n".join(
446
- f"""
447
- <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
448
- <label class="ra-label" for="{group_name}-{i}">{c}</label>
449
- """
450
- for i, c in enumerate(choices)
451
- )
452
- html_template = f"""
453
- <div class="ra-wrap" data-ra="{uid}">
454
- <div class="ra-inner">
455
- <div class="ra-highlight"></div>
456
- {inputs_html}
457
- </div>
458
- </div>
459
- """
460
- js_on_load = r"""
461
- (() => {
462
- const wrap = element.querySelector('.ra-wrap');
463
- const inner = element.querySelector('.ra-inner');
464
- const highlight = element.querySelector('.ra-highlight');
465
- const inputs = Array.from(element.querySelectorAll('.ra-input'));
466
- if (!inputs.length) return;
467
- const choices = inputs.map(i => i.value);
468
- function setHighlightByIndex(idx) {
469
- const n = choices.length;
470
- const pct = 100 / n;
471
- highlight.style.width = `calc(${pct}% - 6px)`;
472
- highlight.style.transform = `translateX(${idx * 100}%)`;
473
- }
474
- function setCheckedByValue(val, shouldTrigger=false) {
475
- const idx = Math.max(0, choices.indexOf(val));
476
- inputs.forEach((inp, i) => { inp.checked = (i === idx); });
477
- setHighlightByIndex(idx);
478
- props.value = choices[idx];
479
- if (shouldTrigger) trigger('change', props.value);
480
- }
481
- setCheckedByValue(props.value ?? choices[0], false);
482
- inputs.forEach((inp) => {
483
- inp.addEventListener('change', () => {
484
- setCheckedByValue(inp.value, true);
485
- });
486
- });
487
- })();
488
- """
489
- super().__init__(
490
- value=value,
491
- html_template=html_template,
492
- js_on_load=js_on_load,
493
- **kwargs
494
- )
495
-
496
-
497
- def apply_gpu_duration(val: str):
498
- return int(val)
499
-
500
-
501
- def calc_timeout_duration(model_name, text, image_front, image_back,
502
- max_new_tokens, temperature, top_p,
503
- top_k, repetition_penalty, gpu_timeout):
504
  try:
505
- base_timeout = int(gpu_timeout)
506
- if image_front is not None and image_back is not None:
507
- return base_timeout * 2
508
- return base_timeout
509
- except:
510
- return 120
511
-
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
- # ===== STEP 1: RAW EXTRACTION (NO TRANSLATION) =====
514
 
515
- def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
516
- messages = [{
517
- "role": "user",
518
- "content": [
519
- {"type": "image"},
520
- {"type": "text", "text": STEP1_EXTRACT_PROMPT},
521
- ]
522
- }]
523
  try:
524
- prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  except:
526
- prompt = STEP1_EXTRACT_PROMPT
527
-
528
- inputs = processor(
529
- text=[prompt],
530
- images=[image],
531
- return_tensors="pt",
532
- padding=True
533
- ).to(device)
534
-
535
- with torch.no_grad():
536
- output_ids = model.generate(
537
- **inputs,
538
- max_new_tokens=512,
539
- do_sample=True,
540
- temperature=temperature,
541
- top_p=top_p,
542
- top_k=top_k,
543
- repetition_penalty=repetition_penalty,
544
- )
545
- input_len = inputs['input_ids'].shape[1]
546
- generated = output_ids[:, input_len:]
547
- return processor.batch_decode(generated, skip_special_tokens=True)[0]
548
-
549
-
550
- # ===== PARSE STEP 1 OUTPUT =====
551
 
552
- def parse_step1_output(raw_output: str) -> dict:
553
- result = {
554
- "photo_present": "❌ Not detected",
555
- "photo_location": "N/A",
556
- "sig_present": "❌ Not detected",
557
- "sig_location": "N/A",
558
- "mrz_present": "❌ Not detected",
559
- "detected_lang": "Unknown",
560
- "original_text": raw_output
561
- }
562
-
563
- def extract_field(pattern, text, default="N/A"):
564
- match = re.search(pattern, text, re.IGNORECASE)
565
- return match.group(1).strip() if match else default
566
-
567
- photo = extract_field(r"PHOTO_PRESENT:\s*(yes|no)", raw_output)
568
- result["photo_present"] = "✅ Yes" if photo.lower() == "yes" else "❌ No"
569
- result["photo_location"] = extract_field(r"PHOTO_LOCATION:\s*([^\n]+)", raw_output)
570
-
571
- sig = extract_field(r"SIGNATURE_PRESENT:\s*(yes|no)", raw_output)
572
- result["sig_present"] = "✅ Yes" if sig.lower() == "yes" else "❌ No"
573
- result["sig_location"] = extract_field(r"SIGNATURE_LOCATION:\s*([^\n]+)", raw_output)
574
-
575
- mrz = extract_field(r"MRZ_PRESENT:\s*(yes|no)", raw_output)
576
- result["mrz_present"] = "✅ Yes" if mrz.lower() == "yes" else "❌ No"
577
- result["detected_lang"] = extract_field(r"DETECTED_LANGUAGE:\s*([^\n]+)", raw_output, "Unknown")
578
-
579
- text_match = re.search(r"---TEXT_START---\n?(.*?)---TEXT_END---", raw_output, re.DOTALL)
580
- if text_match:
581
- result["original_text"] = text_match.group(1).strip()
582
 
583
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
 
585
 
586
  def parse_mrz_lines(raw_text: str) -> dict:
587
- """
588
- Authoritative Python-based MRZ parser.
589
- Supports TD1 (ID cards, 3x~30 chars) and TD3 (Passports, 2x~44 chars).
590
- Returns verified dict. Does NOT rely on LLM for date/sex/name parsing.
591
- """
592
- import datetime
593
 
594
  lines = []
595
  for line in raw_text.split('\n'):
596
  clean = re.sub(r'\s+', '', line.strip())
597
- if re.match(r'^[A-Z0-9<]{20,}$', clean):
598
  lines.append(clean)
599
 
600
  if not lines:
@@ -602,169 +531,428 @@ def parse_mrz_lines(raw_text: str) -> dict:
602
 
603
  def decode_date(yymmdd: str, is_dob: bool = False) -> str:
604
  try:
605
- yy = int(yymmdd[0:2])
606
- mm = int(yymmdd[2:4])
607
- dd = int(yymmdd[4:6])
608
  if not (1 <= mm <= 12 and 1 <= dd <= 31):
609
  return f"Invalid ({yymmdd})"
610
- current_yy = datetime.datetime.now().year % 100
611
- year = (1900 + yy) if (is_dob and yy > current_yy) else (2000 + yy)
612
  return f"{dd:02d}/{mm:02d}/{year}"
613
  except:
614
  return yymmdd
615
 
616
- def clean_field(s: str) -> str:
617
  return re.sub(r'<+$', '', s).replace('<', ' ').strip()
618
 
 
 
 
 
 
 
 
 
 
619
  result = {}
620
 
621
- # TD1: 3 lines, 28-35 chars each
622
  td1 = [l for l in lines if 28 <= len(l) <= 36]
623
  if len(td1) >= 2:
624
  l1, l2 = td1[0], td1[1]
625
  l3 = td1[2] if len(td1) > 2 else ""
626
-
627
- if len(l1) >= 14:
628
- result['doc_type'] = clean_field(l1[0:2])
629
- result['country_code'] = clean_field(l1[2:5])
630
- result['doc_number'] = clean_field(l1[5:14])
631
-
632
- if len(l2) >= 18:
633
- result['dob'] = decode_date(l2[0:6], is_dob=True)
634
- sex_char = l2[7] if len(l2) > 7 else ''
635
- result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
636
- if len(l2) >= 14:
637
- result['expiry'] = decode_date(l2[8:14], is_dob=False)
638
- if len(l2) >= 18:
639
- result['nationality'] = clean_field(l2[15:18])
640
-
641
  if l3:
642
- name_clean = re.sub(r'<+$', '', l3)
643
- if '<<' in name_clean:
644
- parts = name_clean.split('<<')
645
- surname = parts[0].replace('<', ' ').strip()
646
- given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
647
- result['name'] = f"{given} {surname}".strip() if given else surname
648
- else:
649
- result['name'] = name_clean.replace('<', ' ').strip()
650
-
651
  result['mrz_format'] = 'TD1'
652
  return result
653
 
654
- # TD3: 2 lines, 40-48 chars each
655
  td3 = [l for l in lines if 40 <= len(l) <= 48]
656
  if len(td3) >= 2:
657
  l1, l2 = td3[0], td3[1]
658
-
659
- if len(l1) >= 5:
660
- result['doc_type'] = clean_field(l1[0:2])
661
- result['country_code'] = clean_field(l1[2:5])
662
- name_section = l1[5:min(44, len(l1))]
663
- if '<<' in name_section:
664
- parts = name_section.split('<<')
665
- surname = parts[0].replace('<', ' ').strip()
666
- given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
667
- result['name'] = f"{given} {surname}".strip() if given else surname
668
- else:
669
- result['name'] = name_section.replace('<', ' ').strip()
670
-
671
  if len(l2) >= 27:
672
- result['doc_number'] = clean_field(l2[0:9])
673
- result['nationality'] = clean_field(l2[10:13])
674
- result['dob'] = decode_date(l2[13:19], is_dob=True)
675
- sex_char = l2[20] if len(l2) > 20 else ''
676
- result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
677
- result['expiry'] = decode_date(l2[21:27], is_dob=False)
678
-
679
  result['mrz_format'] = 'TD3'
680
  return result
681
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  return {}
683
 
684
- # ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
685
 
686
  def run_step2_structure(model, processor, metadata: dict, device,
687
  max_new_tokens, temperature, top_p, top_k, repetition_penalty):
688
- step2_prompt = STEP2_TEMPLATE.format(
689
- photo_present=metadata["photo_present"],
690
- photo_location=metadata["photo_location"],
691
- sig_present=metadata["sig_present"],
692
- sig_location=metadata["sig_location"],
693
- mrz_present=metadata["mrz_present"],
694
- detected_lang=metadata["detected_lang"],
695
- raw_text=metadata["original_text"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
  )
697
 
698
- messages = [{"role": "user", "content": [{"type": "text", "text": step2_prompt}]}]
699
  try:
700
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
701
  except:
702
- prompt = step2_prompt
703
 
704
  inputs = processor(text=[prompt], return_tensors="pt", padding=True).to(device)
705
 
706
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
707
  gen_kwargs = {
708
- **inputs,
709
- "streamer": streamer,
710
- "max_new_tokens": max_new_tokens,
711
- "do_sample": True,
712
- "temperature": temperature,
713
- "top_p": top_p,
714
- "top_k": top_k,
715
- "repetition_penalty": repetition_penalty,
716
  }
717
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
718
  thread.start()
719
- return streamer, thread
720
 
 
 
721
 
722
- # ===== UNIFIED DEDUPLICATED SUMMARY =====
 
 
 
 
723
 
724
- def build_unified_summary(front_result: str, back_result: str) -> str:
725
- summary = "## 🔄 Unified Deduplicated Record\n\n"
726
- summary += "> *Unique fields from both sides merged. Conflicts flagged with ⚠️.*\n\n"
727
 
728
- def extract_table_rows(text):
729
- rows = {}
730
- table_match = re.search(
731
- r"## 🗂️ Key Fields.*?\n\|.*?\n\|[-| ]+\n(.*?)(?=\n---|\Z)", text, re.DOTALL
732
- )
733
- if table_match:
734
- for line in table_match.group(1).strip().split("\n"):
735
- parts = [p.strip() for p in line.split("|") if p.strip()]
736
- if len(parts) >= 2:
737
- field = re.sub(r"[^\w\s/]", "", parts[0]).strip()
738
- value = parts[1].strip()
739
- if value and value != "—":
740
- rows[field] = value
741
- return rows
742
 
743
- front_fields = extract_table_rows(front_result)
744
- back_fields = extract_table_rows(back_result)
745
- all_fields = list(dict.fromkeys(list(front_fields.keys()) + list(back_fields.keys())))
746
 
747
- summary += "| Field | Value | Source |\n"
748
- summary += "|-------|-------|--------|\n"
749
 
750
- for field in all_fields:
751
- f_val = front_fields.get(field, "")
752
- b_val = back_fields.get(field, "")
753
 
754
- if f_val and b_val:
755
- if f_val.lower() == b_val.lower():
756
- summary += f"| {field} | {f_val} | Front + Back ✅ |\n"
757
- else:
758
- summary += f"| {field} | Front: **{f_val}** / Back: **{b_val}** | ⚠️ Mismatch |\n"
759
- elif f_val:
760
- summary += f"| {field} | {f_val} | Front only |\n"
761
- elif b_val:
762
- summary += f"| {field} | {b_val} | Back only |\n"
763
 
764
- return summary + "\n"
765
 
 
 
766
 
767
- # ===== MAIN OCR FUNCTION =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
768
 
769
  @spaces.GPU(duration=calc_timeout_duration)
770
  def generate_dual_card_ocr(model_name: str, text: str,
@@ -773,69 +961,57 @@ def generate_dual_card_ocr(model_name: str, text: str,
773
  top_k: int, repetition_penalty: float, gpu_timeout: int):
774
 
775
  # Model selection
776
- if model_name == "Chhagan-ID-OCR-v1 ⭐":
777
- if not CHHAGAN_V1_AVAILABLE:
778
- yield "Chhagan_ML-VL-OCR-v1 model is not available.", "Chhagan_ML-VL-OCR-v1 model is not available."
779
- return
780
- processor, model = processor_c1, model_c1
781
-
782
- elif model_name == "Chhagan-DocVL-Qwen3 🔥":
783
- if not CHHAGAN_QWEN3_AVAILABLE:
784
- yield "Chhagan-DocVL-Qwen3 model is not available.", "Chhagan-DocVL-Qwen3 model is not available."
785
- return
786
- processor, model = processor_c2, model_c2
787
-
788
- elif model_name == "Qwen3-VL-2B (Baseline) 📊":
789
- if not QWEN3_BASELINE_AVAILABLE:
790
- yield "Qwen3-VL-2B-Instruct baseline model is not available.", "Qwen3-VL-2B-Instruct baseline model is not available."
791
- return
792
- processor, model = processor_q3, model_q3
793
-
794
- elif model_name == "Nanonets-OCR2-3B":
795
- if not NANONETS_AVAILABLE:
796
- yield "Nanonets-OCR2-3B model is not available.", "Nanonets-OCR2-3B model is not available."
797
- return
798
- processor, model = processor_v, model_v
799
 
800
- else:
801
- yield "Invalid model selected.", "Invalid model selected."
802
- return
 
 
 
803
 
804
  if image_front is None and image_back is None:
805
- yield "Please upload at least one card image (front or back).", "Please upload at least one card image (front or back)."
806
- return
807
 
808
  full_output = ""
809
  front_result = ""
810
  back_result = ""
811
- front_meta_saved = {} # ← NEW: save for MRZ parsing
812
- back_meta_saved = {} # ← NEW: save for MRZ parsing
 
813
 
814
- # ===== FRONT CARD =====
815
  if image_front is not None:
816
  full_output += "# 🎴 FRONT CARD\n\n"
817
- full_output += "⏳ **Step 1 / 2 — Extracting raw text (original script, no translation)...**\n\n"
818
  yield full_output, full_output
819
 
820
- step1_raw = run_step1_extraction(
821
- model, processor, image_front, device,
822
- temperature, top_p, top_k, repetition_penalty
823
- )
824
  front_meta = parse_step1_output(step1_raw)
 
825
 
826
- full_output += f"✅ **Step 1 Complete** — 🌐 Detected Language: **{front_meta['detected_lang']}**\n\n"
827
- full_output += "⏳ **Step 2 / 2 — Translating to English & building structured output...**\n\n"
828
  yield full_output, full_output
829
 
830
- streamer_f, thread_f = run_step2_structure(
831
  model, processor, front_meta, device,
832
- max_new_tokens, temperature, top_p, top_k, repetition_penalty
833
- )
 
 
 
 
 
834
 
835
- buffer_f = ""
836
  for new_text in streamer_f:
837
- buffer_f += new_text
838
- buffer_f = buffer_f.replace("<|im_end|>", "").replace("<|endoftext|>", "")
839
  time.sleep(0.01)
840
  yield full_output + buffer_f, full_output + buffer_f
841
 
@@ -843,31 +1019,33 @@ def generate_dual_card_ocr(model_name: str, text: str,
843
  front_result = buffer_f
844
  thread_f.join()
845
 
846
- # ===== BACK CARD =====
847
  if image_back is not None:
848
  full_output += "\n\n---\n\n# 🎴 BACK CARD\n\n"
849
- full_output += "⏳ **Step 1 / 2 — Extracting raw text (original script, no translation)...**\n\n"
850
  yield full_output, full_output
851
 
852
- step1_raw_back = run_step1_extraction(
853
- model, processor, image_back, device,
854
- temperature, top_p, top_k, repetition_penalty
855
- )
856
  back_meta = parse_step1_output(step1_raw_back)
 
857
 
858
- full_output += f"✅ **Step 1 Complete** — 🌐 Detected Language: **{back_meta['detected_lang']}**\n\n"
859
- full_output += "⏳ **Step 2 / 2 — Translating to English & building structured output...**\n\n"
860
  yield full_output, full_output
861
 
862
- streamer_b, thread_b = run_step2_structure(
863
  model, processor, back_meta, device,
864
- max_new_tokens, temperature, top_p, top_k, repetition_penalty
865
- )
 
 
 
 
 
866
 
867
- buffer_b = ""
868
  for new_text in streamer_b:
869
- buffer_b += new_text
870
- buffer_b = buffer_b.replace("<|im_end|>", "").replace("<|endoftext|>", "")
871
  time.sleep(0.01)
872
  yield full_output + buffer_b, full_output + buffer_b
873
 
@@ -875,81 +1053,64 @@ def generate_dual_card_ocr(model_name: str, text: str,
875
  back_result = buffer_b
876
  thread_b.join()
877
 
878
- # ===== MRZ PYTHON PARSE (authoritative) =====
879
- # ← NEW BLOCK: Try back card first (MRZ usually on back), then front
880
- mrz_data = {}
881
- if back_meta_saved:
882
- mrz_data = parse_mrz_lines(back_meta_saved.get('original_text', ''))
883
- if not mrz_data and front_meta_saved:
884
- mrz_data = parse_mrz_lines(front_meta_saved.get('original_text', ''))
885
-
886
- if mrz_data:
887
- full_output += f"\n\n> ✅ **MRZ Python-parsed successfully** ({mrz_data.get('mrz_format','?')} format) — ground truth applied to summary below.\n"
888
-
889
- # ===== UNIFIED SUMMARY (only when both sides uploaded) =====
890
  if image_front is not None and image_back is not None:
891
  full_output += "\n\n---\n\n"
892
- full_output += build_unified_summary(front_result, back_result)
893
 
894
- full_output += f"\n\n---\n\n**✨ Extraction Complete** | Model: `{model_name}` | Pipeline: OCR Language Detect Translate → Structure\n"
 
895
  yield full_output, full_output
896
 
897
 
898
- # ===== BUILD MODEL CHOICES =====
 
 
899
 
900
  model_choices = []
901
- if CHHAGAN_V1_AVAILABLE:
902
- model_choices.append("Chhagan-ID-OCR-v1 ")
903
- if CHHAGAN_QWEN3_AVAILABLE:
904
- model_choices.append("Chhagan-DocVL-Qwen3 🔥")
905
- if QWEN3_BASELINE_AVAILABLE:
906
- model_choices.append("Qwen3-VL-2B (Baseline) 📊")
907
- if NANONETS_AVAILABLE:
908
- model_choices.append("Nanonets-OCR2-3B")
909
-
910
- if not model_choices:
911
- model_choices = ["No models available"]
912
 
913
  dual_card_examples = [
914
- ["Extract complete information from both sides", "examples/5.jpg", None],
915
- ["Multilingual OCR with MRZ extraction", "examples/4.jpg", None],
916
- ["Extract profile photo and signature locations", "examples/2.jpg", None],
917
  ]
918
 
919
 
920
- # ===== GRADIO UI =====
 
 
921
 
922
  demo = gr.Blocks(css=css, theme=steel_blue_theme)
923
  with demo:
924
- gr.Markdown("# 🌍 **Chhagan Dual-Card ID OCR System**", elem_id="main-title")
925
- gr.Markdown("### *Advanced OCR Auto Language Detection English Translation • MRZ Parsing*")
926
 
927
  loaded_models = []
928
- if CHHAGAN_V1_AVAILABLE:
929
- loaded_models.append("ID-OCR-v1 ")
930
- if CHHAGAN_QWEN3_AVAILABLE:
931
- loaded_models.append("DocVL-Qwen3 🔥")
932
- if QWEN3_BASELINE_AVAILABLE:
933
- loaded_models.append("Qwen3-Baseline 📊")
934
- if NANONETS_AVAILABLE:
935
- loaded_models.append("Nanonets")
936
-
937
- model_info = f"**Loaded Models ({len(loaded_models)}/4):** {', '.join(loaded_models)}" if loaded_models else "⚠️ No models loaded"
938
  gr.Markdown(f"**Status:** {model_info}")
939
- gr.Markdown("**Pipeline:** ✅ Step 1: Raw OCR (original script) → ✅ Step 2: Auto Translate to English → ✅ Structured Output → ✅ Front+Back Deduplication")
940
 
941
  with gr.Row():
942
  with gr.Column(scale=2):
943
  image_query = gr.Textbox(
944
  label="💬 Custom Query (Optional)",
945
- placeholder="Leave empty for automatic full extraction (OCR + translate + structure)...",
946
  value=""
947
  )
948
-
949
  gr.Markdown("### 📤 Upload ID Cards")
950
  with gr.Row():
951
  image_front = gr.Image(type="pil", label="🎴 Front Card", height=250)
952
- image_back = gr.Image(type="pil", label="🎴 Back Card (Optional)", height=250)
953
 
954
  image_submit = gr.Button("🚀 Extract + Translate + Structure", variant="primary", size="lg")
955
 
@@ -960,23 +1121,23 @@ with demo:
960
  )
961
 
962
  with gr.Accordion("⚙️ Advanced Settings", open=False):
963
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
964
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
965
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
966
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
967
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
968
 
969
  with gr.Column(scale=3):
970
  gr.Markdown("## 📄 Extraction Results", elem_id="output-title")
971
  output = gr.Textbox(label="Raw Output (Streaming)", interactive=True, lines=15)
972
- with gr.Accordion("📝 Markdown Preview (Structured)", open=True):
973
  markdown_output = gr.Markdown(label="Formatted Result")
974
 
975
  model_choice = gr.Radio(
976
  choices=model_choices,
977
- label="🤖 Select OCR Model",
978
  value=model_choices[0] if model_choices else None,
979
- info="⭐🔥 = Fine-tuned for ID Cards | 📊 = Baseline | General OCR = Nanonets"
980
  )
981
 
982
  with gr.Row(elem_id="gpu-duration-container"):
@@ -984,21 +1145,20 @@ with demo:
984
  gr.Markdown("**⏱️ GPU Duration (seconds)**")
985
  radioanimated_gpu_duration = RadioAnimated(
986
  choices=["60", "90", "120", "180", "240"],
987
- value="120",
988
  elem_id="radioanimated_gpu_duration"
989
  )
990
- gpu_duration_state = gr.Number(value=120, visible=False)
991
 
992
  gr.Markdown("""
993
- **✨ What This Extracts:**
994
- - 📜 Original script (Hindi, Arabic, Urdu, Chinese, etc.)
995
- - 🌐 Auto English translation (95%+ accuracy)
996
- - 🖼️ Profile photo location & description
997
- - Signature detection & location
998
- - 🔐 MRZ raw lines + parsed fields
999
- - 🗂️ Structured key fields (Name, DOB, ID No., etc.)
1000
- - 🔄 Front + Back unified deduplicated record
1001
- """)
1002
 
1003
  radioanimated_gpu_duration.change(
1004
  fn=apply_gpu_duration,
@@ -1009,62 +1169,42 @@ with demo:
1009
 
1010
  image_submit.click(
1011
  fn=generate_dual_card_ocr,
1012
- inputs=[
1013
- model_choice, image_query,
1014
- image_front, image_back,
1015
- max_new_tokens, temperature, top_p,
1016
- top_k, repetition_penalty, gpu_duration_state
1017
- ],
1018
  outputs=[output, markdown_output]
1019
  )
1020
 
1021
  gr.Markdown("""
1022
- ---
1023
- ### 🎯 Feature Matrix
1024
-
1025
- | Feature | Status | Description |
1026
- |---------|--------|-------------|
1027
- | **Two-Step Pipeline** | | Step 1 = Raw OCR, Step 2 = Translate + Structure |
1028
- | **Auto Language Detect** | | Hindi, Arabic, Urdu, Chinese, 30+ languages |
1029
- | **English Translation** | | 95%+ accuracy, only when non-English detected |
1030
- | **Original Script Preserved** | | Both original + translated shown side by side |
1031
- | **Profile Photo Detection** | | Location described in visual elements box |
1032
- | **Signature Extraction** | | Detected and located per card side |
1033
- | **MRZ Parsing** | | Raw lines + structured parsed fields |
1034
- | **Dual Card Deduplication** | | Front + Back merged, mismatches flagged ⚠️ |
1035
- | **Markdown Structured Output** | ✅ | Tables, code blocks, section headers |
1036
-
1037
- ### 📋 Supported Documents
1038
- - 🇮🇳 Aadhaar Card, PAN Card, Voter ID
1039
- - 🌍 International Passports (with MRZ)
1040
- - 🪪 Driver's Licenses
1041
- - 🏛️ Government ID Cards (30+ countries)
1042
- - 📋 Residence Permits & Visas
1043
-
1044
- ### 🔒 Privacy
1045
- - All processing on-device (GPU)
1046
- - No data stored or transmitted
1047
- - GDPR compliant
1048
-
1049
- **💡 Pro Tip**: Upload both front and back for full deduplication and MRZ cross-validation!
1050
- """)
1051
 
1052
 
1053
  if __name__ == "__main__":
1054
- print("\n" + "="*70)
1055
- print("🚀 STARTING GRADIO INTERFACE...")
1056
- print("="*70 + "\n")
1057
  try:
1058
  demo.queue(max_size=50).launch(
1059
- server_name="0.0.0.0",
1060
- server_port=7860,
1061
- show_error=True,
1062
- share=False
1063
- )
1064
- print("✅ Gradio app launched successfully!")
1065
  except Exception as e:
1066
- print(f"❌ Launch error: {e}")
1067
  import traceback
 
1068
  traceback.print_exc()
1069
-
1070
-
 
1
+ """
2
+ ╔══════════════════════════════════════════════════════════════════╗
3
+ ║ CSM DUAL-CARD ID OCR SYSTEM — ARCHITECTURE NOTE ║
4
+ ╠══════════════════════════════════════════════════════════════════╣
5
+ ║ MODEL TASKS (8B VLM): ║
6
+ ║ Step 1 → Raw OCR: All text, original script, no translate ║
7
+ ║ Step 2 → Doc classify + non-English gap fill only ║
8
+ ║ PYTHON TASKS (Authoritative): ║
9
+ ║ MRZ parse+verify | Numeral convert | Calendar convert ║
10
+ ║ English label extract | Script separate | Cross verify ║
11
+ ╚══════════════════════════════════════════════════════════════════╝
12
+ """
13
+
14
  import os
 
15
  import uuid
 
16
  import time
17
  import re
18
+ import datetime
19
  from threading import Thread
20
+ from typing import Iterable, Dict, Any
21
 
22
  import gradio as gr
23
  import spaces
24
  import torch
 
25
  from PIL import Image
 
26
 
27
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
28
  os.environ["HF_HOME"] = "/tmp/hf_home"
 
32
  Qwen2VLForConditionalGeneration,
33
  AutoProcessor,
34
  TextIteratorStreamer,
35
+ BitsAndBytesConfig,
36
  )
37
 
38
  try:
 
40
  PEFT_AVAILABLE = True
41
  except:
42
  PEFT_AVAILABLE = False
43
+ print("⚠️ PEFT not available")
44
 
45
  try:
46
  from transformers import Qwen3VLForConditionalGeneration
 
49
  QWEN3_AVAILABLE = False
50
  print("⚠️ Qwen3VL not available in current transformers version")
51
 
 
52
  from gradio.themes import Soft
53
  from gradio.themes.utils import colors, fonts, sizes
54
 
55
+ # ===== THEME =====
56
  colors.steel_blue = colors.Color(
57
  name="steel_blue",
58
+ c50="#EBF3F8", c100="#D3E5F0", c200="#A8CCE1", c300="#7DB3D2",
59
+ c400="#529AC3", c500="#4682B4", c600="#3E72A0", c700="#36638C",
60
+ c800="#2E5378", c900="#264364", c950="#1E3450",
 
 
 
 
 
 
 
 
61
  )
62
 
63
  class SteelBlueTheme(Soft):
64
+ def __init__(self, *, primary_hue=colors.gray, secondary_hue=colors.steel_blue,
65
+ neutral_hue=colors.slate, text_size=sizes.text_lg,
66
+ font=(fonts.GoogleFont("Outfit"), "Arial", "sans-serif"),
67
+ font_mono=(fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace")):
68
+ super().__init__(primary_hue=primary_hue, secondary_hue=secondary_hue,
69
+ neutral_hue=neutral_hue, text_size=text_size, font=font, font_mono=font_mono)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  super().set(
71
  background_fill_primary="*primary_50",
72
  background_fill_primary_dark="*primary_900",
73
  body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
74
  body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
75
  button_primary_text_color="white",
 
76
  button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
77
  button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
 
 
78
  button_secondary_text_color="black",
 
79
  button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
80
  button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
 
 
81
  slider_color="*secondary_500",
 
82
  block_title_text_weight="600",
83
  block_border_width="3px",
84
  block_shadow="*shadow_drop_lg",
 
94
  #main-title h1 { font-size: 2.3em !important; }
95
  #output-title h2 { font-size: 2.2em !important; }
96
  .ra-wrap{ width: fit-content; }
97
+ .ra-inner{ position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
98
+ background: var(--neutral-200); border-radius: 9999px; overflow: hidden; }
 
 
99
  .ra-input{ display: none; }
100
+ .ra-label{ position: relative; z-index: 2; padding: 8px 16px; font-family: inherit; font-size: 14px;
101
+ font-weight: 600; color: var(--neutral-500); cursor: pointer; transition: color 0.2s; white-space: nowrap; }
102
+ .ra-highlight{ position: absolute; z-index: 1; top: 6px; left: 6px; height: calc(100% - 12px);
103
+ border-radius: 9999px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);
104
+ transition: transform 0.2s, width 0.2s; }
 
 
 
 
 
 
105
  .ra-input:checked + .ra-label{ color: black; }
106
  .dark .ra-inner { background: var(--neutral-800); }
107
  .dark .ra-label { color: var(--neutral-400); }
108
  .dark .ra-highlight { background: var(--neutral-600); }
109
  .dark .ra-input:checked + .ra-label { color: white; }
110
+ #gpu-duration-container { padding: 10px; border-radius: 8px;
111
+ background: var(--background-fill-secondary); border: 1px solid var(--border-color-primary); margin-top: 10px; }
 
 
 
 
 
112
  """
113
 
114
  MAX_MAX_NEW_TOKENS = 4096
115
  DEFAULT_MAX_NEW_TOKENS = 1024
116
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
117
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
118
 
119
+ print("CUDA available:", torch.cuda.is_available())
 
 
 
 
120
  if torch.cuda.is_available():
121
+ print("Device:", torch.cuda.get_device_name(0))
122
+ print("Using:", device)
 
 
123
 
 
124
 
125
+ # ╔══════════════════════════════════════════╗
126
+ # ║ UNIVERSAL PROMPTS ║
127
+ # ╚══════════════════════════════════════════╝
128
 
129
+ STEP1_EXTRACT_PROMPT = """You are a universal OCR engine. Transcribe ALL visible text from this document image.
 
 
 
 
 
 
 
 
130
 
131
+ OUTPUT FORMAT — fill exactly as shown:
132
  PHOTO_PRESENT: yes/no
133
+ PHOTO_LOCATION: [describe position: top-left / top-right / center-left / not found]
134
  SIGNATURE_PRESENT: yes/no
135
+ SIGNATURE_LOCATION: [describe position: bottom-left / bottom-right / not found]
136
  MRZ_PRESENT: yes/no
137
+ DETECTED_LANGUAGE: [list all languages visible e.g. Arabic+English, Farsi+English, Hindi+English, Chinese, English]
138
  ---TEXT_START---
139
+ [Every word, number, symbol, label and value visible line by line]
140
+ [Original script preserved: Arabic, Farsi, Hindi, Chinese, Cyrillic etc. — DO NOT translate here]
141
+ [Copy label AND its value together: e.g. "DATE OF BIRTH 12/05/2003"]
142
+ [MRZ lines: copy character-perfect including ALL < symbols]
143
+ [Include corner text, watermarks, small print]
144
+ ---TEXT_END---
145
 
146
+ ABSOLUTE RULES:
147
+ - NEVER output pixel coordinates like (50,68) or bounding boxes — plain text ONLY
148
+ - DO NOT translate in this step — original script as-is
149
+ - DO NOT skip or summarize any field
150
+ - Copy every character exactly including < symbols in MRZ"""
151
 
 
152
 
153
+ STEP2_TEMPLATE = """You are a universal KYC document analyst.
154
+ The Python pipeline has already extracted English fields and parsed MRZ.
155
+ Your job is ONLY: classify document + fill gaps from non-English text.
 
 
156
 
157
+ ━━━ ALREADY EXTRACTED BY PYTHON (DO NOT RE-EXTRACT) ━━━
 
158
 
159
+ English Fields Found Directly on Card:
160
+ {python_fields_table}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ MRZ Python Parse Result:
163
+ {mrz_summary}
164
 
165
+ ━━━ YOUR INPUT DATA ━━━
166
 
167
+ English text block from card:
168
+ {english_block}
 
 
 
169
 
170
+ Non-English original script block:
171
+ {original_block}
172
 
173
+ ━━━ YOUR TASKS — ONLY THESE 3 ━━━
174
 
175
+ TASK 1: Identify document type and issuing info
176
+ - Read English block and original block
177
+ - Keywords: PASSPORT/RESIDENT CARD/NATIONAL ID/DRIVING LICENCE/بطاقة/جواز/رخصة/आधार/PAN
178
+ - Top of card = issuing country/institution (NOT person name)
179
 
180
+ TASK 2: Classify non-English labels → check if already in English fields above
181
+ - If نام (Farsi: Name) value already in Python English fields → SKIP
182
+ - If شماره ملی (National Number) already in Python fields → SKIP
183
+ - Only add fields GENUINELY missing from Python extraction
184
 
185
+ TASK 3: Transliterate non-English values NOT found in English block
186
+ - Example: محمد → Mohammad | چراغی → Cheraghi
187
+ - Dates in Shamsi/Hijri: write BOTH original AND note calendar type
188
+ (DO NOT convert — Python handles conversion)
189
 
190
+ RULES:
191
+ - NEVER copy template placeholders like [fill here] or [value]
192
+ - NEVER re-state what Python already found
193
+ - NEVER guess values not visible in card
194
+ - If all fields already covered → write "✅ All fields covered by Python extraction"
195
 
196
+ ━━━ OUTPUT FORMAT ━━━
197
 
198
  ---
199
 
200
+ ## 📋 Document Classification
201
+
202
+ | | |
203
+ |---|---|
204
+ | **Document Type** | |
205
+ | **Issuing Country** | |
206
+ | **Issuing Authority** | |
 
 
 
 
 
 
 
 
207
 
208
  ---
209
 
210
+ ## Additional Fields (non-English only — genuinely new)
211
+
212
+ | Label (Original) | Label (English) | Value (Original) | Value (Transliterated) |
213
+ |---|---|---|---|
214
+ | [only if not in Python fields above] | | | |
215
 
216
+ ---
217
 
218
+ ## 🗓️ Calendar Note (if non-Gregorian dates found)
219
 
220
+ | Original Date | Calendar System | Note |
221
+ |---|---|---|
222
+ | [date as on card] | [Solar Hijri / Lunar Hijri / Buddhist] | Python will convert |
 
 
 
 
 
 
 
223
 
224
  ---"""
225
 
226
 
227
+ # ╔══════════════════════════════════════════╗
228
+ # ║ MODEL LOADING ║
229
+ # ╚══════════════════════════════════════════╝
230
 
231
  print("\n" + "="*70)
232
+ print("🚀 LOADING 4 MODELS")
233
+ print("="*70)
234
+
235
+ # 4-bit BitsAndBytes config (shared for quantized models)
236
+ bnb_4bit_config = BitsAndBytesConfig(
237
+ load_in_4bit=True,
238
+ bnb_4bit_quant_type="nf4",
239
+ bnb_4bit_compute_dtype=torch.float16,
240
+ bnb_4bit_use_double_quant=True,
241
+ )
242
 
243
+ # ── Model 1: Chhagan_ML-VL-OCR-v1 (LoRA, keep) ──
244
+ print("\n1️⃣ Chhagan_ML-VL-OCR-v1 (LoRA Refined)...")
245
  MODEL_ID_C1 = "Chhagan005/Chhagan_ML-VL-OCR-v1"
246
  CHHAGAN_V1_AVAILABLE = False
247
+ processor_c1 = model_c1 = None
 
248
 
249
  if PEFT_AVAILABLE:
250
  try:
251
  try:
252
  config = PeftConfig.from_pretrained(MODEL_ID_C1)
253
+ base_id = config.base_model_name_or_path
 
254
  except:
255
+ base_id = "Qwen/Qwen2.5-VL-2B-Instruct"
256
+ processor_c1 = AutoProcessor.from_pretrained(base_id, trust_remote_code=True)
257
+ base_c1 = Qwen2VLForConditionalGeneration.from_pretrained(
258
+ base_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
259
+ model_c1 = PeftModel.from_pretrained(base_c1, MODEL_ID_C1).to(device).eval()
260
+ print(" ✅ Loaded!")
 
 
 
 
 
 
 
261
  CHHAGAN_V1_AVAILABLE = True
262
  except Exception as e:
263
+ print(f" ❌ Failed: {e}")
264
  else:
265
+ print(" ⚠️ PEFT not available")
266
 
267
+ # ── Model 2: Chhagan-DocVL-Qwen3 (Qwen3 fine-tuned, keep) ──
268
+ print("\n2️⃣ Chhagan-DocVL-Qwen3 (Qwen3-VL Refined)...")
269
  MODEL_ID_C2 = "Chhagan005/Chhagan-DocVL-Qwen3"
270
  CHHAGAN_QWEN3_AVAILABLE = False
271
+ processor_c2 = model_c2 = None
 
272
 
273
  if QWEN3_AVAILABLE:
274
  try:
275
  try:
276
  if PEFT_AVAILABLE:
277
  config = PeftConfig.from_pretrained(MODEL_ID_C2)
278
+ base_id = config.base_model_name_or_path
279
+ processor_c2 = AutoProcessor.from_pretrained(base_id, trust_remote_code=True)
280
+ base_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
281
+ base_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
282
+ model_c2 = PeftModel.from_pretrained(base_c2, MODEL_ID_C2).to(device).eval()
 
 
 
 
 
 
283
  else:
284
+ raise Exception("No PEFT")
285
  except:
286
+ print(" Loading as full fine-tuned...")
287
  processor_c2 = AutoProcessor.from_pretrained(MODEL_ID_C2, trust_remote_code=True)
288
  model_c2 = Qwen3VLForConditionalGeneration.from_pretrained(
289
+ MODEL_ID_C2, attn_implementation="flash_attention_2",
290
+ torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
 
 
 
291
  ).to(device).eval()
292
+ print(" ✅ Loaded!")
293
  CHHAGAN_QWEN3_AVAILABLE = True
294
  except Exception as e:
295
+ print(f" ❌ Failed: {e}")
296
  else:
297
+ print(" ⚠️ Qwen3VL not in transformers version")
298
 
299
+ # ── Model 3: CSM-DocExtract-VL-Q4KM (NEW, replaces Qwen3-2B) ──
300
+ print("\n3️⃣ CSM-DocExtract-VL-Q4KM (8B Q4KM Quantized)...")
301
+ MODEL_ID_Q4KM = "Chhagan005/CSM-DocExtract-VL-Q4KM"
302
+ CSM_Q4KM_AVAILABLE = False
303
+ processor_q4km = model_q4km = None
 
304
 
305
+ try:
306
+ processor_q4km = AutoProcessor.from_pretrained(MODEL_ID_Q4KM, trust_remote_code=True)
307
+ # Try loading as full quantized model first
308
  try:
309
+ model_q4km = Qwen2_5_VLForConditionalGeneration.from_pretrained(
310
+ MODEL_ID_Q4KM,
311
+ quantization_config=bnb_4bit_config,
 
312
  torch_dtype=torch.float16,
313
  device_map="auto",
314
+ trust_remote_code=True,
315
+ ).eval()
316
+ except:
317
+ # Fallback: try Qwen3VL architecture
318
+ if QWEN3_AVAILABLE:
319
+ model_q4km = Qwen3VLForConditionalGeneration.from_pretrained(
320
+ MODEL_ID_Q4KM,
321
+ quantization_config=bnb_4bit_config,
322
+ torch_dtype=torch.float16,
323
+ device_map="auto",
324
+ trust_remote_code=True,
325
+ ).eval()
326
+ else:
327
+ raise Exception("Neither Qwen2.5VL nor Qwen3VL architecture worked")
328
+ print(" ✅ Loaded! (~6-7GB VRAM)")
329
+ CSM_Q4KM_AVAILABLE = True
330
+ except Exception as e:
331
+ print(f" ❌ Failed: {e}")
332
 
333
+ # ── Model 4: CSM-DocExtract-VL 4BNB (NEW, replaces Nanonets) ──
334
+ print("\n4️⃣ CSM-DocExtract-VL 4BNB (BitsAndBytes 4-bit)...")
335
+ MODEL_ID_4BNB = "Chhagan005/CSM-DocExtract-VL"
336
+ CSM_4BNB_AVAILABLE = False
337
+ processor_4bnb = model_4bnb = None
 
338
 
339
  try:
340
+ processor_4bnb = AutoProcessor.from_pretrained(MODEL_ID_4BNB, trust_remote_code=True)
341
+ try:
342
+ model_4bnb = Qwen2_5_VLForConditionalGeneration.from_pretrained(
343
+ MODEL_ID_4BNB,
344
+ quantization_config=bnb_4bit_config,
345
+ torch_dtype=torch.float16,
346
+ device_map="auto",
347
+ trust_remote_code=True,
348
+ ).eval()
349
+ except:
350
+ if QWEN3_AVAILABLE:
351
+ model_4bnb = Qwen3VLForConditionalGeneration.from_pretrained(
352
+ MODEL_ID_4BNB,
353
+ quantization_config=bnb_4bit_config,
354
+ torch_dtype=torch.float16,
355
+ device_map="auto",
356
+ trust_remote_code=True,
357
+ ).eval()
358
+ else:
359
+ raise Exception("Architecture detection failed")
360
+ print(" ✅ Loaded! (~6-7GB VRAM)")
361
+ CSM_4BNB_AVAILABLE = True
362
  except Exception as e:
363
+ print(f" ❌ Failed: {e}")
364
 
365
  print("\n" + "="*70)
366
+ print("📊 MODEL STATUS")
367
  print("="*70)
368
+ status = [
369
+ ("Chhagan_ML-VL-OCR-v1", CHHAGAN_V1_AVAILABLE, "LoRA Fine-tuned"),
370
+ ("Chhagan-DocVL-Qwen3", CHHAGAN_QWEN3_AVAILABLE, "Qwen3-VL Fine-tuned"),
371
+ ("CSM-DocExtract-VL-Q4KM", CSM_Q4KM_AVAILABLE, "8B Q4KM ~6-7GB"),
372
+ ("CSM-DocExtract-VL 4BNB", CSM_4BNB_AVAILABLE, "BitsAndBytes 4-bit ~6-7GB"),
373
+ ]
374
+ for name, ok, note in status:
375
+ print(f" {'✅' if ok else '❌'} {name:<35} {note}")
376
  print("="*70)
377
+ loaded = sum(x[1] for x in status)
378
+ print(f" Total loaded: {loaded}/4\n")
379
+
380
+
381
+ # ╔══════════════════════════════════════════╗
382
+ # ║ PYTHON PIPELINE FUNCTIONS ║
383
+ # ╚══════════════════════════════════════════╝
384
+
385
+ def convert_eastern_numerals(text: str) -> str:
386
+ """P2: Convert Persian/Arabic/Devanagari numerals to Western 0-9"""
387
+ tables = [
388
+ str.maketrans('۰۱۲۳۴۵۶۷۸۹', '0123456789'), # Persian
389
+ str.maketrans('٠١٢٣٤٥٦٧٨٩', '0123456789'), # Arabic
390
+ str.maketrans('०१२३४५६७८९', '0123456789'), # Devanagari
391
+ str.maketrans('০১২৩৪৫৬৭৮৯', '0123456789'), # Bengali
392
+ str.maketrans('੦੧੨੩੪੫੬੭੮੯', '0123456789'), # Gurmukhi
393
+ ]
394
+ for table in tables:
395
+ text = text.translate(table)
396
+ return text
397
+
398
+
399
+ def detect_calendar_system(raw_text: str) -> str:
400
+ """Detect calendar system from country/language context"""
401
+ text_upper = raw_text.upper()
402
+ if any(kw in raw_text for kw in ['جمهوری اسلامی ایران', 'IRAN', 'AFGHANISTAN', 'افغانستان']):
403
+ return 'solar_hijri'
404
+ if any(kw in text_upper for kw in ['SAUDI', 'ARABIA', 'السعودية', 'KUWAIT', 'QATAR', 'BAHRAIN', 'JORDAN']):
405
+ return 'lunar_hijri'
406
+ return 'gregorian'
407
+
408
+
409
+ def convert_shamsi_to_gregorian(shamsi_date: str) -> str:
410
+ """P3: Solar Hijri (Shamsi) → Gregorian using khayyam library"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  try:
412
+ import khayyam
413
+ parts = re.split(r'[/\-\.]', shamsi_date.strip())
414
+ if len(parts) == 3:
415
+ y, m, d = int(parts[0]), int(parts[1]), int(parts[2])
416
+ jd = khayyam.JalaliDate(y, m, d)
417
+ greg = jd.todate()
418
+ return f"{greg.day:02d}/{greg.month:02d}/{greg.year}"
419
+ except ImportError:
420
+ # Approximate manual conversion if khayyam not installed
421
+ try:
422
+ parts = re.split(r'[/\-\.]', shamsi_date.strip())
423
+ y, m, d = int(parts[0]), int(parts[1]), int(parts[2])
424
+ greg_year = y + 621
425
+ return f"{d:02d}/{m:02d}/{greg_year} (approx)"
426
+ except:
427
+ pass
428
+ except Exception:
429
+ pass
430
+ return f"{shamsi_date} (Shamsi)"
431
 
 
432
 
433
+ def convert_hijri_to_gregorian(hijri_date: str) -> str:
434
+ """P3: Lunar Hijri → Gregorian using hijri library"""
 
 
 
 
 
 
435
  try:
436
+ from hijri_converter import convert
437
+ parts = re.split(r'[/\-\.]', hijri_date.strip())
438
+ if len(parts) == 3:
439
+ y, m, d = int(parts[0]), int(parts[1]), int(parts[2])
440
+ greg = convert.Hijri(y, m, d).to_gregorian()
441
+ return f"{greg.day:02d}/{greg.month:02d}/{greg.year}"
442
+ except ImportError:
443
+ try:
444
+ parts = re.split(r'[/\-\.]', hijri_date.strip())
445
+ y, m, d = int(parts[0]), int(parts[1]), int(parts[2])
446
+ greg_year = y - 43 + 622
447
+ return f"{d:02d}/{m:02d}/{greg_year} (approx)"
448
+ except:
449
+ pass
450
  except:
451
+ pass
452
+ return f"{hijri_date} (Hijri)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
+ def separate_scripts(raw_text: str) -> tuple:
456
+ """P5: Separate English/Latin lines from non-Latin script lines"""
457
+ english_lines = []
458
+ original_lines = []
459
+ for line in raw_text.split('\n'):
460
+ line = line.strip()
461
+ if not line:
462
+ continue
463
+ non_latin = sum(1 for c in line if ord(c) > 591)
464
+ total_alpha = sum(1 for c in line if c.isalpha())
465
+ if total_alpha == 0:
466
+ english_lines.append(line)
467
+ elif non_latin / max(total_alpha, 1) > 0.4:
468
+ original_lines.append(line)
469
+ else:
470
+ english_lines.append(line)
471
+ return '\n'.join(english_lines), '\n'.join(original_lines)
472
+
473
+
474
+ def extract_english_fields(raw_text: str) -> list:
475
+ """P4: Extract English label:value pairs directly from card text — no AI"""
476
+ results = []
477
+ patterns = [
478
+ (r'(?:FULL\s+)?NAME\s*[:\-.]?\s*([A-Za-z][A-Za-z\s\-\.\']{1,60})', 'NAME'),
479
+ (r'DATE\s+OF\s+BIRTH\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'DATE OF BIRTH'),
480
+ (r'\bDOB\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'DATE OF BIRTH'),
481
+ (r'BIRTH\s+DATE\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'DATE OF BIRTH'),
482
+ (r'EXPIRY\s+DATE\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'EXPIRY DATE'),
483
+ (r'DATE\s+OF\s+EXPIRY\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'EXPIRY DATE'),
484
+ (r'VALID(?:\s+THRU|\s+UNTIL|ITY)?\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'EXPIRY DATE'),
485
+ (r'EXPIRATION\s+DATE\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'EXPIRY DATE'),
486
+ (r'(?:DATE\s+OF\s+)?ISSUE\s+DATE\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'ISSUE DATE'),
487
+ (r'DATE\s+OF\s+ISSUE\s*[:\-.]?\s*(\d{1,2}[\s/\-\.]\d{1,2}[\s/\-\.]\d{2,4})', 'ISSUE DATE'),
488
+ (r'CIVIL\s+(?:NO\.?|NUMBER)\s*[:\-.]?\s*([A-Z0-9\-]{4,20})', 'CIVIL NUMBER'),
489
+ (r'PASSPORT\s+(?:NO\.?|NUMBER)\s*[:\-.]?\s*([A-Z0-9\-]{6,12})', 'PASSPORT NUMBER'),
490
+ (r'LICENCE\s+(?:NO\.?|NUMBER)\s*[:\-.]?\s*([A-Z0-9\-]{4,20})', 'LICENCE NUMBER'),
491
+ (r'LICENSE\s+(?:NO\.?|NUMBER)\s*[:\-.]?\s*([A-Z0-9\-]{4,20})', 'LICENCE NUMBER'),
492
+ (r'AADHAAR\s*(?:NO\.?|NUMBER)?\s*[:\-.]?\s*(\d{4}\s?\d{4}\s?\d{4})', 'AADHAAR NUMBER'),
493
+ (r'\bPAN\s*[:\-.]?\s*([A-Z]{5}\d{4}[A-Z])', 'PAN NUMBER'),
494
+ (r'EMIRATES\s+ID\s*[:\-.]?\s*(\d{3}-\d{4}-\d{7}-\d)', 'EMIRATES ID'),
495
+ (r'(?:NATIONAL\s+)?ID\s+(?:NO\.?|NUMBER)\s*[:\-.]?\s*([A-Z0-9\-]{4,20})', 'ID NUMBER'),
496
+ (r'DOCUMENT\s+(?:NO\.?|NUMBER)\s*[:\-.]?\s*([A-Z0-9\-]{4,20})', 'DOCUMENT NUMBER'),
497
+ (r'NATIONALITY\s*[:\-.]?\s*([A-Za-z]{3,30})', 'NATIONALITY'),
498
+ (r'(?:GENDER|SEX)\s*[:\-.]?\s*(MALE|FEMALE)', 'GENDER'),
499
+ (r'PLACE\s+OF\s+BIRTH\s*[:\-.]?\s*([A-Za-z\s,]{2,40})', 'PLACE OF BIRTH'),
500
+ (r'(?:PERMANENT\s+)?ADDRESS\s*[:\-.]?\s*(.{5,80})', 'ADDRESS'),
501
+ (r'BLOOD\s+(?:GROUP|TYPE)\s*[:\-.]?\s*([ABO]{1,2}[+-]?)', 'BLOOD GROUP'),
502
+ (r'(?:PROFESSION|OCCUPATION|JOB\s+TITLE)\s*[:\-.]?\s*(.{3,50})', 'PROFESSION'),
503
+ (r'FATHER(?:\'?S)?\s+NAME\s*[:\-.]?\s*([A-Za-z\s]{3,50})', "FATHER'S NAME"),
504
+ (r'MOTHER(?:\'?S)?\s+NAME\s*[:\-.]?\s*([A-Za-z\s]{3,50})', "MOTHER'S NAME"),
505
+ (r'EMPLOYER\s*[:\-.]?\s*(.{3,60})', 'EMPLOYER'),
506
+ ]
507
+ seen = set()
508
+ for pattern, label in patterns:
509
+ m = re.search(pattern, raw_text, re.IGNORECASE)
510
+ if m and label not in seen:
511
+ val = m.group(1).strip()
512
+ if val and len(val) > 1 and '[' not in val:
513
+ results.append((label, val))
514
+ seen.add(label)
515
+ return results
516
 
517
 
518
  def parse_mrz_lines(raw_text: str) -> dict:
519
+ """P1: Authoritative Python MRZ parser — TD1, TD3, MRVA, MRVB"""
520
+ # Normalize: western numerals only
521
+ raw_text = convert_eastern_numerals(raw_text)
 
 
 
522
 
523
  lines = []
524
  for line in raw_text.split('\n'):
525
  clean = re.sub(r'\s+', '', line.strip())
526
+ if re.match(r'^[A-Z0-9<]{25,50}$', clean):
527
  lines.append(clean)
528
 
529
  if not lines:
 
531
 
532
  def decode_date(yymmdd: str, is_dob: bool = False) -> str:
533
  try:
534
+ yy, mm, dd = int(yymmdd[0:2]), int(yymmdd[2:4]), int(yymmdd[4:6])
 
 
535
  if not (1 <= mm <= 12 and 1 <= dd <= 31):
536
  return f"Invalid ({yymmdd})"
537
+ cur_yy = datetime.datetime.now().year % 100
538
+ year = (1900 + yy) if (is_dob and yy > cur_yy) else (2000 + yy)
539
  return f"{dd:02d}/{mm:02d}/{year}"
540
  except:
541
  return yymmdd
542
 
543
+ def clean_fill(s: str) -> str:
544
  return re.sub(r'<+$', '', s).replace('<', ' ').strip()
545
 
546
+ def parse_name(line3: str) -> str:
547
+ name_clean = re.sub(r'<+$', '', line3)
548
+ if '<<' in name_clean:
549
+ parts = name_clean.split('<<')
550
+ surname = parts[0].replace('<', ' ').strip().title()
551
+ given = parts[1].replace('<', ' ').strip().title() if len(parts) > 1 else ''
552
+ return f"{given} {surname}".strip() if given else surname
553
+ return name_clean.replace('<', ' ').strip().title()
554
+
555
  result = {}
556
 
557
+ # TD1: 3 lines, 28-36 chars
558
  td1 = [l for l in lines if 28 <= len(l) <= 36]
559
  if len(td1) >= 2:
560
  l1, l2 = td1[0], td1[1]
561
  l3 = td1[2] if len(td1) > 2 else ""
562
+ result['doc_type'] = clean_fill(l1[0:2])
563
+ result['country_code'] = clean_fill(l1[2:5])
564
+ result['doc_number'] = clean_fill(l1[5:14])
565
+ if len(l2) >= 19:
566
+ result['dob'] = decode_date(l2[0:6], is_dob=True)
567
+ sex = l2[7] if len(l2) > 7 else ''
568
+ result['sex'] = 'Male' if sex == 'M' else ('Female' if sex == 'F' else 'Unknown')
569
+ result['expiry'] = decode_date(l2[8:14], is_dob=False)
570
+ result['nationality'] = clean_fill(l2[15:18])
 
 
 
 
 
 
571
  if l3:
572
+ result['name'] = parse_name(l3)
 
 
 
 
 
 
 
 
573
  result['mrz_format'] = 'TD1'
574
  return result
575
 
576
+ # TD3: 2 lines, 40-48 chars (Passports)
577
  td3 = [l for l in lines if 40 <= len(l) <= 48]
578
  if len(td3) >= 2:
579
  l1, l2 = td3[0], td3[1]
580
+ result['doc_type'] = clean_fill(l1[0:2])
581
+ result['country_code'] = clean_fill(l1[2:5])
582
+ result['name'] = parse_name(l1[5:44])
 
 
 
 
 
 
 
 
 
 
583
  if len(l2) >= 27:
584
+ result['doc_number'] = clean_fill(l2[0:9])
585
+ result['nationality'] = clean_fill(l2[10:13])
586
+ result['dob'] = decode_date(l2[13:19], is_dob=True)
587
+ sex = l2[20] if len(l2) > 20 else ''
588
+ result['sex'] = 'Male' if sex == 'M' else ('Female' if sex == 'F' else 'Unknown')
589
+ result['expiry'] = decode_date(l2[21:27], is_dob=False)
 
590
  result['mrz_format'] = 'TD3'
591
  return result
592
 
593
+ # MRVA/MRVB: 2 lines, 36 chars (Visas)
594
+ mrv = [l for l in lines if 36 <= len(l) <= 38]
595
+ if len(mrv) >= 2:
596
+ l1, l2 = mrv[0], mrv[1]
597
+ result['doc_type'] = clean_fill(l1[0:2])
598
+ result['country_code'] = clean_fill(l1[2:5])
599
+ result['name'] = parse_name(l1[5:36])
600
+ if len(l2) >= 27:
601
+ result['doc_number'] = clean_fill(l2[0:9])
602
+ result['nationality'] = clean_fill(l2[10:13])
603
+ result['dob'] = decode_date(l2[13:19], is_dob=True)
604
+ sex = l2[20] if len(l2) > 20 else ''
605
+ result['sex'] = 'Male' if sex == 'M' else ('Female' if sex == 'F' else 'Unknown')
606
+ result['expiry'] = decode_date(l2[21:27], is_dob=False)
607
+ result['mrz_format'] = 'MRVA/MRVB'
608
+ return result
609
+
610
  return {}
611
 
612
+
613
+ def build_mrz_table(mrz_data: dict) -> str:
614
+ if not mrz_data:
615
+ return "No MRZ detected."
616
+ table = f"**Python Parsed MRZ — Authoritative ({mrz_data.get('mrz_format','?')} format):**\n\n"
617
+ table += "| Field | Verified Value |\n|---|---|\n"
618
+ fields = [
619
+ ('mrz_format', 'MRZ Format'),
620
+ ('doc_type', 'Document Type'),
621
+ ('country_code', 'Issuing Country Code'),
622
+ ('doc_number', 'Document / Civil Number'),
623
+ ('name', 'Full Name'),
624
+ ('dob', 'Date of Birth'),
625
+ ('expiry', 'Expiry Date'),
626
+ ('nationality', 'User Nationality'),
627
+ ('sex', 'Gender'),
628
+ ]
629
+ for key, label in fields:
630
+ if key in mrz_data:
631
+ table += f"| {label} | **{mrz_data[key]}** ✅ |\n"
632
+ return table
633
+
634
+
635
+ def build_unified_summary(front_result: str, back_result: str, mrz_data: dict) -> str:
636
+ """P6: Merge front+back fields, MRZ as ground truth override"""
637
+ summary = "## 🔄 Unified Deduplicated Record\n\n"
638
+
639
+ if mrz_data:
640
+ summary += f"> ✅ *MRZ Python-parsed ({mrz_data.get('mrz_format','?')}) — MRZ values are **ground truth**.*\n\n"
641
+ summary += "### 🔐 MRZ Ground Truth\n\n"
642
+ summary += build_mrz_table(mrz_data) + "\n\n---\n\n"
643
+ else:
644
+ summary += "> *No MRZ — fields merged from front+back. Conflicts flagged ⚠️.*\n\n"
645
+
646
+ def get_rows(text):
647
+ rows = {}
648
+ m = re.search(r"## (?:✅|🗂️)[^\n]*\n\|[^\n]*\n\|[-| ]+\n(.*?)(?=\n---|\Z)", text, re.DOTALL)
649
+ if m:
650
+ for line in m.group(1).strip().split('\n'):
651
+ parts = [p.strip() for p in line.split('|') if p.strip()]
652
+ if len(parts) >= 2:
653
+ field = re.sub(r'[^\w\s/\']', '', parts[0]).strip()
654
+ val = parts[1].strip()
655
+ if val and val.lower() not in ('—', 'not on card', 'n/a', ''):
656
+ rows[field] = val
657
+ return rows
658
+
659
+ front_f = get_rows(front_result)
660
+ back_f = get_rows(back_result)
661
+ all_f = list(dict.fromkeys(list(front_f.keys()) + list(back_f.keys())))
662
+
663
+ # MRZ lookup
664
+ mrz_map = {}
665
+ if mrz_data:
666
+ kw_map = {
667
+ 'name': ['name'],
668
+ 'doc_number': ['civil', 'document', 'id', 'passport', 'licence'],
669
+ 'dob': ['birth', 'dob'],
670
+ 'expiry': ['expiry', 'expiration'],
671
+ 'sex': ['gender', 'sex'],
672
+ 'nationality':['nationality'],
673
+ }
674
+ for mk, keywords in kw_map.items():
675
+ if mk in mrz_data:
676
+ for kw in keywords:
677
+ mrz_map[kw] = mrz_data[mk]
678
+
679
+ def get_mrz(field):
680
+ fl = field.lower()
681
+ for kw, v in mrz_map.items():
682
+ if kw in fl:
683
+ return v
684
+ return None
685
+
686
+ summary += "### 📋 Field Comparison\n\n| Field | Value | Source |\n|---|---|---|\n"
687
+
688
+ for field in all_f:
689
+ fv = front_f.get(field, '')
690
+ bv = back_f.get(field, '')
691
+ mv = get_mrz(field)
692
+
693
+ if fv and bv:
694
+ if fv.lower() == bv.lower():
695
+ note = f"✅ MRZ Confirmed" if mv and any(x in fv.lower() for x in mv.lower().split()) else ("⚠️ MRZ differs: **" + mv + "**" if mv else "")
696
+ summary += f"| {field} | {fv} | Front+Back ✅ {note} |\n"
697
+ else:
698
+ if mv:
699
+ summary += f"| {field} | ~~{fv}~~ / ~~{bv}~~ → **{mv}** | ✅ MRZ Override |\n"
700
+ else:
701
+ summary += f"| {field} | F: **{fv}** / B: **{bv}** | ⚠️ Mismatch |\n"
702
+ elif fv:
703
+ note = f"✅ MRZ Confirmed" if mv and any(x in fv.lower() for x in mv.lower().split()) else (f"⚠️ MRZ: **{mv}**" if mv else "")
704
+ summary += f"| {field} | {fv} | Front only {note} |\n"
705
+ elif bv:
706
+ note = f"✅ MRZ Confirmed" if mv and any(x in bv.lower() for x in mv.lower().split()) else (f"⚠️ MRZ: **{mv}**" if mv else "")
707
+ summary += f"| {field} | {bv} | Back only {note} |\n"
708
+
709
+ return summary + "\n"
710
+
711
+
712
+ # ╔══════════════════════════════════════════╗
713
+ # ║ STEP PIPELINE FUNCTIONS ║
714
+ # ╚══════════════════════════════════════���═══╝
715
+
716
+ def run_step1_extraction(model, processor, image, device, temperature, top_p, top_k, repetition_penalty):
717
+ """Step 1: LLM → Raw OCR, original script, NO translation, NO coordinates"""
718
+
719
+ def _generate(prompt_text):
720
+ messages = [{"role": "user", "content": [
721
+ {"type": "image"},
722
+ {"type": "text", "text": prompt_text},
723
+ ]}]
724
+ try:
725
+ prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
726
+ except:
727
+ prompt = prompt_text
728
+ inputs = processor(text=[prompt], images=[image], return_tensors="pt", padding=True).to(device)
729
+ with torch.no_grad():
730
+ out = model.generate(
731
+ **inputs, max_new_tokens=600, do_sample=True,
732
+ temperature=temperature, top_p=top_p, top_k=top_k,
733
+ repetition_penalty=repetition_penalty,
734
+ )
735
+ gen = out[:, inputs['input_ids'].shape[1]:]
736
+ return processor.batch_decode(gen, skip_special_tokens=True)[0]
737
+
738
+ result = _generate(STEP1_EXTRACT_PROMPT)
739
+
740
+ # Detect coordinate output (Qwen grounding mode triggered) → retry
741
+ if re.search(r'\(\d{1,4},\s*\d{1,4}\)', result) or '---TEXT_START---' not in result:
742
+ print(" ⚠️ Coordinate output detected, retrying...")
743
+ fallback = """Read all text from this document image and write it line by line in plain text.
744
+ Do NOT output coordinates or bounding boxes.
745
+ Start output with:
746
+ PHOTO_PRESENT: yes or no
747
+ SIGNATURE_PRESENT: yes or no
748
+ MRZ_PRESENT: yes or no
749
+ DETECTED_LANGUAGE: name the language(s)
750
+ ---TEXT_START---
751
+ [all text here exactly as printed]
752
+ ---TEXT_END---"""
753
+ result = _generate(fallback)
754
+
755
+ return result
756
+
757
+
758
+ def parse_step1_output(raw_output: str) -> dict:
759
+ """Parse Step 1 structured output → metadata + original text"""
760
+ result = {
761
+ "photo_present": "❌ No",
762
+ "photo_location": "N/A",
763
+ "sig_present": "❌ No",
764
+ "sig_location": "N/A",
765
+ "mrz_present": "❌ No",
766
+ "detected_lang": "Unknown",
767
+ "original_text": raw_output,
768
+ }
769
+
770
+ def get(pattern, text, default="N/A"):
771
+ m = re.search(pattern, text, re.IGNORECASE)
772
+ return m.group(1).strip() if m else default
773
+
774
+ photo = get(r'PHOTO_PRESENT:\s*(yes|no)', raw_output)
775
+ result["photo_present"] = "✅ Yes" if photo.lower() == "yes" else "❌ No"
776
+ result["photo_location"] = get(r'PHOTO_LOCATION:\s*([^\n]+)', raw_output)
777
+
778
+ sig = get(r'SIGNATURE_PRESENT:\s*(yes|no)', raw_output)
779
+ result["sig_present"] = "✅ Yes" if sig.lower() == "yes" else "❌ No"
780
+ result["sig_location"] = get(r'SIGNATURE_LOCATION:\s*([^\n]+)', raw_output)
781
+
782
+ mrz = get(r'MRZ_PRESENT:\s*(yes|no)', raw_output)
783
+ result["mrz_present"] = "✅ Yes" if mrz.lower() == "yes" else "❌ No"
784
+ result["detected_lang"] = get(r'DETECTED_LANGUAGE:\s*([^\n]+)', raw_output, "Unknown")
785
+
786
+ m = re.search(r'---TEXT_START---\n?(.*?)---TEXT_END---', raw_output, re.DOTALL)
787
+ if m:
788
+ result["original_text"] = m.group(1).strip()
789
+
790
+ return result
791
+
792
 
793
  def run_step2_structure(model, processor, metadata: dict, device,
794
  max_new_tokens, temperature, top_p, top_k, repetition_penalty):
795
+ """Step 2: Python extracts English fields + MRZ. LLM only classifies + fills gaps."""
796
+
797
+ raw_text = metadata.get('original_text', '')
798
+
799
+ # P2: Convert eastern numerals first
800
+ raw_text_normalized = convert_eastern_numerals(raw_text)
801
+
802
+ # P5: Separate scripts
803
+ english_block, original_block = separate_scripts(raw_text_normalized)
804
+
805
+ # P4: Direct English field extraction
806
+ english_fields = extract_english_fields(raw_text_normalized)
807
+
808
+ # P1: MRZ parse (authoritative)
809
+ mrz_data = parse_mrz_lines(raw_text_normalized)
810
+
811
+ # P3: Calendar detection + conversion (for display)
812
+ calendar_sys = detect_calendar_system(raw_text)
813
+
814
+ # Build python fields table
815
+ if english_fields:
816
+ tbl = "| Field (as printed on card) | Value (as printed) |\n|---|---|\n"
817
+ for label, val in english_fields:
818
+ tbl += f"| **{label}** | {val} |\n"
819
+ else:
820
+ tbl = "| — | No English label:value pairs detected |\n"
821
+
822
+ # MRZ summary
823
+ if mrz_data:
824
+ mrz_summary = " | ".join([f"{k}: {v}" for k, v in mrz_data.items() if k != 'mrz_format'])
825
+ mrz_summary = f"✅ {mrz_data.get('mrz_format','?')} parsed: {mrz_summary}"
826
+ else:
827
+ mrz_summary = "❌ No MRZ detected"
828
+
829
+ # Non-Gregorian note
830
+ cal_note = ""
831
+ if calendar_sys == 'solar_hijri':
832
+ cal_note = "\n> ⚠️ **Solar Hijri (Shamsi) calendar detected** — Python will convert dates to Gregorian."
833
+ elif calendar_sys == 'lunar_hijri':
834
+ cal_note = "\n> ⚠️ **Lunar Hijri calendar detected** — Python will convert dates to Gregorian."
835
+
836
+ # Build prompt for LLM (classification + gaps only)
837
+ prompt_text = STEP2_TEMPLATE.format(
838
+ python_fields_table=tbl,
839
+ mrz_summary=mrz_summary,
840
+ english_block=english_block or "None",
841
+ original_block=original_block or "None",
842
  )
843
 
844
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}]
845
  try:
846
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
847
  except:
848
+ prompt = prompt_text
849
 
850
  inputs = processor(text=[prompt], return_tensors="pt", padding=True).to(device)
851
 
852
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
853
  gen_kwargs = {
854
+ **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
855
+ "do_sample": True, "temperature": temperature, "top_p": top_p,
856
+ "top_k": top_k, "repetition_penalty": repetition_penalty,
 
 
 
 
 
857
  }
858
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
859
  thread.start()
 
860
 
861
+ # Pre-build Python-verified sections
862
+ python_sections = f"""## 🖼️ Visual Elements
863
 
864
+ | Element | Status | Location |
865
+ |---------|--------|----------|
866
+ | 📷 Profile Photo | {metadata['photo_present']} | {metadata['photo_location']} |
867
+ | ✍️ Signature | {metadata['sig_present']} | {metadata['sig_location']} |
868
+ | 🔐 MRZ Zone | {metadata['mrz_present']} | Bottom strip |
869
 
870
+ ---
 
 
871
 
872
+ ## ✅ English Fields (Direct from Card — Not Modified)
873
+ {cal_note}
 
 
 
 
 
 
 
 
 
 
 
 
874
 
875
+ {tbl}
 
 
876
 
877
+ ---
 
878
 
879
+ ## 📜 Original Script
 
 
880
 
881
+ {raw_text}
882
+ ---
 
 
 
 
 
 
 
883
 
884
+ ## 🔐 MRZ Data
885
 
886
+ {chr(10).join([l for l in raw_text.split(chr(10)) if re.match(r'^[A-Z0-9<]{25,50}$', re.sub(r'\s+','',l.strip()))]) or 'NOT PRESENT'}
887
+ {build_mrz_table(mrz_data) if mrz_data else '_No MRZ detected._'}
888
 
889
+ ---
890
+
891
+ """
892
+ return streamer, thread, mrz_data, python_sections
893
+
894
+
895
+ # ╔══════════════════════════════════════════╗
896
+ # ║ GRADIO HELPER CLASSES ║
897
+ # ╚══════════════════════════════════════════╝
898
+
899
+ class RadioAnimated(gr.HTML):
900
+ def __init__(self, choices, value=None, **kwargs):
901
+ if not choices or len(choices) < 2:
902
+ raise ValueError("RadioAnimated requires at least 2 choices.")
903
+ if value is None:
904
+ value = choices[0]
905
+ uid = uuid.uuid4().hex[:8]
906
+ group_name = f"ra-{uid}"
907
+ inputs_html = "\n".join(
908
+ f'<input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">'
909
+ f'<label class="ra-label" for="{group_name}-{i}">{c}</label>'
910
+ for i, c in enumerate(choices)
911
+ )
912
+ html_template = f"""
913
+ <div class="ra-wrap" data-ra="{uid}">
914
+ <div class="ra-inner"><div class="ra-highlight"></div>{inputs_html}</div>
915
+ </div>"""
916
+ js_on_load = r"""
917
+ (() => {
918
+ const highlight = element.querySelector('.ra-highlight');
919
+ const inputs = Array.from(element.querySelectorAll('.ra-input'));
920
+ if (!inputs.length) return;
921
+ const choices = inputs.map(i => i.value);
922
+ function setHighlight(idx) {
923
+ highlight.style.width = `calc(${100/choices.length}% - 6px)`;
924
+ highlight.style.transform = `translateX(${idx * 100}%)`;
925
+ }
926
+ function setVal(val, trigger=false) {
927
+ const idx = Math.max(0, choices.indexOf(val));
928
+ inputs.forEach((inp, i) => { inp.checked = (i === idx); });
929
+ setHighlight(idx);
930
+ props.value = choices[idx];
931
+ if (trigger) trigger('change', props.value);
932
+ }
933
+ setVal(props.value ?? choices[0], false);
934
+ inputs.forEach(inp => inp.addEventListener('change', () => setVal(inp.value, true)));
935
+ })();"""
936
+ super().__init__(value=value, html_template=html_template, js_on_load=js_on_load, **kwargs)
937
+
938
+
939
+ def apply_gpu_duration(val: str):
940
+ return int(val)
941
+
942
+
943
+ def calc_timeout_duration(model_name, text, image_front, image_back,
944
+ max_new_tokens, temperature, top_p, top_k,
945
+ repetition_penalty, gpu_timeout):
946
+ try:
947
+ base = int(gpu_timeout)
948
+ return base * 2 if (image_front is not None and image_back is not None) else base
949
+ except:
950
+ return 180
951
+
952
+
953
+ # ╔══════════════════════════════════════════╗
954
+ # ║ MAIN PIPELINE FUNCTION ║
955
+ # ╚══════════════════════════════════════════╝
956
 
957
  @spaces.GPU(duration=calc_timeout_duration)
958
  def generate_dual_card_ocr(model_name: str, text: str,
 
961
  top_k: int, repetition_penalty: float, gpu_timeout: int):
962
 
963
  # Model selection
964
+ model_map = {
965
+ "Chhagan-ID-OCR-v1 ⭐": (CHHAGAN_V1_AVAILABLE, processor_c1, model_c1),
966
+ "Chhagan-DocVL-Qwen3 🔥": (CHHAGAN_QWEN3_AVAILABLE, processor_c2, model_c2),
967
+ "CSM-DocExtract-Q4KM 🏆": (CSM_Q4KM_AVAILABLE, processor_q4km, model_q4km),
968
+ "CSM-DocExtract-4BNB 💎": (CSM_4BNB_AVAILABLE, processor_4bnb, model_4bnb),
969
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
970
 
971
+ if model_name not in model_map:
972
+ yield "Invalid model.", "Invalid model."; return
973
+
974
+ available, processor, model = model_map[model_name]
975
+ if not available:
976
+ yield f"{model_name} not available.", f"{model_name} not available."; return
977
 
978
  if image_front is None and image_back is None:
979
+ yield "Please upload at least one card image.", "Please upload at least one card image."; return
 
980
 
981
  full_output = ""
982
  front_result = ""
983
  back_result = ""
984
+ all_mrz_data = {}
985
+ front_meta_saved = {}
986
+ back_meta_saved = {}
987
 
988
+ # ───── FRONT CARD ─────
989
  if image_front is not None:
990
  full_output += "# 🎴 FRONT CARD\n\n"
991
+ full_output += "⏳ **Step 1/2 — Raw OCR (original script, no translation)...**\n\n"
992
  yield full_output, full_output
993
 
994
+ step1_raw = run_step1_extraction(model, processor, image_front, device,
995
+ temperature, top_p, top_k, repetition_penalty)
 
 
996
  front_meta = parse_step1_output(step1_raw)
997
+ front_meta_saved = front_meta
998
 
999
+ full_output += f"✅ **Step 1 Done** — 🌐 Language: **{front_meta['detected_lang']}**\n\n"
1000
+ full_output += "⏳ **Step 2/2 — Python extract + LLM classify...**\n\n"
1001
  yield full_output, full_output
1002
 
1003
+ streamer_f, thread_f, mrz_f, python_sections_f = run_step2_structure(
1004
  model, processor, front_meta, device,
1005
+ max_new_tokens, temperature, top_p, top_k, repetition_penalty)
1006
+
1007
+ if mrz_f:
1008
+ all_mrz_data = mrz_f
1009
+
1010
+ buffer_f = python_sections_f
1011
+ yield full_output + buffer_f, full_output + buffer_f
1012
 
 
1013
  for new_text in streamer_f:
1014
+ buffer_f += new_text.replace("<|im_end|>", "").replace("<|endoftext|>", "")
 
1015
  time.sleep(0.01)
1016
  yield full_output + buffer_f, full_output + buffer_f
1017
 
 
1019
  front_result = buffer_f
1020
  thread_f.join()
1021
 
1022
+ # ───── BACK CARD ─────
1023
  if image_back is not None:
1024
  full_output += "\n\n---\n\n# 🎴 BACK CARD\n\n"
1025
+ full_output += "⏳ **Step 1/2 — Raw OCR (original script, no translation)...**\n\n"
1026
  yield full_output, full_output
1027
 
1028
+ step1_raw_back = run_step1_extraction(model, processor, image_back, device,
1029
+ temperature, top_p, top_k, repetition_penalty)
 
 
1030
  back_meta = parse_step1_output(step1_raw_back)
1031
+ back_meta_saved = back_meta
1032
 
1033
+ full_output += f"✅ **Step 1 Done** — 🌐 Language: **{back_meta['detected_lang']}**\n\n"
1034
+ full_output += "⏳ **Step 2/2 — Python extract + LLM classify...**\n\n"
1035
  yield full_output, full_output
1036
 
1037
+ streamer_b, thread_b, mrz_b, python_sections_b = run_step2_structure(
1038
  model, processor, back_meta, device,
1039
+ max_new_tokens, temperature, top_p, top_k, repetition_penalty)
1040
+
1041
+ if mrz_b and not all_mrz_data:
1042
+ all_mrz_data = mrz_b
1043
+
1044
+ buffer_b = python_sections_b
1045
+ yield full_output + buffer_b, full_output + buffer_b
1046
 
 
1047
  for new_text in streamer_b:
1048
+ buffer_b += new_text.replace("<|im_end|>", "").replace("<|endoftext|>", "")
 
1049
  time.sleep(0.01)
1050
  yield full_output + buffer_b, full_output + buffer_b
1051
 
 
1053
  back_result = buffer_b
1054
  thread_b.join()
1055
 
1056
+ # ───── UNIFIED SUMMARY ─────
 
 
 
 
 
 
 
 
 
 
 
1057
  if image_front is not None and image_back is not None:
1058
  full_output += "\n\n---\n\n"
1059
+ full_output += build_unified_summary(front_result, back_result, all_mrz_data)
1060
 
1061
+ mrz_note = f"MRZ: {all_mrz_data.get('mrz_format','?')} verified" if all_mrz_data else "MRZ: Not detected"
1062
+ full_output += f"\n\n---\n\n**✨ Complete** | Model: `{model_name}` | {mrz_note} | Pipeline: OCR → Python Extract → LLM Classify\n"
1063
  yield full_output, full_output
1064
 
1065
 
1066
+ # ╔══════════════════════════════════════════╗
1067
+ # ║ MODEL CHOICES ║
1068
+ # ╚══════════════════════════════════════════╝
1069
 
1070
  model_choices = []
1071
+ if CHHAGAN_V1_AVAILABLE: model_choices.append("Chhagan-ID-OCR-v1 ⭐")
1072
+ if CHHAGAN_QWEN3_AVAILABLE: model_choices.append("Chhagan-DocVL-Qwen3 🔥")
1073
+ if CSM_Q4KM_AVAILABLE: model_choices.append("CSM-DocExtract-Q4KM 🏆")
1074
+ if CSM_4BNB_AVAILABLE: model_choices.append("CSM-DocExtract-4BNB 💎")
1075
+ if not model_choices: model_choices = ["No models available"]
 
 
 
 
 
 
1076
 
1077
  dual_card_examples = [
1078
+ ["Extract complete information", "examples/5.jpg", None],
1079
+ ["Multilingual OCR with MRZ", "examples/4.jpg", None],
1080
+ ["Extract profile photo and signature", "examples/2.jpg", None],
1081
  ]
1082
 
1083
 
1084
+ # ╔══════════════════════════════════════════╗
1085
+ # ║ GRADIO UI ║
1086
+ # ╚══════════════════════════════════════════╝
1087
 
1088
  demo = gr.Blocks(css=css, theme=steel_blue_theme)
1089
  with demo:
1090
+ gr.Markdown("# 🌍 **CSM Dual-Card ID OCR System**", elem_id="main-title")
1091
+ gr.Markdown("### *Universal Document Extraction MRZ + Multilingual + Auto Calendar*")
1092
 
1093
  loaded_models = []
1094
+ if CHHAGAN_V1_AVAILABLE: loaded_models.append("ID-OCR-v1 ⭐")
1095
+ if CHHAGAN_QWEN3_AVAILABLE: loaded_models.append("DocVL-Qwen3 🔥")
1096
+ if CSM_Q4KM_AVAILABLE: loaded_models.append("Q4KM 🏆")
1097
+ if CSM_4BNB_AVAILABLE: loaded_models.append("4BNB 💎")
1098
+
1099
+ model_info = f"**Loaded ({len(loaded_models)}/4):** {', '.join(loaded_models)}" if loaded_models else "⚠️ No models"
 
 
 
 
1100
  gr.Markdown(f"**Status:** {model_info}")
1101
+ gr.Markdown("**Pipeline:** ✅ Step1: Raw OCR → ✅ Python: MRZ+English Extract → ✅ LLM: Classify+Gaps → ✅ Deduplicate")
1102
 
1103
  with gr.Row():
1104
  with gr.Column(scale=2):
1105
  image_query = gr.Textbox(
1106
  label="💬 Custom Query (Optional)",
1107
+ placeholder="Leave empty for automatic full extraction...",
1108
  value=""
1109
  )
 
1110
  gr.Markdown("### 📤 Upload ID Cards")
1111
  with gr.Row():
1112
  image_front = gr.Image(type="pil", label="🎴 Front Card", height=250)
1113
+ image_back = gr.Image(type="pil", label="🎴 Back Card (Optional)", height=250)
1114
 
1115
  image_submit = gr.Button("🚀 Extract + Translate + Structure", variant="primary", size="lg")
1116
 
 
1121
  )
1122
 
1123
  with gr.Accordion("⚙️ Advanced Settings", open=False):
1124
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
1125
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
1126
+ top_p = gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
1127
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
1128
+ repetition_penalty= gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
1129
 
1130
  with gr.Column(scale=3):
1131
  gr.Markdown("## 📄 Extraction Results", elem_id="output-title")
1132
  output = gr.Textbox(label="Raw Output (Streaming)", interactive=True, lines=15)
1133
+ with gr.Accordion("📝 Structured Preview", open=True):
1134
  markdown_output = gr.Markdown(label="Formatted Result")
1135
 
1136
  model_choice = gr.Radio(
1137
  choices=model_choices,
1138
+ label="🤖 Select Model",
1139
  value=model_choices[0] if model_choices else None,
1140
+ info="🏆💎 = 8B Quantized (best) | 🔥 = Qwen3 Fine-tuned | = LoRA"
1141
  )
1142
 
1143
  with gr.Row(elem_id="gpu-duration-container"):
 
1145
  gr.Markdown("**⏱️ GPU Duration (seconds)**")
1146
  radioanimated_gpu_duration = RadioAnimated(
1147
  choices=["60", "90", "120", "180", "240"],
1148
+ value="180",
1149
  elem_id="radioanimated_gpu_duration"
1150
  )
1151
+ gpu_duration_state = gr.Number(value=180, visible=False)
1152
 
1153
  gr.Markdown("""
1154
+ **✨ What This Extracts:**
1155
+ - 🔐 MRZ: TD1/TD3/MRVA/MRVB Python parsed, 100% accurate
1156
+ - English fields: Direct from card, not modified
1157
+ - 📜 Original script: Arabic/Farsi/Hindi/Chinese as-is
1158
+ - 🗓Calendar: Shamsi/Hijri Gregorian conversion
1159
+ - 🔢 Eastern numerals: ۱۲۳ 123 automatic
1160
+ - 🔄 Front+Back: Deduplicated, MRZ-verified
1161
+ """)
 
1162
 
1163
  radioanimated_gpu_duration.change(
1164
  fn=apply_gpu_duration,
 
1169
 
1170
  image_submit.click(
1171
  fn=generate_dual_card_ocr,
1172
+ inputs=[model_choice, image_query, image_front, image_back,
1173
+ max_new_tokens, temperature, top_p, top_k,
1174
+ repetition_penalty, gpu_duration_state],
 
 
 
1175
  outputs=[output, markdown_output]
1176
  )
1177
 
1178
  gr.Markdown("""
1179
+ ---
1180
+ ### 🎯 Feature Matrix
1181
+
1182
+ | Feature | Method | Accuracy |
1183
+ |---------|--------|---------|
1184
+ | MRZ Parse (TD1/TD3/MRVA) | Python | 100% |
1185
+ | English Labels Extract | Python Regex | 100% |
1186
+ | Eastern Numeral Convert | Python char map | 100% |
1187
+ | Shamsi/Hijri Calendar | Python library | 100% |
1188
+ | Raw OCR (32+ scripts) | 8B VLM | 90%+ |
1189
+ | Doc Type Classification | 8B VLM | 95%+ |
1190
+ | Non-English Translation | 8B VLM | 90%+ |
1191
+ | Front+Back Deduplication | Python | 100% |
1192
+
1193
+ ### 📋 Supported Documents
1194
+ 🇮🇳 Aadhaar, PAN, Passport | 🇦🇪 Emirates ID | 🇸🇦 Iqama | 🇴🇲 Oman Resident Card
1195
+ 🌍 International Passports (MRZ) | 🚗 Driving Licences | 🇮🇷 Iranian National ID (Shamsi)
1196
+
1197
+ ### 🔒 Privacy
1198
+ All processing on-device | No data stored | GDPR compliant
1199
+ """)
 
 
 
 
 
 
 
 
1200
 
1201
 
1202
  if __name__ == "__main__":
1203
+ print("\n🚀 STARTING...")
 
 
1204
  try:
1205
  demo.queue(max_size=50).launch(
1206
+ server_name="0.0.0.0", server_port=7860, show_error=True, share=False)
 
 
 
 
 
1207
  except Exception as e:
 
1208
  import traceback
1209
+ print(f"❌ {e}")
1210
  traceback.print_exc()