Chhagan005 commited on
Commit
589e015
ยท
verified ยท
1 Parent(s): 6007a3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -3
app.py CHANGED
@@ -203,7 +203,32 @@ YOUR TASKS:
203
  1. If text is non-English โ†’ translate to English with 95%+ accuracy
204
  2. If text is already English โ†’ copy as-is
205
  3. Extract all key KYC fields
206
- 4. Output EXACTLY in the format below โ€” no extra commentary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  ---
209
 
@@ -221,11 +246,14 @@ YOUR TASKS:
221
 
222
  {raw_text}
223
 
 
224
  ---
225
 
226
  ## ๐ŸŒ English Translation
 
227
  [Write complete English translation here. If already English, write: Already in English โ€” then copy text]
228
 
 
229
  ---
230
 
231
  ## ๐Ÿ—‚๏ธ Key Fields (English)
@@ -234,7 +262,7 @@ YOUR TASKS:
234
  |-------|-------|
235
  | ๐Ÿ“„ Document Type | |
236
  | ๐Ÿ‘ค Full Name | |
237
- | ๐Ÿ”ข ID / Document Number | |
238
  | ๐ŸŽ‚ Date of Birth | |
239
  | ๐Ÿ“… Issue Date | |
240
  | โณ Expiry Date | |
@@ -247,7 +275,9 @@ YOUR TASKS:
247
  ---
248
 
249
  ## ๐Ÿ” MRZ Data
250
- [Raw MRZ lines here โ€” if not present write: NOT PRESENT]
 
 
251
 
252
  **Parsed MRZ:**
253
  | Field | Value |
@@ -553,6 +583,104 @@ def parse_step1_output(raw_output: str) -> dict:
553
  return result
554
 
555
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
556
  # ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
557
 
558
  def run_step2_structure(model, processor, metadata: dict, device,
@@ -680,6 +808,8 @@ def generate_dual_card_ocr(model_name: str, text: str,
680
  full_output = ""
681
  front_result = ""
682
  back_result = ""
 
 
683
 
684
  # ===== FRONT CARD =====
685
  if image_front is not None:
@@ -745,6 +875,17 @@ def generate_dual_card_ocr(model_name: str, text: str,
745
  back_result = buffer_b
746
  thread_b.join()
747
 
 
 
 
 
 
 
 
 
 
 
 
748
  # ===== UNIFIED SUMMARY (only when both sides uploaded) =====
749
  if image_front is not None and image_back is not None:
750
  full_output += "\n\n---\n\n"
 
203
  1. If text is non-English โ†’ translate to English with 95%+ accuracy
204
  2. If text is already English โ†’ copy as-is
205
  3. Extract all key KYC fields
206
+ 4. Output EXACTLY in the format below
207
+
208
+ โš ๏ธ CRITICAL EXTRACTION RULES โ€” READ BEFORE EXTRACTING:
209
+
210
+ RULE 1 โ€” COUNTRY/INSTITUTION vs PERSON NAME:
211
+ - Text appearing at the TOP of ID cards like "Sultanate of Oman", "SULTANATE OF OMAN",
212
+ "Republic of India", "United Arab Emirates", "ROYAL OMAN POLICE" etc. is the
213
+ ISSUING COUNTRY or INSTITUTION NAME โ€” THIS IS NOT THE PERSON'S NAME
214
+ - Extract person's name ONLY from explicit name labels:
215
+ ุงู„ุฅุณู… / ุงู„ุงุณู… (Arabic) | NAME: | ๅง“ๅ (Chinese) | เคจเคพเคฎ (Hindi) | ะ˜ะœะฏ (Russian)
216
+ - In MRZ: TD1 Line 3 = person's name (e.g., FERIL<SUNNA = "Feril Sunna")
217
+
218
+ RULE 2 โ€” CIVIL ID vs BARCODE/CHIP ID:
219
+ - Long hex strings printed on barcodes/chips (e.g., 7E400DD3D032A7C) are card
220
+ SERIAL/CHIP numbers โ€” NOT the Civil ID
221
+ - The actual Civil/Document ID is under labels:
222
+ ุงู„ุฑู‚ู… ุงู„ู…ุฏู†ูŠ (Civil No.) | ุฑู‚ู… ุงู„ู‡ูˆูŠุฉ (ID No.) | ID NO. | CIVIL NO.
223
+ - Actual Civil ID is typically 8-12 alphanumeric characters (e.g., 73616576)
224
+
225
+ RULE 3 โ€” MRZ IS GROUND TRUTH (do not override it):
226
+ - MRZ lines (uppercase A-Z, 0-9, < characters) are cryptographically verified
227
+ - MRZ date format is YYMMDD: first 2 = year, middle 2 = month, last 2 = day
228
+ Example: 030512 = year 03 โ†’ 2003, month 05, day 12 โ†’ 12/05/2003
229
+ Example: 260908 = year 26 โ†’ 2026, month 09, day 08 โ†’ 08/09/2026
230
+ - MRZ Sex: M = Male, F = Female
231
+ - If MRZ present, extract name/DOB/sex/expiry/nationality FROM MRZ LINES, not from visual text
232
 
233
  ---
234
 
 
246
 
247
  {raw_text}
248
 
249
+
250
  ---
251
 
252
  ## ๐ŸŒ English Translation
253
+
254
  [Write complete English translation here. If already English, write: Already in English โ€” then copy text]
255
 
256
+
257
  ---
258
 
259
  ## ๐Ÿ—‚๏ธ Key Fields (English)
 
262
  |-------|-------|
263
  | ๐Ÿ“„ Document Type | |
264
  | ๐Ÿ‘ค Full Name | |
265
+ | ๐Ÿ”ข Civil / Document Number | |
266
  | ๐ŸŽ‚ Date of Birth | |
267
  | ๐Ÿ“… Issue Date | |
268
  | โณ Expiry Date | |
 
275
  ---
276
 
277
  ## ๐Ÿ” MRZ Data
278
+
279
+ [Raw MRZ lines here โ€” copy exactly as-is. If not present write: NOT PRESENT]
280
+
281
 
282
  **Parsed MRZ:**
283
  | Field | Value |
 
583
  return result
584
 
585
 
586
+ def parse_mrz_lines(raw_text: str) -> dict:
587
+ """
588
+ Authoritative Python-based MRZ parser.
589
+ Supports TD1 (ID cards, 3x~30 chars) and TD3 (Passports, 2x~44 chars).
590
+ Returns verified dict. Does NOT rely on LLM for date/sex/name parsing.
591
+ """
592
+ import datetime
593
+
594
+ lines = []
595
+ for line in raw_text.split('\n'):
596
+ clean = re.sub(r'\s+', '', line.strip())
597
+ if re.match(r'^[A-Z0-9<]{20,}$', clean):
598
+ lines.append(clean)
599
+
600
+ if not lines:
601
+ return {}
602
+
603
+ def decode_date(yymmdd: str, is_dob: bool = False) -> str:
604
+ try:
605
+ yy = int(yymmdd[0:2])
606
+ mm = int(yymmdd[2:4])
607
+ dd = int(yymmdd[4:6])
608
+ if not (1 <= mm <= 12 and 1 <= dd <= 31):
609
+ return f"Invalid ({yymmdd})"
610
+ current_yy = datetime.datetime.now().year % 100
611
+ year = (1900 + yy) if (is_dob and yy > current_yy) else (2000 + yy)
612
+ return f"{dd:02d}/{mm:02d}/{year}"
613
+ except:
614
+ return yymmdd
615
+
616
+ def clean_field(s: str) -> str:
617
+ return re.sub(r'<+$', '', s).replace('<', ' ').strip()
618
+
619
+ result = {}
620
+
621
+ # TD1: 3 lines, 28-35 chars each
622
+ td1 = [l for l in lines if 28 <= len(l) <= 36]
623
+ if len(td1) >= 2:
624
+ l1, l2 = td1[0], td1[1]
625
+ l3 = td1[2] if len(td1) > 2 else ""
626
+
627
+ if len(l1) >= 14:
628
+ result['doc_type'] = clean_field(l1[0:2])
629
+ result['country_code'] = clean_field(l1[2:5])
630
+ result['doc_number'] = clean_field(l1[5:14])
631
+
632
+ if len(l2) >= 18:
633
+ result['dob'] = decode_date(l2[0:6], is_dob=True)
634
+ sex_char = l2[7] if len(l2) > 7 else ''
635
+ result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
636
+ if len(l2) >= 14:
637
+ result['expiry'] = decode_date(l2[8:14], is_dob=False)
638
+ if len(l2) >= 18:
639
+ result['nationality'] = clean_field(l2[15:18])
640
+
641
+ if l3:
642
+ name_clean = re.sub(r'<+$', '', l3)
643
+ if '<<' in name_clean:
644
+ parts = name_clean.split('<<')
645
+ surname = parts[0].replace('<', ' ').strip()
646
+ given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
647
+ result['name'] = f"{given} {surname}".strip() if given else surname
648
+ else:
649
+ result['name'] = name_clean.replace('<', ' ').strip()
650
+
651
+ result['mrz_format'] = 'TD1'
652
+ return result
653
+
654
+ # TD3: 2 lines, 40-48 chars each
655
+ td3 = [l for l in lines if 40 <= len(l) <= 48]
656
+ if len(td3) >= 2:
657
+ l1, l2 = td3[0], td3[1]
658
+
659
+ if len(l1) >= 5:
660
+ result['doc_type'] = clean_field(l1[0:2])
661
+ result['country_code'] = clean_field(l1[2:5])
662
+ name_section = l1[5:min(44, len(l1))]
663
+ if '<<' in name_section:
664
+ parts = name_section.split('<<')
665
+ surname = parts[0].replace('<', ' ').strip()
666
+ given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
667
+ result['name'] = f"{given} {surname}".strip() if given else surname
668
+ else:
669
+ result['name'] = name_section.replace('<', ' ').strip()
670
+
671
+ if len(l2) >= 27:
672
+ result['doc_number'] = clean_field(l2[0:9])
673
+ result['nationality'] = clean_field(l2[10:13])
674
+ result['dob'] = decode_date(l2[13:19], is_dob=True)
675
+ sex_char = l2[20] if len(l2) > 20 else ''
676
+ result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
677
+ result['expiry'] = decode_date(l2[21:27], is_dob=False)
678
+
679
+ result['mrz_format'] = 'TD3'
680
+ return result
681
+
682
+ return {}
683
+
684
  # ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
685
 
686
  def run_step2_structure(model, processor, metadata: dict, device,
 
808
  full_output = ""
809
  front_result = ""
810
  back_result = ""
811
+ front_meta_saved = {} # โ† NEW: save for MRZ parsing
812
+ back_meta_saved = {} # โ† NEW: save for MRZ parsing
813
 
814
  # ===== FRONT CARD =====
815
  if image_front is not None:
 
875
  back_result = buffer_b
876
  thread_b.join()
877
 
878
+ # ===== MRZ PYTHON PARSE (authoritative) =====
879
+ # โ† NEW BLOCK: Try back card first (MRZ usually on back), then front
880
+ mrz_data = {}
881
+ if back_meta_saved:
882
+ mrz_data = parse_mrz_lines(back_meta_saved.get('original_text', ''))
883
+ if not mrz_data and front_meta_saved:
884
+ mrz_data = parse_mrz_lines(front_meta_saved.get('original_text', ''))
885
+
886
+ if mrz_data:
887
+ full_output += f"\n\n> โœ… **MRZ Python-parsed successfully** ({mrz_data.get('mrz_format','?')} format) โ€” ground truth applied to summary below.\n"
888
+
889
  # ===== UNIFIED SUMMARY (only when both sides uploaded) =====
890
  if image_front is not None and image_back is not None:
891
  full_output += "\n\n---\n\n"