Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -203,7 +203,32 @@ YOUR TASKS:
|
|
| 203 |
1. If text is non-English โ translate to English with 95%+ accuracy
|
| 204 |
2. If text is already English โ copy as-is
|
| 205 |
3. Extract all key KYC fields
|
| 206 |
-
4. Output EXACTLY in the format below
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
---
|
| 209 |
|
|
@@ -221,11 +246,14 @@ YOUR TASKS:
|
|
| 221 |
|
| 222 |
{raw_text}
|
| 223 |
|
|
|
|
| 224 |
---
|
| 225 |
|
| 226 |
## ๐ English Translation
|
|
|
|
| 227 |
[Write complete English translation here. If already English, write: Already in English โ then copy text]
|
| 228 |
|
|
|
|
| 229 |
---
|
| 230 |
|
| 231 |
## ๐๏ธ Key Fields (English)
|
|
@@ -234,7 +262,7 @@ YOUR TASKS:
|
|
| 234 |
|-------|-------|
|
| 235 |
| ๐ Document Type | |
|
| 236 |
| ๐ค Full Name | |
|
| 237 |
-
| ๐ข
|
| 238 |
| ๐ Date of Birth | |
|
| 239 |
| ๐
Issue Date | |
|
| 240 |
| โณ Expiry Date | |
|
|
@@ -247,7 +275,9 @@ YOUR TASKS:
|
|
| 247 |
---
|
| 248 |
|
| 249 |
## ๐ MRZ Data
|
| 250 |
-
|
|
|
|
|
|
|
| 251 |
|
| 252 |
**Parsed MRZ:**
|
| 253 |
| Field | Value |
|
|
@@ -553,6 +583,104 @@ def parse_step1_output(raw_output: str) -> dict:
|
|
| 553 |
return result
|
| 554 |
|
| 555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
# ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
|
| 557 |
|
| 558 |
def run_step2_structure(model, processor, metadata: dict, device,
|
|
@@ -680,6 +808,8 @@ def generate_dual_card_ocr(model_name: str, text: str,
|
|
| 680 |
full_output = ""
|
| 681 |
front_result = ""
|
| 682 |
back_result = ""
|
|
|
|
|
|
|
| 683 |
|
| 684 |
# ===== FRONT CARD =====
|
| 685 |
if image_front is not None:
|
|
@@ -745,6 +875,17 @@ def generate_dual_card_ocr(model_name: str, text: str,
|
|
| 745 |
back_result = buffer_b
|
| 746 |
thread_b.join()
|
| 747 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 748 |
# ===== UNIFIED SUMMARY (only when both sides uploaded) =====
|
| 749 |
if image_front is not None and image_back is not None:
|
| 750 |
full_output += "\n\n---\n\n"
|
|
|
|
| 203 |
1. If text is non-English โ translate to English with 95%+ accuracy
|
| 204 |
2. If text is already English โ copy as-is
|
| 205 |
3. Extract all key KYC fields
|
| 206 |
+
4. Output EXACTLY in the format below
|
| 207 |
+
|
| 208 |
+
โ ๏ธ CRITICAL EXTRACTION RULES โ READ BEFORE EXTRACTING:
|
| 209 |
+
|
| 210 |
+
RULE 1 โ COUNTRY/INSTITUTION vs PERSON NAME:
|
| 211 |
+
- Text appearing at the TOP of ID cards like "Sultanate of Oman", "SULTANATE OF OMAN",
|
| 212 |
+
"Republic of India", "United Arab Emirates", "ROYAL OMAN POLICE" etc. is the
|
| 213 |
+
ISSUING COUNTRY or INSTITUTION NAME โ THIS IS NOT THE PERSON'S NAME
|
| 214 |
+
- Extract person's name ONLY from explicit name labels:
|
| 215 |
+
ุงูุฅุณู
/ ุงูุงุณู
(Arabic) | NAME: | ๅงๅ (Chinese) | เคจเคพเคฎ (Hindi) | ะะะฏ (Russian)
|
| 216 |
+
- In MRZ: TD1 Line 3 = person's name (e.g., FERIL<SUNNA = "Feril Sunna")
|
| 217 |
+
|
| 218 |
+
RULE 2 โ CIVIL ID vs BARCODE/CHIP ID:
|
| 219 |
+
- Long hex strings printed on barcodes/chips (e.g., 7E400DD3D032A7C) are card
|
| 220 |
+
SERIAL/CHIP numbers โ NOT the Civil ID
|
| 221 |
+
- The actual Civil/Document ID is under labels:
|
| 222 |
+
ุงูุฑูู
ุงูู
ุฏูู (Civil No.) | ุฑูู
ุงููููุฉ (ID No.) | ID NO. | CIVIL NO.
|
| 223 |
+
- Actual Civil ID is typically 8-12 alphanumeric characters (e.g., 73616576)
|
| 224 |
+
|
| 225 |
+
RULE 3 โ MRZ IS GROUND TRUTH (do not override it):
|
| 226 |
+
- MRZ lines (uppercase A-Z, 0-9, < characters) are cryptographically verified
|
| 227 |
+
- MRZ date format is YYMMDD: first 2 = year, middle 2 = month, last 2 = day
|
| 228 |
+
Example: 030512 = year 03 โ 2003, month 05, day 12 โ 12/05/2003
|
| 229 |
+
Example: 260908 = year 26 โ 2026, month 09, day 08 โ 08/09/2026
|
| 230 |
+
- MRZ Sex: M = Male, F = Female
|
| 231 |
+
- If MRZ present, extract name/DOB/sex/expiry/nationality FROM MRZ LINES, not from visual text
|
| 232 |
|
| 233 |
---
|
| 234 |
|
|
|
|
| 246 |
|
| 247 |
{raw_text}
|
| 248 |
|
| 249 |
+
|
| 250 |
---
|
| 251 |
|
| 252 |
## ๐ English Translation
|
| 253 |
+
|
| 254 |
[Write complete English translation here. If already English, write: Already in English โ then copy text]
|
| 255 |
|
| 256 |
+
|
| 257 |
---
|
| 258 |
|
| 259 |
## ๐๏ธ Key Fields (English)
|
|
|
|
| 262 |
|-------|-------|
|
| 263 |
| ๐ Document Type | |
|
| 264 |
| ๐ค Full Name | |
|
| 265 |
+
| ๐ข Civil / Document Number | |
|
| 266 |
| ๐ Date of Birth | |
|
| 267 |
| ๐
Issue Date | |
|
| 268 |
| โณ Expiry Date | |
|
|
|
|
| 275 |
---
|
| 276 |
|
| 277 |
## ๐ MRZ Data
|
| 278 |
+
|
| 279 |
+
[Raw MRZ lines here โ copy exactly as-is. If not present write: NOT PRESENT]
|
| 280 |
+
|
| 281 |
|
| 282 |
**Parsed MRZ:**
|
| 283 |
| Field | Value |
|
|
|
|
| 583 |
return result
|
| 584 |
|
| 585 |
|
| 586 |
+
def parse_mrz_lines(raw_text: str) -> dict:
|
| 587 |
+
"""
|
| 588 |
+
Authoritative Python-based MRZ parser.
|
| 589 |
+
Supports TD1 (ID cards, 3x~30 chars) and TD3 (Passports, 2x~44 chars).
|
| 590 |
+
Returns verified dict. Does NOT rely on LLM for date/sex/name parsing.
|
| 591 |
+
"""
|
| 592 |
+
import datetime
|
| 593 |
+
|
| 594 |
+
lines = []
|
| 595 |
+
for line in raw_text.split('\n'):
|
| 596 |
+
clean = re.sub(r'\s+', '', line.strip())
|
| 597 |
+
if re.match(r'^[A-Z0-9<]{20,}$', clean):
|
| 598 |
+
lines.append(clean)
|
| 599 |
+
|
| 600 |
+
if not lines:
|
| 601 |
+
return {}
|
| 602 |
+
|
| 603 |
+
def decode_date(yymmdd: str, is_dob: bool = False) -> str:
|
| 604 |
+
try:
|
| 605 |
+
yy = int(yymmdd[0:2])
|
| 606 |
+
mm = int(yymmdd[2:4])
|
| 607 |
+
dd = int(yymmdd[4:6])
|
| 608 |
+
if not (1 <= mm <= 12 and 1 <= dd <= 31):
|
| 609 |
+
return f"Invalid ({yymmdd})"
|
| 610 |
+
current_yy = datetime.datetime.now().year % 100
|
| 611 |
+
year = (1900 + yy) if (is_dob and yy > current_yy) else (2000 + yy)
|
| 612 |
+
return f"{dd:02d}/{mm:02d}/{year}"
|
| 613 |
+
except:
|
| 614 |
+
return yymmdd
|
| 615 |
+
|
| 616 |
+
def clean_field(s: str) -> str:
|
| 617 |
+
return re.sub(r'<+$', '', s).replace('<', ' ').strip()
|
| 618 |
+
|
| 619 |
+
result = {}
|
| 620 |
+
|
| 621 |
+
# TD1: 3 lines, 28-35 chars each
|
| 622 |
+
td1 = [l for l in lines if 28 <= len(l) <= 36]
|
| 623 |
+
if len(td1) >= 2:
|
| 624 |
+
l1, l2 = td1[0], td1[1]
|
| 625 |
+
l3 = td1[2] if len(td1) > 2 else ""
|
| 626 |
+
|
| 627 |
+
if len(l1) >= 14:
|
| 628 |
+
result['doc_type'] = clean_field(l1[0:2])
|
| 629 |
+
result['country_code'] = clean_field(l1[2:5])
|
| 630 |
+
result['doc_number'] = clean_field(l1[5:14])
|
| 631 |
+
|
| 632 |
+
if len(l2) >= 18:
|
| 633 |
+
result['dob'] = decode_date(l2[0:6], is_dob=True)
|
| 634 |
+
sex_char = l2[7] if len(l2) > 7 else ''
|
| 635 |
+
result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
|
| 636 |
+
if len(l2) >= 14:
|
| 637 |
+
result['expiry'] = decode_date(l2[8:14], is_dob=False)
|
| 638 |
+
if len(l2) >= 18:
|
| 639 |
+
result['nationality'] = clean_field(l2[15:18])
|
| 640 |
+
|
| 641 |
+
if l3:
|
| 642 |
+
name_clean = re.sub(r'<+$', '', l3)
|
| 643 |
+
if '<<' in name_clean:
|
| 644 |
+
parts = name_clean.split('<<')
|
| 645 |
+
surname = parts[0].replace('<', ' ').strip()
|
| 646 |
+
given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
|
| 647 |
+
result['name'] = f"{given} {surname}".strip() if given else surname
|
| 648 |
+
else:
|
| 649 |
+
result['name'] = name_clean.replace('<', ' ').strip()
|
| 650 |
+
|
| 651 |
+
result['mrz_format'] = 'TD1'
|
| 652 |
+
return result
|
| 653 |
+
|
| 654 |
+
# TD3: 2 lines, 40-48 chars each
|
| 655 |
+
td3 = [l for l in lines if 40 <= len(l) <= 48]
|
| 656 |
+
if len(td3) >= 2:
|
| 657 |
+
l1, l2 = td3[0], td3[1]
|
| 658 |
+
|
| 659 |
+
if len(l1) >= 5:
|
| 660 |
+
result['doc_type'] = clean_field(l1[0:2])
|
| 661 |
+
result['country_code'] = clean_field(l1[2:5])
|
| 662 |
+
name_section = l1[5:min(44, len(l1))]
|
| 663 |
+
if '<<' in name_section:
|
| 664 |
+
parts = name_section.split('<<')
|
| 665 |
+
surname = parts[0].replace('<', ' ').strip()
|
| 666 |
+
given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
|
| 667 |
+
result['name'] = f"{given} {surname}".strip() if given else surname
|
| 668 |
+
else:
|
| 669 |
+
result['name'] = name_section.replace('<', ' ').strip()
|
| 670 |
+
|
| 671 |
+
if len(l2) >= 27:
|
| 672 |
+
result['doc_number'] = clean_field(l2[0:9])
|
| 673 |
+
result['nationality'] = clean_field(l2[10:13])
|
| 674 |
+
result['dob'] = decode_date(l2[13:19], is_dob=True)
|
| 675 |
+
sex_char = l2[20] if len(l2) > 20 else ''
|
| 676 |
+
result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
|
| 677 |
+
result['expiry'] = decode_date(l2[21:27], is_dob=False)
|
| 678 |
+
|
| 679 |
+
result['mrz_format'] = 'TD3'
|
| 680 |
+
return result
|
| 681 |
+
|
| 682 |
+
return {}
|
| 683 |
+
|
| 684 |
# ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
|
| 685 |
|
| 686 |
def run_step2_structure(model, processor, metadata: dict, device,
|
|
|
|
| 808 |
full_output = ""
|
| 809 |
front_result = ""
|
| 810 |
back_result = ""
|
| 811 |
+
front_meta_saved = {} # โ NEW: save for MRZ parsing
|
| 812 |
+
back_meta_saved = {} # โ NEW: save for MRZ parsing
|
| 813 |
|
| 814 |
# ===== FRONT CARD =====
|
| 815 |
if image_front is not None:
|
|
|
|
| 875 |
back_result = buffer_b
|
| 876 |
thread_b.join()
|
| 877 |
|
| 878 |
+
# ===== MRZ PYTHON PARSE (authoritative) =====
|
| 879 |
+
# โ NEW BLOCK: Try back card first (MRZ usually on back), then front
|
| 880 |
+
mrz_data = {}
|
| 881 |
+
if back_meta_saved:
|
| 882 |
+
mrz_data = parse_mrz_lines(back_meta_saved.get('original_text', ''))
|
| 883 |
+
if not mrz_data and front_meta_saved:
|
| 884 |
+
mrz_data = parse_mrz_lines(front_meta_saved.get('original_text', ''))
|
| 885 |
+
|
| 886 |
+
if mrz_data:
|
| 887 |
+
full_output += f"\n\n> โ
**MRZ Python-parsed successfully** ({mrz_data.get('mrz_format','?')} format) โ ground truth applied to summary below.\n"
|
| 888 |
+
|
| 889 |
# ===== UNIFIED SUMMARY (only when both sides uploaded) =====
|
| 890 |
if image_front is not None and image_back is not None:
|
| 891 |
full_output += "\n\n---\n\n"
|