Spaces:

Chhagan005
/

Multi_ML_OCR

Sleeping

App Files Files Community

Chhagan005 commited on 10 days ago

Commit

589e015

verified ·

1 Parent(s): 6007a3e

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -3

app.py CHANGED Viewed

@@ -203,7 +203,32 @@ YOUR TASKS:
 1. If text is non-English → translate to English with 95%+ accuracy
 2. If text is already English → copy as-is
 3. Extract all key KYC fields
-4. Output EXACTLY in the format below — no extra commentary
 ---
@@ -221,11 +246,14 @@ YOUR TASKS:
 {raw_text}
 ---
 ## 🌐 English Translation
 [Write complete English translation here. If already English, write: Already in English — then copy text]
 ---
 ## 🗂️ Key Fields (English)
@@ -234,7 +262,7 @@ YOUR TASKS:
 |-------|-------|
 | 📄 Document Type | |
 | 👤 Full Name | |
-| 🔢 ID / Document Number | |
 | 🎂 Date of Birth | |
 | 📅 Issue Date | |
 | ⏳ Expiry Date | |
@@ -247,7 +275,9 @@ YOUR TASKS:
 ---
 ## 🔐 MRZ Data
-[Raw MRZ lines here — if not present write: NOT PRESENT]
 **Parsed MRZ:**
 | Field | Value |
@@ -553,6 +583,104 @@ def parse_step1_output(raw_output: str) -> dict:
     return result
 # ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
 def run_step2_structure(model, processor, metadata: dict, device,
@@ -680,6 +808,8 @@ def generate_dual_card_ocr(model_name: str, text: str,
     full_output = ""
     front_result = ""
     back_result = ""
     # ===== FRONT CARD =====
     if image_front is not None:
@@ -745,6 +875,17 @@ def generate_dual_card_ocr(model_name: str, text: str,
         back_result = buffer_b
         thread_b.join()
     # ===== UNIFIED SUMMARY (only when both sides uploaded) =====
     if image_front is not None and image_back is not None:
         full_output += "\n\n---\n\n"

 1. If text is non-English → translate to English with 95%+ accuracy
 2. If text is already English → copy as-is
 3. Extract all key KYC fields
+4. Output EXACTLY in the format below
+⚠️ CRITICAL EXTRACTION RULES — READ BEFORE EXTRACTING:
+RULE 1 — COUNTRY/INSTITUTION vs PERSON NAME:
+- Text appearing at the TOP of ID cards like "Sultanate of Oman", "SULTANATE OF OMAN",
+  "Republic of India", "United Arab Emirates", "ROYAL OMAN POLICE" etc. is the
+  ISSUING COUNTRY or INSTITUTION NAME — THIS IS NOT THE PERSON'S NAME
+- Extract person's name ONLY from explicit name labels:
+  الإسم / الاسم (Arabic) | NAME: | 姓名 (Chinese) | नाम (Hindi) | ИМЯ (Russian)
+- In MRZ: TD1 Line 3 = person's name (e.g., FERIL<SUNNA = "Feril Sunna")
+RULE 2 — CIVIL ID vs BARCODE/CHIP ID:
+- Long hex strings printed on barcodes/chips (e.g., 7E400DD3D032A7C) are card
+  SERIAL/CHIP numbers — NOT the Civil ID
+- The actual Civil/Document ID is under labels:
+  الرقم المدني (Civil No.) | رقم الهوية (ID No.) | ID NO. | CIVIL NO.
+- Actual Civil ID is typically 8-12 alphanumeric characters (e.g., 73616576)
+RULE 3 — MRZ IS GROUND TRUTH (do not override it):
+- MRZ lines (uppercase A-Z, 0-9, < characters) are cryptographically verified
+- MRZ date format is YYMMDD: first 2 = year, middle 2 = month, last 2 = day
+  Example: 030512 = year 03 → 2003, month 05, day 12 → 12/05/2003
+  Example: 260908 = year 26 → 2026, month 09, day 08 → 08/09/2026
+- MRZ Sex: M = Male, F = Female
+- If MRZ present, extract name/DOB/sex/expiry/nationality FROM MRZ LINES, not from visual text
 ---
 {raw_text}
 ---
 ## 🌐 English Translation
 [Write complete English translation here. If already English, write: Already in English — then copy text]
 ---
 ## 🗂️ Key Fields (English)
 |-------|-------|
 | 📄 Document Type | |
 | 👤 Full Name | |
+| 🔢 Civil / Document Number | |
 | 🎂 Date of Birth | |
 | 📅 Issue Date | |
 | ⏳ Expiry Date | |
 ---
 ## 🔐 MRZ Data
+[Raw MRZ lines here — copy exactly as-is. If not present write: NOT PRESENT]
 **Parsed MRZ:**
 | Field | Value |
     return result
+def parse_mrz_lines(raw_text: str) -> dict:
+    """
+    Authoritative Python-based MRZ parser.
+    Supports TD1 (ID cards, 3x~30 chars) and TD3 (Passports, 2x~44 chars).
+    Returns verified dict. Does NOT rely on LLM for date/sex/name parsing.
+    """
+    import datetime
+    lines = []
+    for line in raw_text.split('\n'):
+        clean = re.sub(r'\s+', '', line.strip())
+        if re.match(r'^[A-Z0-9<]{20,}$', clean):
+            lines.append(clean)
+    if not lines:
+        return {}
+    def decode_date(yymmdd: str, is_dob: bool = False) -> str:
+        try:
+            yy = int(yymmdd[0:2])
+            mm = int(yymmdd[2:4])
+            dd = int(yymmdd[4:6])
+            if not (1 <= mm <= 12 and 1 <= dd <= 31):
+                return f"Invalid ({yymmdd})"
+            current_yy = datetime.datetime.now().year % 100
+            year = (1900 + yy) if (is_dob and yy > current_yy) else (2000 + yy)
+            return f"{dd:02d}/{mm:02d}/{year}"
+        except:
+            return yymmdd
+    def clean_field(s: str) -> str:
+        return re.sub(r'<+$', '', s).replace('<', ' ').strip()
+    result = {}
+    # TD1: 3 lines, 28-35 chars each
+    td1 = [l for l in lines if 28 <= len(l) <= 36]
+    if len(td1) >= 2:
+        l1, l2 = td1[0], td1[1]
+        l3 = td1[2] if len(td1) > 2 else ""
+        if len(l1) >= 14:
+            result['doc_type'] = clean_field(l1[0:2])
+            result['country_code'] = clean_field(l1[2:5])
+            result['doc_number'] = clean_field(l1[5:14])
+        if len(l2) >= 18:
+            result['dob'] = decode_date(l2[0:6], is_dob=True)
+            sex_char = l2[7] if len(l2) > 7 else ''
+            result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
+            if len(l2) >= 14:
+                result['expiry'] = decode_date(l2[8:14], is_dob=False)
+            if len(l2) >= 18:
+                result['nationality'] = clean_field(l2[15:18])
+        if l3:
+            name_clean = re.sub(r'<+$', '', l3)
+            if '<<' in name_clean:
+                parts = name_clean.split('<<')
+                surname = parts[0].replace('<', ' ').strip()
+                given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
+                result['name'] = f"{given} {surname}".strip() if given else surname
+            else:
+                result['name'] = name_clean.replace('<', ' ').strip()
+        result['mrz_format'] = 'TD1'
+        return result
+    # TD3: 2 lines, 40-48 chars each
+    td3 = [l for l in lines if 40 <= len(l) <= 48]
+    if len(td3) >= 2:
+        l1, l2 = td3[0], td3[1]
+        if len(l1) >= 5:
+            result['doc_type'] = clean_field(l1[0:2])
+            result['country_code'] = clean_field(l1[2:5])
+            name_section = l1[5:min(44, len(l1))]
+            if '<<' in name_section:
+                parts = name_section.split('<<')
+                surname = parts[0].replace('<', ' ').strip()
+                given = parts[1].replace('<', ' ').strip() if len(parts) > 1 else ''
+                result['name'] = f"{given} {surname}".strip() if given else surname
+            else:
+                result['name'] = name_section.replace('<', ' ').strip()
+        if len(l2) >= 27:
+            result['doc_number'] = clean_field(l2[0:9])
+            result['nationality'] = clean_field(l2[10:13])
+            result['dob'] = decode_date(l2[13:19], is_dob=True)
+            sex_char = l2[20] if len(l2) > 20 else ''
+            result['sex'] = 'Male' if sex_char == 'M' else ('Female' if sex_char == 'F' else sex_char)
+            result['expiry'] = decode_date(l2[21:27], is_dob=False)
+        result['mrz_format'] = 'TD3'
+        return result
+    return {}
 # ===== STEP 2: TRANSLATE + STRUCTURE (STREAMING) =====
 def run_step2_structure(model, processor, metadata: dict, device,
     full_output = ""
     front_result = ""
     back_result = ""
+    front_meta_saved = {}   # ← NEW: save for MRZ parsing
+    back_meta_saved = {}    # ← NEW: save for MRZ parsing
     # ===== FRONT CARD =====
     if image_front is not None:
         back_result = buffer_b
         thread_b.join()
+ # ===== MRZ PYTHON PARSE (authoritative) =====
+    # ← NEW BLOCK: Try back card first (MRZ usually on back), then front
+    mrz_data = {}
+    if back_meta_saved:
+        mrz_data = parse_mrz_lines(back_meta_saved.get('original_text', ''))
+    if not mrz_data and front_meta_saved:
+        mrz_data = parse_mrz_lines(front_meta_saved.get('original_text', ''))
+    if mrz_data:
+        full_output += f"\n\n> ✅ **MRZ Python-parsed successfully** ({mrz_data.get('mrz_format','?')} format) — ground truth applied to summary below.\n"
     # ===== UNIFIED SUMMARY (only when both sides uploaded) =====
     if image_front is not None and image_back is not None:
         full_output += "\n\n---\n\n"