File size: 4,579 Bytes
ed1108a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

import re
import json
import logging
from typing import Dict, Any, Optional

try:
    from mrz.checker.td1 import TD1CodeChecker
    from mrz.checker.td2 import TD2CodeChecker
    from mrz.checker.td3 import TD3CodeChecker
    from mrz.checker.mrva import MRVACodeChecker
    from mrz.checker.mrvb import MRVBCodeChecker
    HAS_MRZ = True
except ImportError:
    HAS_MRZ = False
    logging.warning("mrz library not found. MRZ validation will be limited.")

def clean_json_output(text: str) -> Dict[str, Any]:
    """
    Extracts the first valid JSON block from a string, handling markdown fences 
    and extra text.
    """
    try:
        # Try to find JSON structure
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            json_str = match.group(0)
            return json.loads(json_str)
        return {"error": "No JSON found", "raw_text": text}
    except json.JSONDecodeError:
        # Fallback: aggressive cleanup
        try:
            # remove markdown
            text = re.sub(r'```json\s*', '', text)
            text = re.sub(r'```\s*', '', text)
            match = re.search(r'\{.*\}', text, re.DOTALL)
            if match:
                return json.loads(match.group(0))
        except:
            pass
        return {"error": "Invalid JSON format", "raw_text": text}

def parse_mrz(mrz_lines: list) -> Dict[str, Any]:
    """
    Parses MRZ lines using the mrz library if available. 
    Returns standardized fields: document_number, expiry_date, date_of_birth, 
    nationality, sex, names, surname.
    """
    if not mrz_lines or not HAS_MRZ:
        return {}
    
    # Simple heuristic to determine type based on line length and count
    # TD1: 3 lines, 30 chars
    # TD2: 2 lines, 36 chars
    # TD3: 2 lines, 44 chars
    
    clean_lines = [line.replace(' ', '') for line in mrz_lines if line.strip()]
    if not clean_lines:
        return {}

    try:
        checker = None
        if len(clean_lines) == 3 and len(clean_lines[0]) == 30:
            checker = TD1CodeChecker("\n".join(clean_lines))
        elif len(clean_lines) == 2:
            if len(clean_lines[0]) == 36:
                checker = TD2CodeChecker("\n".join(clean_lines))
            elif len(clean_lines[0]) == 44:
                checker = TD3CodeChecker("\n".join(clean_lines))
            # Fallback for MRV (Visa)
            elif len(clean_lines) == 2 and len(clean_lines[0]) == 44:
                 checker = MRVACodeChecker("\n".join(clean_lines))
            elif len(clean_lines) == 2 and len(clean_lines[0]) == 36:
                 checker = MRVBCodeChecker("\n".join(clean_lines))
        
        if checker and checker.fields():
            fields = checker.fields()
            return {
                "document_number": fields.document_number,
                "expiry_date": fields.expiry_date,
                "date_of_birth": fields.birth_date,
                "nationality": fields.nationality,
                "sex": fields.sex,
                "surname": fields.surname,
                "given_names": fields.name,
                "issuing_country": fields.country,
                "mrz_valid": bool(checker)
            }
    except Exception as e:
        logging.error(f"MRZ Parsing Error: {e}")
    
    return {}

def merge_results(front_data: Dict, back_data: Dict) -> Dict:
    """
    Merges data from Front and Back scans.
    Strategy:
    1. If MRZ exists (usually Back), use MRZ values for core fields.
    2. Document Number from Front often has better OCR than Back (non-MRZ).
    3. Address usually on Back.
    """
    merged = front_data.copy()
    
    # If Back has MRZ, prioritze it for verification
    if back_data.get("mrz_lines"):
        mrz_data = parse_mrz(back_data["mrz_lines"])
        if mrz_data:
            merged["mrz_parsed"] = mrz_data
            # Overwrite potentially hallucinations with strict MRZ data
            if mrz_data.get("document_number"):
                merged["document_number"]["value"] = mrz_data["document_number"]
                merged["document_number"]["confidence"] = 1.0 # MRZ trusted
            if mrz_data.get("date_of_birth"):
                merged["date_of_birth"]["value"] = mrz_data["date_of_birth"] 
            if mrz_data.get("expiry_date"):
                merged["expiry_date"]["value"] = mrz_data["expiry_date"]
    
    # Merge other fields from back if missing in front
    for key, val in back_data.items():
        if key not in merged or not merged[key].get("value"):
            merged[key] = val
            
    return merged