import re import json import logging from typing import Dict, Any, Optional try: from mrz.checker.td1 import TD1CodeChecker from mrz.checker.td2 import TD2CodeChecker from mrz.checker.td3 import TD3CodeChecker from mrz.checker.mrva import MRVACodeChecker from mrz.checker.mrvb import MRVBCodeChecker HAS_MRZ = True except ImportError: HAS_MRZ = False logging.warning("mrz library not found. MRZ validation will be limited.") def clean_json_output(text: str) -> Dict[str, Any]: """ Extracts the first valid JSON block from a string, handling markdown fences and extra text. """ try: # Try to find JSON structure match = re.search(r'\{.*\}', text, re.DOTALL) if match: json_str = match.group(0) return json.loads(json_str) return {"error": "No JSON found", "raw_text": text} except json.JSONDecodeError: # Fallback: aggressive cleanup try: # remove markdown text = re.sub(r'```json\s*', '', text) text = re.sub(r'```\s*', '', text) match = re.search(r'\{.*\}', text, re.DOTALL) if match: return json.loads(match.group(0)) except: pass return {"error": "Invalid JSON format", "raw_text": text} def parse_mrz(mrz_lines: list) -> Dict[str, Any]: """ Parses MRZ lines using the mrz library if available. Returns standardized fields: document_number, expiry_date, date_of_birth, nationality, sex, names, surname. """ if not mrz_lines or not HAS_MRZ: return {} # Simple heuristic to determine type based on line length and count # TD1: 3 lines, 30 chars # TD2: 2 lines, 36 chars # TD3: 2 lines, 44 chars clean_lines = [line.replace(' ', '') for line in mrz_lines if line.strip()] if not clean_lines: return {} try: checker = None if len(clean_lines) == 3 and len(clean_lines[0]) == 30: checker = TD1CodeChecker("\n".join(clean_lines)) elif len(clean_lines) == 2: if len(clean_lines[0]) == 36: checker = TD2CodeChecker("\n".join(clean_lines)) elif len(clean_lines[0]) == 44: checker = TD3CodeChecker("\n".join(clean_lines)) # Fallback for MRV (Visa) elif len(clean_lines) == 2 and len(clean_lines[0]) == 44: checker = MRVACodeChecker("\n".join(clean_lines)) elif len(clean_lines) == 2 and len(clean_lines[0]) == 36: checker = MRVBCodeChecker("\n".join(clean_lines)) if checker and checker.fields(): fields = checker.fields() return { "document_number": fields.document_number, "expiry_date": fields.expiry_date, "date_of_birth": fields.birth_date, "nationality": fields.nationality, "sex": fields.sex, "surname": fields.surname, "given_names": fields.name, "issuing_country": fields.country, "mrz_valid": bool(checker) } except Exception as e: logging.error(f"MRZ Parsing Error: {e}") return {} def merge_results(front_data: Dict, back_data: Dict) -> Dict: """ Merges data from Front and Back scans. Strategy: 1. If MRZ exists (usually Back), use MRZ values for core fields. 2. Document Number from Front often has better OCR than Back (non-MRZ). 3. Address usually on Back. """ merged = front_data.copy() # If Back has MRZ, prioritze it for verification if back_data.get("mrz_lines"): mrz_data = parse_mrz(back_data["mrz_lines"]) if mrz_data: merged["mrz_parsed"] = mrz_data # Overwrite potentially hallucinations with strict MRZ data if mrz_data.get("document_number"): merged["document_number"]["value"] = mrz_data["document_number"] merged["document_number"]["confidence"] = 1.0 # MRZ trusted if mrz_data.get("date_of_birth"): merged["date_of_birth"]["value"] = mrz_data["date_of_birth"] if mrz_data.get("expiry_date"): merged["expiry_date"]["value"] = mrz_data["expiry_date"] # Merge other fields from back if missing in front for key, val in back_data.items(): if key not in merged or not merged[key].get("value"): merged[key] = val return merged