Spaces:
Running on Zero
Running on Zero
| import re | |
| import json | |
| import logging | |
| from typing import Dict, Any, Optional | |
| try: | |
| from mrz.checker.td1 import TD1CodeChecker | |
| from mrz.checker.td2 import TD2CodeChecker | |
| from mrz.checker.td3 import TD3CodeChecker | |
| from mrz.checker.mrva import MRVACodeChecker | |
| from mrz.checker.mrvb import MRVBCodeChecker | |
| HAS_MRZ = True | |
| except ImportError: | |
| HAS_MRZ = False | |
| logging.warning("mrz library not found. MRZ validation will be limited.") | |
| def clean_json_output(text: str) -> Dict[str, Any]: | |
| """ | |
| Extracts the first valid JSON block from a string, handling markdown fences | |
| and extra text. | |
| """ | |
| try: | |
| # Try to find JSON structure | |
| match = re.search(r'\{.*\}', text, re.DOTALL) | |
| if match: | |
| json_str = match.group(0) | |
| return json.loads(json_str) | |
| return {"error": "No JSON found", "raw_text": text} | |
| except json.JSONDecodeError: | |
| # Fallback: aggressive cleanup | |
| try: | |
| # remove markdown | |
| text = re.sub(r'```json\s*', '', text) | |
| text = re.sub(r'```\s*', '', text) | |
| match = re.search(r'\{.*\}', text, re.DOTALL) | |
| if match: | |
| return json.loads(match.group(0)) | |
| except: | |
| pass | |
| return {"error": "Invalid JSON format", "raw_text": text} | |
| def parse_mrz(mrz_lines: list) -> Dict[str, Any]: | |
| """ | |
| Parses MRZ lines using the mrz library if available. | |
| Returns standardized fields: document_number, expiry_date, date_of_birth, | |
| nationality, sex, names, surname. | |
| """ | |
| if not mrz_lines or not HAS_MRZ: | |
| return {} | |
| # Simple heuristic to determine type based on line length and count | |
| # TD1: 3 lines, 30 chars | |
| # TD2: 2 lines, 36 chars | |
| # TD3: 2 lines, 44 chars | |
| clean_lines = [line.replace(' ', '') for line in mrz_lines if line.strip()] | |
| if not clean_lines: | |
| return {} | |
| try: | |
| checker = None | |
| if len(clean_lines) == 3 and len(clean_lines[0]) == 30: | |
| checker = TD1CodeChecker("\n".join(clean_lines)) | |
| elif len(clean_lines) == 2: | |
| if len(clean_lines[0]) == 36: | |
| checker = TD2CodeChecker("\n".join(clean_lines)) | |
| elif len(clean_lines[0]) == 44: | |
| checker = TD3CodeChecker("\n".join(clean_lines)) | |
| # Fallback for MRV (Visa) | |
| elif len(clean_lines) == 2 and len(clean_lines[0]) == 44: | |
| checker = MRVACodeChecker("\n".join(clean_lines)) | |
| elif len(clean_lines) == 2 and len(clean_lines[0]) == 36: | |
| checker = MRVBCodeChecker("\n".join(clean_lines)) | |
| if checker and checker.fields(): | |
| fields = checker.fields() | |
| return { | |
| "document_number": fields.document_number, | |
| "expiry_date": fields.expiry_date, | |
| "date_of_birth": fields.birth_date, | |
| "nationality": fields.nationality, | |
| "sex": fields.sex, | |
| "surname": fields.surname, | |
| "given_names": fields.name, | |
| "issuing_country": fields.country, | |
| "mrz_valid": bool(checker) | |
| } | |
| except Exception as e: | |
| logging.error(f"MRZ Parsing Error: {e}") | |
| return {} | |
| def merge_results(front_data: Dict, back_data: Dict) -> Dict: | |
| """ | |
| Merges data from Front and Back scans. | |
| Strategy: | |
| 1. If MRZ exists (usually Back), use MRZ values for core fields. | |
| 2. Document Number from Front often has better OCR than Back (non-MRZ). | |
| 3. Address usually on Back. | |
| """ | |
| merged = front_data.copy() | |
| # If Back has MRZ, prioritze it for verification | |
| if back_data.get("mrz_lines"): | |
| mrz_data = parse_mrz(back_data["mrz_lines"]) | |
| if mrz_data: | |
| merged["mrz_parsed"] = mrz_data | |
| # Overwrite potentially hallucinations with strict MRZ data | |
| if mrz_data.get("document_number"): | |
| merged["document_number"]["value"] = mrz_data["document_number"] | |
| merged["document_number"]["confidence"] = 1.0 # MRZ trusted | |
| if mrz_data.get("date_of_birth"): | |
| merged["date_of_birth"]["value"] = mrz_data["date_of_birth"] | |
| if mrz_data.get("expiry_date"): | |
| merged["expiry_date"]["value"] = mrz_data["expiry_date"] | |
| # Merge other fields from back if missing in front | |
| for key, val in back_data.items(): | |
| if key not in merged or not merged[key].get("value"): | |
| merged[key] = val | |
| return merged | |