Spaces:

Chhagan005
/

Chhagan-DocVL-Demo

Sleeping

File size: 4,579 Bytes

ed1108a


import re
import json
import logging
from typing import Dict, Any, Optional

try:
    from mrz.checker.td1 import TD1CodeChecker
    from mrz.checker.td2 import TD2CodeChecker
    from mrz.checker.td3 import TD3CodeChecker
    from mrz.checker.mrva import MRVACodeChecker
    from mrz.checker.mrvb import MRVBCodeChecker
    HAS_MRZ = True
except ImportError:
    HAS_MRZ = False
    logging.warning("mrz library not found. MRZ validation will be limited.")

def clean_json_output(text: str) -> Dict[str, Any]:
    """
    Extracts the first valid JSON block from a string, handling markdown fences 
    and extra text.
    """
    try:
        # Try to find JSON structure
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            json_str = match.group(0)
            return json.loads(json_str)
        return {"error": "No JSON found", "raw_text": text}
    except json.JSONDecodeError:
        # Fallback: aggressive cleanup
        try:
            # remove markdown
            text = re.sub(r'```json\s*', '', text)
            text = re.sub(r'```\s*', '', text)
            match = re.search(r'\{.*\}', text, re.DOTALL)
            if match:
                return json.loads(match.group(0))
        except:
            pass
        return {"error": "Invalid JSON format", "raw_text": text}

def parse_mrz(mrz_lines: list) -> Dict[str, Any]:
    """
    Parses MRZ lines using the mrz library if available. 
    Returns standardized fields: document_number, expiry_date, date_of_birth, 
    nationality, sex, names, surname.
    """
    if not mrz_lines or not HAS_MRZ:
        return {}
    
    # Simple heuristic to determine type based on line length and count
    # TD1: 3 lines, 30 chars
    # TD2: 2 lines, 36 chars
    # TD3: 2 lines, 44 chars
    
    clean_lines = [line.replace(' ', '') for line in mrz_lines if line.strip()]
    if not clean_lines:
        return {}

    try:
        checker = None
        if len(clean_lines) == 3 and len(clean_lines[0]) == 30:
            checker = TD1CodeChecker("\n".join(clean_lines))
        elif len(clean_lines) == 2:
            if len(clean_lines[0]) == 36:
                checker = TD2CodeChecker("\n".join(clean_lines))
            elif len(clean_lines[0]) == 44:
                checker = TD3CodeChecker("\n".join(clean_lines))
            # Fallback for MRV (Visa)
            elif len(clean_lines) == 2 and len(clean_lines[0]) == 44:
                 checker = MRVACodeChecker("\n".join(clean_lines))
            elif len(clean_lines) == 2 and len(clean_lines[0]) == 36:
                 checker = MRVBCodeChecker("\n".join(clean_lines))
        
        if checker and checker.fields():
            fields = checker.fields()
            return {
                "document_number": fields.document_number,
                "expiry_date": fields.expiry_date,
                "date_of_birth": fields.birth_date,
                "nationality": fields.nationality,
                "sex": fields.sex,
                "surname": fields.surname,
                "given_names": fields.name,
                "issuing_country": fields.country,
                "mrz_valid": bool(checker)
            }
    except Exception as e:
        logging.error(f"MRZ Parsing Error: {e}")
    
    return {}

def merge_results(front_data: Dict, back_data: Dict) -> Dict:
    """
    Merges data from Front and Back scans.
    Strategy:
    1. If MRZ exists (usually Back), use MRZ values for core fields.
    2. Document Number from Front often has better OCR than Back (non-MRZ).
    3. Address usually on Back.
    """
    merged = front_data.copy()
    
    # If Back has MRZ, prioritze it for verification
    if back_data.get("mrz_lines"):
        mrz_data = parse_mrz(back_data["mrz_lines"])
        if mrz_data:
            merged["mrz_parsed"] = mrz_data
            # Overwrite potentially hallucinations with strict MRZ data
            if mrz_data.get("document_number"):
                merged["document_number"]["value"] = mrz_data["document_number"]
                merged["document_number"]["confidence"] = 1.0 # MRZ trusted
            if mrz_data.get("date_of_birth"):
                merged["date_of_birth"]["value"] = mrz_data["date_of_birth"] 
            if mrz_data.get("expiry_date"):
                merged["expiry_date"]["value"] = mrz_data["expiry_date"]
    
    # Merge other fields from back if missing in front
    for key, val in back_data.items():
        if key not in merged or not merged[key].get("value"):
            merged[key] = val
            
    return merged