Spaces:

Chhagan005
/

Chhagan-DocVL-Demo

Running on Zero

App Files Files Community

Chhagan-DocVL-Demo / utils.py

Chhagan005

Upload utils.py with huggingface_hub

ed1108a verified 6 days ago

raw

history blame contribute delete

4.58 kB


	import re
	import json
	import logging
	from typing import Dict, Any, Optional

	try:
	from mrz.checker.td1 import TD1CodeChecker
	from mrz.checker.td2 import TD2CodeChecker
	from mrz.checker.td3 import TD3CodeChecker
	from mrz.checker.mrva import MRVACodeChecker
	from mrz.checker.mrvb import MRVBCodeChecker
	HAS_MRZ = True
	except ImportError:
	HAS_MRZ = False
	logging.warning("mrz library not found. MRZ validation will be limited.")

	def clean_json_output(text: str) -> Dict[str, Any]:
	"""
	Extracts the first valid JSON block from a string, handling markdown fences
	and extra text.
	"""
	try:
	# Try to find JSON structure
	match = re.search(r'\{.*\}', text, re.DOTALL)
	if match:
	json_str = match.group(0)
	return json.loads(json_str)
	return {"error": "No JSON found", "raw_text": text}
	except json.JSONDecodeError:
	# Fallback: aggressive cleanup
	try:
	# remove markdown
	text = re.sub(r'```json\s*', '', text)
	text = re.sub(r'```\s*', '', text)
	match = re.search(r'\{.*\}', text, re.DOTALL)
	if match:
	return json.loads(match.group(0))
	except:
	pass
	return {"error": "Invalid JSON format", "raw_text": text}

	def parse_mrz(mrz_lines: list) -> Dict[str, Any]:
	"""
	Parses MRZ lines using the mrz library if available.
	Returns standardized fields: document_number, expiry_date, date_of_birth,
	nationality, sex, names, surname.
	"""
	if not mrz_lines or not HAS_MRZ:
	return {}

	# Simple heuristic to determine type based on line length and count
	# TD1: 3 lines, 30 chars
	# TD2: 2 lines, 36 chars
	# TD3: 2 lines, 44 chars

	clean_lines = [line.replace(' ', '') for line in mrz_lines if line.strip()]
	if not clean_lines:
	return {}

	try:
	checker = None
	if len(clean_lines) == 3 and len(clean_lines[0]) == 30:
	checker = TD1CodeChecker("\n".join(clean_lines))
	elif len(clean_lines) == 2:
	if len(clean_lines[0]) == 36:
	checker = TD2CodeChecker("\n".join(clean_lines))
	elif len(clean_lines[0]) == 44:
	checker = TD3CodeChecker("\n".join(clean_lines))
	# Fallback for MRV (Visa)
	elif len(clean_lines) == 2 and len(clean_lines[0]) == 44:
	checker = MRVACodeChecker("\n".join(clean_lines))
	elif len(clean_lines) == 2 and len(clean_lines[0]) == 36:
	checker = MRVBCodeChecker("\n".join(clean_lines))

	if checker and checker.fields():
	fields = checker.fields()
	return {
	"document_number": fields.document_number,
	"expiry_date": fields.expiry_date,
	"date_of_birth": fields.birth_date,
	"nationality": fields.nationality,
	"sex": fields.sex,
	"surname": fields.surname,
	"given_names": fields.name,
	"issuing_country": fields.country,
	"mrz_valid": bool(checker)
	}
	except Exception as e:
	logging.error(f"MRZ Parsing Error: {e}")

	return {}

	def merge_results(front_data: Dict, back_data: Dict) -> Dict:
	"""
	Merges data from Front and Back scans.
	Strategy:
	1. If MRZ exists (usually Back), use MRZ values for core fields.
	2. Document Number from Front often has better OCR than Back (non-MRZ).
	3. Address usually on Back.
	"""
	merged = front_data.copy()

	# If Back has MRZ, prioritze it for verification
	if back_data.get("mrz_lines"):
	mrz_data = parse_mrz(back_data["mrz_lines"])
	if mrz_data:
	merged["mrz_parsed"] = mrz_data
	# Overwrite potentially hallucinations with strict MRZ data
	if mrz_data.get("document_number"):
	merged["document_number"]["value"] = mrz_data["document_number"]
	merged["document_number"]["confidence"] = 1.0 # MRZ trusted
	if mrz_data.get("date_of_birth"):
	merged["date_of_birth"]["value"] = mrz_data["date_of_birth"]
	if mrz_data.get("expiry_date"):
	merged["expiry_date"]["value"] = mrz_data["expiry_date"]

	# Merge other fields from back if missing in front
	for key, val in back_data.items():
	if key not in merged or not merged[key].get("value"):
	merged[key] = val

	return merged