Spaces:

Midnightar
/

document-validator

Sleeping

App Files Files Community

document-validator / app.py

Midnightar

Update app.py

968cf82 verified 25 days ago

raw

history blame

14.7 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	import easyocr
	import cv2
	import numpy as np
	import re
	import os

	app = FastAPI()

	# =========================
	# LOAD OCR MODEL
	# =========================

	reader = easyocr.Reader(['en'])

	# =========================
	# REQUEST MODEL
	# =========================

	class ImageRequest(BaseModel):
	image_path: str

	# =========================
	# IMAGE QUALITY CHECKS
	# =========================

	def is_blurry(image):

	gray = cv2.cvtColor(
	image,
	cv2.COLOR_BGR2GRAY
	)

	variance = cv2.Laplacian(
	gray,
	cv2.CV_64F
	).var()

	return variance < 100


	def is_dark(image):

	brightness = np.mean(image)

	return brightness < 50


	# =========================
	# OCR TEXT EXTRACTION
	# =========================

	def extract_text(image_path):

	results = reader.readtext(image_path)

	text = " ".join(
	[r[1] for r in results]
	).lower()

	return text


	# =========================
	# DOCUMENT DETECTION
	# =========================

	def detect_document(text):

	# CLEAN TEXT
	text = text.lower().strip()

	# REMOVE EXTRA SYMBOLS
	cleaned_text = re.sub(
	r'[^a-zA-Z0-9\s-]',
	' ',
	text
	)

	# SPLIT WORDS
	words = cleaned_text.split()

	# =========================
	# REJECT RANDOM OCR GARBAGE
	# =========================

	garbage_patterns = [
	r'^[a-z0-9]{4,8}$'
	]

	for pattern in garbage_patterns:

	for word in words:

	if re.match(pattern, word):

	if len(words) <= 2:
	return {
	"document_type": "unknown",
	"confidence": 5,
	"matched_keywords": [word],
	"reason": (
	"OCR detected unreadable or "
	"meaningless text."
	)
	}

	# =========================
	# NIN
	# =========================

	nin_keywords = [
	"national identification number",
	"national identity",
	"nin",
	"nimc"
	]

	matched_keywords = []

	for keyword in nin_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) > 0:

	return {
	"document_type": "nin",
	"confidence": 95,
	"matched_keywords": matched_keywords
	}

	# =========================
	# PASSPORT
	# =========================

	passport_keywords = [
	"passport",
	"federal republic of nigeria",
	"nigeria passport"
	]

	matched_keywords = []

	for keyword in passport_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) > 0:

	return {
	"document_type": "passport",
	"confidence": 94,
	"matched_keywords": matched_keywords
	}

	# =========================
	# DRIVER LICENSE
	# =========================

	license_keywords = [
	"driver",
	"license",
	"drivers licence",
	"driver licence",
	"frsc"
	]

	matched_keywords = []

	for keyword in license_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) >= 2:

	return {
	"document_type": "drivers_license",
	"confidence": 92,
	"matched_keywords": matched_keywords
	}

	# =========================
	# VOTER CARD
	# =========================

	voter_keywords = [
	"voter",
	"inec",
	"permanent voter",
	"polling unit"
	]

	matched_keywords = []

	for keyword in voter_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) > 0:

	return {
	"document_type": "voters_card",
	"confidence": 90,
	"matched_keywords": matched_keywords
	}

	# =========================
	# ELECTRICITY / UTILITY BILL
	# =========================

	electricity_keywords = [

	# General
	"electricity",
	"electric bill",
	"power bill",
	"meter number",

	# Nigerian DISCOs
	"ibedc",
	"ibadan electricity",
	"ikedc",
	"ikeja electric",
	"ekedc",
	"eko electric",
	"aedc",
	"abuja electricity",
	"eedc",
	"enugu electricity",
	"bedc",
	"benin electricity",
	"jed",
	"jos electricity",
	"kedco",
	"kano electricity",
	"kaedco",
	"kaduna electric",
	"phed",
	"port harcourt electricity",
	"yedc",
	"yola electricity",

	# Common terms
	"prepaid",
	"postpaid",
	"disco",
	"energy charge",
	"tariff"
	]

	matched_keywords = []

	for keyword in electricity_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) > 0:

	return {
	"document_type": "utility_bill",
	"confidence": 90,
	"matched_keywords": matched_keywords
	}

	# =========================
	# BANK STATEMENT
	# =========================

	bank_keywords = [

	"account statement",
	"statement of account",
	"transaction",
	"balance",
	"account number",
	"credit",
	"debit",
	"withdrawal",
	"deposit",

	# Nigerian Banks
	"access bank",
	"gtbank",
	"uba",
	"zenith bank",
	"first bank",
	"opay",
	"moniepoint",
	"kuda",
	"fcmb",
	"sterling bank"
	]

	matched_keywords = []

	for keyword in bank_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) > 0:

	return {
	"document_type": "bank_statement",
	"confidence": 91,
	"matched_keywords": matched_keywords
	}

	# =========================
	# TENANCY AGREEMENT
	# =========================

	tenancy_keywords = [

	"tenancy agreement",
	"landlord",
	"tenant",
	"rent",
	"property",
	"lease agreement",
	"rental agreement"
	]

	matched_keywords = []

	for keyword in tenancy_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	if len(matched_keywords) > 0:

	return {
	"document_type": "tenancy_agreement",
	"confidence": 89,
	"matched_keywords": matched_keywords
	}

	# =========================
	# VEHICLE KEYWORDS
	# =========================

	vehicle_keywords = [

	"toyota",
	"honda",
	"lexus",
	"benz",
	"mercedes",
	"ford",
	"jeep",
	"hyundai",
	"kia",
	"nissan",
	"camry",
	"corolla",
	"rav4",
	"pilot",
	"highlander",
	"vehicle",
	"plate number"
	]

	matched_keywords = []

	for keyword in vehicle_keywords:

	if keyword in cleaned_text:
	matched_keywords.append(keyword)

	# =========================
	# NIGERIAN STATES
	# =========================

	nigeria_states = [

	"lagos",
	"abuja",
	"kano",
	"kaduna",
	"oyo",
	"ogun",
	"ondo",
	"osun",
	"kwara",
	"imo",
	"anambra",
	"enugu",
	"rivers",
	"delta",
	"edo",
	"cross river",
	"akwa ibom",
	"bayelsa",
	"plateau",
	"benue",
	"kogi",
	"ekiti",
	"niger",
	"zamfara",
	"sokoto",
	"katsina",
	"borno",
	"yobe",
	"adamawa",
	"taraba",
	"gombe",
	"bauchi",
	"jigawa",
	"nasarawa",
	"kebbi",
	"ebonyi"
	]

	state_matches = []

	for state in nigeria_states:

	if state in cleaned_text:
	state_matches.append(state)

	# =========================
	# NIGERIAN PLATE PATTERNS
	# =========================

	plate_patterns = [

	r"[A-Z]{3}-?\d{3}[A-Z]{2}",
	r"[A-Z]{2}\d{3}[A-Z]{3}",
	r"[A-Z]{3}\s\d{3}\s[A-Z]{2}"
	]

	detected_plate = None

	for pattern in plate_patterns:

	plate_match = re.search(
	pattern,
	cleaned_text.upper()
	)

	if plate_match:

	detected_plate = plate_match.group()

	break

	# =========================
	# VEHICLE DETECTION
	# =========================

	if detected_plate:

	return {
	"document_type": "vehicle_plate",
	"confidence": 97,
	"matched_keywords": [
	detected_plate
	] + state_matches
	}

	# VEHICLE WITHOUT CLEAR PLATE
	if len(matched_keywords) > 0:

	return {
	"document_type": "vehicle_image",
	"confidence": 75,
	"matched_keywords": matched_keywords
	}

	# =========================
	# UNKNOWN DOCUMENT
	# =========================

	return None


	# =========================
	# HOME ROUTE
	# =========================

	@app.get("/")
	def home():

	return {
	"success": True,
	"message": "Document Validation API Running",
	"supported_documents": [
	"National ID (NIN)",
	"International Passport",
	"Driver License",
	"Voter Card",
	"Vehicle with Plate Number",
	"Utility Bill",
	"Bank Statement",
	"Tenancy Agreement"
	]
	}


	# =========================
	# VALIDATION ENDPOINT
	# =========================

	@app.post("/validate")
	async def validate_document(
	request: ImageRequest
	):

	try:

	image_path = request.image_path

	# =========================
	# CHECK FILE EXISTS
	# =========================

	if not os.path.exists(image_path):

	return {
	"success": False,
	"message": "Image not found",
	"reason": (
	"The provided image path "
	"does not exist."
	)
	}

	# =========================
	# READ IMAGE
	# =========================

	image = cv2.imread(image_path)

	if image is None:

	return {
	"success": False,
	"message": "Invalid image",
	"reason": (
	"The file could not be "
	"read as an image."
	),
	"suggestion": (
	"Provide a valid JPG or PNG image."
	)
	}

	# =========================
	# BLUR CHECK
	# =========================

	if is_blurry(image):

	return {
	"success": False,
	"message": "Image rejected",
	"reason": "The image is blurry.",
	"suggestion": (
	"Retake the photo with "
	"better focus."
	)
	}

	# =========================
	# DARK IMAGE CHECK
	# =========================

	if is_dark(image):

	return {
	"success": False,
	"message": "Image rejected",
	"reason": "The image is too dark.",
	"suggestion": (
	"Use better lighting."
	)
	}

	# =========================
	# OCR TEXT EXTRACTION
	# =========================

	text = extract_text(image_path)

	# =========================
	# NO TEXT FOUND
	# =========================

	if len(text.strip()) == 0:

	return {
	"success": False,
	"message": "Document rejected",
	"reason": (
	"No readable text was detected."
	),
	"suggestion": (
	"Ensure the document is "
	"clear and fully visible."
	)
	}

	# =========================
	# DOCUMENT DETECTION
	# =========================

	document_result = detect_document(text)

	# =========================
	# UNSUPPORTED DOCUMENT
	# =========================

	if document_result is None:

	return {
	"success": False,
	"message": "Document rejected",
	"reason": (
	"The uploaded image does not "
	"match any supported document type."
	),
	"supported_documents": [
	"National ID (NIN)",
	"International Passport",
	"Driver License",
	"Voter Card",
	"Vehicle with Plate Number",
	"Utility Bill",
	"Bank Statement",
	"Tenancy Agreement"
	],
	"possible_issues": [
	"Image is cropped",
	"Text is unreadable",
	"Unsupported document uploaded",
	"Poor lighting",
	"Low image quality",
	"Document too far from camera",
	"Document partially hidden"
	],
	"ocr_preview": text[:300]
	}

	# =========================
	# SUCCESS RESPONSE
	# =========================

	return {
	"success": True,
	"message": "Document verified successfully",
	"document_type": (
	document_result["document_type"]
	),
	"confidence": (
	document_result["confidence"]
	),
	"matched_keywords": (
	document_result["matched_keywords"]
	),
	"ocr_preview": text[:300]
	}

	except Exception as e:

	return {
	"success": False,
	"message": "System error",
	"reason": str(e)
	}


	# =========================
	# RUN SERVER
	# =========================

	if __name__ == "__main__":

	import uvicorn

	uvicorn.run(
	app,
	host="0.0.0.0",
	port=7860
	)