Spaces:

Midnightar
/

document-validator

Sleeping

App Files Files Community

document-validator / app.py

Midnightar

Update app.py

419ca28 verified 9 days ago

raw

history blame contribute delete

14.8 kB

	from fastapi import FastAPI
	from pydantic import BaseModel
	import easyocr
	import cv2
	import numpy as np
	import re
	import os
	import requests

	app = FastAPI()

	# =========================
	# LOAD OCR MODEL
	# =========================

	reader = easyocr.Reader(['en'])

	# =========================
	# REQUEST MODEL
	# =========================

	class ImageRequest(BaseModel):
	image_url: str
	document_type: str

	# =========================
	# DOWNLOAD IMAGE
	# =========================

	def download_image(url):

	try:

	response = requests.get(
	url,
	timeout=30
	)

	if response.status_code != 200:
	return None

	image_path = "temp.jpg"

	with open(image_path, "wb") as f:
	f.write(response.content)

	return image_path

	except:
	return None

	# =========================
	# IMAGE QUALITY CHECKS
	# =========================

	def is_blurry(image):

	gray = cv2.cvtColor(
	image,
	cv2.COLOR_BGR2GRAY
	)

	variance = cv2.Laplacian(
	gray,
	cv2.CV_64F
	).var()

	return variance < 100


	def is_dark(image):

	brightness = np.mean(image)

	return brightness < 50

	# =========================
	# OCR TEXT EXTRACTION
	# =========================

	def extract_text(image_path):

	results = reader.readtext(image_path)

	text = " ".join(
	[r[1] for r in results]
	).lower()

	return text

	# =========================
	# DOCUMENT VALIDATION
	# =========================

	def validate_document_type(
	text,
	document_type
	):

	text = text.lower().strip()

	cleaned_text = re.sub(
	r'[^a-zA-Z0-9\s-]',
	' ',
	text
	)

	matched_keywords = []

	confidence = 0

	# =========================
	# NATIONAL ID (NIN)
	# =========================

	if document_type == "National ID (NIN)":

	keywords = [
	"national identification number",
	"national identity",
	"nimc",
	"nin"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 25

	# =========================
	# INTERNATIONAL PASSPORT
	# =========================

	elif document_type == "International Passport":

	keywords = [
	"passport",
	"federal republic of nigeria",
	"nigeria passport",
	"international passport"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 25

	# =========================
	# DRIVER LICENSE
	# =========================

	elif document_type == "Driver License":

	keywords = [
	"driver",
	"license",
	"drivers licence",
	"driver licence",
	"frsc"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 20

	# =========================
	# VOTER CARD
	# =========================

	elif document_type == "Voter Card":

	keywords = [
	"voter",
	"inec",
	"permanent voter",
	"polling unit"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 25

	# =========================
	# UTILITY BILL
	# =========================

	elif document_type == "Utility Bill":

	keywords = [

	# General
	"electricity",
	"electric bill",
	"power bill",
	"meter number",
	"meter no",
	"token",
	"kwh",
	"prepaid",
	"postpaid",
	"energy charge",
	"tariff",

	# Nigerian DISCOs
	"ibedc",
	"ibadan electricity",

	"ikedc",
	"ikeja electric",

	"ekedc",
	"eko electric",

	"aedc",
	"abuja electricity",

	"eedc",
	"enugu electricity",

	"bedc",
	"benin electricity",

	"jed",
	"jos electricity",

	"kedco",
	"kano electricity",

	"kaedco",
	"kaduna electric",

	"phed",
	"port harcourt electricity",

	"yedc",
	"yola electricity"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 15

	# =========================
	# BANK STATEMENT
	# =========================

	elif document_type == "Bank Statement":

	keywords = [

	"account statement",
	"statement of account",
	"transaction",
	"balance",
	"account number",
	"credit",
	"debit",
	"withdrawal",
	"deposit",

	# Nigerian Banks
	"access bank",
	"gtbank",
	"uba",
	"zenith bank",
	"first bank",
	"opay",
	"moniepoint",
	"kuda",
	"fcmb",
	"sterling bank",
	"wema bank",
	"providus",
	"fidelity bank",
	"union bank"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 15

	# =========================
	# TENANCY AGREEMENT
	# =========================

	elif document_type == "Tenancy Agreement":

	keywords = [
	"tenancy agreement",
	"landlord",
	"tenant",
	"rent",
	"property",
	"lease agreement",
	"rental agreement"
	]

	for keyword in keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 20

	# =========================
	# VEHICLE WITH PLATE NUMBER
	# =========================

	elif document_type == "Vehicle with Plate Number":

	vehicle_keywords = [

	"toyota",
	"honda",
	"lexus",
	"benz",
	"mercedes",
	"ford",
	"jeep",
	"hyundai",
	"kia",
	"nissan",
	"camry",
	"corolla",
	"rav4",
	"pilot",
	"highlander",
	"vehicle",
	"plate number"
	]

	for keyword in vehicle_keywords:

	if keyword in cleaned_text:

	matched_keywords.append(keyword)

	confidence += 10

	# Nigerian states

	nigeria_states = [
	"lagos",
	"abuja",
	"kano",
	"kaduna",
	"oyo",
	"ogun",
	"ondo",
	"osun",
	"kwara",
	"imo",
	"anambra",
	"enugu",
	"rivers",
	"delta",
	"edo",
	"cross river",
	"akwa ibom",
	"bayelsa",
	"plateau",
	"benue",
	"kogi",
	"ekiti",
	"niger",
	"zamfara",
	"sokoto",
	"katsina",
	"borno",
	"yobe",
	"adamawa",
	"taraba",
	"gombe",
	"bauchi",
	"jigawa",
	"nasarawa",
	"kebbi",
	"ebonyi"
	]

	for state in nigeria_states:

	if state in cleaned_text:

	matched_keywords.append(state)

	confidence += 5

	# Plate patterns

	plate_patterns = [
	r"[A-Z]{3}-?\d{3}[A-Z]{2}",
	r"[A-Z]{2}\d{3}[A-Z]{3}",
	r"[A-Z]{3}\s\d{3}\s[A-Z]{2}"
	]

	for pattern in plate_patterns:

	plate_match = re.search(
	pattern,
	cleaned_text.upper()
	)

	if plate_match:

	matched_keywords.append(
	plate_match.group()
	)

	confidence += 50

	# =========================
	# LOW CONFIDENCE
	# =========================

	if confidence <= 0:

	return None

	confidence = min(confidence, 99)

	return {
	"document_type": document_type,
	"confidence": confidence,
	"matched_keywords": matched_keywords
	}

	# =========================
	# HOME ROUTE
	# =========================

	@app.get("/")
	def home():

	return {
	"success": True,
	"message": "Document Validation API Running",
	"supported_documents": [
	"National ID (NIN)",
	"International Passport",
	"Driver License",
	"Voter Card",
	"Vehicle with Plate Number",
	"Utility Bill",
	"Bank Statement",
	"Tenancy Agreement"
	]
	}

	# =========================
	# VALIDATION ENDPOINT
	# =========================

	@app.post("/validate")
	async def validate_document(
	request: ImageRequest
	):

	try:

	# =========================
	# VALID DOCUMENT TYPES
	# =========================

	valid_document_types = [

	"National ID (NIN)",
	"International Passport",
	"Driver License",
	"Voter Card",
	"Vehicle with Plate Number",
	"Utility Bill",
	"Bank Statement",
	"Tenancy Agreement"
	]

	if request.document_type not in valid_document_types:

	return {
	"success": False,
	"message": "Invalid document type",
	"supported_document_types": (
	valid_document_types
	)
	}

	# =========================
	# DOWNLOAD IMAGE
	# =========================

	image_path = download_image(
	request.image_url
	)

	if image_path is None:

	return {
	"success": False,
	"message": "Image download failed",
	"reason": (
	"Could not download image "
	"from URL."
	)
	}

	# =========================
	# READ IMAGE
	# =========================

	image = cv2.imread(image_path)

	if image is None:

	return {
	"success": False,
	"message": "Invalid image",
	"reason": (
	"The downloaded file "
	"could not be read "
	"as an image."
	),
	"suggestion": (
	"Ensure the URL points "
	"directly to an image."
	)
	}

	# =========================
	# BLUR CHECK
	# =========================

	if is_blurry(image):

	return {
	"success": False,
	"message": "Image rejected",
	"reason": (
	"The uploaded image "
	"is blurry."
	),
	"suggestion": (
	"Retake the photo "
	"with better focus."
	)
	}

	# =========================
	# DARK CHECK
	# =========================

	if is_dark(image):

	return {
	"success": False,
	"message": "Image rejected",
	"reason": (
	"The uploaded image "
	"is too dark."
	),
	"suggestion": (
	"Take the photo in a "
	"brighter environment."
	)
	}

	# =========================
	# OCR EXTRACTION
	# =========================

	text = extract_text(image_path)

	# =========================
	# NO TEXT FOUND
	# =========================

	if len(text.strip()) == 0:

	return {
	"success": False,
	"message": "Document rejected",
	"reason": (
	"No readable text "
	"was detected "
	"in the image."
	),
	"suggestion": (
	"Ensure the document "
	"is clear and visible."
	)
	}

	# =========================
	# VALIDATE DOCUMENT
	# =========================

	document_result = validate_document_type(
	text,
	request.document_type
	)

	# =========================
	# DOCUMENT FAILED
	# =========================

	if document_result is None:

	return {
	"success": False,
	"message": "Document rejected",
	"reason": (
	f"The uploaded image "
	f"does not match "
	f"the expected "
	f"document type: "
	f"{request.document_type}"
	),
	"ocr_preview": text[:300],
	"possible_issues": [
	"Wrong document uploaded",
	"Image is blurry",
	"Image is cropped",
	"Poor lighting",
	"Text not readable",
	"Document partially hidden"
	]
	}

	# =========================
	# SUCCESS RESPONSE
	# =========================

	return {
	"success": True,
	"message": (
	"Document verified successfully"
	),
	"document_type": (
	document_result["document_type"]
	),
	"confidence": (
	document_result["confidence"]
	),
	"matched_keywords": (
	document_result["matched_keywords"]
	),
	"ocr_preview": text[:300]
	}

	except Exception as e:

	return {
	"success": False,
	"message": "System error",
	"reason": str(e)
	}

	finally:

	# =========================
	# CLEAN TEMP FILE
	# =========================

	if os.path.exists("temp.jpg"):

	os.remove("temp.jpg")