Spaces:

Midnightar
/

document-validator

Sleeping

App Files Files Community

Midnightar commited on 13 days ago

Commit

0fc2bf7

verified ·

1 Parent(s): d86e67c

Update app.py

Browse files

Files changed (1) hide show

app.py +262 -139

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
-from fastapi import FastAPI, UploadFile, File
 import easyocr
 import cv2
 import numpy as np
 import re
 import os
 app = FastAPI()
@@ -13,18 +15,62 @@ app = FastAPI()
 reader = easyocr.Reader(['en'])
 # =========================
 # IMAGE QUALITY CHECKS
 # =========================
 def is_blurry(image):
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    variance = cv2.Laplacian(gray, cv2.CV_64F).var()
     return variance < 100
 def is_dark(image):
     brightness = np.mean(image)
     return brightness < 50
@@ -33,8 +79,13 @@ def is_dark(image):
 # =========================
 def extract_text(image_path):
     results = reader.readtext(image_path)
-    text = " ".join([r[1] for r in results]).lower()
     return text
@@ -44,36 +95,64 @@ def extract_text(image_path):
 def detect_document(text):
     # CLEAN TEXT
     text = text.lower().strip()
-    # REMOVE EXTRA SYMBOLS
-    cleaned_text = re.sub(r'[^a-zA-Z0-9\s-]', ' ', text)
-    # SPLIT WORDS
     words = cleaned_text.split()
     # =========================
-    # REJECT RANDOM OCR GARBAGE
     # =========================
     garbage_patterns = [
         r'^[a-z0-9]{4,8}$'
     ]
     for pattern in garbage_patterns:
         for word in words:
             if re.match(pattern, word):
-                if len(words) <= 2:
-                    return {
-                        "document_type": "unknown",
-                        "confidence": 5,
-                        "matched_keywords": [word],
-                        "reason": (
-                            "OCR detected unreadable or "
-                            "meaningless text."
-                        )
-                    }
     # =========================
     # NIN
@@ -82,22 +161,21 @@ def detect_document(text):
     nin_keywords = [
         "national identification number",
         "national identity",
-        "nin",
         "nimc"
     ]
-    matched_keywords = []
     for keyword in nin_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "nin",
-            "confidence": 95,
-            "matched_keywords": matched_keywords
-        }
     # =========================
     # PASSPORT
@@ -106,21 +184,16 @@ def detect_document(text):
     passport_keywords = [
         "passport",
         "federal republic of nigeria",
-        "nigeria passport"
     ]
-    matched_keywords = []
     for keyword in passport_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "passport",
-            "confidence": 94,
-            "matched_keywords": matched_keywords
-        }
     # =========================
     # DRIVER LICENSE
@@ -134,18 +207,12 @@ def detect_document(text):
         "frsc"
     ]
-    matched_keywords = []
     for keyword in license_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) >= 2:
-        return {
-            "document_type": "drivers_license",
-            "confidence": 92,
-            "matched_keywords": matched_keywords
-        }
     # =========================
     # VOTER CARD
@@ -158,21 +225,15 @@ def detect_document(text):
         "polling unit"
     ]
-    matched_keywords = []
     for keyword in voter_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "voters_card",
-            "confidence": 90,
-            "matched_keywords": matched_keywords
-        }
     # =========================
-    # ELECTRICITY COMPANIES
     # =========================
     electricity_keywords = [
@@ -182,57 +243,62 @@ def detect_document(text):
         "electric bill",
         "power bill",
         "meter number",
         # Nigerian DISCOs
         "ibedc",
         "ibadan electricity",
         "ikedc",
         "ikeja electric",
         "ekedc",
         "eko electric",
         "aedc",
         "abuja electricity",
         "eedc",
         "enugu electricity",
         "bedc",
         "benin electricity",
         "jed",
         "jos electricity",
         "kedco",
         "kano electricity",
         "kaedco",
         "kaduna electric",
         "phed",
         "port harcourt electricity",
-        "yedc",
-        "yola electricity",
-        # Common Nigerian utility terms
-        "prepaid",
-        "postpaid",
-        "disco",
-        "energy charge",
-        "tariff"
     ]
-    matched_keywords = []
     for keyword in electricity_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "utility_bill",
-            "confidence": 90,
-            "matched_keywords": matched_keywords
-        }
     # =========================
     # BANK STATEMENT
     # =========================
     bank_keywords = [
         "account statement",
         "statement of account",
         "transaction",
@@ -253,21 +319,19 @@ def detect_document(text):
         "moniepoint",
         "kuda",
         "fcmb",
-        "sterling bank"
     ]
-    matched_keywords = []
     for keyword in bank_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "bank_statement",
-            "confidence": 91,
-            "matched_keywords": matched_keywords
-        }
     # =========================
     # TENANCY AGREEMENT
@@ -283,18 +347,12 @@ def detect_document(text):
         "rental agreement"
     ]
-    matched_keywords = []
     for keyword in tenancy_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "tenancy_agreement",
-            "confidence": 89,
-            "matched_keywords": matched_keywords
-        }
     # =========================
     # VEHICLE KEYWORDS
@@ -320,11 +378,12 @@ def detect_document(text):
         "plate number"
     ]
-    matched_keywords = []
     for keyword in vehicle_keywords:
         if keyword in cleaned_text:
-            matched_keywords.append(keyword)
     # =========================
     # NIGERIAN STATES
@@ -369,11 +428,12 @@ def detect_document(text):
         "ebonyi"
     ]
-    state_matches = []
     for state in nigeria_states:
         if state in cleaned_text:
-            state_matches.append(state)
     # =========================
     # NIGERIAN PLATE PATTERNS
@@ -385,39 +445,71 @@ def detect_document(text):
         r"[A-Z]{3}\s\d{3}\s[A-Z]{2}"
     ]
-    detected_plate = None
     for pattern in plate_patterns:
-        plate_match = re.search(pattern, cleaned_text.upper())
         if plate_match:
-            detected_plate = plate_match.group()
-            break
     # =========================
-    # VEHICLE DETECTION
     # =========================
-    if detected_plate:
-        return {
-            "document_type": "vehicle_plate",
-            "confidence": 97,
-            "matched_keywords": [detected_plate] + state_matches
-        }
-    # VEHICLE WITHOUT CLEAR PLATE
-    if len(matched_keywords) > 0:
-        return {
-            "document_type": "vehicle_image",
-            "confidence": 75,
-            "matched_keywords": matched_keywords
-        }
     # =========================
-    # UNKNOWN DOCUMENT
     # =========================
-    return None
 # =========================
@@ -426,6 +518,7 @@ def detect_document(text):
 @app.get("/")
 def home():
     return {
         "success": True,
         "message": "Document Validation API Running",
@@ -447,18 +540,30 @@ def home():
 # =========================
 @app.post("/validate")
-async def validate_document(file: UploadFile = File(...)):
     try:
         # =========================
-        # SAVE FILE
         # =========================
-        image_path = "temp.jpg"
-        with open(image_path, "wb") as f:
-            f.write(await file.read())
         # =========================
         # READ IMAGE
@@ -467,15 +572,17 @@ async def validate_document(file: UploadFile = File(...)):
         image = cv2.imread(image_path)
         if image is None:
             return {
                 "success": False,
                 "message": "Invalid image",
                 "reason": (
-                    "The uploaded file could not "
-                    "be read as an image."
                 ),
                 "suggestion": (
-                    "Upload a valid JPG or PNG image."
                 )
             }
@@ -484,12 +591,16 @@ async def validate_document(file: UploadFile = File(...)):
         # =========================
         if is_blurry(image):
             return {
                 "success": False,
                 "message": "Image rejected",
-                "reason": "The uploaded image is blurry.",
                 "suggestion": (
-                    "Retake the photo with better focus."
                 )
             }
@@ -498,12 +609,16 @@ async def validate_document(file: UploadFile = File(...)):
         # =========================
         if is_dark(image):
             return {
                 "success": False,
                 "message": "Image rejected",
-                "reason": "The uploaded image is too dark.",
                 "suggestion": (
-                    "Take the photo in a brighter environment."
                 )
             }
@@ -518,16 +633,17 @@ async def validate_document(file: UploadFile = File(...)):
         # =========================
         if len(text.strip()) == 0:
             return {
                 "success": False,
                 "message": "Document rejected",
                 "reason": (
-                    "No readable text was detected "
-                    "in the image."
                 ),
                 "suggestion": (
-                    "Ensure the document is clear "
-                    "and fully visible."
                 )
             }
@@ -535,19 +651,23 @@ async def validate_document(file: UploadFile = File(...)):
         # DOCUMENT DETECTION
         # =========================
-        document_result = detect_document(text)
         # =========================
         # UNSUPPORTED DOCUMENT
         # =========================
         if document_result is None:
             return {
                 "success": False,
                 "message": "Document rejected",
                 "reason": (
-                    "The uploaded image does not match "
-                    "any supported document type."
                 ),
                 "supported_documents": [
                     "National ID (NIN)",
@@ -587,6 +707,9 @@ async def validate_document(file: UploadFile = File(...)):
             "matched_keywords": (
                 document_result["matched_keywords"]
             ),
             "ocr_preview": text[:300]
         }

+from fastapi import FastAPI
+from pydantic import BaseModel
 import easyocr
 import cv2
 import numpy as np
 import re
 import os
+import requests
 app = FastAPI()
 reader = easyocr.Reader(['en'])
+# =========================
+# REQUEST MODEL
+# =========================
+class ImageRequest(BaseModel):
+    image_url: str
+# =========================
+# DOWNLOAD IMAGE
+# =========================
+def download_image(url):
+    try:
+        response = requests.get(
+            url,
+            timeout=30
+        )
+        if response.status_code != 200:
+            return None
+        image_path = "temp.jpg"
+        with open(image_path, "wb") as f:
+            f.write(response.content)
+        return image_path
+    except:
+        return None
 # =========================
 # IMAGE QUALITY CHECKS
 # =========================
 def is_blurry(image):
+    gray = cv2.cvtColor(
+        image,
+        cv2.COLOR_BGR2GRAY
+    )
+    variance = cv2.Laplacian(
+        gray,
+        cv2.CV_64F
+    ).var()
     return variance < 100
 def is_dark(image):
     brightness = np.mean(image)
     return brightness < 50
 # =========================
 def extract_text(image_path):
     results = reader.readtext(image_path)
+    text = " ".join(
+        [r[1] for r in results]
+    ).lower()
     return text
 def detect_document(text):
+    # =========================
     # CLEAN TEXT
+    # =========================
     text = text.lower().strip()
+    cleaned_text = re.sub(
+        r'[^a-zA-Z0-9\s-]',
+        ' ',
+        text
+    )
     words = cleaned_text.split()
     # =========================
+    # SCORE SYSTEM
+    # =========================
+    scores = {
+        "nin": 0,
+        "passport": 0,
+        "drivers_license": 0,
+        "voters_card": 0,
+        "utility_bill": 0,
+        "bank_statement": 0,
+        "tenancy_agreement": 0,
+        "vehicle_plate": 0,
+        "vehicle_image": 0
+    }
+    matched_keywords = {
+        "nin": [],
+        "passport": [],
+        "drivers_license": [],
+        "voters_card": [],
+        "utility_bill": [],
+        "bank_statement": [],
+        "tenancy_agreement": [],
+        "vehicle_plate": [],
+        "vehicle_image": []
+    }
+    # =========================
+    # GARBAGE OCR DETECTION
     # =========================
     garbage_patterns = [
         r'^[a-z0-9]{4,8}$'
     ]
+    garbage_count = 0
     for pattern in garbage_patterns:
         for word in words:
             if re.match(pattern, word):
+                garbage_count += 1
     # =========================
     # NIN
     nin_keywords = [
         "national identification number",
         "national identity",
         "nimc"
     ]
     for keyword in nin_keywords:
         if keyword in cleaned_text:
+            scores["nin"] += 5
+            matched_keywords["nin"].append(keyword)
+    # Weak standalone nin
+    if " nin " in f" {cleaned_text} ":
+        scores["nin"] += 1
+        matched_keywords["nin"].append("nin")
     # =========================
     # PASSPORT
     passport_keywords = [
         "passport",
         "federal republic of nigeria",
+        "nigeria passport",
+        "international passport"
     ]
     for keyword in passport_keywords:
         if keyword in cleaned_text:
+            scores["passport"] += 5
+            matched_keywords["passport"].append(keyword)
     # =========================
     # DRIVER LICENSE
         "frsc"
     ]
     for keyword in license_keywords:
         if keyword in cleaned_text:
+            scores["drivers_license"] += 3
+            matched_keywords["drivers_license"].append(keyword)
     # =========================
     # VOTER CARD
         "polling unit"
     ]
     for keyword in voter_keywords:
         if keyword in cleaned_text:
+            scores["voters_card"] += 4
+            matched_keywords["voters_card"].append(keyword)
     # =========================
+    # UTILITY BILL
     # =========================
     electricity_keywords = [
         "electric bill",
         "power bill",
         "meter number",
+        "meter no",
+        "token",
+        "kwh",
+        "prepaid",
+        "postpaid",
+        "energy charge",
+        "tariff",
         # Nigerian DISCOs
         "ibedc",
         "ibadan electricity",
         "ikedc",
         "ikeja electric",
         "ekedc",
         "eko electric",
         "aedc",
         "abuja electricity",
         "eedc",
         "enugu electricity",
         "bedc",
         "benin electricity",
         "jed",
         "jos electricity",
         "kedco",
         "kano electricity",
         "kaedco",
         "kaduna electric",
         "phed",
         "port harcourt electricity",
+        "yedc",
+        "yola electricity"
     ]
     for keyword in electricity_keywords:
         if keyword in cleaned_text:
+            scores["utility_bill"] += 4
+            matched_keywords["utility_bill"].append(keyword)
     # =========================
     # BANK STATEMENT
     # =========================
     bank_keywords = [
         "account statement",
         "statement of account",
         "transaction",
         "moniepoint",
         "kuda",
         "fcmb",
+        "sterling bank",
+        "wema bank",
+        "providus",
+        "fidelity bank",
+        "union bank"
     ]
     for keyword in bank_keywords:
         if keyword in cleaned_text:
+            scores["bank_statement"] += 3
+            matched_keywords["bank_statement"].append(keyword)
     # =========================
     # TENANCY AGREEMENT
         "rental agreement"
     ]
     for keyword in tenancy_keywords:
         if keyword in cleaned_text:
+            scores["tenancy_agreement"] += 3
+            matched_keywords["tenancy_agreement"].append(keyword)
     # =========================
     # VEHICLE KEYWORDS
         "plate number"
     ]
     for keyword in vehicle_keywords:
         if keyword in cleaned_text:
+            scores["vehicle_image"] += 3
+            matched_keywords["vehicle_image"].append(keyword)
     # =========================
     # NIGERIAN STATES
         "ebonyi"
     ]
     for state in nigeria_states:
         if state in cleaned_text:
+            scores["vehicle_plate"] += 1
+            matched_keywords["vehicle_plate"].append(state)
     # =========================
     # NIGERIAN PLATE PATTERNS
         r"[A-Z]{3}\s\d{3}\s[A-Z]{2}"
     ]
     for pattern in plate_patterns:
+        plate_match = re.search(
+            pattern,
+            cleaned_text.upper()
+        )
         if plate_match:
+            scores["vehicle_plate"] += 10
+            matched_keywords[
+                "vehicle_plate"
+            ].append(
+                plate_match.group()
+            )
     # =========================
+    # OCR GARBAGE PENALTY
     # =========================
+    if garbage_count >= 5:
+        for key in scores:
+            scores[key] -= 2
     # =========================
+    # BEST MATCH
     # =========================
+    best_doc = max(
+        scores,
+        key=scores.get
+    )
+    best_score = scores[best_doc]
+    # =========================
+    # LOW CONFIDENCE
+    # =========================
+    if best_score <= 0:
+        return None
+    # =========================
+    # CONFIDENCE
+    # =========================
+    confidence = min(
+        99,
+        max(50, best_score * 5)
+    )
+    # =========================
+    # RETURN RESULT
+    # =========================
+    return {
+        "document_type": best_doc,
+        "confidence": confidence,
+        "matched_keywords": (
+            matched_keywords[best_doc]
+        ),
+        "all_scores": scores
+    }
 # =========================
 @app.get("/")
 def home():
     return {
         "success": True,
         "message": "Document Validation API Running",
 # =========================
 @app.post("/validate")
+async def validate_document(
+    request: ImageRequest
+):
     try:
         # =========================
+        # DOWNLOAD IMAGE
         # =========================
+        image_path = download_image(
+            request.image_url
+        )
+        if image_path is None:
+            return {
+                "success": False,
+                "message": "Image download failed",
+                "reason": (
+                    "Could not download image "
+                    "from URL."
+                )
+            }
         # =========================
         # READ IMAGE
         image = cv2.imread(image_path)
         if image is None:
             return {
                 "success": False,
                 "message": "Invalid image",
                 "reason": (
+                    "The downloaded file could "
+                    "not be read as an image."
                 ),
                 "suggestion": (
+                    "Ensure the URL points "
+                    "directly to an image."
                 )
             }
         # =========================
         if is_blurry(image):
             return {
                 "success": False,
                 "message": "Image rejected",
+                "reason": (
+                    "The uploaded image is blurry."
+                ),
                 "suggestion": (
+                    "Retake the photo "
+                    "with better focus."
                 )
             }
         # =========================
         if is_dark(image):
             return {
                 "success": False,
                 "message": "Image rejected",
+                "reason": (
+                    "The uploaded image is too dark."
+                ),
                 "suggestion": (
+                    "Take the photo in a "
+                    "brighter environment."
                 )
             }
         # =========================
         if len(text.strip()) == 0:
             return {
                 "success": False,
                 "message": "Document rejected",
                 "reason": (
+                    "No readable text was "
+                    "detected in the image."
                 ),
                 "suggestion": (
+                    "Ensure the document "
+                    "is clear and visible."
                 )
             }
         # DOCUMENT DETECTION
         # =========================
+        document_result = detect_document(
+            text
+        )
         # =========================
         # UNSUPPORTED DOCUMENT
         # =========================
         if document_result is None:
             return {
                 "success": False,
                 "message": "Document rejected",
                 "reason": (
+                    "The uploaded image "
+                    "does not match any "
+                    "supported document type."
                 ),
                 "supported_documents": [
                     "National ID (NIN)",
             "matched_keywords": (
                 document_result["matched_keywords"]
             ),
+            "score_breakdown": (
+                document_result["all_scores"]
+            ),
             "ocr_preview": text[:300]
         }