Spaces:
Sleeping
Sleeping
File size: 4,126 Bytes
ae2e698 c70099c 254fdf9 5ebcb93 ae2e698 5968a12 254fdf9 a8683a1 ae2e698 8324e53 5968a12 8324e53 5968a12 8324e53 ae2e698 8324e53 ae2e698 5968a12 254fdf9 b07dfbb 5968a12 254fdf9 a726fb2 2c3e33d 5968a12 65bef46 2c3e33d 254fdf9 5968a12 65bef46 a726fb2 254fdf9 a726fb2 5968a12 a8683a1 8324e53 254fdf9 5968a12 65bef46 5968a12 65bef46 5968a12 65bef46 5968a12 65bef46 5968a12 65bef46 5968a12 65bef46 5968a12 65bef46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
from paddleocr import PaddleOCR
import re
# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Helper regex patterns
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEX = [
r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', # 12/04/1980 or 12-04-1980
r'\b\d{4}-\d{2}-\d{2}\b', # 1980-04-12
r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b', # 12-APR-1980
r'\b(19|20)\d{2}\b' # Year only
]
GENDERS = ["MALE", "FEMALE", "TRANSGENDER"]
def extract_kyc_fields(file_path, force_type=None):
try:
result = ocr.ocr(file_path, cls=True)
lines = []
# Normalize OCR text
for block in result:
for line in block:
text = re.sub(r'\s+', ' ', line[1][0].strip())
if text:
lines.append(text)
full_text = "\n".join(lines)
# Determine card type
if force_type:
card_type = force_type.upper()
else:
pan_match = re.search(PAN_REGEX, full_text)
aadhaar_match = re.search(AADHAAR_REGEX, full_text)
card_type = "UNKNOWN"
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
response = {"card_type": card_type}
if card_type == "PAN":
pan_number = re.search(PAN_REGEX, full_text)
response["pan_number"] = pan_number.group(0) if pan_number else "Not found"
response["dob"] = extract_dob(lines)
response["name"] = extract_pan_name(lines)
elif card_type == "AADHAAR":
aadhaar_number = re.search(AADHAAR_REGEX, full_text)
response["aadhaar_number"] = aadhaar_number.group(0) if aadhaar_number else "Not found"
response["dob"] = extract_dob(lines)
response["gender"] = extract_gender(lines)
response["name"] = extract_aadhaar_name(lines)
else:
response["error"] = "Could not identify document as PAN or Aadhaar."
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
def extract_dob(lines):
for line in lines:
for pattern in DOB_REGEX:
match = re.search(pattern, line, re.IGNORECASE)
if match:
return match.group(0)
return "Not found"
def extract_gender(lines):
for line in lines:
for gender in GENDERS:
if gender in line.upper():
return gender
return "Not found"
def extract_pan_name(lines):
# Heuristic: Name is after "INCOME TAX DEPARTMENT" and contains only letters/spaces
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line.upper():
for j in range(i + 1, len(lines)):
candidate = lines[j].strip()
if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
# Skip words like INDIA, GOVT, DEPARTMENT
if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
return candidate
return "Not found"
def extract_aadhaar_name(lines):
# Heuristic: Name is usually above DOB
for i, line in enumerate(lines):
if any(re.search(p, line) for p in DOB_REGEX):
if i > 0:
candidate_name = lines[i - 1].strip()
if not re.search(r'\d', candidate_name) and len(candidate_name.split()) >= 2:
if not any(x in candidate_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]):
return candidate_name
# Fallback: First line with >=2 words and no digits
for line in lines:
candidate_name = line.strip()
if len(candidate_name.split()) >= 2 and not re.search(r'\d', candidate_name):
if not any(x in candidate_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]):
return candidate_name
return "Not found"
|