File size: 4,126 Bytes
ae2e698
c70099c
 
254fdf9
5ebcb93
ae2e698
5968a12
 
 
 
 
 
 
 
 
 
 
 
254fdf9
a8683a1
ae2e698
8324e53
5968a12
 
8324e53
 
5968a12
8324e53
 
ae2e698
8324e53
ae2e698
5968a12
254fdf9
 
b07dfbb
5968a12
 
254fdf9
 
 
 
 
a726fb2
 
 
2c3e33d
5968a12
 
65bef46
 
2c3e33d
254fdf9
5968a12
 
65bef46
 
 
a726fb2
254fdf9
 
 
a726fb2
5968a12
a8683a1
8324e53
254fdf9
 
 
 
5968a12
 
 
 
 
65bef46
 
 
 
5968a12
 
 
65bef46
 
 
 
5968a12
 
 
65bef46
5968a12
 
 
 
 
65bef46
 
 
 
5968a12
65bef46
5968a12
 
 
 
 
 
 
65bef46
5968a12
 
 
 
65bef46
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from paddleocr import PaddleOCR
import re

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Helper regex patterns
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEX = [
    r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',  # 12/04/1980 or 12-04-1980
    r'\b\d{4}-\d{2}-\d{2}\b',          # 1980-04-12
    r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',  # 12-APR-1980
    r'\b(19|20)\d{2}\b'                # Year only
]

GENDERS = ["MALE", "FEMALE", "TRANSGENDER"]

def extract_kyc_fields(file_path, force_type=None):
    try:
        result = ocr.ocr(file_path, cls=True)
        lines = []

        # Normalize OCR text
        for block in result:
            for line in block:
                text = re.sub(r'\s+', ' ', line[1][0].strip())
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        # Determine card type
        if force_type:
            card_type = force_type.upper()
        else:
            pan_match = re.search(PAN_REGEX, full_text)
            aadhaar_match = re.search(AADHAAR_REGEX, full_text)
            card_type = "UNKNOWN"
            if pan_match:
                card_type = "PAN"
            elif aadhaar_match:
                card_type = "AADHAAR"

        response = {"card_type": card_type}

        if card_type == "PAN":
            pan_number = re.search(PAN_REGEX, full_text)
            response["pan_number"] = pan_number.group(0) if pan_number else "Not found"
            response["dob"] = extract_dob(lines)
            response["name"] = extract_pan_name(lines)

        elif card_type == "AADHAAR":
            aadhaar_number = re.search(AADHAAR_REGEX, full_text)
            response["aadhaar_number"] = aadhaar_number.group(0) if aadhaar_number else "Not found"
            response["dob"] = extract_dob(lines)
            response["gender"] = extract_gender(lines)
            response["name"] = extract_aadhaar_name(lines)

        else:
            response["error"] = "Could not identify document as PAN or Aadhaar."

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}


def extract_dob(lines):
    for line in lines:
        for pattern in DOB_REGEX:
            match = re.search(pattern, line, re.IGNORECASE)
            if match:
                return match.group(0)
    return "Not found"


def extract_gender(lines):
    for line in lines:
        for gender in GENDERS:
            if gender in line.upper():
                return gender
    return "Not found"


def extract_pan_name(lines):
    # Heuristic: Name is after "INCOME TAX DEPARTMENT" and contains only letters/spaces
    for i, line in enumerate(lines):
        if "INCOME TAX DEPARTMENT" in line.upper():
            for j in range(i + 1, len(lines)):
                candidate = lines[j].strip()
                if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
                    # Skip words like INDIA, GOVT, DEPARTMENT
                    if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
                        return candidate
    return "Not found"


def extract_aadhaar_name(lines):
    # Heuristic: Name is usually above DOB
    for i, line in enumerate(lines):
        if any(re.search(p, line) for p in DOB_REGEX):
            if i > 0:
                candidate_name = lines[i - 1].strip()
                if not re.search(r'\d', candidate_name) and len(candidate_name.split()) >= 2:
                    if not any(x in candidate_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]):
                        return candidate_name
    # Fallback: First line with >=2 words and no digits
    for line in lines:
        candidate_name = line.strip()
        if len(candidate_name.split()) >= 2 and not re.search(r'\d', candidate_name):
            if not any(x in candidate_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]):
                return candidate_name
    return "Not found"