File size: 6,673 Bytes
7147400
c70099c
7147400
482b26a
7147400
 
c70099c
482b26a
 
 
 
 
ae2e698
482b26a
7147400
 
 
 
 
 
 
 
 
482b26a
 
 
 
7147400
482b26a
 
 
 
7147400
a8683a1
482b26a
8324e53
 
 
7147400
8324e53
 
ae2e698
8324e53
ae2e698
254fdf9
 
b07dfbb
254fdf9
7147400
254fdf9
7147400
254fdf9
a726fb2
482b26a
a726fb2
2c3e33d
482b26a
 
 
254fdf9
482b26a
 
 
254fdf9
482b26a
 
 
 
254fdf9
a8683a1
482b26a
254fdf9
482b26a
7147400
 
254fdf9
7147400
254fdf9
7147400
 
 
 
65bef46
7147400
 
 
65bef46
 
7147400
 
 
65bef46
7147400
 
 
 
65bef46
 
7147400
65bef46
7147400
 
 
 
 
 
 
 
 
 
 
65bef46
7147400
65bef46
 
7147400
482b26a
7147400
 
 
 
 
 
 
482b26a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7147400
 
482b26a
 
 
 
 
 
 
 
 
 
 
 
 
7147400
482b26a
 
 
 
 
 
 
 
 
7147400
 
482b26a
 
 
 
7147400
 
482b26a
 
eee95a6
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import re
from datetime import datetime
from typing import Optional, Dict, Any
from simple_salesforce import Salesforce
from paddleocr import PaddleOCR

# ------------------------------
# OCR setup (stable on small CPU)
# ------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1")
_ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Regexes
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEXES = [
    r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
    r'\b\d{4}-\d{2}-\d{2}\b',
    r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
    r'\b(19|20)\d{2}\b'
]

# ------------------------------
# OCR helpers
# ------------------------------
def extract_kyc_fields(file_path: str, force_type: Optional[str] = None) -> Dict[str, Any]:
    """
    Extracts KYC fields from an Aadhaar or PAN image.
    Returns a dict with keys:
      card_type, name, dob, aadhaar_number, pan_number
    (number keys present only when relevant)
    """
    try:
        result = _ocr.ocr(file_path, cls=True)
        lines = []
        for block in result:
            for line in block:
                text = re.sub(r'\s+', ' ', line[1][0].strip())
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        if force_type:
            card_type = force_type.upper()
        else:
            card_type = "UNKNOWN"
            if re.search(PAN_REGEX, full_text):
                card_type = "PAN"
            elif re.search(AADHAAR_REGEX, full_text):
                card_type = "AADHAAR"

        out = {"card_type": card_type}

        if card_type == "PAN":
            out["pan_number"] = _first(PAN_REGEX, full_text) or "Not found"
            out["dob"] = _extract_dob(lines)
            out["name"] = _extract_pan_name(lines)
        elif card_type == "AADHAAR":
            out["aadhaar_number"] = _first(AADHAAR_REGEX, full_text) or "Not found"
            out["dob"] = _extract_dob(lines)
            out["name"] = _extract_aadhaar_name(lines)
        else:
            out["name"] = _extract_generic_name(lines)
            out["dob"] = _extract_dob(lines)
            out["error"] = "Could not identify document as PAN or Aadhaar."
        return out

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}", "card_type": "UNKNOWN"}

def _first(pattern: str, text: str, flags: int = 0) -> Optional[str]:
    m = re.search(pattern, text, flags)
    return m.group(0) if m else None

def _extract_dob(lines):
    for line in lines:
        for pattern in DOB_REGEXES[:-1]:
            m = re.search(pattern, line, re.IGNORECASE)
            if m:
                return m.group(0)
    for line in lines:
        m = re.search(DOB_REGEXES[-1], line)
        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
            return m.group(0)
    return "Not found"

def _extract_pan_name(lines):
    for i, line in enumerate(lines):
        if "INCOME TAX DEPARTMENT" in line.upper():
            for j in range(i + 1, len(lines)):
                candidate = lines[j].strip()
                if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
                    if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
                        return candidate
    return "Not found"

def _extract_aadhaar_name(lines):
    for i, line in enumerate(lines):
        if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
            if i > 0:
                candidate = lines[i - 1].strip()
                if _looks_like_name(candidate):
                    return candidate
    for line in lines:
        if _looks_like_name(line.strip()):
            return line.strip()
    return "Not found"

def _extract_generic_name(lines):
    for line in lines:
        if _looks_like_name(line.strip()):
            return line.strip()
    return "Not found"

def _looks_like_name(text: str) -> bool:
    if re.search(r'\d', text):
        return False
    if len(text.split()) < 2:
        return False
    banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
    return not any(b in text.upper() for b in banned)

# ------------------------------
# Salesforce helpers
# ------------------------------
def connect_salesforce(
    username: Optional[str] = None,
    password: Optional[str] = None,
    token: Optional[str] = None,
    domain: Optional[str] = None,
):
    """
    Connect using provided args; fallback to environment variables.
    Returns a Salesforce client or raises an Exception with the root cause.
    """
    sf_username = username or os.getenv("SF_USERNAME", "")
    sf_password = password or os.getenv("SF_PASSWORD", "")
    sf_token    = token    or os.getenv("SF_TOKEN", "")
    sf_domain   = (domain or os.getenv("SF_DOMAIN", "login")).strip() or "login"

    if not (sf_username and sf_password and sf_token):
        raise RuntimeError(
            "Missing Salesforce credentials. Provide username/password/token (via UI or env vars)."
        )

    return Salesforce(
        username=sf_username,
        password=sf_password,
        security_token=sf_token,
        domain=sf_domain  # "login"=prod, "test"=sandbox
    )

def create_kyc_record(
    sf,
    aadhaar: Optional[Dict[str, Any]],
    pan: Optional[Dict[str, Any]],
    agent_id: Optional[str] = None
) -> Dict[str, Any]:
    """
    Create a single KYC_Record__c, filling the Aadhaar_* and Pan_* fields
    from the respective OCR outputs. Fields (exact API names):

      Aadhaar_Name__c
      Aadhaar_DOB__c
      Aadhaar_Number__c
      Pan_Name__c
      Pan_DOB__c
      PAN_Number__c
    """
    try:
        def safe(v): 
            if not v or v == "Not found":
                return ""
            return v

        record = {
            "Aadhaar_Name__c":   safe(aadhaar.get("name") if aadhaar else ""),
            "Aadhaar_DOB__c":    safe(aadhaar.get("dob") if aadhaar else ""),
            "Aadhaar_Number__c": safe(aadhaar.get("aadhaar_number") if aadhaar else ""),
            "Pan_Name__c":       safe(pan.get("name") if pan else ""),
            "Pan_DOB__c":        safe(pan.get("dob") if pan else ""),
            "PAN_Number__c":     safe(pan.get("pan_number") if pan else ""),
        }

        if agent_id:
            record["Agent__c"] = agent_id  # only if you have this field in org

        resp = sf.KYC_Record__c.create(record)
        return {"status": "success", "record_id": resp.get("id"), "payload_sent": record}
    except Exception as e:
        return {"status": "error", "message": str(e)}