Spaces:
Sleeping
Sleeping
File size: 6,673 Bytes
7147400 c70099c 7147400 482b26a 7147400 c70099c 482b26a ae2e698 482b26a 7147400 482b26a 7147400 482b26a 7147400 a8683a1 482b26a 8324e53 7147400 8324e53 ae2e698 8324e53 ae2e698 254fdf9 b07dfbb 254fdf9 7147400 254fdf9 7147400 254fdf9 a726fb2 482b26a a726fb2 2c3e33d 482b26a 254fdf9 482b26a 254fdf9 482b26a 254fdf9 a8683a1 482b26a 254fdf9 482b26a 7147400 254fdf9 7147400 254fdf9 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 65bef46 7147400 482b26a 7147400 482b26a 7147400 482b26a 7147400 482b26a 7147400 482b26a 7147400 482b26a eee95a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import os
import re
from datetime import datetime
from typing import Optional, Dict, Any
from simple_salesforce import Salesforce
from paddleocr import PaddleOCR
# ------------------------------
# OCR setup (stable on small CPU)
# ------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1")
_ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Regexes
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEXES = [
r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
r'\b\d{4}-\d{2}-\d{2}\b',
r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
r'\b(19|20)\d{2}\b'
]
# ------------------------------
# OCR helpers
# ------------------------------
def extract_kyc_fields(file_path: str, force_type: Optional[str] = None) -> Dict[str, Any]:
"""
Extracts KYC fields from an Aadhaar or PAN image.
Returns a dict with keys:
card_type, name, dob, aadhaar_number, pan_number
(number keys present only when relevant)
"""
try:
result = _ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = re.sub(r'\s+', ' ', line[1][0].strip())
if text:
lines.append(text)
full_text = "\n".join(lines)
if force_type:
card_type = force_type.upper()
else:
card_type = "UNKNOWN"
if re.search(PAN_REGEX, full_text):
card_type = "PAN"
elif re.search(AADHAAR_REGEX, full_text):
card_type = "AADHAAR"
out = {"card_type": card_type}
if card_type == "PAN":
out["pan_number"] = _first(PAN_REGEX, full_text) or "Not found"
out["dob"] = _extract_dob(lines)
out["name"] = _extract_pan_name(lines)
elif card_type == "AADHAAR":
out["aadhaar_number"] = _first(AADHAAR_REGEX, full_text) or "Not found"
out["dob"] = _extract_dob(lines)
out["name"] = _extract_aadhaar_name(lines)
else:
out["name"] = _extract_generic_name(lines)
out["dob"] = _extract_dob(lines)
out["error"] = "Could not identify document as PAN or Aadhaar."
return out
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}", "card_type": "UNKNOWN"}
def _first(pattern: str, text: str, flags: int = 0) -> Optional[str]:
m = re.search(pattern, text, flags)
return m.group(0) if m else None
def _extract_dob(lines):
for line in lines:
for pattern in DOB_REGEXES[:-1]:
m = re.search(pattern, line, re.IGNORECASE)
if m:
return m.group(0)
for line in lines:
m = re.search(DOB_REGEXES[-1], line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
return m.group(0)
return "Not found"
def _extract_pan_name(lines):
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line.upper():
for j in range(i + 1, len(lines)):
candidate = lines[j].strip()
if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
return candidate
return "Not found"
def _extract_aadhaar_name(lines):
for i, line in enumerate(lines):
if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
if i > 0:
candidate = lines[i - 1].strip()
if _looks_like_name(candidate):
return candidate
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _extract_generic_name(lines):
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _looks_like_name(text: str) -> bool:
if re.search(r'\d', text):
return False
if len(text.split()) < 2:
return False
banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
return not any(b in text.upper() for b in banned)
# ------------------------------
# Salesforce helpers
# ------------------------------
def connect_salesforce(
username: Optional[str] = None,
password: Optional[str] = None,
token: Optional[str] = None,
domain: Optional[str] = None,
):
"""
Connect using provided args; fallback to environment variables.
Returns a Salesforce client or raises an Exception with the root cause.
"""
sf_username = username or os.getenv("SF_USERNAME", "")
sf_password = password or os.getenv("SF_PASSWORD", "")
sf_token = token or os.getenv("SF_TOKEN", "")
sf_domain = (domain or os.getenv("SF_DOMAIN", "login")).strip() or "login"
if not (sf_username and sf_password and sf_token):
raise RuntimeError(
"Missing Salesforce credentials. Provide username/password/token (via UI or env vars)."
)
return Salesforce(
username=sf_username,
password=sf_password,
security_token=sf_token,
domain=sf_domain # "login"=prod, "test"=sandbox
)
def create_kyc_record(
sf,
aadhaar: Optional[Dict[str, Any]],
pan: Optional[Dict[str, Any]],
agent_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Create a single KYC_Record__c, filling the Aadhaar_* and Pan_* fields
from the respective OCR outputs. Fields (exact API names):
Aadhaar_Name__c
Aadhaar_DOB__c
Aadhaar_Number__c
Pan_Name__c
Pan_DOB__c
PAN_Number__c
"""
try:
def safe(v):
if not v or v == "Not found":
return ""
return v
record = {
"Aadhaar_Name__c": safe(aadhaar.get("name") if aadhaar else ""),
"Aadhaar_DOB__c": safe(aadhaar.get("dob") if aadhaar else ""),
"Aadhaar_Number__c": safe(aadhaar.get("aadhaar_number") if aadhaar else ""),
"Pan_Name__c": safe(pan.get("name") if pan else ""),
"Pan_DOB__c": safe(pan.get("dob") if pan else ""),
"PAN_Number__c": safe(pan.get("pan_number") if pan else ""),
}
if agent_id:
record["Agent__c"] = agent_id # only if you have this field in org
resp = sf.KYC_Record__c.create(record)
return {"status": "success", "record_id": resp.get("id"), "payload_sent": record}
except Exception as e:
return {"status": "error", "message": str(e)}
|