SMART_KYC_OCR / utils.py
gopichandra's picture
Update utils.py
eee95a6 verified
import os
import re
from datetime import datetime
from typing import Optional, Dict, Any
from simple_salesforce import Salesforce
from paddleocr import PaddleOCR
# ------------------------------
# OCR setup (stable on small CPU)
# ------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1")
_ocr = PaddleOCR(use_angle_cls=True, lang='en')
# Regexes
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEXES = [
r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
r'\b\d{4}-\d{2}-\d{2}\b',
r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
r'\b(19|20)\d{2}\b'
]
# ------------------------------
# OCR helpers
# ------------------------------
def extract_kyc_fields(file_path: str, force_type: Optional[str] = None) -> Dict[str, Any]:
"""
Extracts KYC fields from an Aadhaar or PAN image.
Returns a dict with keys:
card_type, name, dob, aadhaar_number, pan_number
(number keys present only when relevant)
"""
try:
result = _ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = re.sub(r'\s+', ' ', line[1][0].strip())
if text:
lines.append(text)
full_text = "\n".join(lines)
if force_type:
card_type = force_type.upper()
else:
card_type = "UNKNOWN"
if re.search(PAN_REGEX, full_text):
card_type = "PAN"
elif re.search(AADHAAR_REGEX, full_text):
card_type = "AADHAAR"
out = {"card_type": card_type}
if card_type == "PAN":
out["pan_number"] = _first(PAN_REGEX, full_text) or "Not found"
out["dob"] = _extract_dob(lines)
out["name"] = _extract_pan_name(lines)
elif card_type == "AADHAAR":
out["aadhaar_number"] = _first(AADHAAR_REGEX, full_text) or "Not found"
out["dob"] = _extract_dob(lines)
out["name"] = _extract_aadhaar_name(lines)
else:
out["name"] = _extract_generic_name(lines)
out["dob"] = _extract_dob(lines)
out["error"] = "Could not identify document as PAN or Aadhaar."
return out
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}", "card_type": "UNKNOWN"}
def _first(pattern: str, text: str, flags: int = 0) -> Optional[str]:
m = re.search(pattern, text, flags)
return m.group(0) if m else None
def _extract_dob(lines):
for line in lines:
for pattern in DOB_REGEXES[:-1]:
m = re.search(pattern, line, re.IGNORECASE)
if m:
return m.group(0)
for line in lines:
m = re.search(DOB_REGEXES[-1], line)
if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
return m.group(0)
return "Not found"
def _extract_pan_name(lines):
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line.upper():
for j in range(i + 1, len(lines)):
candidate = lines[j].strip()
if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
return candidate
return "Not found"
def _extract_aadhaar_name(lines):
for i, line in enumerate(lines):
if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
if i > 0:
candidate = lines[i - 1].strip()
if _looks_like_name(candidate):
return candidate
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _extract_generic_name(lines):
for line in lines:
if _looks_like_name(line.strip()):
return line.strip()
return "Not found"
def _looks_like_name(text: str) -> bool:
if re.search(r'\d', text):
return False
if len(text.split()) < 2:
return False
banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
return not any(b in text.upper() for b in banned)
# ------------------------------
# Salesforce helpers
# ------------------------------
def connect_salesforce(
username: Optional[str] = None,
password: Optional[str] = None,
token: Optional[str] = None,
domain: Optional[str] = None,
):
"""
Connect using provided args; fallback to environment variables.
Returns a Salesforce client or raises an Exception with the root cause.
"""
sf_username = username or os.getenv("SF_USERNAME", "")
sf_password = password or os.getenv("SF_PASSWORD", "")
sf_token = token or os.getenv("SF_TOKEN", "")
sf_domain = (domain or os.getenv("SF_DOMAIN", "login")).strip() or "login"
if not (sf_username and sf_password and sf_token):
raise RuntimeError(
"Missing Salesforce credentials. Provide username/password/token (via UI or env vars)."
)
return Salesforce(
username=sf_username,
password=sf_password,
security_token=sf_token,
domain=sf_domain # "login"=prod, "test"=sandbox
)
def create_kyc_record(
sf,
aadhaar: Optional[Dict[str, Any]],
pan: Optional[Dict[str, Any]],
agent_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Create a single KYC_Record__c, filling the Aadhaar_* and Pan_* fields
from the respective OCR outputs. Fields (exact API names):
Aadhaar_Name__c
Aadhaar_DOB__c
Aadhaar_Number__c
Pan_Name__c
Pan_DOB__c
PAN_Number__c
"""
try:
def safe(v):
if not v or v == "Not found":
return ""
return v
record = {
"Aadhaar_Name__c": safe(aadhaar.get("name") if aadhaar else ""),
"Aadhaar_DOB__c": safe(aadhaar.get("dob") if aadhaar else ""),
"Aadhaar_Number__c": safe(aadhaar.get("aadhaar_number") if aadhaar else ""),
"Pan_Name__c": safe(pan.get("name") if pan else ""),
"Pan_DOB__c": safe(pan.get("dob") if pan else ""),
"PAN_Number__c": safe(pan.get("pan_number") if pan else ""),
}
if agent_id:
record["Agent__c"] = agent_id # only if you have this field in org
resp = sf.KYC_Record__c.create(record)
return {"status": "success", "record_id": resp.get("id"), "payload_sent": record}
except Exception as e:
return {"status": "error", "message": str(e)}