import os import re from datetime import datetime from typing import Optional, Dict, Any from simple_salesforce import Salesforce from paddleocr import PaddleOCR # ------------------------------ # OCR setup (stable on small CPU) # ------------------------------ os.environ.setdefault("OMP_NUM_THREADS", "1") _ocr = PaddleOCR(use_angle_cls=True, lang='en') # Regexes PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b' AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b' DOB_REGEXES = [ r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', r'\b\d{4}-\d{2}-\d{2}\b', r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b', r'\b(19|20)\d{2}\b' ] # ------------------------------ # OCR helpers # ------------------------------ def extract_kyc_fields(file_path: str, force_type: Optional[str] = None) -> Dict[str, Any]: """ Extracts KYC fields from an Aadhaar or PAN image. Returns a dict with keys: card_type, name, dob, aadhaar_number, pan_number (number keys present only when relevant) """ try: result = _ocr.ocr(file_path, cls=True) lines = [] for block in result: for line in block: text = re.sub(r'\s+', ' ', line[1][0].strip()) if text: lines.append(text) full_text = "\n".join(lines) if force_type: card_type = force_type.upper() else: card_type = "UNKNOWN" if re.search(PAN_REGEX, full_text): card_type = "PAN" elif re.search(AADHAAR_REGEX, full_text): card_type = "AADHAAR" out = {"card_type": card_type} if card_type == "PAN": out["pan_number"] = _first(PAN_REGEX, full_text) or "Not found" out["dob"] = _extract_dob(lines) out["name"] = _extract_pan_name(lines) elif card_type == "AADHAAR": out["aadhaar_number"] = _first(AADHAAR_REGEX, full_text) or "Not found" out["dob"] = _extract_dob(lines) out["name"] = _extract_aadhaar_name(lines) else: out["name"] = _extract_generic_name(lines) out["dob"] = _extract_dob(lines) out["error"] = "Could not identify document as PAN or Aadhaar." return out except Exception as e: return {"error": f"OCR processing failed: {str(e)}", "card_type": "UNKNOWN"} def _first(pattern: str, text: str, flags: int = 0) -> Optional[str]: m = re.search(pattern, text, flags) return m.group(0) if m else None def _extract_dob(lines): for line in lines: for pattern in DOB_REGEXES[:-1]: m = re.search(pattern, line, re.IGNORECASE) if m: return m.group(0) for line in lines: m = re.search(DOB_REGEXES[-1], line) if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]): return m.group(0) return "Not found" def _extract_pan_name(lines): for i, line in enumerate(lines): if "INCOME TAX DEPARTMENT" in line.upper(): for j in range(i + 1, len(lines)): candidate = lines[j].strip() if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate): if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]): return candidate return "Not found" def _extract_aadhaar_name(lines): for i, line in enumerate(lines): if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES): if i > 0: candidate = lines[i - 1].strip() if _looks_like_name(candidate): return candidate for line in lines: if _looks_like_name(line.strip()): return line.strip() return "Not found" def _extract_generic_name(lines): for line in lines: if _looks_like_name(line.strip()): return line.strip() return "Not found" def _looks_like_name(text: str) -> bool: if re.search(r'\d', text): return False if len(text.split()) < 2: return False banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"] return not any(b in text.upper() for b in banned) # ------------------------------ # Salesforce helpers # ------------------------------ def connect_salesforce( username: Optional[str] = None, password: Optional[str] = None, token: Optional[str] = None, domain: Optional[str] = None, ): """ Connect using provided args; fallback to environment variables. Returns a Salesforce client or raises an Exception with the root cause. """ sf_username = username or os.getenv("SF_USERNAME", "") sf_password = password or os.getenv("SF_PASSWORD", "") sf_token = token or os.getenv("SF_TOKEN", "") sf_domain = (domain or os.getenv("SF_DOMAIN", "login")).strip() or "login" if not (sf_username and sf_password and sf_token): raise RuntimeError( "Missing Salesforce credentials. Provide username/password/token (via UI or env vars)." ) return Salesforce( username=sf_username, password=sf_password, security_token=sf_token, domain=sf_domain # "login"=prod, "test"=sandbox ) def create_kyc_record( sf, aadhaar: Optional[Dict[str, Any]], pan: Optional[Dict[str, Any]], agent_id: Optional[str] = None ) -> Dict[str, Any]: """ Create a single KYC_Record__c, filling the Aadhaar_* and Pan_* fields from the respective OCR outputs. Fields (exact API names): Aadhaar_Name__c Aadhaar_DOB__c Aadhaar_Number__c Pan_Name__c Pan_DOB__c PAN_Number__c """ try: def safe(v): if not v or v == "Not found": return "" return v record = { "Aadhaar_Name__c": safe(aadhaar.get("name") if aadhaar else ""), "Aadhaar_DOB__c": safe(aadhaar.get("dob") if aadhaar else ""), "Aadhaar_Number__c": safe(aadhaar.get("aadhaar_number") if aadhaar else ""), "Pan_Name__c": safe(pan.get("name") if pan else ""), "Pan_DOB__c": safe(pan.get("dob") if pan else ""), "PAN_Number__c": safe(pan.get("pan_number") if pan else ""), } if agent_id: record["Agent__c"] = agent_id # only if you have this field in org resp = sf.KYC_Record__c.create(record) return {"status": "success", "record_id": resp.get("id"), "payload_sent": record} except Exception as e: return {"status": "error", "message": str(e)}