Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| from datetime import datetime | |
| from typing import Optional, Dict, Any | |
| from simple_salesforce import Salesforce | |
| from paddleocr import PaddleOCR | |
| # ------------------------------ | |
| # OCR setup (stable on small CPU) | |
| # ------------------------------ | |
| os.environ.setdefault("OMP_NUM_THREADS", "1") | |
| _ocr = PaddleOCR(use_angle_cls=True, lang='en') | |
| # Regexes | |
| PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b' | |
| AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b' | |
| DOB_REGEXES = [ | |
| r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', | |
| r'\b\d{4}-\d{2}-\d{2}\b', | |
| r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b', | |
| r'\b(19|20)\d{2}\b' | |
| ] | |
| # ------------------------------ | |
| # OCR helpers | |
| # ------------------------------ | |
| def extract_kyc_fields(file_path: str, force_type: Optional[str] = None) -> Dict[str, Any]: | |
| """ | |
| Extracts KYC fields from an Aadhaar or PAN image. | |
| Returns a dict with keys: | |
| card_type, name, dob, aadhaar_number, pan_number | |
| (number keys present only when relevant) | |
| """ | |
| try: | |
| result = _ocr.ocr(file_path, cls=True) | |
| lines = [] | |
| for block in result: | |
| for line in block: | |
| text = re.sub(r'\s+', ' ', line[1][0].strip()) | |
| if text: | |
| lines.append(text) | |
| full_text = "\n".join(lines) | |
| if force_type: | |
| card_type = force_type.upper() | |
| else: | |
| card_type = "UNKNOWN" | |
| if re.search(PAN_REGEX, full_text): | |
| card_type = "PAN" | |
| elif re.search(AADHAAR_REGEX, full_text): | |
| card_type = "AADHAAR" | |
| out = {"card_type": card_type} | |
| if card_type == "PAN": | |
| out["pan_number"] = _first(PAN_REGEX, full_text) or "Not found" | |
| out["dob"] = _extract_dob(lines) | |
| out["name"] = _extract_pan_name(lines) | |
| elif card_type == "AADHAAR": | |
| out["aadhaar_number"] = _first(AADHAAR_REGEX, full_text) or "Not found" | |
| out["dob"] = _extract_dob(lines) | |
| out["name"] = _extract_aadhaar_name(lines) | |
| else: | |
| out["name"] = _extract_generic_name(lines) | |
| out["dob"] = _extract_dob(lines) | |
| out["error"] = "Could not identify document as PAN or Aadhaar." | |
| return out | |
| except Exception as e: | |
| return {"error": f"OCR processing failed: {str(e)}", "card_type": "UNKNOWN"} | |
| def _first(pattern: str, text: str, flags: int = 0) -> Optional[str]: | |
| m = re.search(pattern, text, flags) | |
| return m.group(0) if m else None | |
| def _extract_dob(lines): | |
| for line in lines: | |
| for pattern in DOB_REGEXES[:-1]: | |
| m = re.search(pattern, line, re.IGNORECASE) | |
| if m: | |
| return m.group(0) | |
| for line in lines: | |
| m = re.search(DOB_REGEXES[-1], line) | |
| if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]): | |
| return m.group(0) | |
| return "Not found" | |
| def _extract_pan_name(lines): | |
| for i, line in enumerate(lines): | |
| if "INCOME TAX DEPARTMENT" in line.upper(): | |
| for j in range(i + 1, len(lines)): | |
| candidate = lines[j].strip() | |
| if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate): | |
| if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]): | |
| return candidate | |
| return "Not found" | |
| def _extract_aadhaar_name(lines): | |
| for i, line in enumerate(lines): | |
| if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES): | |
| if i > 0: | |
| candidate = lines[i - 1].strip() | |
| if _looks_like_name(candidate): | |
| return candidate | |
| for line in lines: | |
| if _looks_like_name(line.strip()): | |
| return line.strip() | |
| return "Not found" | |
| def _extract_generic_name(lines): | |
| for line in lines: | |
| if _looks_like_name(line.strip()): | |
| return line.strip() | |
| return "Not found" | |
| def _looks_like_name(text: str) -> bool: | |
| if re.search(r'\d', text): | |
| return False | |
| if len(text.split()) < 2: | |
| return False | |
| banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"] | |
| return not any(b in text.upper() for b in banned) | |
| # ------------------------------ | |
| # Salesforce helpers | |
| # ------------------------------ | |
| def connect_salesforce( | |
| username: Optional[str] = None, | |
| password: Optional[str] = None, | |
| token: Optional[str] = None, | |
| domain: Optional[str] = None, | |
| ): | |
| """ | |
| Connect using provided args; fallback to environment variables. | |
| Returns a Salesforce client or raises an Exception with the root cause. | |
| """ | |
| sf_username = username or os.getenv("SF_USERNAME", "") | |
| sf_password = password or os.getenv("SF_PASSWORD", "") | |
| sf_token = token or os.getenv("SF_TOKEN", "") | |
| sf_domain = (domain or os.getenv("SF_DOMAIN", "login")).strip() or "login" | |
| if not (sf_username and sf_password and sf_token): | |
| raise RuntimeError( | |
| "Missing Salesforce credentials. Provide username/password/token (via UI or env vars)." | |
| ) | |
| return Salesforce( | |
| username=sf_username, | |
| password=sf_password, | |
| security_token=sf_token, | |
| domain=sf_domain # "login"=prod, "test"=sandbox | |
| ) | |
| def create_kyc_record( | |
| sf, | |
| aadhaar: Optional[Dict[str, Any]], | |
| pan: Optional[Dict[str, Any]], | |
| agent_id: Optional[str] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Create a single KYC_Record__c, filling the Aadhaar_* and Pan_* fields | |
| from the respective OCR outputs. Fields (exact API names): | |
| Aadhaar_Name__c | |
| Aadhaar_DOB__c | |
| Aadhaar_Number__c | |
| Pan_Name__c | |
| Pan_DOB__c | |
| PAN_Number__c | |
| """ | |
| try: | |
| def safe(v): | |
| if not v or v == "Not found": | |
| return "" | |
| return v | |
| record = { | |
| "Aadhaar_Name__c": safe(aadhaar.get("name") if aadhaar else ""), | |
| "Aadhaar_DOB__c": safe(aadhaar.get("dob") if aadhaar else ""), | |
| "Aadhaar_Number__c": safe(aadhaar.get("aadhaar_number") if aadhaar else ""), | |
| "Pan_Name__c": safe(pan.get("name") if pan else ""), | |
| "Pan_DOB__c": safe(pan.get("dob") if pan else ""), | |
| "PAN_Number__c": safe(pan.get("pan_number") if pan else ""), | |
| } | |
| if agent_id: | |
| record["Agent__c"] = agent_id # only if you have this field in org | |
| resp = sf.KYC_Record__c.create(record) | |
| return {"status": "success", "record_id": resp.get("id"), "payload_sent": record} | |
| except Exception as e: | |
| return {"status": "error", "message": str(e)} | |