gopichandra commited on
Commit
482b26a
·
verified ·
1 Parent(s): 47e36a3

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +80 -102
utils.py CHANGED
@@ -1,16 +1,17 @@
1
  import os
2
  import re
3
  from datetime import datetime
 
4
  from simple_salesforce import Salesforce
5
  from paddleocr import PaddleOCR
6
 
7
- # -----------------------------------
8
- # OCR SETUP
9
- # -----------------------------------
10
- os.environ.setdefault("OMP_NUM_THREADS", "1") # limit threads for stability
11
- ocr = PaddleOCR(use_angle_cls=True, lang='en')
12
 
13
- # Regex patterns
14
  PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
15
  AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
16
  DOB_REGEXES = [
@@ -19,23 +20,20 @@ DOB_REGEXES = [
19
  r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
20
  r'\b(19|20)\d{2}\b'
21
  ]
22
- GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] # kept for completeness (not stored)
23
 
24
- # -----------------------------------
25
- # OCR HELPERS
26
- # -----------------------------------
27
- def extract_kyc_fields(file_path, force_type=None):
28
  """
29
- Returns a dict with:
30
- card_type: PAN | AADHAAR | UNKNOWN
31
- pan_number / aadhaar_number
32
- name (best-guess)
33
- dob (best-guess for the detected card)
34
  """
35
  try:
36
- result = ocr.ocr(file_path, cls=True)
37
  lines = []
38
-
39
  for block in result:
40
  for line in block:
41
  text = re.sub(r'\s+', ' ', line[1][0].strip())
@@ -53,40 +51,35 @@ def extract_kyc_fields(file_path, force_type=None):
53
  elif re.search(AADHAAR_REGEX, full_text):
54
  card_type = "AADHAAR"
55
 
56
- response = {"card_type": card_type}
57
 
58
  if card_type == "PAN":
59
- response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found"
60
- response["dob"] = _extract_dob(lines)
61
- response["name"] = _extract_pan_name(lines)
62
-
63
  elif card_type == "AADHAAR":
64
- response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found"
65
- response["dob"] = _extract_dob(lines)
66
- response["name"] = _extract_aadhaar_name(lines)
67
-
68
  else:
69
- response["error"] = "Could not identify document as PAN or Aadhaar."
70
- # best-effort generic fields
71
- response["dob"] = _extract_dob(lines)
72
- response["name"] = _extract_generic_name(lines)
73
 
74
- return response
75
  except Exception as e:
76
- return {"error": f"OCR processing failed: {str(e)}"}
77
 
78
- def _first_match(pattern, text, flags=0):
79
  m = re.search(pattern, text, flags)
80
  return m.group(0) if m else None
81
 
82
  def _extract_dob(lines):
83
- # Try common formats
84
  for line in lines:
85
  for pattern in DOB_REGEXES[:-1]:
86
  m = re.search(pattern, line, re.IGNORECASE)
87
  if m:
88
  return m.group(0)
89
- # Year-only with labels
90
  for line in lines:
91
  m = re.search(DOB_REGEXES[-1], line)
92
  if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
@@ -104,14 +97,12 @@ def _extract_pan_name(lines):
104
  return "Not found"
105
 
106
  def _extract_aadhaar_name(lines):
107
- # Heuristic: Name usually above DOB
108
  for i, line in enumerate(lines):
109
  if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
110
  if i > 0:
111
  candidate = lines[i - 1].strip()
112
  if _looks_like_name(candidate):
113
  return candidate
114
- # Fallback
115
  for line in lines:
116
  if _looks_like_name(line.strip()):
117
  return line.strip()
@@ -123,7 +114,7 @@ def _extract_generic_name(lines):
123
  return line.strip()
124
  return "Not found"
125
 
126
- def _looks_like_name(text):
127
  if re.search(r'\d', text):
128
  return False
129
  if len(text.split()) < 2:
@@ -131,73 +122,60 @@ def _looks_like_name(text):
131
  banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
132
  return not any(b in text.upper() for b in banned)
133
 
134
- # -----------------------------------
135
- # SALESFORCE HELPERS
136
- # -----------------------------------
137
- SF_USERNAME = os.getenv("SF_USERNAME", "")
138
- SF_PASSWORD = os.getenv("SF_PASSWORD", "")
139
- SF_TOKEN = os.getenv("SF_TOKEN", "")
140
- SF_DOMAIN = os.getenv("SF_DOMAIN", "login") # "login"=prod, "test"=sandbox
141
-
142
- def connect_salesforce():
143
- try:
144
- sf = Salesforce(
145
- username=SF_USERNAME,
146
- password=SF_PASSWORD,
147
- security_token=SF_TOKEN,
148
- domain=SF_DOMAIN
 
 
 
 
 
 
149
  )
150
- print(f"✅ Connected to Salesforce ({SF_DOMAIN})")
151
- return sf
152
- except Exception as e:
153
- print("❌ Salesforce login failed:", e)
154
- return None
155
 
156
- def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None):
 
 
 
 
 
 
 
 
 
 
 
 
157
  """
158
- Creates a record in KYC_Record__c with the fields:
159
- Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c
160
- Pan_Name__c, Pan_DOB__c, PAN_Number__c
161
- Optionally includes Agent__c if you pass agent_id and that field exists.
 
 
 
 
 
162
  """
163
  try:
164
- if not sf:
165
- return {"status": "error", "message": "Salesforce not connected"}
166
-
167
- # Normalize values
168
- def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "")
169
 
170
  record = {
171
- "Aadhaar_Name__c": "",
172
- "Aadhaar_DOB__c": "",
173
- "Aadhaar_Number__c":"",
174
- "Pan_Name__c": "",
175
- "Pan_DOB__c": "",
176
- "PAN_Number__c": "",
177
- }
178
-
179
- ct = (kyc_data.get("card_type") or "").upper()
180
- if ct == "AADHAAR":
181
- record["Aadhaar_Name__c"] = val_or_blank("name")
182
- record["Aadhaar_DOB__c"] = val_or_blank("dob")
183
- record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number")
184
- elif ct == "PAN":
185
- record["Pan_Name__c"] = val_or_blank("name")
186
- record["Pan_DOB__c"] = val_or_blank("dob")
187
- record["PAN_Number__c"] = val_or_blank("pan_number")
188
- else:
189
- # Unknown: best effort — fill name/dob into Aadhaar side to avoid losing data
190
- record["Aadhaar_Name__c"] = val_or_blank("name")
191
- record["Aadhaar_DOB__c"] = val_or_blank("dob")
192
-
193
- # Optionally include Agent__c if provided (and exists in your org)
194
- if agent_id:
195
- record["Agent__c"] = agent_id
196
-
197
- # Optionally store file name in a text field if you have one (not required by you):
198
- # record["KYC_File_Name__c"] = file_name or ""
199
-
200
- resp = sf.KYC_Record__c.create(record)
201
- return {"status": "success", "record_id": resp.get("id")}
202
- except Exception as e:
203
- return {"status": "error", "message": str(e)}
 
1
  import os
2
  import re
3
  from datetime import datetime
4
+ from typing import Optional, Dict, Any
5
  from simple_salesforce import Salesforce
6
  from paddleocr import PaddleOCR
7
 
8
+ # ------------------------------
9
+ # OCR setup (stable on small CPU)
10
+ # ------------------------------
11
+ os.environ.setdefault("OMP_NUM_THREADS", "1")
12
+ _ocr = PaddleOCR(use_angle_cls=True, lang='en')
13
 
14
+ # Regexes
15
  PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
16
  AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
17
  DOB_REGEXES = [
 
20
  r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
21
  r'\b(19|20)\d{2}\b'
22
  ]
 
23
 
24
+ # ------------------------------
25
+ # OCR helpers
26
+ # ------------------------------
27
+ def extract_kyc_fields(file_path: str, force_type: Optional[str] = None) -> Dict[str, Any]:
28
  """
29
+ Extracts KYC fields from an Aadhaar or PAN image.
30
+ Returns a dict with keys:
31
+ card_type, name, dob, aadhaar_number, pan_number
32
+ (number keys present only when relevant)
 
33
  """
34
  try:
35
+ result = _ocr.ocr(file_path, cls=True)
36
  lines = []
 
37
  for block in result:
38
  for line in block:
39
  text = re.sub(r'\s+', ' ', line[1][0].strip())
 
51
  elif re.search(AADHAAR_REGEX, full_text):
52
  card_type = "AADHAAR"
53
 
54
+ out = {"card_type": card_type}
55
 
56
  if card_type == "PAN":
57
+ out["pan_number"] = _first(PAN_REGEX, full_text) or "Not found"
58
+ out["dob"] = _extract_dob(lines)
59
+ out["name"] = _extract_pan_name(lines)
 
60
  elif card_type == "AADHAAR":
61
+ out["aadhaar_number"] = _first(AADHAAR_REGEX, full_text) or "Not found"
62
+ out["dob"] = _extract_dob(lines)
63
+ out["name"] = _extract_aadhaar_name(lines)
 
64
  else:
65
+ out["name"] = _extract_generic_name(lines)
66
+ out["dob"] = _extract_dob(lines)
67
+ out["error"] = "Could not identify document as PAN or Aadhaar."
68
+ return out
69
 
 
70
  except Exception as e:
71
+ return {"error": f"OCR processing failed: {str(e)}", "card_type": "UNKNOWN"}
72
 
73
+ def _first(pattern: str, text: str, flags: int = 0) -> Optional[str]:
74
  m = re.search(pattern, text, flags)
75
  return m.group(0) if m else None
76
 
77
  def _extract_dob(lines):
 
78
  for line in lines:
79
  for pattern in DOB_REGEXES[:-1]:
80
  m = re.search(pattern, line, re.IGNORECASE)
81
  if m:
82
  return m.group(0)
 
83
  for line in lines:
84
  m = re.search(DOB_REGEXES[-1], line)
85
  if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
 
97
  return "Not found"
98
 
99
  def _extract_aadhaar_name(lines):
 
100
  for i, line in enumerate(lines):
101
  if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
102
  if i > 0:
103
  candidate = lines[i - 1].strip()
104
  if _looks_like_name(candidate):
105
  return candidate
 
106
  for line in lines:
107
  if _looks_like_name(line.strip()):
108
  return line.strip()
 
114
  return line.strip()
115
  return "Not found"
116
 
117
+ def _looks_like_name(text: str) -> bool:
118
  if re.search(r'\d', text):
119
  return False
120
  if len(text.split()) < 2:
 
122
  banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
123
  return not any(b in text.upper() for b in banned)
124
 
125
+ # ------------------------------
126
+ # Salesforce helpers
127
+ # ------------------------------
128
+ def connect_salesforce(
129
+ username: Optional[str] = None,
130
+ password: Optional[str] = None,
131
+ token: Optional[str] = None,
132
+ domain: Optional[str] = None,
133
+ ):
134
+ """
135
+ Connect using provided args; fallback to environment variables.
136
+ Returns a Salesforce client or raises an Exception with the root cause.
137
+ """
138
+ sf_username = username or os.getenv("SF_USERNAME", "")
139
+ sf_password = password or os.getenv("SF_PASSWORD", "")
140
+ sf_token = token or os.getenv("SF_TOKEN", "")
141
+ sf_domain = (domain or os.getenv("SF_DOMAIN", "login")).strip() or "login"
142
+
143
+ if not (sf_username and sf_password and sf_token):
144
+ raise RuntimeError(
145
+ "Missing Salesforce credentials. Provide username/password/token (via UI or env vars)."
146
  )
 
 
 
 
 
147
 
148
+ return Salesforce(
149
+ username=sf_username,
150
+ password=sf_password,
151
+ security_token=sf_token,
152
+ domain=sf_domain # "login"=prod, "test"=sandbox
153
+ )
154
+
155
+ def create_kyc_record(
156
+ sf,
157
+ aadhaar: Optional[Dict[str, Any]],
158
+ pan: Optional[Dict[str, Any]],
159
+ agent_id: Optional[str] = None
160
+ ) -> Dict[str, Any]:
161
  """
162
+ Create a single KYC_Record__c, filling the Aadhaar_* and Pan_* fields
163
+ from the respective OCR outputs. Fields (exact API names):
164
+
165
+ Aadhaar_Name__c
166
+ Aadhaar_DOB__c
167
+ Aadhaar_Number__c
168
+ Pan_Name__c
169
+ Pan_DOB__c
170
+ PAN_Number__c
171
  """
172
  try:
173
+ def safe(v):
174
+ if not v or v == "Not found":
175
+ return ""
176
+ return v
 
177
 
178
  record = {
179
+ "Aadhaar_Name__c": safe(aadhaar.get("name") if aadhaar else ""),
180
+ "Aadhaar_DOB__c": safe(aadhaar.get("dob") if aadhaar else ""),
181
+ "Aadhaar_Number__c": safe(aadhaar.get("aa_