pavansuresh commited on
Commit
be1c212
·
verified ·
1 Parent(s): d6b15a6

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +65 -84
utils.py CHANGED
@@ -5,8 +5,9 @@ import re
5
  import difflib
6
  from simple_salesforce import Salesforce
7
  from dotenv import load_dotenv
 
8
 
9
- # Load .env variables
10
  load_dotenv()
11
 
12
  def get_salesforce_client():
@@ -15,7 +16,6 @@ def get_salesforce_client():
15
  username=os.getenv("SF_USERNAME"),
16
  password=os.getenv("SF_PASSWORD"),
17
  security_token=os.getenv("SF_SECURITY_TOKEN"),
18
- domain=os.getenv("SF_DOMAIN").replace("https://", "").replace(".salesforce.com", "")
19
  )
20
  return sf, None
21
  except Exception as e:
@@ -24,22 +24,18 @@ def get_salesforce_client():
24
  def get_salesforce_objects(sf):
25
  try:
26
  desc = sf.describe()
27
- object_names = []
28
- for obj in desc['sobjects']:
29
- if not obj['deprecatedAndHidden']: # Skip hidden/deprecated
30
- object_names.append(obj['name'])
31
- object_names.sort()
32
- return object_names, None
33
  except Exception as e:
34
- return [], str(e)
35
 
36
  def get_object_fields(sf, object_name):
37
  try:
38
- metadata = sf.__getattr__(object_name).describe()
39
- fields = [field['name'] for field in metadata['fields']]
40
  return fields, None
41
  except Exception as e:
42
- return [], str(e)
43
 
44
  def extract_text_from_pdf(pdf_path):
45
  try:
@@ -47,120 +43,105 @@ def extract_text_from_pdf(pdf_path):
47
  text = ""
48
  for page in doc:
49
  text += page.get_text()
 
50
  return text.strip(), None
51
  except Exception as e:
52
  return None, str(e)
53
 
54
- # ✅ Extract key-value pairs smartly from the PDF text
55
  def extract_key_value_pairs(pdf_path):
56
  try:
57
  doc = fitz.open(pdf_path)
58
- text = ""
59
  for page in doc:
60
- text += page.get_text()
61
-
62
- keys = []
63
- values = []
64
-
65
- # Pattern 1: Direct "Key: Value"
66
- lines = text.splitlines()
67
- for line in lines:
68
- match = re.match(r"^([A-Z][A-Za-z0-9 ()/_\-]{3,50})\s*[:\-]\s*(.+)$", line.strip())
69
- if match:
70
- key = match.group(1).strip()
71
- value = match.group(2).strip()
72
- if len(value) > 1:
73
- keys.append(key)
74
- values.append(value)
75
-
76
- # Pattern 2: Special extractions
77
-
78
- # Total Agreement Value
79
- match_val = re.search(r"Total Agreement Value[^\$]*\$\s?([\d,]+(?:\.\d{2})?)", text, re.IGNORECASE)
80
- if match_val:
81
- keys.append("Total Agreement Value")
82
- numeric_value = match_val.group(1).replace(",", "")
83
- values.append(numeric_value)
84
-
85
- # Agreement Name
86
- match_name = re.search(r"Agreement\s+(MSA\s+[A-Za-z0-9 _\-]+)", text, re.IGNORECASE)
87
- if match_name:
88
- keys.append("Agreement Name")
89
- values.append(match_name.group(1).strip())
90
-
91
- # Agreement Start and End Dates
92
- match_dates = re.search(
93
- r"effective as of\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}).*?until\s*[<\(]?([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})",
94
- text, re.IGNORECASE | re.DOTALL
95
- )
96
- if match_dates:
97
- keys.append("Agreement Start Date")
98
- values.append(match_dates.group(1).strip())
99
- keys.append("Agreement End Date")
100
- values.append(match_dates.group(2).strip())
101
-
102
- return [{"keys": keys, "values": values}], None
103
  except Exception as e:
104
  return None, str(e)
105
 
106
- # ✅ FIXED: Use original key as mapping key (not value)
107
- def map_fields(extracted_data, object_fields):
 
 
 
 
 
 
 
 
108
  try:
109
  mappings = {}
110
  confidence_scores = {}
111
- keys = extracted_data[0]["keys"]
112
- values = extracted_data[0]["values"]
113
-
114
- for key, value in zip(keys, values):
115
- if key.lower() in ["name", "email"]:
116
- continue # Skip these
117
-
118
- best_match = difflib.get_close_matches(key, object_fields, n=1, cutoff=0.0)
119
  if best_match:
120
  matched_field = best_match[0]
121
- confidence = difflib.SequenceMatcher(None, key.lower(), matched_field.lower()).ratio()
122
  mappings[key] = matched_field
123
- confidence_scores[key] = round(confidence, 2)
 
124
  else:
125
- mappings[key] = object_fields[0]
126
  confidence_scores[key] = 0.0
127
-
128
  return mappings, confidence_scores, None
129
  except Exception as e:
130
  return None, None, str(e)
131
 
132
  def create_record(sf, object_name, data):
133
  try:
 
 
 
 
 
 
 
 
134
  result = sf.__getattr__(object_name).create(data)
135
- return result.get("id", "Unknown ID"), None
136
  except Exception as e:
137
  return None, str(e)
138
 
139
  def attach_pdf(sf, record_id, pdf_path):
140
  try:
141
  with open(pdf_path, "rb") as f:
142
- body = f.read()
143
- content_version = sf.ContentVersion.create({
144
  "Title": os.path.basename(pdf_path),
145
  "PathOnClient": os.path.basename(pdf_path),
146
- "VersionData": body.encode("base64") if isinstance(body, str) else body
147
- })
 
 
148
 
149
- content_document_id = sf.query(
150
- f"SELECT ContentDocumentId FROM ContentVersion WHERE Id = '{content_version['id']}'"
151
- )["records"][0]["ContentDocumentId"]
 
152
 
 
153
  sf.ContentDocumentLink.create({
154
  "ContentDocumentId": content_document_id,
155
  "LinkedEntityId": record_id,
156
- "ShareType": "V"
 
157
  })
158
-
159
- return "PDF attached successfully", None
160
  except Exception as e:
161
  return None, str(e)
162
 
163
- def log_failure(pdf_path, object_name, error):
164
  with open("failures.json", "a") as f:
165
- json.dump({"pdf": pdf_path, "object": object_name, "error": error}, f)
 
 
 
 
166
  f.write("\n")
 
5
  import difflib
6
  from simple_salesforce import Salesforce
7
  from dotenv import load_dotenv
8
+ from datetime import datetime
9
 
10
+ # Load environment variables
11
  load_dotenv()
12
 
13
  def get_salesforce_client():
 
16
  username=os.getenv("SF_USERNAME"),
17
  password=os.getenv("SF_PASSWORD"),
18
  security_token=os.getenv("SF_SECURITY_TOKEN"),
 
19
  )
20
  return sf, None
21
  except Exception as e:
 
24
  def get_salesforce_objects(sf):
25
  try:
26
  desc = sf.describe()
27
+ objects = [sobject["name"] for sobject in desc["sobjects"]]
28
+ return objects, None
 
 
 
 
29
  except Exception as e:
30
+ return None, str(e)
31
 
32
  def get_object_fields(sf, object_name):
33
  try:
34
+ desc = sf.__getattr__(object_name).describe()
35
+ fields = [field["name"] for field in desc["fields"]]
36
  return fields, None
37
  except Exception as e:
38
+ return None, str(e)
39
 
40
  def extract_text_from_pdf(pdf_path):
41
  try:
 
43
  text = ""
44
  for page in doc:
45
  text += page.get_text()
46
+ doc.close()
47
  return text.strip(), None
48
  except Exception as e:
49
  return None, str(e)
50
 
 
51
  def extract_key_value_pairs(pdf_path):
52
  try:
53
  doc = fitz.open(pdf_path)
54
+ key_values = []
55
  for page in doc:
56
+ text = page.get_text()
57
+ lines = text.split("\n")
58
+ for line in lines:
59
+ if ":" in line:
60
+ parts = line.split(":", 1)
61
+ key = parts[0].strip()
62
+ value = parts[1].strip()
63
+ if key and value:
64
+ key_values.append((key, value))
65
+ doc.close()
66
+ return [{"keys": [kv[0] for kv in key_values], "values": [kv[1] for kv in key_values]}], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  except Exception as e:
68
  return None, str(e)
69
 
70
+ def normalize_date(value):
71
+ try:
72
+ if re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}$", value):
73
+ dt = datetime.strptime(value, "%m/%d/%y")
74
+ return dt.strftime("%Y-%m-%d")
75
+ except:
76
+ pass
77
+ return value
78
+
79
+ def map_fields(extracted_data, salesforce_fields):
80
  try:
81
  mappings = {}
82
  confidence_scores = {}
83
+ for key in extracted_data[0]["keys"]:
84
+ best_match = difflib.get_close_matches(key, salesforce_fields, n=1, cutoff=0.3)
 
 
 
 
 
 
85
  if best_match:
86
  matched_field = best_match[0]
 
87
  mappings[key] = matched_field
88
+ ratio = difflib.SequenceMatcher(None, key, matched_field).ratio()
89
+ confidence_scores[key] = round(ratio, 2)
90
  else:
91
+ mappings[key] = "No match"
92
  confidence_scores[key] = 0.0
 
93
  return mappings, confidence_scores, None
94
  except Exception as e:
95
  return None, None, str(e)
96
 
97
  def create_record(sf, object_name, data):
98
  try:
99
+ # Insert required IDs
100
+ data["AccountId"] = "001dL00001ASyPbQAL"
101
+ data["OwnerId"] = "005dL00000f9B0lQAE"
102
+
103
+ # Normalize all field values
104
+ for key in data:
105
+ data[key] = normalize_date(data[key])
106
+
107
  result = sf.__getattr__(object_name).create(data)
108
+ return result.get("id"), None
109
  except Exception as e:
110
  return None, str(e)
111
 
112
  def attach_pdf(sf, record_id, pdf_path):
113
  try:
114
  with open(pdf_path, "rb") as f:
115
+ pdf_data = f.read()
116
+ content_version = {
117
  "Title": os.path.basename(pdf_path),
118
  "PathOnClient": os.path.basename(pdf_path),
119
+ "VersionData": pdf_data,
120
+ }
121
+ cv_result = sf.ContentVersion.create(content_version)
122
+ content_version_id = cv_result["id"]
123
 
124
+ # Get the ContentDocumentId from ContentVersion
125
+ query = f"SELECT ContentDocumentId FROM ContentVersion WHERE Id = '{content_version_id}'"
126
+ result = sf.query(query)
127
+ content_document_id = result["records"][0]["ContentDocumentId"]
128
 
129
+ # Link ContentDocument to the record
130
  sf.ContentDocumentLink.create({
131
  "ContentDocumentId": content_document_id,
132
  "LinkedEntityId": record_id,
133
+ "ShareType": "V",
134
+ "Visibility": "AllUsers"
135
  })
136
+ return "Attached successfully", None
 
137
  except Exception as e:
138
  return None, str(e)
139
 
140
+ def log_failure(pdf_path, object_name, error_message):
141
  with open("failures.json", "a") as f:
142
+ json.dump({
143
+ "pdf": pdf_path,
144
+ "object": object_name,
145
+ "error": error_message
146
+ }, f)
147
  f.write("\n")