pavansuresh commited on
Commit
4fc2a49
·
verified ·
1 Parent(s): be1c212

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +84 -65
utils.py CHANGED
@@ -5,9 +5,8 @@ import re
5
  import difflib
6
  from simple_salesforce import Salesforce
7
  from dotenv import load_dotenv
8
- from datetime import datetime
9
 
10
- # Load environment variables
11
  load_dotenv()
12
 
13
  def get_salesforce_client():
@@ -16,6 +15,7 @@ def get_salesforce_client():
16
  username=os.getenv("SF_USERNAME"),
17
  password=os.getenv("SF_PASSWORD"),
18
  security_token=os.getenv("SF_SECURITY_TOKEN"),
 
19
  )
20
  return sf, None
21
  except Exception as e:
@@ -24,18 +24,22 @@ def get_salesforce_client():
24
  def get_salesforce_objects(sf):
25
  try:
26
  desc = sf.describe()
27
- objects = [sobject["name"] for sobject in desc["sobjects"]]
28
- return objects, None
 
 
 
 
29
  except Exception as e:
30
- return None, str(e)
31
 
32
  def get_object_fields(sf, object_name):
33
  try:
34
- desc = sf.__getattr__(object_name).describe()
35
- fields = [field["name"] for field in desc["fields"]]
36
  return fields, None
37
  except Exception as e:
38
- return None, str(e)
39
 
40
  def extract_text_from_pdf(pdf_path):
41
  try:
@@ -43,105 +47,120 @@ def extract_text_from_pdf(pdf_path):
43
  text = ""
44
  for page in doc:
45
  text += page.get_text()
46
- doc.close()
47
  return text.strip(), None
48
  except Exception as e:
49
  return None, str(e)
50
 
 
51
  def extract_key_value_pairs(pdf_path):
52
  try:
53
  doc = fitz.open(pdf_path)
54
- key_values = []
55
  for page in doc:
56
- text = page.get_text()
57
- lines = text.split("\n")
58
- for line in lines:
59
- if ":" in line:
60
- parts = line.split(":", 1)
61
- key = parts[0].strip()
62
- value = parts[1].strip()
63
- if key and value:
64
- key_values.append((key, value))
65
- doc.close()
66
- return [{"keys": [kv[0] for kv in key_values], "values": [kv[1] for kv in key_values]}], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  except Exception as e:
68
  return None, str(e)
69
 
70
- def normalize_date(value):
71
- try:
72
- if re.match(r"^\d{1,2}/\d{1,2}/\d{2,4}$", value):
73
- dt = datetime.strptime(value, "%m/%d/%y")
74
- return dt.strftime("%Y-%m-%d")
75
- except:
76
- pass
77
- return value
78
-
79
- def map_fields(extracted_data, salesforce_fields):
80
  try:
81
  mappings = {}
82
  confidence_scores = {}
83
- for key in extracted_data[0]["keys"]:
84
- best_match = difflib.get_close_matches(key, salesforce_fields, n=1, cutoff=0.3)
 
 
 
 
 
 
85
  if best_match:
86
  matched_field = best_match[0]
 
87
  mappings[key] = matched_field
88
- ratio = difflib.SequenceMatcher(None, key, matched_field).ratio()
89
- confidence_scores[key] = round(ratio, 2)
90
  else:
91
- mappings[key] = "No match"
92
  confidence_scores[key] = 0.0
 
93
  return mappings, confidence_scores, None
94
  except Exception as e:
95
  return None, None, str(e)
96
 
97
  def create_record(sf, object_name, data):
98
  try:
99
- # Insert required IDs
100
- data["AccountId"] = "001dL00001ASyPbQAL"
101
- data["OwnerId"] = "005dL00000f9B0lQAE"
102
-
103
- # Normalize all field values
104
- for key in data:
105
- data[key] = normalize_date(data[key])
106
-
107
  result = sf.__getattr__(object_name).create(data)
108
- return result.get("id"), None
109
  except Exception as e:
110
  return None, str(e)
111
 
112
  def attach_pdf(sf, record_id, pdf_path):
113
  try:
114
  with open(pdf_path, "rb") as f:
115
- pdf_data = f.read()
116
- content_version = {
117
  "Title": os.path.basename(pdf_path),
118
  "PathOnClient": os.path.basename(pdf_path),
119
- "VersionData": pdf_data,
120
- }
121
- cv_result = sf.ContentVersion.create(content_version)
122
- content_version_id = cv_result["id"]
123
 
124
- # Get the ContentDocumentId from ContentVersion
125
- query = f"SELECT ContentDocumentId FROM ContentVersion WHERE Id = '{content_version_id}'"
126
- result = sf.query(query)
127
- content_document_id = result["records"][0]["ContentDocumentId"]
128
 
129
- # Link ContentDocument to the record
130
  sf.ContentDocumentLink.create({
131
  "ContentDocumentId": content_document_id,
132
  "LinkedEntityId": record_id,
133
- "ShareType": "V",
134
- "Visibility": "AllUsers"
135
  })
136
- return "Attached successfully", None
 
137
  except Exception as e:
138
  return None, str(e)
139
 
140
- def log_failure(pdf_path, object_name, error_message):
141
  with open("failures.json", "a") as f:
142
- json.dump({
143
- "pdf": pdf_path,
144
- "object": object_name,
145
- "error": error_message
146
- }, f)
147
  f.write("\n")
 
5
  import difflib
6
  from simple_salesforce import Salesforce
7
  from dotenv import load_dotenv
 
8
 
9
+ # Load .env variables
10
  load_dotenv()
11
 
12
  def get_salesforce_client():
 
15
  username=os.getenv("SF_USERNAME"),
16
  password=os.getenv("SF_PASSWORD"),
17
  security_token=os.getenv("SF_SECURITY_TOKEN"),
18
+ domain=os.getenv("SF_DOMAIN").replace("https://", "").replace(".salesforce.com", "")
19
  )
20
  return sf, None
21
  except Exception as e:
 
24
  def get_salesforce_objects(sf):
25
  try:
26
  desc = sf.describe()
27
+ object_names = []
28
+ for obj in desc['sobjects']:
29
+ if not obj['deprecatedAndHidden']: # Skip hidden/deprecated
30
+ object_names.append(obj['name'])
31
+ object_names.sort()
32
+ return object_names, None
33
  except Exception as e:
34
+ return [], str(e)
35
 
36
  def get_object_fields(sf, object_name):
37
  try:
38
+ metadata = sf.__getattr__(object_name).describe()
39
+ fields = [field['name'] for field in metadata['fields']]
40
  return fields, None
41
  except Exception as e:
42
+ return [], str(e)
43
 
44
  def extract_text_from_pdf(pdf_path):
45
  try:
 
47
  text = ""
48
  for page in doc:
49
  text += page.get_text()
 
50
  return text.strip(), None
51
  except Exception as e:
52
  return None, str(e)
53
 
54
+ # ✅ Extract key-value pairs smartly from the PDF text
55
  def extract_key_value_pairs(pdf_path):
56
  try:
57
  doc = fitz.open(pdf_path)
58
+ text = ""
59
  for page in doc:
60
+ text += page.get_text()
61
+
62
+ keys = []
63
+ values = []
64
+
65
+ # Pattern 1: Direct "Key: Value"
66
+ lines = text.splitlines()
67
+ for line in lines:
68
+ match = re.match(r"^([A-Z][A-Za-z0-9 ()/_\-]{3,50})\s*[:\-]\s*(.+)$", line.strip())
69
+ if match:
70
+ key = match.group(1).strip()
71
+ value = match.group(2).strip()
72
+ if len(value) > 1:
73
+ keys.append(key)
74
+ values.append(value)
75
+
76
+ # Pattern 2: Special extractions
77
+
78
+ # Total Agreement Value
79
+ match_val = re.search(r"Total Agreement Value[^\$]*\$\s?([\d,]+(?:\.\d{2})?)", text, re.IGNORECASE)
80
+ if match_val:
81
+ keys.append("Total Agreement Value")
82
+ numeric_value = match_val.group(1).replace(",", "")
83
+ values.append(numeric_value)
84
+
85
+ # Agreement Name
86
+ match_name = re.search(r"Agreement\s+(MSA\s+[A-Za-z0-9 _\-]+)", text, re.IGNORECASE)
87
+ if match_name:
88
+ keys.append("Agreement Name")
89
+ values.append(match_name.group(1).strip())
90
+
91
+ # Agreement Start and End Dates
92
+ match_dates = re.search(
93
+ r"effective as of\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}).*?until\s*[<\(]?([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})",
94
+ text, re.IGNORECASE | re.DOTALL
95
+ )
96
+ if match_dates:
97
+ keys.append("Agreement Start Date")
98
+ values.append(match_dates.group(1).strip())
99
+ keys.append("Agreement End Date")
100
+ values.append(match_dates.group(2).strip())
101
+
102
+ return [{"keys": keys, "values": values}], None
103
  except Exception as e:
104
  return None, str(e)
105
 
106
+ # ✅ FIXED: Use original key as mapping key (not value)
107
+ def map_fields(extracted_data, object_fields):
 
 
 
 
 
 
 
 
108
  try:
109
  mappings = {}
110
  confidence_scores = {}
111
+ keys = extracted_data[0]["keys"]
112
+ values = extracted_data[0]["values"]
113
+
114
+ for key, value in zip(keys, values):
115
+ if key.lower() in ["name", "email"]:
116
+ continue # Skip these
117
+
118
+ best_match = difflib.get_close_matches(key, object_fields, n=1, cutoff=0.0)
119
  if best_match:
120
  matched_field = best_match[0]
121
+ confidence = difflib.SequenceMatcher(None, key.lower(), matched_field.lower()).ratio()
122
  mappings[key] = matched_field
123
+ confidence_scores[key] = round(confidence, 2)
 
124
  else:
125
+ mappings[key] = object_fields[0]
126
  confidence_scores[key] = 0.0
127
+
128
  return mappings, confidence_scores, None
129
  except Exception as e:
130
  return None, None, str(e)
131
 
132
  def create_record(sf, object_name, data):
133
  try:
 
 
 
 
 
 
 
 
134
  result = sf.__getattr__(object_name).create(data)
135
+ return result.get("id", "Unknown ID"), None
136
  except Exception as e:
137
  return None, str(e)
138
 
139
  def attach_pdf(sf, record_id, pdf_path):
140
  try:
141
  with open(pdf_path, "rb") as f:
142
+ body = f.read()
143
+ content_version = sf.ContentVersion.create({
144
  "Title": os.path.basename(pdf_path),
145
  "PathOnClient": os.path.basename(pdf_path),
146
+ "VersionData": body.encode("base64") if isinstance(body, str) else body
147
+ })
 
 
148
 
149
+ content_document_id = sf.query(
150
+ f"SELECT ContentDocumentId FROM ContentVersion WHERE Id = '{content_version['id']}'"
151
+ )["records"][0]["ContentDocumentId"]
 
152
 
 
153
  sf.ContentDocumentLink.create({
154
  "ContentDocumentId": content_document_id,
155
  "LinkedEntityId": record_id,
156
+ "ShareType": "V"
 
157
  })
158
+
159
+ return "PDF attached successfully", None
160
  except Exception as e:
161
  return None, str(e)
162
 
163
+ def log_failure(pdf_path, object_name, error):
164
  with open("failures.json", "a") as f:
165
+ json.dump({"pdf": pdf_path, "object": object_name, "error": error}, f)
 
 
 
 
166
  f.write("\n")