pavansuresh commited on
Commit
6c54356
·
verified ·
1 Parent(s): 3de93fb

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +123 -165
utils.py CHANGED
@@ -1,166 +1,124 @@
 
1
  import os
2
- import fitz # PyMuPDF
3
- import json
4
- import re
5
- import difflib
6
- from simple_salesforce import Salesforce
7
- from dotenv import load_dotenv
8
-
9
- # Load .env variables
10
- load_dotenv()
11
-
12
- def get_salesforce_client():
13
- try:
14
- sf = Salesforce(
15
- username=os.getenv("SF_USERNAME"),
16
- password=os.getenv("SF_PASSWORD"),
17
- security_token=os.getenv("SF_SECURITY_TOKEN"),
18
- domain=os.getenv("SF_DOMAIN").replace("https://", "").replace(".salesforce.com", "")
19
- )
20
- return sf, None
21
- except Exception as e:
22
- return None, str(e)
23
-
24
- def get_salesforce_objects(sf):
25
- try:
26
- desc = sf.describe()
27
- object_names = []
28
- for obj in desc['sobjects']:
29
- if not obj['deprecatedAndHidden']: # Skip hidden/deprecated
30
- object_names.append(obj['name'])
31
- object_names.sort()
32
- return object_names, None
33
- except Exception as e:
34
- return [], str(e)
35
-
36
- def get_object_fields(sf, object_name):
37
- try:
38
- metadata = sf.__getattr__(object_name).describe()
39
- fields = [field['name'] for field in metadata['fields']]
40
- return fields, None
41
- except Exception as e:
42
- return [], str(e)
43
-
44
- def extract_text_from_pdf(pdf_path):
45
- try:
46
- doc = fitz.open(pdf_path)
47
- text = ""
48
- for page in doc:
49
- text += page.get_text()
50
- return text.strip(), None
51
- except Exception as e:
52
- return None, str(e)
53
-
54
- # ✅ Extract key-value pairs smartly from the PDF text
55
- def extract_key_value_pairs(pdf_path):
56
- try:
57
- doc = fitz.open(pdf_path)
58
- text = ""
59
- for page in doc:
60
- text += page.get_text()
61
-
62
- keys = []
63
- values = []
64
-
65
- # Pattern 1: Direct "Key: Value"
66
- lines = text.splitlines()
67
- for line in lines:
68
- match = re.match(r"^([A-Z][A-Za-z0-9 ()/_\-]{3,50})\s*[:\-]\s*(.+)$", line.strip())
69
- if match:
70
- key = match.group(1).strip()
71
- value = match.group(2).strip()
72
- if len(value) > 1:
73
- keys.append(key)
74
- values.append(value)
75
-
76
- # Pattern 2: Special extractions
77
-
78
- # Total Agreement Value
79
- match_val = re.search(r"Total Agreement Value[^\$]*\$\s?([\d,]+(?:\.\d{2})?)", text, re.IGNORECASE)
80
- if match_val:
81
- keys.append("Total Agreement Value")
82
- numeric_value = match_val.group(1).replace(",", "")
83
- values.append(numeric_value)
84
-
85
- # Agreement Name
86
- match_name = re.search(r"Agreement\s+(MSA\s+[A-Za-z0-9 _\-]+)", text, re.IGNORECASE)
87
- if match_name:
88
- keys.append("Agreement Name")
89
- values.append(match_name.group(1).strip())
90
-
91
- # Agreement Start and End Dates
92
- match_dates = re.search(
93
- r"effective as of\s*([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}).*?until\s*[<\(]?([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})",
94
- text, re.IGNORECASE | re.DOTALL
95
- )
96
- if match_dates:
97
- keys.append("Agreement Start Date")
98
- values.append(match_dates.group(1).strip())
99
- keys.append("Agreement End Date")
100
- values.append(match_dates.group(2).strip())
101
-
102
- return [{"keys": keys, "values": values}], None
103
- except Exception as e:
104
- return None, str(e)
105
-
106
- # FIXED: Use original key as mapping key (not value)
107
- def map_fields(extracted_data, object_fields):
108
- try:
109
- mappings = {}
110
- confidence_scores = {}
111
- keys = extracted_data[0]["keys"]
112
- values = extracted_data[0]["values"]
113
-
114
- for key, value in zip(keys, values):
115
- if key.lower() in ["name", "email"]:
116
- continue # Skip these
117
-
118
- best_match = difflib.get_close_matches(key, object_fields, n=1, cutoff=0.0)
119
- if best_match:
120
- matched_field = best_match[0]
121
- confidence = difflib.SequenceMatcher(None, key.lower(), matched_field.lower()).ratio()
122
- mappings[key] = matched_field
123
- confidence_scores[key] = round(confidence, 2)
124
- else:
125
- mappings[key] = object_fields[0]
126
- confidence_scores[key] = 0.0
127
-
128
- return mappings, confidence_scores, None
129
- except Exception as e:
130
- return None, None, str(e)
131
-
132
- def create_record(sf, object_name, data):
133
- try:
134
- result = sf.__getattr__(object_name).create(data)
135
- return result.get("id", "Unknown ID"), None
136
- except Exception as e:
137
- return None, str(e)
138
-
139
- def attach_pdf(sf, record_id, pdf_path):
140
- try:
141
- with open(pdf_path, "rb") as f:
142
- body = f.read()
143
- content_version = sf.ContentVersion.create({
144
- "Title": os.path.basename(pdf_path),
145
- "PathOnClient": os.path.basename(pdf_path),
146
- "VersionData": body.encode("base64") if isinstance(body, str) else body
147
- })
148
-
149
- content_document_id = sf.query(
150
- f"SELECT ContentDocumentId FROM ContentVersion WHERE Id = '{content_version['id']}'"
151
- )["records"][0]["ContentDocumentId"]
152
-
153
- sf.ContentDocumentLink.create({
154
- "ContentDocumentId": content_document_id,
155
- "LinkedEntityId": record_id,
156
- "ShareType": "V"
157
- })
158
-
159
- return "PDF attached successfully", None
160
- except Exception as e:
161
- return None, str(e)
162
-
163
- def log_failure(pdf_path, object_name, error):
164
- with open("failures.json", "a") as f:
165
- json.dump({"pdf": pdf_path, "object": object_name, "error": error}, f)
166
- f.write("\n")
 
1
+ import gradio as gr
2
  import os
3
+ from utils import (
4
+ get_salesforce_client,
5
+ get_salesforce_objects,
6
+ get_object_fields,
7
+ extract_key_value_pairs,
8
+ map_fields,
9
+ create_record,
10
+ attach_pdf,
11
+ log_failure
12
+ )
13
+
14
+ # Set up environment
15
+ os.environ["GRADIO_TEMP_DIR"] = "temp"
16
+ os.makedirs("temp", exist_ok=True)
17
+
18
+ # Global variable
19
+ object_fields_cache = {}
20
+
21
+ # Get all Salesforce object names
22
+ def get_salesforce_object_names():
23
+ sf, error = get_salesforce_client()
24
+ if error:
25
+ return ["Error connecting to Salesforce"]
26
+ object_names, _ = get_salesforce_objects(sf)
27
+ return object_names
28
+
29
+ # Extract key-value pairs
30
+ def extract_contract_fields(pdf_paths):
31
+ if not pdf_paths or not isinstance(pdf_paths, list) or not pdf_paths[0]:
32
+ return {}, "No PDF uploaded"
33
+ extracted_data, error = extract_key_value_pairs(pdf_paths[0])
34
+ if error:
35
+ return {}, f"Extraction error: {error}"
36
+ return extracted_data[0], "Extraction successful"
37
+
38
+ # Map fields between extracted data and Salesforce object fields
39
+ def map_contract_fields(pdf_paths, object_name):
40
+ if not pdf_paths or not isinstance(pdf_paths, list) or not pdf_paths[0]:
41
+ return "No PDF uploaded"
42
+ sf, error = get_salesforce_client()
43
+ if error:
44
+ return f"Error: {error}"
45
+ extracted_data, error = extract_key_value_pairs(pdf_paths[0])
46
+ if error:
47
+ return f"Error: {error}"
48
+ if object_name in object_fields_cache:
49
+ fields = object_fields_cache[object_name]
50
+ else:
51
+ fields, error = get_object_fields(sf, object_name)
52
+ object_fields_cache[object_name] = fields
53
+ mappings, scores, _ = map_fields(extracted_data, fields)
54
+ mapping_display = ""
55
+ for key, sf_field in mappings.items():
56
+ mapping_display += f"{key} -> {sf_field} (Confidence: {scores.get(key, 0)})\n"
57
+ return mapping_display
58
+
59
+ # Migrate to Salesforce
60
+ def migrate_to_salesforce(pdf_paths, object_name):
61
+ if not pdf_paths or not isinstance(pdf_paths, list) or not pdf_paths[0]:
62
+ return "Error: No valid PDF file provided"
63
+ pdf_path = pdf_paths[0]
64
+ sf, error = get_salesforce_client()
65
+ if error:
66
+ log_failure(pdf_path, object_name, error)
67
+ return f"Error: {error}"
68
+ extracted_data, error = extract_key_value_pairs(pdf_path)
69
+ if error:
70
+ log_failure(pdf_path, object_name, error)
71
+ return f"Error: {error}"
72
+ fields, error = get_object_fields(sf, object_name)
73
+ if error:
74
+ log_failure(pdf_path, object_name, error)
75
+ return f"Error: {error}"
76
+ mappings, _, error = map_fields(extracted_data, fields)
77
+ if error:
78
+ log_failure(pdf_path, object_name, error)
79
+ return f"Error: {error}"
80
+
81
+ data = {}
82
+ for key, value in zip(extracted_data[0]["keys"], extracted_data[0]["values"]):
83
+ sf_field = mappings.get(key)
84
+ if not sf_field:
85
+ continue
86
+ if "value" in key.lower() and isinstance(value, str):
87
+ value = float(value.replace(",", "").replace("$", ""))
88
+ data[sf_field] = value
89
+
90
+ data["AccountId"] = "001XXXXXXXXXXXXXXX" # Replace with actual AccountId
91
+
92
+ record_id, error = create_record(sf, object_name, data)
93
+ if error:
94
+ log_failure(pdf_path, object_name, error)
95
+ return f"Error: {error}"
96
+ attach_status, error = attach_pdf(sf, record_id, pdf_path)
97
+ if error:
98
+ log_failure(pdf_path, object_name, error)
99
+ return f"Error: {error}"
100
+ return f" Record Created: {record_id}\n📎 Attachment: {attach_status}"
101
+
102
+ # Gradio UI
103
+ with gr.Blocks(title="Smart Contract Migrator") as demo:
104
+ gr.Markdown("# 📄 Smart Contract Migrator to Salesforce")
105
+
106
+ with gr.Row():
107
+ pdf_input = gr.File(label="Upload Contract PDF", file_types=[".pdf"])
108
+ object_dropdown = gr.Dropdown(label="Salesforce Object", choices=get_salesforce_object_names())
109
+
110
+ extract_btn = gr.Button("Extract Fields")
111
+ extract_output = gr.JSON(label="Extracted Fields")
112
+
113
+ map_btn = gr.Button("Map Fields")
114
+ map_output = gr.Textbox(label="Field Mappings")
115
+
116
+ migrate_btn = gr.Button("Migrate to Salesforce")
117
+ migrate_output = gr.Textbox(label="Migration Result")
118
+
119
+ extract_btn.click(fn=extract_contract_fields, inputs=[pdf_input], outputs=[extract_output, map_output])
120
+ map_btn.click(fn=map_contract_fields, inputs=[pdf_input, object_dropdown], outputs=map_output)
121
+ migrate_btn.click(fn=migrate_to_salesforce, inputs=[pdf_input, object_dropdown], outputs=migrate_output)
122
+
123
+ if __name__ == "__main__":
124
+ demo.launch()