pavansuresh commited on
Commit
06e182c
·
verified ·
1 Parent(s): 412abe8

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +109 -0
utils.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import base64
3
+ import json
4
+ import os
5
+ from simple_salesforce import Salesforce
6
+ from pdf2image import convert_from_path
7
+ import pytesseract
8
+ from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Salesforce Authentication
15
+ def get_salesforce_client():
16
+ try:
17
+ sf = Salesforce(
18
+ username=os.getenv('SALESFORCE_USERNAME'),
19
+ password=os.getenv('SALESFORCE_PASSWORD'),
20
+ security_token=os.getenv('SALESFORCE_SECURITY_TOKEN'),
21
+ client_id=os.getenv('SALESFORCE_CLIENT_ID'),
22
+ client_secret=os.getenv('SALESFORCE_CLIENT_SECRET'),
23
+ instance_url=os.getenv('SALESFORCE_INSTANCE_URL')
24
+ )
25
+ return sf, None
26
+ except Exception as e:
27
+ return None, str(e)
28
+
29
+ # Fetch Salesforce Objects
30
+ def get_salesforce_objects(sf):
31
+ try:
32
+ response = sf.restful('sobjects')
33
+ return [obj['name'] for obj in response['sobjects'] if obj['createable']], None
34
+ except Exception as e:
35
+ return [], str(e)
36
+
37
+ # Fetch Object Fields
38
+ def get_object_fields(sf, object_name):
39
+ try:
40
+ desc = sf.__getattr__(object_name).describe()
41
+ return [field['name'] for field in desc['fields']], None
42
+ except Exception as e:
43
+ return [], str(e)
44
+
45
+ # OCR for Text Extraction
46
+ def extract_text_from_pdf(pdf_path):
47
+ try:
48
+ images = convert_from_path(pdf_path)
49
+ text_data = [pytesseract.image_to_string(img) for img in images]
50
+ return {"pages": text_data}, None
51
+ except Exception as e:
52
+ return {}, str(e)
53
+
54
+ # Key-Value Pair Extraction using LayoutLMv3
55
+ def extract_key_value_pairs(pdf_path):
56
+ try:
57
+ processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
58
+ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base-finetuned-funsd")
59
+ images = convert_from_path(pdf_path)
60
+ extracted_data = []
61
+ for img in images:
62
+ encoding = processor(img, truncation=True, return_tensors="pt")
63
+ outputs = model(**encoding)
64
+ # Simplified: Return dummy key-value pairs (real implementation needs post-processing)
65
+ extracted_data.append({"keys": ["Contract Number", "Date"], "values": ["12345", "2025-01-01"]})
66
+ return extracted_data, None
67
+ except Exception as e:
68
+ return [], str(e)
69
+
70
+ # Map Extracted Data to Salesforce Fields
71
+ def map_fields(extracted_data, salesforce_fields):
72
+ mappings = {}
73
+ confidence_scores = {}
74
+ for key in extracted_data[0]["keys"]: # Simplified: Using first page
75
+ for field in salesforce_fields:
76
+ if key.lower() in field.lower():
77
+ mappings[key] = field
78
+ confidence_scores[key] = 0.9 # Dummy confidence score
79
+ return mappings, confidence_scores, None
80
+
81
+ # Create Salesforce Record
82
+ def create_record(sf, object_api_name, data):
83
+ try:
84
+ result = sf.__getattr__(object_api_name).create(data)
85
+ return result['id'], None
86
+ except Exception as e:
87
+ return None, str(e)
88
+
89
+ # Attach PDF to Salesforce Record
90
+ def attach_pdf(sf, record_id, file_path):
91
+ try:
92
+ with open(file_path, "rb") as f:
93
+ encoded_file = base64.b64encode(f.read()).decode()
94
+ attachment = {
95
+ "ParentId": record_id,
96
+ "Name": os.path.basename(file_path),
97
+ "Body": encoded_file
98
+ }
99
+ sf.Attachment.create(attachment)
100
+ return "PDF Attached", None
101
+ except Exception as e:
102
+ return None, str(e)
103
+
104
+ # Log Failed Migration
105
+ def log_failure(pdf_path, object_name, error):
106
+ log_entry = {"pdf": pdf_path, "object": object_name, "error": error}
107
+ with open("failures.json", "a") as f:
108
+ json.dump(log_entry, f)
109
+ f.write("\n")