Files changed (1) hide show
  1. utils.py +30 -99
utils.py CHANGED
@@ -1,122 +1,53 @@
1
- import requests
2
- import base64
3
  import json
4
  import os
5
- from simple_salesforce import Salesforce
6
- from pdf2image import convert_from_path
7
- import pytesseract
8
- from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
9
- from dotenv import load_dotenv
10
 
11
- # Load environment variables
12
- load_dotenv()
13
-
14
- # Salesforce Authentication
15
  def get_salesforce_client():
16
- try:
17
- username = os.getenv('SF_USERNAME')
18
- password = os.getenv('SF_PASSWORD')
19
- security_token = os.getenv('SF_SECURITY_TOKEN')
20
- domain = os.getenv('SF_DOMAIN', 'login.salesforce.com') # Default to login.salesforce.com if not set
21
- instance_url = f"https://{domain}"
22
-
23
- if not all([username, password, security_token, instance_url]):
24
- raise ValueError("Missing required Salesforce credentials or instance URL")
25
 
26
- sf = Salesforce(
27
- username=username,
28
- password=password,
29
- security_token=security_token,
30
- instance_url=instance_url
31
- )
32
- print("Salesforce client connected successfully")
33
- return sf, None
34
- except Exception as e:
35
- print(f"Salesforce connection failed: {str(e)}")
36
- return None, str(e)
37
-
38
- # Fetch Salesforce Objects
39
  def get_salesforce_objects(sf):
40
- try:
41
- response = sf.restful('sobjects')
42
- return [obj['name'] for obj in response['sobjects'] if obj['createable']], None
43
- except Exception as e:
44
- return [], str(e)
45
 
46
- # Fetch Object Fields
47
  def get_object_fields(sf, object_name):
48
- try:
49
- desc = sf.__getattr__(object_name).describe()
50
- return [field['name'] for field in desc['fields']], None
51
- except Exception as e:
52
- return [], str(e)
53
 
54
- # OCR for Text Extraction
55
  def extract_text_from_pdf(pdf_path):
56
  try:
57
- images = convert_from_path(pdf_path)
58
- text_data = [pytesseract.image_to_string(img) for img in images]
59
- return {"pages": text_data}, None
 
 
60
  except Exception as e:
61
- if "poppler" in str(e).lower():
62
- return {}, "Error: Unable to process PDF. Please ensure Poppler is installed and in PATH (e.g., 'apt-get install poppler-utils' on Ubuntu)."
63
- return {}, str(e)
64
 
65
- # Key-Value Pair Extraction using LayoutLMv3
66
  def extract_key_value_pairs(pdf_path):
67
  try:
68
- processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
69
- model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base-finetuned-funsd")
70
- images = convert_from_path(pdf_path)
71
- extracted_data = []
72
- for img in images:
73
- encoding = processor(img, truncation=True, return_tensors="pt")
74
- outputs = model(**encoding)
75
- # Simplified: Return dummy key-value pairs (real implementation needs post-processing)
76
- extracted_data.append({"keys": ["Contract Number", "Date"], "values": ["12345", "2025-01-01"]})
77
- return extracted_data, None
78
- except Exception as e:
79
- if "poppler" in str(e).lower():
80
- return [], "Error: Unable to process PDF. Please ensure Poppler is installed and in PATH (e.g., 'apt-get install poppler-utils' on Ubuntu)."
81
- return [], str(e)
82
-
83
- # Map Extracted Data to Salesforce Fields
84
- def map_fields(extracted_data, salesforce_fields):
85
- mappings = {}
86
- confidence_scores = {}
87
- for key in extracted_data[0]["keys"]: # Simplified: Using first page
88
- for field in salesforce_fields:
89
- if key.lower() in field.lower():
90
- mappings[key] = field
91
- confidence_scores[key] = 0.9 # Dummy confidence score
92
- return mappings, confidence_scores, None
93
-
94
- # Create Salesforce Record
95
- def create_record(sf, object_api_name, data):
96
- try:
97
- result = sf.__getattr__(object_api_name).create(data)
98
- return result['id'], None
99
  except Exception as e:
100
  return None, str(e)
101
 
102
- # Attach PDF to Salesforce Record
103
- def attach_pdf(sf, record_id, file_path):
104
  try:
105
- with open(file_path, "rb") as f:
106
- encoded_file = base64.b64encode(f.read()).decode()
107
- attachment = {
108
- "ParentId": record_id,
109
- "Name": os.path.basename(file_path),
110
- "Body": encoded_file
111
- }
112
- sf.Attachment.create(attachment)
113
- return "PDF Attached", None
114
  except Exception as e:
115
- return None, str(e)
 
 
 
 
 
 
116
 
117
- # Log Failed Migration
118
  def log_failure(pdf_path, object_name, error):
119
- log_entry = {"pdf": pdf_path, "object": object_name, "error": error}
120
  with open("failures.json", "a") as f:
121
- json.dump(log_entry, f)
122
- f.write("\n")
 
1
+ import fitz # PyMuPDF
 
2
  import json
3
  import os
 
 
 
 
 
4
 
 
 
 
 
5
  def get_salesforce_client():
6
+ # Replace with actual authentication logic
7
+ return "FakeSFClient", None
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def get_salesforce_objects(sf):
10
+ return ["Account", "Contact", "Opportunity"], None
 
 
 
 
11
 
 
12
  def get_object_fields(sf, object_name):
13
+ return ["Name", "Email", "Phone"], None
 
 
 
 
14
 
 
15
  def extract_text_from_pdf(pdf_path):
16
  try:
17
+ doc = fitz.open(pdf_path)
18
+ text = ""
19
+ for page in doc:
20
+ text += page.get_text()
21
+ return text.strip(), None
22
  except Exception as e:
23
+ return None, str(e)
 
 
24
 
 
25
  def extract_key_value_pairs(pdf_path):
26
  try:
27
+ # Dummy example; replace with NLP extraction logic
28
+ return [{"keys": ["Name", "Email"], "values": ["John Doe", "john@example.com"]}], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
  return None, str(e)
31
 
32
+ def map_fields(extracted_data, object_fields):
 
33
  try:
34
+ mappings = {}
35
+ confidence_scores = {}
36
+ for k in extracted_data[0]["keys"]:
37
+ match = next((f for f in object_fields if f.lower() in k.lower()), object_fields[0])
38
+ mappings[k] = match
39
+ confidence_scores[k] = 0.9
40
+ return mappings, confidence_scores, None
 
 
41
  except Exception as e:
42
+ return None, None, str(e)
43
+
44
+ def create_record(sf, object_name, data):
45
+ return "001ABC123XYZ", None # Simulated Salesforce ID
46
+
47
+ def attach_pdf(sf, record_id, pdf_path):
48
+ return "PDF attached successfully", None
49
 
 
50
  def log_failure(pdf_path, object_name, error):
 
51
  with open("failures.json", "a") as f:
52
+ json.dump({"pdf": pdf_path, "object": object_name, "error": error}, f)
53
+ f.write("\n")