File size: 4,570 Bytes
06e182c
 
3b7cab4
06e182c
3b7cab4
 
 
 
06e182c
 
3b7cab4
06e182c
 
3b7cab4
 
 
3c36447
 
 
 
 
 
 
 
 
3b7cab4
3c36447
 
 
 
3b7cab4
 
 
 
 
 
06e182c
3b7cab4
 
 
 
 
 
 
06e182c
3b7cab4
 
 
 
 
 
 
06e182c
3b7cab4
 
06e182c
3b7cab4
 
 
 
0ae7721
 
3b7cab4
06e182c
3b7cab4
 
 
 
 
 
 
 
 
 
 
 
 
 
0ae7721
 
3b7cab4
06e182c
3b7cab4
 
 
 
 
 
 
 
 
 
a00fb61
3b7cab4
 
 
 
 
 
 
06e182c
3b7cab4
 
06e182c
 
3b7cab4
 
06e182c
 
3b7cab4
06e182c
3b7cab4
 
06e182c
3b7cab4
06e182c
3b7cab4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import base64
import json
import os
from simple_salesforce import Salesforce
from pdf2image import convert_from_path
import pytesseract
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Salesforce Authentication
def get_salesforce_client():
    try:
        username = os.getenv('SF_USERNAME')
        password = os.getenv('SF_PASSWORD')
        security_token = os.getenv('SF_SECURITY_TOKEN')
        domain = os.getenv('SF_DOMAIN', 'login.salesforce.com')  # Default to login.salesforce.com if not set
        instance_url = f"https://{domain}"

        if not all([username, password, security_token, instance_url]):
            raise ValueError("Missing required Salesforce credentials or instance URL")

        sf = Salesforce(
            username=username,
            password=password,
            security_token=security_token,
            instance_url=instance_url
        )
        print("Salesforce client connected successfully")
        return sf, None
    except Exception as e:
        print(f"Salesforce connection failed: {str(e)}")
        return None, str(e)

# Fetch Salesforce Objects
def get_salesforce_objects(sf):
    try:
        response = sf.restful('sobjects')
        return [obj['name'] for obj in response['sobjects'] if obj['createable']], None
    except Exception as e:
        return [], str(e)

# Fetch Object Fields
def get_object_fields(sf, object_name):
    try:
        desc = sf.__getattr__(object_name).describe()
        return [field['name'] for field in desc['fields']], None
    except Exception as e:
        return [], str(e)

# OCR for Text Extraction
def extract_text_from_pdf(pdf_path):
    try:
        images = convert_from_path(pdf_path)
        text_data = [pytesseract.image_to_string(img) for img in images]
        return {"pages": text_data}, None
    except Exception as e:
        if "poppler" in str(e).lower():
            return {}, "Error: Unable to process PDF. Please ensure Poppler is installed and in PATH (e.g., 'apt-get install poppler-utils' on Ubuntu)."
        return {}, str(e)

# Key-Value Pair Extraction using LayoutLMv3
def extract_key_value_pairs(pdf_path):
    try:
        processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
        model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base-finetuned-funsd")
        images = convert_from_path(pdf_path)
        extracted_data = []
        for img in images:
            encoding = processor(img, truncation=True, return_tensors="pt")
            outputs = model(**encoding)
            # Simplified: Return dummy key-value pairs (real implementation needs post-processing)
            extracted_data.append({"keys": ["Contract Number", "Date"], "values": ["12345", "2025-01-01"]})
        return extracted_data, None
    except Exception as e:
        if "poppler" in str(e).lower():
            return [], "Error: Unable to process PDF. Please ensure Poppler is installed and in PATH (e.g., 'apt-get install poppler-utils' on Ubuntu)."
        return [], str(e)

# Map Extracted Data to Salesforce Fields
def map_fields(extracted_data, salesforce_fields):
    mappings = {}
    confidence_scores = {}
    for key in extracted_data[0]["keys"]:  # Simplified: Using first page
        for field in salesforce_fields:
            if key.lower() in field.lower():
                mappings[key] = field
                confidence_scores[key] = 0.9  # Dummy confidence score
    return mappings, confidence_scores, None

# Create Salesforce Record
def create_record(sf, object_api_name, data):
    try:
        result = sf.__getattr__(object_api_name).create(data)
        return result['id'], None
    except Exception as e:
        return None, str(e)

# Attach PDF to Salesforce Record
def attach_pdf(sf, record_id, file_path):
    try:
        with open(file_path, "rb") as f:
            encoded_file = base64.b64encode(f.read()).decode()
        attachment = {
            "ParentId": record_id,
            "Name": os.path.basename(file_path),
            "Body": encoded_file
        }
        sf.Attachment.create(attachment)
        return "PDF Attached", None
    except Exception as e:
        return None, str(e)

# Log Failed Migration
def log_failure(pdf_path, object_name, error):
    log_entry = {"pdf": pdf_path, "object": object_name, "error": error}
    with open("failures.json", "a") as f:
        json.dump(log_entry, f)
        f.write("\n")