Spaces:
Sleeping
Sleeping
File size: 4,570 Bytes
06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 3c36447 3b7cab4 3c36447 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 0ae7721 3b7cab4 06e182c 3b7cab4 0ae7721 3b7cab4 06e182c 3b7cab4 a00fb61 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 06e182c 3b7cab4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import requests
import base64
import json
import os
from simple_salesforce import Salesforce
from pdf2image import convert_from_path
import pytesseract
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Salesforce Authentication
def get_salesforce_client():
try:
username = os.getenv('SF_USERNAME')
password = os.getenv('SF_PASSWORD')
security_token = os.getenv('SF_SECURITY_TOKEN')
domain = os.getenv('SF_DOMAIN', 'login.salesforce.com') # Default to login.salesforce.com if not set
instance_url = f"https://{domain}"
if not all([username, password, security_token, instance_url]):
raise ValueError("Missing required Salesforce credentials or instance URL")
sf = Salesforce(
username=username,
password=password,
security_token=security_token,
instance_url=instance_url
)
print("Salesforce client connected successfully")
return sf, None
except Exception as e:
print(f"Salesforce connection failed: {str(e)}")
return None, str(e)
# Fetch Salesforce Objects
def get_salesforce_objects(sf):
try:
response = sf.restful('sobjects')
return [obj['name'] for obj in response['sobjects'] if obj['createable']], None
except Exception as e:
return [], str(e)
# Fetch Object Fields
def get_object_fields(sf, object_name):
try:
desc = sf.__getattr__(object_name).describe()
return [field['name'] for field in desc['fields']], None
except Exception as e:
return [], str(e)
# OCR for Text Extraction
def extract_text_from_pdf(pdf_path):
try:
images = convert_from_path(pdf_path)
text_data = [pytesseract.image_to_string(img) for img in images]
return {"pages": text_data}, None
except Exception as e:
if "poppler" in str(e).lower():
return {}, "Error: Unable to process PDF. Please ensure Poppler is installed and in PATH (e.g., 'apt-get install poppler-utils' on Ubuntu)."
return {}, str(e)
# Key-Value Pair Extraction using LayoutLMv3
def extract_key_value_pairs(pdf_path):
try:
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base-finetuned-funsd")
images = convert_from_path(pdf_path)
extracted_data = []
for img in images:
encoding = processor(img, truncation=True, return_tensors="pt")
outputs = model(**encoding)
# Simplified: Return dummy key-value pairs (real implementation needs post-processing)
extracted_data.append({"keys": ["Contract Number", "Date"], "values": ["12345", "2025-01-01"]})
return extracted_data, None
except Exception as e:
if "poppler" in str(e).lower():
return [], "Error: Unable to process PDF. Please ensure Poppler is installed and in PATH (e.g., 'apt-get install poppler-utils' on Ubuntu)."
return [], str(e)
# Map Extracted Data to Salesforce Fields
def map_fields(extracted_data, salesforce_fields):
mappings = {}
confidence_scores = {}
for key in extracted_data[0]["keys"]: # Simplified: Using first page
for field in salesforce_fields:
if key.lower() in field.lower():
mappings[key] = field
confidence_scores[key] = 0.9 # Dummy confidence score
return mappings, confidence_scores, None
# Create Salesforce Record
def create_record(sf, object_api_name, data):
try:
result = sf.__getattr__(object_api_name).create(data)
return result['id'], None
except Exception as e:
return None, str(e)
# Attach PDF to Salesforce Record
def attach_pdf(sf, record_id, file_path):
try:
with open(file_path, "rb") as f:
encoded_file = base64.b64encode(f.read()).decode()
attachment = {
"ParentId": record_id,
"Name": os.path.basename(file_path),
"Body": encoded_file
}
sf.Attachment.create(attachment)
return "PDF Attached", None
except Exception as e:
return None, str(e)
# Log Failed Migration
def log_failure(pdf_path, object_name, error):
log_entry = {"pdf": pdf_path, "object": object_name, "error": error}
with open("failures.json", "a") as f:
json.dump(log_entry, f)
f.write("\n") |