Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 5, 2025

Commit

2b51034

verified ·

1 Parent(s): 1c6f1bf

Update ai_mapping.py

Browse files

Files changed (1) hide show

ai_mapping.py +79 -25

ai_mapping.py CHANGED Viewed

@@ -1,39 +1,93 @@
-from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
-import os
-def run_ai_mapping(text_data, pdf_path, object_fields):
     """
-    Map extracted PDF text to Salesforce fields using LayoutLMv3.
-    Returns mappings with confidence scores and flags unmapped fields.
     """
     try:
-        # Placeholder for LayoutLMv3-based key-value pair extraction
-        # In a real implementation, load the model and processor
-        # processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
-        # model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base-finetuned-funsd")
-        # Process pdf_path, extract key-value pairs, and map to object_fields
-        # Mock implementation for demonstration
-        mappings = {
-            "Customer_Name__c": {"value": "Acme Corp", "confidence": 0.95},
-            "Start_Date__c": {"value": "2023-01-01", "confidence": 0.90},
-            "End_Date__c": {"value": "2024-01-01", "confidence": 0.90},
-            "Amount__c": {"value": "50000", "confidence": 0.85}
-        }
-        # Flag unmapped fields
-        unmapped_fields = [field for field in object_fields if field not in mappings]
-        result = {
             "mappings": mappings,
             "unmapped_fields": unmapped_fields,
-            "status": "success" if not unmapped_fields else "partial",
             "error": None
         }
-        return result
     except Exception as e:
         return {
-            "mappings": {},
-            "unmapped_fields": object_fields,
             "status": "failed",
-            "error": f"AI mapping failed: {str(e)}"
         }

+from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification, LayoutLMv3FeatureExtractor
+import torch
+from PIL import Image
+import pdf2image
+from typing import Dict, List
+# Load pre-trained LayoutLMv3 models (adjust model names based on your fine-tuned models)
+tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)  # Set to True if OCR is needed
+model = LayoutLMv3ForTokenClassification.from_pretrained("path_to_finetuned_funsd_model")  # Replace with your fine-tuned model
+def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
     """
+    Extract key-value pairs from PDF text using LayoutLMv3-finetuned-funsd.
+    Args:
+        text_data (str): Extracted text from PDF.
+        pdf_path (str): Path to the PDF file.
+    Returns:
+        dict: Key-value pairs extracted from the document.
     """
     try:
+        # Convert PDF to images (one per page)
+        images = pdf2image.convert_from_path(pdf_path)
+        # Process each page
+        key_values = {}
+        for i, image in enumerate(images):
+            # Preprocess image and text
+            encoding = feature_extractor(image, text_data.splitlines(), return_tensors="pt")
+            input_ids = encoding["input_ids"]
+            attention_mask = encoding["attention_mask"]
+            token_type_ids = encoding["token_type_ids"] if "token_type_ids" in encoding else None
+            # Get model predictions
+            with torch.no_grad():
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+                predictions = torch.argmax(outputs.logits, dim=2)
+            # Post-process predictions to extract key-value pairs (simplified logic)
+            tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
+            labels = predictions[0].tolist()
+            current_key = None
+            current_value = []
+            for token, label in zip(tokens, labels):
+                if label == 1:  # Assuming label 1 indicates a key start
+                    if current_key and current_value:
+                        key_values[current_key] = " ".join(current_value).strip()
+                    current_key = token
+                    current_value = []
+                elif label == 2 and current_key:  # Assuming label 2 indicates a value
+                    current_value.append(token)
+            if current_key and current_value:
+                key_values[current_key] = " ".join(current_value).strip()
+        return key_values
+    except Exception as e:
+        return {"status": "failed", "error": str(e), "key_values": {}}
+def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
+    """
+    Map extracted key-values to Salesforce fields using a custom-trained Transformer.
+    Args:
+        key_values (dict): Extracted key-value pairs.
+        object_field_names (list): List of Salesforce field names.
+        pdf_path (str): Path to the PDF file (for context if needed).
+    Returns:
+        dict: Mapping results with status, mappings, unmapped fields, and error (if any).
+    """
+    try:
+        # Placeholder for custom-trained Transformer logic (replace with your model)
+        mappings = {}
+        unmapped_fields = object_field_names.copy()
+        for field in object_field_names:
+            for key, value in key_values.items():
+                if field.lower() in key.lower():  # Simple string matching (replace with model prediction)
+                    mappings[field] = value
+                    unmapped_fields.remove(field)
+                    break
+        return {
+            "status": "success",
             "mappings": mappings,
             "unmapped_fields": unmapped_fields,
             "error": None
         }
     except Exception as e:
         return {
             "status": "failed",
+            "error": str(e),
+            "mappings": {},
+            "unmapped_fields": object_field_names
         }