Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 5, 2025

Commit

5c08ef5

verified ·

1 Parent(s): 2b51034

Update ai_mapping.py

Browse files

Files changed (1) hide show

ai_mapping.py +19 -14

ai_mapping.py CHANGED Viewed

@@ -1,17 +1,23 @@
-from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification, LayoutLMv3FeatureExtractor
 import torch
 from PIL import Image
 import pdf2image
 from typing import Dict, List
-# Load pre-trained LayoutLMv3 models (adjust model names based on your fine-tuned models)
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
-feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)  # Set to True if OCR is needed
-model = LayoutLMv3ForTokenClassification.from_pretrained("path_to_finetuned_funsd_model")  # Replace with your fine-tuned model
 def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
     """
-    Extract key-value pairs from PDF text using LayoutLMv3-finetuned-funsd.
     Args:
         text_data (str): Extracted text from PDF.
         pdf_path (str): Path to the PDF file.
@@ -26,14 +32,14 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
         key_values = {}
         for i, image in enumerate(images):
             # Preprocess image and text
-            encoding = feature_extractor(image, text_data.splitlines(), return_tensors="pt")
             input_ids = encoding["input_ids"]
             attention_mask = encoding["attention_mask"]
-            token_type_ids = encoding["token_type_ids"] if "token_type_ids" in encoding else None
             # Get model predictions
             with torch.no_grad():
-                outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                 predictions = torch.argmax(outputs.logits, dim=2)
             # Post-process predictions to extract key-value pairs (simplified logic)
@@ -42,23 +48,23 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
             current_key = None
             current_value = []
             for token, label in zip(tokens, labels):
-                if label == 1:  # Assuming label 1 indicates a key start
                     if current_key and current_value:
                         key_values[current_key] = " ".join(current_value).strip()
                     current_key = token
                     current_value = []
-                elif label == 2 and current_key:  # Assuming label 2 indicates a value
                     current_value.append(token)
             if current_key and current_value:
                 key_values[current_key] = " ".join(current_value).strip()
-        return key_values
     except Exception as e:
         return {"status": "failed", "error": str(e), "key_values": {}}
 def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
     """
-    Map extracted key-values to Salesforce fields using a custom-trained Transformer.
     Args:
         key_values (dict): Extracted key-value pairs.
         object_field_names (list): List of Salesforce field names.
@@ -67,13 +73,12 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
         dict: Mapping results with status, mappings, unmapped fields, and error (if any).
     """
     try:
-        # Placeholder for custom-trained Transformer logic (replace with your model)
         mappings = {}
         unmapped_fields = object_field_names.copy()
         for field in object_field_names:
             for key, value in key_values.items():
-                if field.lower() in key.lower():  # Simple string matching (replace with model prediction)
                     mappings[field] = value
                     unmapped_fields.remove(field)
                     break

+from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification, LayoutLMv3ImageProcessor
 import torch
 from PIL import Image
 import pdf2image
 from typing import Dict, List
+import os
+from huggingface_hub import login
+# Optional: Log in to Hugging Face if using a private model
+# Uncomment and replace with your token if needed
+# login(token="your_hf_token")
+# Load pre-trained LayoutLMv3 models
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)  # Updated to ImageProcessor
+model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")  # Public base model
 def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
     """
+    Extract key-value pairs from PDF text using LayoutLMv3-base.
     Args:
         text_data (str): Extracted text from PDF.
         pdf_path (str): Path to the PDF file.
         key_values = {}
         for i, image in enumerate(images):
             # Preprocess image and text
+            encoding = feature_extractor(images=[image], text=text_data.splitlines(), return_tensors="pt")
             input_ids = encoding["input_ids"]
             attention_mask = encoding["attention_mask"]
+            # token_type_ids not needed for LayoutLMv3-base
             # Get model predictions
             with torch.no_grad():
+                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 predictions = torch.argmax(outputs.logits, dim=2)
             # Post-process predictions to extract key-value pairs (simplified logic)
             current_key = None
             current_value = []
             for token, label in zip(tokens, labels):
+                if label == 1:  # Assuming label 1 indicates a key start (adjust based on training)
                     if current_key and current_value:
                         key_values[current_key] = " ".join(current_value).strip()
                     current_key = token
                     current_value = []
+                elif label == 2 and current_key:  # Assuming label 2 indicates a value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
                 key_values[current_key] = " ".join(current_value).strip()
+        return key_values if key_values else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
     except Exception as e:
         return {"status": "failed", "error": str(e), "key_values": {}}
 def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
     """
+    Map extracted key-values to Salesforce fields using LayoutLMv3-base (simplified).
     Args:
         key_values (dict): Extracted key-value pairs.
         object_field_names (list): List of Salesforce field names.
         dict: Mapping results with status, mappings, unmapped fields, and error (if any).
     """
     try:
         mappings = {}
         unmapped_fields = object_field_names.copy()
         for field in object_field_names:
             for key, value in key_values.items():
+                if field.lower() in key.lower():  # Simple string matching
                     mappings[field] = value
                     unmapped_fields.remove(field)
                     break