Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 5, 2025

Commit

b9ae2ff

verified ·

1 Parent(s): b2e3ca0

Update ai_mapping.py

Browse files

Files changed (1) hide show

ai_mapping.py +17 -17

ai_mapping.py CHANGED Viewed

@@ -5,19 +5,19 @@ import pdf2image
 from typing import Dict, List
 import os
 from huggingface_hub import login
 # Optional: Log in to Hugging Face if using a private model
-# Uncomment and replace with your token if needed
 # login(token="your_hf_token")
 # Load pre-trained LayoutLMv3 models
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
-feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)  # Updated to ImageProcessor
-model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")  # Public base model
 def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
     """
-    Extract key-value pairs from PDF text using LayoutLMv3-base.
     Args:
         text_data (str): Extracted text from PDF.
         pdf_path (str): Path to the PDF file.
@@ -25,35 +25,35 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
         dict: Key-value pairs extracted from the document.
     """
     try:
-        # Convert PDF to images (one per page)
-        images = pdf2image.convert_from_path(pdf_path)
-        # Process each page
         key_values = {}
         for i, image in enumerate(images):
-            # Preprocess image and text
             encoding = feature_extractor(images=[image], text=text_data.splitlines(), return_tensors="pt")
             input_ids = encoding["input_ids"]
             attention_mask = encoding["attention_mask"]
-            # token_type_ids not needed for LayoutLMv3-base
-            # Get model predictions
             with torch.no_grad():
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 predictions = torch.argmax(outputs.logits, dim=2)
-            # Post-process predictions to extract key-value pairs (simplified logic)
             tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
             labels = predictions[0].tolist()
             current_key = None
             current_value = []
             for token, label in zip(tokens, labels):
-                if label == 1:  # Assuming label 1 indicates a key start (adjust based on training)
                     if current_key and current_value:
                         key_values[current_key] = " ".join(current_value).strip()
                     current_key = token
                     current_value = []
-                elif label == 2 and current_key:  # Assuming label 2 indicates a value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
                 key_values[current_key] = " ".join(current_value).strip()
@@ -64,10 +64,10 @@ def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str,
 def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
     """
-    Map extracted key-values to Salesforce fields using LayoutLMv3-base (simplified).
     Args:
         key_values (dict): Extracted key-value pairs.
-        object_field_names (list): List of Salesforce field names.
         pdf_path (str): Path to the PDF file (for context if needed).
     Returns:
         dict: Mapping results with status, mappings, unmapped fields, and error (if any).
@@ -78,7 +78,7 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
         for field in object_field_names:
             for key, value in key_values.items():
-                if field.lower() in key.lower():  # Simple string matching
                     mappings[field] = value
                     unmapped_fields.remove(field)
                     break

 from typing import Dict, List
 import os
 from huggingface_hub import login
+import re
 # Optional: Log in to Hugging Face if using a private model
 # login(token="your_hf_token")
 # Load pre-trained LayoutLMv3 models
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
+model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
 def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
     """
+    Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
     Args:
         text_data (str): Extracted text from PDF.
         pdf_path (str): Path to the PDF file.
         dict: Key-value pairs extracted from the document.
     """
     try:
+        # Fallback to regex if model is untrained
         key_values = {}
+        dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text_data)
+        amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
+        if dates or amounts:
+            key_values.update({"Date": dates[0] if dates else "", "Amount": amounts[0] if amounts else ""})
+        # Attempt LayoutLMv3 processing
+        images = pdf2image.convert_from_path(pdf_path)
         for i, image in enumerate(images):
             encoding = feature_extractor(images=[image], text=text_data.splitlines(), return_tensors="pt")
             input_ids = encoding["input_ids"]
             attention_mask = encoding["attention_mask"]
             with torch.no_grad():
                 outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                 predictions = torch.argmax(outputs.logits, dim=2)
             tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
             labels = predictions[0].tolist()
             current_key = None
             current_value = []
             for token, label in zip(tokens, labels):
+                if label == 1:  # Key start (adjust based on training)
                     if current_key and current_value:
                         key_values[current_key] = " ".join(current_value).strip()
                     current_key = token
                     current_value = []
+                elif label == 2 and current_key:  # Value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
                 key_values[current_key] = " ".join(current_value).strip()
 def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
     """
+    Map extracted key-values to object fields using LayoutLMv3-base (simplified).
     Args:
         key_values (dict): Extracted key-value pairs.
+        object_field_names (list): List of object field names.
         pdf_path (str): Path to the PDF file (for context if needed).
     Returns:
         dict: Mapping results with status, mappings, unmapped fields, and error (if any).
         for field in object_field_names:
             for key, value in key_values.items():
+                if field.lower() in key.lower() or any(k.lower() in field.lower() for k in key_values.keys()):
                     mappings[field] = value
                     unmapped_fields.remove(field)
                     break