Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 24, 2025

Commit

6e4819e

verified ·

1 Parent(s): af4bac8

Update ai_mapping.py

Browse files

Files changed (1) hide show

ai_mapping.py +57 -39

ai_mapping.py CHANGED Viewed

@@ -4,12 +4,8 @@ from PIL import Image
 import fitz  # PyMuPDF
 from typing import Dict, List
 import os
-from huggingface_hub import login
 import re
-# Optional: Log in to Hugging Face if using a private model
-# login(token="your_hf_token")
 # Load pre-trained LayoutLMv3 models
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
 feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
@@ -17,41 +13,47 @@ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-b
 def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
     """
-    Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
     Args:
         page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
                           'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
         pdf_path (str): Path to the PDF file.
     Returns:
-        dict: Key-value pairs extracted from the document.
     """
     try:
         # Fallback to regex using concatenated text from all pages
         text_data = " ".join([page["text"] for page in page_data])
-        key_values = {}
-        # Enhanced regex patterns with flexibility
-        dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
-        # Targeted date capture with OCR tolerance
-        date_context = re.findall(r'(?:executed\s+as\s+of|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})|(?:Start\s+Date|End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
-        amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
-        # Refined Agreement Name to stop at Exhibit or clear break
-        name_context = re.findall(r'(?:Order\s+Form|Contract)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
         if name_context:
             key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
-        # Update key_values with matched fields
-        for key, value in dates:
-            key_values[key] = value
-        if date_context:
-            start_date = next((d[0] for d in date_context if d[0]), "")
-            if start_date and not key_values.get("Agreement Start Date"):
-                key_values["Agreement Start Date"] = start_date
-            end_date = next((d[1] for d in date_context if d[1]), "")
-            if end_date and not key_values.get("Agreement End Date"):
-                key_values["Agreement End Date"] = end_date
         if amounts:
-            key_values["Amount"] = amounts[0]
-        # Attempt LayoutLMv3 processing
         doc = fitz.open(pdf_path)
         for page_num, page_info in enumerate(page_data):
             if not page_info["text"].strip() or "No text detected" in page_info["text"]:
@@ -100,13 +102,17 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
             for token, label in zip(tokens, labels):
                 if label == 1:  # Key start (adjust based on training)
                     if current_key and current_value:
-                        key_values[current_key] = " ".join(current_value).strip()
                     current_key = token
                     current_value = []
                 elif label == 2 and current_key:  # Value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
-                key_values[current_key] = " ".join(current_value).strip()
             # Clean up temporary image
             if os.path.exists(img_path):
@@ -119,7 +125,7 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
 def extract_clauses(page_data: list) -> Dict[str, str]:
     """
-    Extract clauses from PDF text based on keywords.
     Args:
         page_data (list): List of dictionaries with 'text' (str) per page.
     Returns:
@@ -127,18 +133,26 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
     """
     clauses = {}
     text_data = "\n".join([page["text"] for page in page_data])  # Use newlines for better segmentation
-    # Broader search for "NO WAIVER" with fallback
     no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
     if no_waiver_match:
         clause_text = no_waiver_match.group(1).strip()
-        clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER"
-    elif "NO WAIVER" in text_data:
-        clauses["NO WAIVER"] = "NO WAIVER"
-    return clauses if clauses else {}
 def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
     """
-    Map extracted key-values to object fields using LayoutLMv3-base (simplified).
     Args:
         key_values (dict): Extracted key-value pairs.
         object_field_names (list): List of object field names.
@@ -150,23 +164,27 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
         mappings = {}
         unmapped_fields = object_field_names.copy()
         for field in object_field_names:
             for key, value in key_values.items():
-                if field.lower() in key.lower() or any(k.lower() in field.lower() for k in key_values.keys()):
                     mappings[field] = value
-                    unmapped_fields.remove(field)
                     break
         return {
             "status": "success",
             "mappings": mappings,
             "unmapped_fields": unmapped_fields,
-            "error": None
         }
     except Exception as e:
         return {
             "status": "failed",
             "error": str(e),
             "mappings": {},
-            "unmapped_fields": object_field_names
         }

 import fitz  # PyMuPDF
 from typing import Dict, List
 import os
 import re
 # Load pre-trained LayoutLMv3 models
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
 feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
 def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
     """
+    Extract key-value pairs from PDF text using LayoutLMv3-base with focus on Agreement Name,
+    Agreement Start Date, Agreement End Date, and Total Agreement Value, with regex fallback.
     Args:
         page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
                           'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
         pdf_path (str): Path to the PDF file.
     Returns:
+        dict: Key-value pairs extracted from the document focusing on specified fields.
     """
     try:
+        # Initialize key-value dictionary for required fields
+        key_values = {
+            "Agreement Name": "Unknown",
+            "Agreement Start Date": "",
+            "Agreement End Date": "",
+            "Total Agreement Value": ""
+        }
         # Fallback to regex using concatenated text from all pages
         text_data = " ".join([page["text"] for page in page_data])
+        # Refined regex patterns for required fields
+        name_context = re.findall(r'(?:Order\s+Form|Contract|Agreement\s+Name)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
         if name_context:
             key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
+        date_patterns = [
+            r'(Agreement\s+Start\s+Date|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
+            r'(Agreement\s+End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
+        ]
+        for pattern in date_patterns:
+            matches = re.findall(pattern, text_data, re.IGNORECASE)
+            for key, value in matches:
+                key_values[key] = value
+        amount_pattern = r'(?:Total\s+Agreement\s+Value|Amount|Total\s+Cost)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
+        amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
         if amounts:
+            key_values["Total Agreement Value"] = amounts[0].split(":")[-1].strip() if ":" in amounts[0] else amounts[0].strip()
+        # Attempt LayoutLMv3 processing for enhanced extraction
         doc = fitz.open(pdf_path)
         for page_num, page_info in enumerate(page_data):
             if not page_info["text"].strip() or "No text detected" in page_info["text"]:
             for token, label in zip(tokens, labels):
                 if label == 1:  # Key start (adjust based on training)
                     if current_key and current_value:
+                        key = " ".join(current_value).strip()
+                        if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
+                            key_values[current_key] = key
                     current_key = token
                     current_value = []
                 elif label == 2 and current_key:  # Value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
+                key = " ".join(current_value).strip()
+                if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
+                    key_values[current_key] = key
             # Clean up temporary image
             if os.path.exists(img_path):
 def extract_clauses(page_data: list) -> Dict[str, str]:
     """
+    Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER.
     Args:
         page_data (list): List of dictionaries with 'text' (str) per page.
     Returns:
     """
     clauses = {}
     text_data = "\n".join([page["text"] for page in page_data])  # Use newlines for better segmentation
+    # Search for NO WAIVER clause
     no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
     if no_waiver_match:
         clause_text = no_waiver_match.group(1).strip()
+        clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
+    elif "NO WAIVER" in text_data.upper():
+        clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
+    # Add more clause extractions as needed (e.g., Termination, Indemnity)
+    termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
+    if termination_match:
+        clauses["Termination"] = termination_match.group(1).strip()
+    return clauses if clauses else {"No clauses extracted": "No relevant clauses found in the document"}
 def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
     """
+    Map extracted key-values to object fields, prioritizing Agreement Name, Agreement Start Date,
+    Agreement End Date, and Total Agreement Value.
     Args:
         key_values (dict): Extracted key-value pairs.
         object_field_names (list): List of object field names.
         mappings = {}
         unmapped_fields = object_field_names.copy()
+        # Prioritize mapping for required fields
         for field in object_field_names:
             for key, value in key_values.items():
+                if field.lower() in key.lower():
                     mappings[field] = value
+                    if field in unmapped_fields:
+                        unmapped_fields.remove(field)
                     break
         return {
             "status": "success",
             "mappings": mappings,
             "unmapped_fields": unmapped_fields,
+            "error": None,
+            "clauses": extract_clauses(page_data)  # Include clauses in the output
         }
     except Exception as e:
         return {
             "status": "failed",
             "error": str(e),
             "mappings": {},
+            "unmapped_fields": object_field_names,
+            "clauses": {}
         }