Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 24, 2025

Commit

dde2ff7

verified ·

1 Parent(s): 6e4819e

Update ai_mapping.py

Browse files

Files changed (1) hide show

ai_mapping.py +31 -16

ai_mapping.py CHANGED Viewed

@@ -34,24 +34,27 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
         # Fallback to regex using concatenated text from all pages
         text_data = " ".join([page["text"] for page in page_data])
-        # Refined regex patterns for required fields
-        name_context = re.findall(r'(?:Order\s+Form|Contract|Agreement\s+Name)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
         if name_context:
-            key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
         date_patterns = [
-            r'(Agreement\s+Start\s+Date|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
-            r'(Agreement\s+End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
         ]
         for pattern in date_patterns:
             matches = re.findall(pattern, text_data, re.IGNORECASE)
-            for key, value in matches:
-                key_values[key] = value
-        amount_pattern = r'(?:Total\s+Agreement\s+Value|Amount|Total\s+Cost)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
         amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
         if amounts:
-            key_values["Total Agreement Value"] = amounts[0].split(":")[-1].strip() if ":" in amounts[0] else amounts[0].strip()
         # Attempt LayoutLMv3 processing for enhanced extraction
         doc = fitz.open(pdf_path)
@@ -103,23 +106,35 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
                 if label == 1:  # Key start (adjust based on training)
                     if current_key and current_value:
                         key = " ".join(current_value).strip()
-                        if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
-                            key_values[current_key] = key
                     current_key = token
                     current_value = []
                 elif label == 2 and current_key:  # Value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
                 key = " ".join(current_value).strip()
-                if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
-                    key_values[current_key] = key
             # Clean up temporary image
             if os.path.exists(img_path):
                 os.unlink(img_path)
         doc.close()
-        return key_values if key_values else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
     except Exception as e:
         return {"status": "failed", "error": str(e), "key_values": {}}
@@ -142,7 +157,7 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
     elif "NO WAIVER" in text_data.upper():
         clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
-    # Add more clause extractions as needed (e.g., Termination, Indemnity)
     termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
     if termination_match:
         clauses["Termination"] = termination_match.group(1).strip()
@@ -167,7 +182,7 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
         # Prioritize mapping for required fields
         for field in object_field_names:
             for key, value in key_values.items():
-                if field.lower() in key.lower():
                     mappings[field] = value
                     if field in unmapped_fields:
                         unmapped_fields.remove(field)

         # Fallback to regex using concatenated text from all pages
         text_data = " ".join([page["text"] for page in page_data])
+        # Refined regex patterns for required fields, avoiding record type as Agreement Name
+        name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
         if name_context:
+            key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
+        # Enhanced date patterns to capture context like "executed as of" or specific date labels
         date_patterns = [
+            r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
+            r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
         ]
         for pattern in date_patterns:
             matches = re.findall(pattern, text_data, re.IGNORECASE)
+            for key, value in [("Agreement Start Date", matches[0][0]) if "start" in pattern.lower() or "effective" in pattern.lower() else ("Agreement End Date", matches[0][0]) for matches in [m for m in [re.findall(pattern, text_data, re.IGNORECASE)] if m]]:
+                if value and not key_values.get(key):
+                    key_values[key] = value
+        # Improved amount pattern to capture total value context
+        amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
         amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
         if amounts:
+            key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value"])), "")
         # Attempt LayoutLMv3 processing for enhanced extraction
         doc = fitz.open(pdf_path)
                 if label == 1:  # Key start (adjust based on training)
                     if current_key and current_value:
                         key = " ".join(current_value).strip()
+                        if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
+                            key_values["Agreement Name"] = key
+                        elif "start date" in current_key.lower() or "effective date" in current_key.lower():
+                            key_values["Agreement Start Date"] = key
+                        elif "end date" in current_key.lower() or "termination date" in current_key.lower():
+                            key_values["Agreement End Date"] = key
+                        elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
+                            key_values["Total Agreement Value"] = key
                     current_key = token
                     current_value = []
                 elif label == 2 and current_key:  # Value (adjust based on training)
                     current_value.append(token)
             if current_key and current_value:
                 key = " ".join(current_value).strip()
+                if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
+                    key_values["Agreement Name"] = key
+                elif "start date" in current_key.lower() or "effective date" in current_key.lower():
+                    key_values["Agreement Start Date"] = key
+                elif "end date" in current_key.lower() or "termination date" in current_key.lower():
+                    key_values["Agreement End Date"] = key
+                elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
+                    key_values["Total Agreement Value"] = key
             # Clean up temporary image
             if os.path.exists(img_path):
                 os.unlink(img_path)
         doc.close()
+        return key_values if any(key_values.values()) else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
     except Exception as e:
         return {"status": "failed", "error": str(e), "key_values": {}}
     elif "NO WAIVER" in text_data.upper():
         clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
+    # Add Termination clause
     termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
     if termination_match:
         clauses["Termination"] = termination_match.group(1).strip()
         # Prioritize mapping for required fields
         for field in object_field_names:
             for key, value in key_values.items():
+                if field.lower() in key.lower() and value:
                     mappings[field] = value
                     if field in unmapped_fields:
                         unmapped_fields.remove(field)