Spaces:
Sleeping
Sleeping
Update ai_mapping.py
Browse files- ai_mapping.py +21 -15
ai_mapping.py
CHANGED
|
@@ -35,26 +35,32 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 35 |
text_data = " ".join([page["text"] for page in page_data])
|
| 36 |
|
| 37 |
# Refined regex patterns for required fields, avoiding record type as Agreement Name
|
| 38 |
-
name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
|
| 39 |
if name_context:
|
| 40 |
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
date_patterns = [
|
| 44 |
-
r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}
|
| 45 |
-
r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}
|
| 46 |
]
|
| 47 |
for pattern in date_patterns:
|
| 48 |
matches = re.findall(pattern, text_data, re.IGNORECASE)
|
| 49 |
-
|
|
|
|
| 50 |
if value and not key_values.get(key):
|
| 51 |
key_values[key] = value
|
| 52 |
|
| 53 |
# Improved amount pattern to capture total value context
|
| 54 |
-
amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
|
| 55 |
amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
|
| 56 |
if amounts:
|
| 57 |
-
key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value"])), "")
|
| 58 |
|
| 59 |
# Attempt LayoutLMv3 processing for enhanced extraction
|
| 60 |
doc = fitz.open(pdf_path)
|
|
@@ -108,11 +114,11 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 108 |
key = " ".join(current_value).strip()
|
| 109 |
if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
|
| 110 |
key_values["Agreement Name"] = key
|
| 111 |
-
elif "start date" in current_key.lower() or "effective date" in current_key.lower():
|
| 112 |
key_values["Agreement Start Date"] = key
|
| 113 |
elif "end date" in current_key.lower() or "termination date" in current_key.lower():
|
| 114 |
key_values["Agreement End Date"] = key
|
| 115 |
-
elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
|
| 116 |
key_values["Total Agreement Value"] = key
|
| 117 |
current_key = token
|
| 118 |
current_value = []
|
|
@@ -122,11 +128,11 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 122 |
key = " ".join(current_value).strip()
|
| 123 |
if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
|
| 124 |
key_values["Agreement Name"] = key
|
| 125 |
-
elif "start date" in current_key.lower() or "effective date" in current_key.lower():
|
| 126 |
key_values["Agreement Start Date"] = key
|
| 127 |
elif "end date" in current_key.lower() or "termination date" in current_key.lower():
|
| 128 |
key_values["Agreement End Date"] = key
|
| 129 |
-
elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
|
| 130 |
key_values["Total Agreement Value"] = key
|
| 131 |
|
| 132 |
# Clean up temporary image
|
|
@@ -140,7 +146,7 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 140 |
|
| 141 |
def extract_clauses(page_data: list) -> Dict[str, str]:
|
| 142 |
"""
|
| 143 |
-
Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER.
|
| 144 |
Args:
|
| 145 |
page_data (list): List of dictionaries with 'text' (str) per page.
|
| 146 |
Returns:
|
|
@@ -155,9 +161,9 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
|
|
| 155 |
clause_text = no_waiver_match.group(1).strip()
|
| 156 |
clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
|
| 157 |
elif "NO WAIVER" in text_data.upper():
|
| 158 |
-
clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
|
| 159 |
|
| 160 |
-
#
|
| 161 |
termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 162 |
if termination_match:
|
| 163 |
clauses["Termination"] = termination_match.group(1).strip()
|
|
|
|
| 35 |
text_data = " ".join([page["text"] for page in page_data])
|
| 36 |
|
| 37 |
# Refined regex patterns for required fields, avoiding record type as Agreement Name
|
| 38 |
+
name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title|Agreement\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
|
| 39 |
if name_context:
|
| 40 |
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
|
| 41 |
+
else:
|
| 42 |
+
# Fallback to infer name from context if no explicit title
|
| 43 |
+
party_match = re.search(r'(?:between\s+([A-Za-z\s]+)\s+and)', text_data, re.IGNORECASE)
|
| 44 |
+
if party_match:
|
| 45 |
+
key_values["Agreement Name"] = party_match.group(1).strip() or "Unknown"
|
| 46 |
+
|
| 47 |
+
# Enhanced date patterns to capture "executed as of" and other date contexts
|
| 48 |
date_patterns = [
|
| 49 |
+
r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}/\d{1,2}/\d{2,4})',
|
| 50 |
+
r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}/\d{1,2}/\d{2,4})'
|
| 51 |
]
|
| 52 |
for pattern in date_patterns:
|
| 53 |
matches = re.findall(pattern, text_data, re.IGNORECASE)
|
| 54 |
+
if matches:
|
| 55 |
+
key, value = ("Agreement Start Date", matches[0]) if "start" in pattern.lower() or "effective" in pattern.lower() or "executed" in pattern.lower() else ("Agreement End Date", matches[0])
|
| 56 |
if value and not key_values.get(key):
|
| 57 |
key_values[key] = value
|
| 58 |
|
| 59 |
# Improved amount pattern to capture total value context
|
| 60 |
+
amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value|List\s+Price)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
|
| 61 |
amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
|
| 62 |
if amounts:
|
| 63 |
+
key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value", "price"])), "")
|
| 64 |
|
| 65 |
# Attempt LayoutLMv3 processing for enhanced extraction
|
| 66 |
doc = fitz.open(pdf_path)
|
|
|
|
| 114 |
key = " ".join(current_value).strip()
|
| 115 |
if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
|
| 116 |
key_values["Agreement Name"] = key
|
| 117 |
+
elif "start date" in current_key.lower() or "effective date" in current_key.lower() or "executed as of" in current_key.lower():
|
| 118 |
key_values["Agreement Start Date"] = key
|
| 119 |
elif "end date" in current_key.lower() or "termination date" in current_key.lower():
|
| 120 |
key_values["Agreement End Date"] = key
|
| 121 |
+
elif "total agreement value" in current_key.lower() or "amount" in current_key.lower() or "price" in current_key.lower():
|
| 122 |
key_values["Total Agreement Value"] = key
|
| 123 |
current_key = token
|
| 124 |
current_value = []
|
|
|
|
| 128 |
key = " ".join(current_value).strip()
|
| 129 |
if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
|
| 130 |
key_values["Agreement Name"] = key
|
| 131 |
+
elif "start date" in current_key.lower() or "effective date" in current_key.lower() or "executed as of" in current_key.lower():
|
| 132 |
key_values["Agreement Start Date"] = key
|
| 133 |
elif "end date" in current_key.lower() or "termination date" in current_key.lower():
|
| 134 |
key_values["Agreement End Date"] = key
|
| 135 |
+
elif "total agreement value" in current_key.lower() or "amount" in current_key.lower() or "price" in current_key.lower():
|
| 136 |
key_values["Total Agreement Value"] = key
|
| 137 |
|
| 138 |
# Clean up temporary image
|
|
|
|
| 146 |
|
| 147 |
def extract_clauses(page_data: list) -> Dict[str, str]:
|
| 148 |
"""
|
| 149 |
+
Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER and Termination.
|
| 150 |
Args:
|
| 151 |
page_data (list): List of dictionaries with 'text' (str) per page.
|
| 152 |
Returns:
|
|
|
|
| 161 |
clause_text = no_waiver_match.group(1).strip()
|
| 162 |
clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
|
| 163 |
elif "NO WAIVER" in text_data.upper():
|
| 164 |
+
clauses["NO WAIVER"] = re.search(r'(NO\s+WAIVER\s*[:\s]*[\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE).group(1).strip() if re.search(r'(NO\s+WAIVER\s*[:\s]*[\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE) else "NO WAIVER clause identified but no detailed content extracted"
|
| 165 |
|
| 166 |
+
# Search for Termination clause
|
| 167 |
termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 168 |
if termination_match:
|
| 169 |
clauses["Termination"] = termination_match.group(1).strip()
|