Spaces:
Sleeping
Sleeping
Update ai_mapping.py
Browse files- ai_mapping.py +31 -16
ai_mapping.py
CHANGED
|
@@ -34,24 +34,27 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 34 |
# Fallback to regex using concatenated text from all pages
|
| 35 |
text_data = " ".join([page["text"] for page in page_data])
|
| 36 |
|
| 37 |
-
# Refined regex patterns for required fields
|
| 38 |
-
name_context = re.findall(r'(?:
|
| 39 |
if name_context:
|
| 40 |
-
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.
|
| 41 |
|
|
|
|
| 42 |
date_patterns = [
|
| 43 |
-
r'(Agreement\s+Start\s+Date|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 44 |
-
r'(Agreement\s+End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
|
| 45 |
]
|
| 46 |
for pattern in date_patterns:
|
| 47 |
matches = re.findall(pattern, text_data, re.IGNORECASE)
|
| 48 |
-
for key, value in matches:
|
| 49 |
-
key_values
|
|
|
|
| 50 |
|
| 51 |
-
|
|
|
|
| 52 |
amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
|
| 53 |
if amounts:
|
| 54 |
-
key_values["Total Agreement Value"] =
|
| 55 |
|
| 56 |
# Attempt LayoutLMv3 processing for enhanced extraction
|
| 57 |
doc = fitz.open(pdf_path)
|
|
@@ -103,23 +106,35 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 103 |
if label == 1: # Key start (adjust based on training)
|
| 104 |
if current_key and current_value:
|
| 105 |
key = " ".join(current_value).strip()
|
| 106 |
-
if
|
| 107 |
-
key_values[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
current_key = token
|
| 109 |
current_value = []
|
| 110 |
elif label == 2 and current_key: # Value (adjust based on training)
|
| 111 |
current_value.append(token)
|
| 112 |
if current_key and current_value:
|
| 113 |
key = " ".join(current_value).strip()
|
| 114 |
-
if
|
| 115 |
-
key_values[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
# Clean up temporary image
|
| 118 |
if os.path.exists(img_path):
|
| 119 |
os.unlink(img_path)
|
| 120 |
|
| 121 |
doc.close()
|
| 122 |
-
return key_values if key_values else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
|
| 123 |
except Exception as e:
|
| 124 |
return {"status": "failed", "error": str(e), "key_values": {}}
|
| 125 |
|
|
@@ -142,7 +157,7 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
|
|
| 142 |
elif "NO WAIVER" in text_data.upper():
|
| 143 |
clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
|
| 144 |
|
| 145 |
-
# Add
|
| 146 |
termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 147 |
if termination_match:
|
| 148 |
clauses["Termination"] = termination_match.group(1).strip()
|
|
@@ -167,7 +182,7 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
|
|
| 167 |
# Prioritize mapping for required fields
|
| 168 |
for field in object_field_names:
|
| 169 |
for key, value in key_values.items():
|
| 170 |
-
if field.lower() in key.lower():
|
| 171 |
mappings[field] = value
|
| 172 |
if field in unmapped_fields:
|
| 173 |
unmapped_fields.remove(field)
|
|
|
|
| 34 |
# Fallback to regex using concatenated text from all pages
|
| 35 |
text_data = " ".join([page["text"] for page in page_data])
|
| 36 |
|
| 37 |
+
# Refined regex patterns for required fields, avoiding record type as Agreement Name
|
| 38 |
+
name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
|
| 39 |
if name_context:
|
| 40 |
+
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
|
| 41 |
|
| 42 |
+
# Enhanced date patterns to capture context like "executed as of" or specific date labels
|
| 43 |
date_patterns = [
|
| 44 |
+
r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 45 |
+
r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
|
| 46 |
]
|
| 47 |
for pattern in date_patterns:
|
| 48 |
matches = re.findall(pattern, text_data, re.IGNORECASE)
|
| 49 |
+
for key, value in [("Agreement Start Date", matches[0][0]) if "start" in pattern.lower() or "effective" in pattern.lower() else ("Agreement End Date", matches[0][0]) for matches in [m for m in [re.findall(pattern, text_data, re.IGNORECASE)] if m]]:
|
| 50 |
+
if value and not key_values.get(key):
|
| 51 |
+
key_values[key] = value
|
| 52 |
|
| 53 |
+
# Improved amount pattern to capture total value context
|
| 54 |
+
amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
|
| 55 |
amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
|
| 56 |
if amounts:
|
| 57 |
+
key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value"])), "")
|
| 58 |
|
| 59 |
# Attempt LayoutLMv3 processing for enhanced extraction
|
| 60 |
doc = fitz.open(pdf_path)
|
|
|
|
| 106 |
if label == 1: # Key start (adjust based on training)
|
| 107 |
if current_key and current_value:
|
| 108 |
key = " ".join(current_value).strip()
|
| 109 |
+
if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
|
| 110 |
+
key_values["Agreement Name"] = key
|
| 111 |
+
elif "start date" in current_key.lower() or "effective date" in current_key.lower():
|
| 112 |
+
key_values["Agreement Start Date"] = key
|
| 113 |
+
elif "end date" in current_key.lower() or "termination date" in current_key.lower():
|
| 114 |
+
key_values["Agreement End Date"] = key
|
| 115 |
+
elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
|
| 116 |
+
key_values["Total Agreement Value"] = key
|
| 117 |
current_key = token
|
| 118 |
current_value = []
|
| 119 |
elif label == 2 and current_key: # Value (adjust based on training)
|
| 120 |
current_value.append(token)
|
| 121 |
if current_key and current_value:
|
| 122 |
key = " ".join(current_value).strip()
|
| 123 |
+
if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
|
| 124 |
+
key_values["Agreement Name"] = key
|
| 125 |
+
elif "start date" in current_key.lower() or "effective date" in current_key.lower():
|
| 126 |
+
key_values["Agreement Start Date"] = key
|
| 127 |
+
elif "end date" in current_key.lower() or "termination date" in current_key.lower():
|
| 128 |
+
key_values["Agreement End Date"] = key
|
| 129 |
+
elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
|
| 130 |
+
key_values["Total Agreement Value"] = key
|
| 131 |
|
| 132 |
# Clean up temporary image
|
| 133 |
if os.path.exists(img_path):
|
| 134 |
os.unlink(img_path)
|
| 135 |
|
| 136 |
doc.close()
|
| 137 |
+
return key_values if any(key_values.values()) else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
|
| 138 |
except Exception as e:
|
| 139 |
return {"status": "failed", "error": str(e), "key_values": {}}
|
| 140 |
|
|
|
|
| 157 |
elif "NO WAIVER" in text_data.upper():
|
| 158 |
clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
|
| 159 |
|
| 160 |
+
# Add Termination clause
|
| 161 |
termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 162 |
if termination_match:
|
| 163 |
clauses["Termination"] = termination_match.group(1).strip()
|
|
|
|
| 182 |
# Prioritize mapping for required fields
|
| 183 |
for field in object_field_names:
|
| 184 |
for key, value in key_values.items():
|
| 185 |
+
if field.lower() in key.lower() and value:
|
| 186 |
mappings[field] = value
|
| 187 |
if field in unmapped_fields:
|
| 188 |
unmapped_fields.remove(field)
|