Spaces:
Sleeping
Sleeping
Update ai_mapping.py
Browse files- ai_mapping.py +57 -39
ai_mapping.py
CHANGED
|
@@ -4,12 +4,8 @@ from PIL import Image
|
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
from typing import Dict, List
|
| 6 |
import os
|
| 7 |
-
from huggingface_hub import login
|
| 8 |
import re
|
| 9 |
|
| 10 |
-
# Optional: Log in to Hugging Face if using a private model
|
| 11 |
-
# login(token="your_hf_token")
|
| 12 |
-
|
| 13 |
# Load pre-trained LayoutLMv3 models
|
| 14 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 15 |
feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
|
|
@@ -17,41 +13,47 @@ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-b
|
|
| 17 |
|
| 18 |
def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
|
| 19 |
"""
|
| 20 |
-
Extract key-value pairs from PDF text using LayoutLMv3-base
|
|
|
|
| 21 |
Args:
|
| 22 |
page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
|
| 23 |
'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
|
| 24 |
pdf_path (str): Path to the PDF file.
|
| 25 |
Returns:
|
| 26 |
-
dict: Key-value pairs extracted from the document.
|
| 27 |
"""
|
| 28 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# Fallback to regex using concatenated text from all pages
|
| 30 |
text_data = " ".join([page["text"] for page in page_data])
|
| 31 |
-
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
# Targeted date capture with OCR tolerance
|
| 35 |
-
date_context = re.findall(r'(?:executed\s+as\s+of|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})|(?:Start\s+Date|End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 36 |
-
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 37 |
-
# Refined Agreement Name to stop at Exhibit or clear break
|
| 38 |
-
name_context = re.findall(r'(?:Order\s+Form|Contract)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
|
| 39 |
if name_context:
|
| 40 |
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
if amounts:
|
| 52 |
-
key_values["
|
| 53 |
|
| 54 |
-
# Attempt LayoutLMv3 processing
|
| 55 |
doc = fitz.open(pdf_path)
|
| 56 |
for page_num, page_info in enumerate(page_data):
|
| 57 |
if not page_info["text"].strip() or "No text detected" in page_info["text"]:
|
|
@@ -100,13 +102,17 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 100 |
for token, label in zip(tokens, labels):
|
| 101 |
if label == 1: # Key start (adjust based on training)
|
| 102 |
if current_key and current_value:
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
current_key = token
|
| 105 |
current_value = []
|
| 106 |
elif label == 2 and current_key: # Value (adjust based on training)
|
| 107 |
current_value.append(token)
|
| 108 |
if current_key and current_value:
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# Clean up temporary image
|
| 112 |
if os.path.exists(img_path):
|
|
@@ -119,7 +125,7 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 119 |
|
| 120 |
def extract_clauses(page_data: list) -> Dict[str, str]:
|
| 121 |
"""
|
| 122 |
-
Extract clauses from PDF text based on keywords.
|
| 123 |
Args:
|
| 124 |
page_data (list): List of dictionaries with 'text' (str) per page.
|
| 125 |
Returns:
|
|
@@ -127,18 +133,26 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
|
|
| 127 |
"""
|
| 128 |
clauses = {}
|
| 129 |
text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
|
| 130 |
-
|
|
|
|
| 131 |
no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 132 |
if no_waiver_match:
|
| 133 |
clause_text = no_waiver_match.group(1).strip()
|
| 134 |
-
clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER"
|
| 135 |
-
elif "NO WAIVER" in text_data:
|
| 136 |
-
clauses["NO WAIVER"] = "NO WAIVER"
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
|
| 140 |
"""
|
| 141 |
-
Map extracted key-values to object fields
|
|
|
|
| 142 |
Args:
|
| 143 |
key_values (dict): Extracted key-value pairs.
|
| 144 |
object_field_names (list): List of object field names.
|
|
@@ -150,23 +164,27 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
|
|
| 150 |
mappings = {}
|
| 151 |
unmapped_fields = object_field_names.copy()
|
| 152 |
|
|
|
|
| 153 |
for field in object_field_names:
|
| 154 |
for key, value in key_values.items():
|
| 155 |
-
if field.lower() in key.lower()
|
| 156 |
mappings[field] = value
|
| 157 |
-
|
|
|
|
| 158 |
break
|
| 159 |
|
| 160 |
return {
|
| 161 |
"status": "success",
|
| 162 |
"mappings": mappings,
|
| 163 |
"unmapped_fields": unmapped_fields,
|
| 164 |
-
"error": None
|
|
|
|
| 165 |
}
|
| 166 |
except Exception as e:
|
| 167 |
return {
|
| 168 |
"status": "failed",
|
| 169 |
"error": str(e),
|
| 170 |
"mappings": {},
|
| 171 |
-
"unmapped_fields": object_field_names
|
|
|
|
| 172 |
}
|
|
|
|
| 4 |
import fitz # PyMuPDF
|
| 5 |
from typing import Dict, List
|
| 6 |
import os
|
|
|
|
| 7 |
import re
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
# Load pre-trained LayoutLMv3 models
|
| 10 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 11 |
feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
|
|
|
|
| 13 |
|
| 14 |
def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
|
| 15 |
"""
|
| 16 |
+
Extract key-value pairs from PDF text using LayoutLMv3-base with focus on Agreement Name,
|
| 17 |
+
Agreement Start Date, Agreement End Date, and Total Agreement Value, with regex fallback.
|
| 18 |
Args:
|
| 19 |
page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
|
| 20 |
'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
|
| 21 |
pdf_path (str): Path to the PDF file.
|
| 22 |
Returns:
|
| 23 |
+
dict: Key-value pairs extracted from the document focusing on specified fields.
|
| 24 |
"""
|
| 25 |
try:
|
| 26 |
+
# Initialize key-value dictionary for required fields
|
| 27 |
+
key_values = {
|
| 28 |
+
"Agreement Name": "Unknown",
|
| 29 |
+
"Agreement Start Date": "",
|
| 30 |
+
"Agreement End Date": "",
|
| 31 |
+
"Total Agreement Value": ""
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
# Fallback to regex using concatenated text from all pages
|
| 35 |
text_data = " ".join([page["text"] for page in page_data])
|
| 36 |
+
|
| 37 |
+
# Refined regex patterns for required fields
|
| 38 |
+
name_context = re.findall(r'(?:Order\s+Form|Contract|Agreement\s+Name)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if name_context:
|
| 40 |
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
|
| 41 |
+
|
| 42 |
+
date_patterns = [
|
| 43 |
+
r'(Agreement\s+Start\s+Date|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
|
| 44 |
+
r'(Agreement\s+End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
|
| 45 |
+
]
|
| 46 |
+
for pattern in date_patterns:
|
| 47 |
+
matches = re.findall(pattern, text_data, re.IGNORECASE)
|
| 48 |
+
for key, value in matches:
|
| 49 |
+
key_values[key] = value
|
| 50 |
+
|
| 51 |
+
amount_pattern = r'(?:Total\s+Agreement\s+Value|Amount|Total\s+Cost)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
|
| 52 |
+
amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
|
| 53 |
if amounts:
|
| 54 |
+
key_values["Total Agreement Value"] = amounts[0].split(":")[-1].strip() if ":" in amounts[0] else amounts[0].strip()
|
| 55 |
|
| 56 |
+
# Attempt LayoutLMv3 processing for enhanced extraction
|
| 57 |
doc = fitz.open(pdf_path)
|
| 58 |
for page_num, page_info in enumerate(page_data):
|
| 59 |
if not page_info["text"].strip() or "No text detected" in page_info["text"]:
|
|
|
|
| 102 |
for token, label in zip(tokens, labels):
|
| 103 |
if label == 1: # Key start (adjust based on training)
|
| 104 |
if current_key and current_value:
|
| 105 |
+
key = " ".join(current_value).strip()
|
| 106 |
+
if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
|
| 107 |
+
key_values[current_key] = key
|
| 108 |
current_key = token
|
| 109 |
current_value = []
|
| 110 |
elif label == 2 and current_key: # Value (adjust based on training)
|
| 111 |
current_value.append(token)
|
| 112 |
if current_key and current_value:
|
| 113 |
+
key = " ".join(current_value).strip()
|
| 114 |
+
if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
|
| 115 |
+
key_values[current_key] = key
|
| 116 |
|
| 117 |
# Clean up temporary image
|
| 118 |
if os.path.exists(img_path):
|
|
|
|
| 125 |
|
| 126 |
def extract_clauses(page_data: list) -> Dict[str, str]:
|
| 127 |
"""
|
| 128 |
+
Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER.
|
| 129 |
Args:
|
| 130 |
page_data (list): List of dictionaries with 'text' (str) per page.
|
| 131 |
Returns:
|
|
|
|
| 133 |
"""
|
| 134 |
clauses = {}
|
| 135 |
text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
|
| 136 |
+
|
| 137 |
+
# Search for NO WAIVER clause
|
| 138 |
no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 139 |
if no_waiver_match:
|
| 140 |
clause_text = no_waiver_match.group(1).strip()
|
| 141 |
+
clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
|
| 142 |
+
elif "NO WAIVER" in text_data.upper():
|
| 143 |
+
clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
|
| 144 |
+
|
| 145 |
+
# Add more clause extractions as needed (e.g., Termination, Indemnity)
|
| 146 |
+
termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 147 |
+
if termination_match:
|
| 148 |
+
clauses["Termination"] = termination_match.group(1).strip()
|
| 149 |
+
|
| 150 |
+
return clauses if clauses else {"No clauses extracted": "No relevant clauses found in the document"}
|
| 151 |
|
| 152 |
def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
|
| 153 |
"""
|
| 154 |
+
Map extracted key-values to object fields, prioritizing Agreement Name, Agreement Start Date,
|
| 155 |
+
Agreement End Date, and Total Agreement Value.
|
| 156 |
Args:
|
| 157 |
key_values (dict): Extracted key-value pairs.
|
| 158 |
object_field_names (list): List of object field names.
|
|
|
|
| 164 |
mappings = {}
|
| 165 |
unmapped_fields = object_field_names.copy()
|
| 166 |
|
| 167 |
+
# Prioritize mapping for required fields
|
| 168 |
for field in object_field_names:
|
| 169 |
for key, value in key_values.items():
|
| 170 |
+
if field.lower() in key.lower():
|
| 171 |
mappings[field] = value
|
| 172 |
+
if field in unmapped_fields:
|
| 173 |
+
unmapped_fields.remove(field)
|
| 174 |
break
|
| 175 |
|
| 176 |
return {
|
| 177 |
"status": "success",
|
| 178 |
"mappings": mappings,
|
| 179 |
"unmapped_fields": unmapped_fields,
|
| 180 |
+
"error": None,
|
| 181 |
+
"clauses": extract_clauses(page_data) # Include clauses in the output
|
| 182 |
}
|
| 183 |
except Exception as e:
|
| 184 |
return {
|
| 185 |
"status": "failed",
|
| 186 |
"error": str(e),
|
| 187 |
"mappings": {},
|
| 188 |
+
"unmapped_fields": object_field_names,
|
| 189 |
+
"clauses": {}
|
| 190 |
}
|