Spaces:
Sleeping
Sleeping
Update ai_mapping.py
Browse files- ai_mapping.py +10 -12
ai_mapping.py
CHANGED
|
@@ -31,10 +31,11 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 31 |
key_values = {}
|
| 32 |
# Enhanced regex patterns with flexibility
|
| 33 |
dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 34 |
-
# Capture date near "Effective Date" or "
|
| 35 |
-
date_context = re.findall(r'(?:Effective\s+Date|
|
| 36 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 37 |
-
|
|
|
|
| 38 |
# Update key_values with matched fields
|
| 39 |
for key, value in dates:
|
| 40 |
key_values[key] = value
|
|
@@ -42,8 +43,8 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 42 |
key_values["Agreement Start Date"] = date_context[0]
|
| 43 |
if amounts:
|
| 44 |
key_values["Amount"] = amounts[0]
|
| 45 |
-
|
| 46 |
-
key_values[
|
| 47 |
|
| 48 |
# Attempt LayoutLMv3 processing
|
| 49 |
doc = fitz.open(pdf_path)
|
|
@@ -121,13 +122,10 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
|
|
| 121 |
"""
|
| 122 |
clauses = {}
|
| 123 |
text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
clause_text = match.group(1).strip()
|
| 129 |
-
if clause_text and len(clause_text.split()) > 5: # Minimum length to ensure meaningful clause
|
| 130 |
-
clauses[f"{keyword} {len([k for k in clauses.keys() if k.startswith(keyword)]) + 1}"] = clause_text
|
| 131 |
return clauses if clauses else {}
|
| 132 |
|
| 133 |
def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
|
|
|
|
| 31 |
key_values = {}
|
| 32 |
# Enhanced regex patterns with flexibility
|
| 33 |
dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 34 |
+
# Capture date near "Effective Date" or "executed as of"
|
| 35 |
+
date_context = re.findall(r'(?:Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 36 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 37 |
+
# Heuristic for Agreement Name near Order Form or titles
|
| 38 |
+
name_context = re.findall(r'(?:Order\s+Form|Agreement)\s*[:\s]*([A-Za-z0-9\s-]+)(?=\s*(?:Product|Quantity|List|Net))', text_data, re.IGNORECASE)
|
| 39 |
# Update key_values with matched fields
|
| 40 |
for key, value in dates:
|
| 41 |
key_values[key] = value
|
|
|
|
| 43 |
key_values["Agreement Start Date"] = date_context[0]
|
| 44 |
if amounts:
|
| 45 |
key_values["Amount"] = amounts[0]
|
| 46 |
+
if name_context:
|
| 47 |
+
key_values["Agreement Name"] = name_context[0].strip()
|
| 48 |
|
| 49 |
# Attempt LayoutLMv3 processing
|
| 50 |
doc = fitz.open(pdf_path)
|
|
|
|
| 122 |
"""
|
| 123 |
clauses = {}
|
| 124 |
text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
|
| 125 |
+
# Target "NO WAIVER" specifically under "General Provisions"
|
| 126 |
+
no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=(?:General\s+Provisions|\Z))', text_data, re.IGNORECASE)
|
| 127 |
+
if no_waiver_match and len(no_waiver_match.group(1).strip().split()) > 5:
|
| 128 |
+
clauses["NO WAIVER"] = no_waiver_match.group(1).strip()
|
|
|
|
|
|
|
|
|
|
| 129 |
return clauses if clauses else {}
|
| 130 |
|
| 131 |
def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
|