Spaces:
Sleeping
Sleeping
Update ai_mapping.py
Browse files- ai_mapping.py +12 -9
ai_mapping.py
CHANGED
|
@@ -31,20 +31,23 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 31 |
key_values = {}
|
| 32 |
# Enhanced regex patterns with flexibility
|
| 33 |
dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 34 |
-
# Capture date
|
| 35 |
-
date_context = re.findall(r'(?:
|
| 36 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 37 |
-
#
|
| 38 |
-
name_context = re.findall(r'(?:Order\s+Form|Agreement)\s*[:\s]*([A-Za-z0-9\s-]+)(?=\s*(?:Product|Quantity|List|Net))', text_data, re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
| 39 |
# Update key_values with matched fields
|
| 40 |
for key, value in dates:
|
| 41 |
key_values[key] = value
|
| 42 |
if date_context and not key_values.get("Agreement Start Date"):
|
| 43 |
key_values["Agreement Start Date"] = date_context[0]
|
|
|
|
|
|
|
| 44 |
if amounts:
|
| 45 |
key_values["Amount"] = amounts[0]
|
| 46 |
-
if name_context:
|
| 47 |
-
key_values["Agreement Name"] = name_context[0].strip()
|
| 48 |
|
| 49 |
# Attempt LayoutLMv3 processing
|
| 50 |
doc = fitz.open(pdf_path)
|
|
@@ -122,9 +125,9 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
|
|
| 122 |
"""
|
| 123 |
clauses = {}
|
| 124 |
text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
|
| 125 |
-
# Target "NO WAIVER"
|
| 126 |
-
no_waiver_match = re.search(r'
|
| 127 |
-
if no_waiver_match
|
| 128 |
clauses["NO WAIVER"] = no_waiver_match.group(1).strip()
|
| 129 |
return clauses if clauses else {}
|
| 130 |
|
|
|
|
| 31 |
key_values = {}
|
| 32 |
# Enhanced regex patterns with flexibility
|
| 33 |
dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 34 |
+
# Capture date anywhere with context like "executed as of" or "Effective Date"
|
| 35 |
+
date_context = re.findall(r'(?:executed\s+as\s+of|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
|
| 36 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 37 |
+
# Refined Agreement Name near Order Form or document start
|
| 38 |
+
name_context = re.findall(r'(?:Order\s+Form|Agreement)\s*[:\s]*([A-Za-z0-9\s-]+)(?=\s*(?:Product|Quantity|List|Net|\Z))', text_data, re.IGNORECASE)
|
| 39 |
+
# Prioritize first meaningful name, avoiding procedural text
|
| 40 |
+
if name_context:
|
| 41 |
+
key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("no")), "Unknown")
|
| 42 |
# Update key_values with matched fields
|
| 43 |
for key, value in dates:
|
| 44 |
key_values[key] = value
|
| 45 |
if date_context and not key_values.get("Agreement Start Date"):
|
| 46 |
key_values["Agreement Start Date"] = date_context[0]
|
| 47 |
+
if not key_values.get("Agreement End Date") and len(date_context) > 1:
|
| 48 |
+
key_values["Agreement End Date"] = date_context[1] if date_context[1:] else ""
|
| 49 |
if amounts:
|
| 50 |
key_values["Amount"] = amounts[0]
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# Attempt LayoutLMv3 processing
|
| 53 |
doc = fitz.open(pdf_path)
|
|
|
|
| 125 |
"""
|
| 126 |
clauses = {}
|
| 127 |
text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
|
| 128 |
+
# Target exact "NO WAIVER" text only
|
| 129 |
+
no_waiver_match = re.search(r'NO\s+WAIVER\s*[:\s]*(.*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
|
| 130 |
+
if no_waiver_match:
|
| 131 |
clauses["NO WAIVER"] = no_waiver_match.group(1).strip()
|
| 132 |
return clauses if clauses else {}
|
| 133 |
|