Spaces:
Sleeping
Sleeping
Update ai_mapping.py
Browse files- ai_mapping.py +29 -3
ai_mapping.py
CHANGED
|
@@ -29,10 +29,17 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 29 |
# Fallback to regex using concatenated text from all pages
|
| 30 |
text_data = " ".join([page["text"] for page in page_data])
|
| 31 |
key_values = {}
|
| 32 |
-
|
|
|
|
| 33 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
# Attempt LayoutLMv3 processing
|
| 38 |
doc = fitz.open(pdf_path)
|
|
@@ -100,6 +107,25 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
|
|
| 100 |
except Exception as e:
|
| 101 |
return {"status": "failed", "error": str(e), "key_values": {}}
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
|
| 104 |
"""
|
| 105 |
Map extracted key-values to object fields using LayoutLMv3-base (simplified).
|
|
|
|
| 29 |
# Fallback to regex using concatenated text from all pages
|
| 30 |
text_data = " ".join([page["text"] for page in page_data])
|
| 31 |
key_values = {}
|
| 32 |
+
# Enhanced regex patterns
|
| 33 |
+
dates = re.findall(r'(Agreement Start Date|Agreement End Date):\s*(\d{1,2}/\d{1,2}/\d{4})', text_data)
|
| 34 |
amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
|
| 35 |
+
names = re.findall(r'(Agreement Name):\s*([A-Za-z0-9\s-]+)', text_data)
|
| 36 |
+
# Update key_values with matched fields
|
| 37 |
+
for key, value in dates:
|
| 38 |
+
key_values[key] = value
|
| 39 |
+
if amounts:
|
| 40 |
+
key_values["Amount"] = amounts[0]
|
| 41 |
+
for key, value in names:
|
| 42 |
+
key_values[key] = value
|
| 43 |
|
| 44 |
# Attempt LayoutLMv3 processing
|
| 45 |
doc = fitz.open(pdf_path)
|
|
|
|
| 107 |
except Exception as e:
|
| 108 |
return {"status": "failed", "error": str(e), "key_values": {}}
|
| 109 |
|
| 110 |
+
def extract_clauses(page_data: list) -> Dict[str, str]:
|
| 111 |
+
"""
|
| 112 |
+
Extract clauses from PDF text based on keywords.
|
| 113 |
+
Args:
|
| 114 |
+
page_data (list): List of dictionaries with 'text' (str) per page.
|
| 115 |
+
Returns:
|
| 116 |
+
dict: Mapping of clause names to their text content.
|
| 117 |
+
"""
|
| 118 |
+
clauses = {}
|
| 119 |
+
text_data = " ".join([page["text"] for page in page_data])
|
| 120 |
+
clause_keywords = ["Clause", "Section", "Terms", "Condition", "Provision"]
|
| 121 |
+
for keyword in clause_keywords:
|
| 122 |
+
matches = re.finditer(rf'{keyword}\s+\d+\.?\s*(.+?)(?=(?:\s*{keyword}\s+\d+\.?|\Z))', text_data, re.DOTALL)
|
| 123 |
+
for match in matches:
|
| 124 |
+
clause_text = match.group(1).strip()
|
| 125 |
+
if clause_text and len(clause_text.split()) > 5: # Minimum length to ensure meaningful clause
|
| 126 |
+
clauses[f"{keyword} {len(clauses) + 1}"] = clause_text
|
| 127 |
+
return clauses if clauses else {}
|
| 128 |
+
|
| 129 |
def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
|
| 130 |
"""
|
| 131 |
Map extracted key-values to object fields using LayoutLMv3-base (simplified).
|