pavansuresh commited on
Commit
dde2ff7
·
verified ·
1 Parent(s): 6e4819e

Update ai_mapping.py

Browse files
Files changed (1) hide show
  1. ai_mapping.py +31 -16
ai_mapping.py CHANGED
@@ -34,24 +34,27 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
34
  # Fallback to regex using concatenated text from all pages
35
  text_data = " ".join([page["text"] for page in page_data])
36
 
37
- # Refined regex patterns for required fields
38
- name_context = re.findall(r'(?:Order\s+Form|Contract|Agreement\s+Name)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
39
  if name_context:
40
- key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
41
 
 
42
  date_patterns = [
43
- r'(Agreement\s+Start\s+Date|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
44
- r'(Agreement\s+End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
45
  ]
46
  for pattern in date_patterns:
47
  matches = re.findall(pattern, text_data, re.IGNORECASE)
48
- for key, value in matches:
49
- key_values[key] = value
 
50
 
51
- amount_pattern = r'(?:Total\s+Agreement\s+Value|Amount|Total\s+Cost)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
 
52
  amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
53
  if amounts:
54
- key_values["Total Agreement Value"] = amounts[0].split(":")[-1].strip() if ":" in amounts[0] else amounts[0].strip()
55
 
56
  # Attempt LayoutLMv3 processing for enhanced extraction
57
  doc = fitz.open(pdf_path)
@@ -103,23 +106,35 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
103
  if label == 1: # Key start (adjust based on training)
104
  if current_key and current_value:
105
  key = " ".join(current_value).strip()
106
- if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
107
- key_values[current_key] = key
 
 
 
 
 
 
108
  current_key = token
109
  current_value = []
110
  elif label == 2 and current_key: # Value (adjust based on training)
111
  current_value.append(token)
112
  if current_key and current_value:
113
  key = " ".join(current_value).strip()
114
- if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
115
- key_values[current_key] = key
 
 
 
 
 
 
116
 
117
  # Clean up temporary image
118
  if os.path.exists(img_path):
119
  os.unlink(img_path)
120
 
121
  doc.close()
122
- return key_values if key_values else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
123
  except Exception as e:
124
  return {"status": "failed", "error": str(e), "key_values": {}}
125
 
@@ -142,7 +157,7 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
142
  elif "NO WAIVER" in text_data.upper():
143
  clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
144
 
145
- # Add more clause extractions as needed (e.g., Termination, Indemnity)
146
  termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
147
  if termination_match:
148
  clauses["Termination"] = termination_match.group(1).strip()
@@ -167,7 +182,7 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
167
  # Prioritize mapping for required fields
168
  for field in object_field_names:
169
  for key, value in key_values.items():
170
- if field.lower() in key.lower():
171
  mappings[field] = value
172
  if field in unmapped_fields:
173
  unmapped_fields.remove(field)
 
34
  # Fallback to regex using concatenated text from all pages
35
  text_data = " ".join([page["text"] for page in page_data])
36
 
37
+ # Refined regex patterns for required fields, avoiding record type as Agreement Name
38
+ name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
39
  if name_context:
40
+ key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
41
 
42
+ # Enhanced date patterns to capture context like "executed as of" or specific date labels
43
  date_patterns = [
44
+ r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
45
+ r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
46
  ]
47
  for pattern in date_patterns:
48
  matches = re.findall(pattern, text_data, re.IGNORECASE)
49
+ for key, value in [("Agreement Start Date", matches[0][0]) if "start" in pattern.lower() or "effective" in pattern.lower() else ("Agreement End Date", matches[0][0]) for matches in [m for m in [re.findall(pattern, text_data, re.IGNORECASE)] if m]]:
50
+ if value and not key_values.get(key):
51
+ key_values[key] = value
52
 
53
+ # Improved amount pattern to capture total value context
54
+ amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
55
  amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
56
  if amounts:
57
+ key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value"])), "")
58
 
59
  # Attempt LayoutLMv3 processing for enhanced extraction
60
  doc = fitz.open(pdf_path)
 
106
  if label == 1: # Key start (adjust based on training)
107
  if current_key and current_value:
108
  key = " ".join(current_value).strip()
109
+ if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
110
+ key_values["Agreement Name"] = key
111
+ elif "start date" in current_key.lower() or "effective date" in current_key.lower():
112
+ key_values["Agreement Start Date"] = key
113
+ elif "end date" in current_key.lower() or "termination date" in current_key.lower():
114
+ key_values["Agreement End Date"] = key
115
+ elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
116
+ key_values["Total Agreement Value"] = key
117
  current_key = token
118
  current_value = []
119
  elif label == 2 and current_key: # Value (adjust based on training)
120
  current_value.append(token)
121
  if current_key and current_value:
122
  key = " ".join(current_value).strip()
123
+ if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
124
+ key_values["Agreement Name"] = key
125
+ elif "start date" in current_key.lower() or "effective date" in current_key.lower():
126
+ key_values["Agreement Start Date"] = key
127
+ elif "end date" in current_key.lower() or "termination date" in current_key.lower():
128
+ key_values["Agreement End Date"] = key
129
+ elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
130
+ key_values["Total Agreement Value"] = key
131
 
132
  # Clean up temporary image
133
  if os.path.exists(img_path):
134
  os.unlink(img_path)
135
 
136
  doc.close()
137
+ return key_values if any(key_values.values()) else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
138
  except Exception as e:
139
  return {"status": "failed", "error": str(e), "key_values": {}}
140
 
 
157
  elif "NO WAIVER" in text_data.upper():
158
  clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
159
 
160
+ # Add Termination clause
161
  termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
162
  if termination_match:
163
  clauses["Termination"] = termination_match.group(1).strip()
 
182
  # Prioritize mapping for required fields
183
  for field in object_field_names:
184
  for key, value in key_values.items():
185
+ if field.lower() in key.lower() and value:
186
  mappings[field] = value
187
  if field in unmapped_fields:
188
  unmapped_fields.remove(field)