pavansuresh commited on
Commit
a40fdc8
·
verified ·
1 Parent(s): dde2ff7

Update ai_mapping.py

Browse files
Files changed (1) hide show
  1. ai_mapping.py +21 -15
ai_mapping.py CHANGED
@@ -35,26 +35,32 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
35
  text_data = " ".join([page["text"] for page in page_data])
36
 
37
  # Refined regex patterns for required fields, avoiding record type as Agreement Name
38
- name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
39
  if name_context:
40
  key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
41
-
42
- # Enhanced date patterns to capture context like "executed as of" or specific date labels
 
 
 
 
 
43
  date_patterns = [
44
- r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
45
- r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
46
  ]
47
  for pattern in date_patterns:
48
  matches = re.findall(pattern, text_data, re.IGNORECASE)
49
- for key, value in [("Agreement Start Date", matches[0][0]) if "start" in pattern.lower() or "effective" in pattern.lower() else ("Agreement End Date", matches[0][0]) for matches in [m for m in [re.findall(pattern, text_data, re.IGNORECASE)] if m]]:
 
50
  if value and not key_values.get(key):
51
  key_values[key] = value
52
 
53
  # Improved amount pattern to capture total value context
54
- amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
55
  amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
56
  if amounts:
57
- key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value"])), "")
58
 
59
  # Attempt LayoutLMv3 processing for enhanced extraction
60
  doc = fitz.open(pdf_path)
@@ -108,11 +114,11 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
108
  key = " ".join(current_value).strip()
109
  if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
110
  key_values["Agreement Name"] = key
111
- elif "start date" in current_key.lower() or "effective date" in current_key.lower():
112
  key_values["Agreement Start Date"] = key
113
  elif "end date" in current_key.lower() or "termination date" in current_key.lower():
114
  key_values["Agreement End Date"] = key
115
- elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
116
  key_values["Total Agreement Value"] = key
117
  current_key = token
118
  current_value = []
@@ -122,11 +128,11 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
122
  key = " ".join(current_value).strip()
123
  if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
124
  key_values["Agreement Name"] = key
125
- elif "start date" in current_key.lower() or "effective date" in current_key.lower():
126
  key_values["Agreement Start Date"] = key
127
  elif "end date" in current_key.lower() or "termination date" in current_key.lower():
128
  key_values["Agreement End Date"] = key
129
- elif "total agreement value" in current_key.lower() or "amount" in current_key.lower():
130
  key_values["Total Agreement Value"] = key
131
 
132
  # Clean up temporary image
@@ -140,7 +146,7 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
140
 
141
  def extract_clauses(page_data: list) -> Dict[str, str]:
142
  """
143
- Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER.
144
  Args:
145
  page_data (list): List of dictionaries with 'text' (str) per page.
146
  Returns:
@@ -155,9 +161,9 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
155
  clause_text = no_waiver_match.group(1).strip()
156
  clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
157
  elif "NO WAIVER" in text_data.upper():
158
- clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
159
 
160
- # Add Termination clause
161
  termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
162
  if termination_match:
163
  clauses["Termination"] = termination_match.group(1).strip()
 
35
  text_data = " ".join([page["text"] for page in page_data])
36
 
37
  # Refined regex patterns for required fields, avoiding record type as Agreement Name
38
+ name_context = re.findall(r'(?:Agreement\s+Name|Contract\s+Title|Agreement\s+Title)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
39
  if name_context:
40
  key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and "MASTER SUBSCRIPTION AGREEMENT" not in name.upper()), "Unknown")
41
+ else:
42
+ # Fallback to infer name from context if no explicit title
43
+ party_match = re.search(r'(?:between\s+([A-Za-z\s]+)\s+and)', text_data, re.IGNORECASE)
44
+ if party_match:
45
+ key_values["Agreement Name"] = party_match.group(1).strip() or "Unknown"
46
+
47
+ # Enhanced date patterns to capture "executed as of" and other date contexts
48
  date_patterns = [
49
+ r'(?:Agreement\s+Start\s+Date|Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}/\d{1,2}/\d{2,4})',
50
+ r'(?:Agreement\s+End\s+Date|Termination\s+Date)\s*[:\s]*(\d{1,2}/\d{1,2}/\d{2,4})'
51
  ]
52
  for pattern in date_patterns:
53
  matches = re.findall(pattern, text_data, re.IGNORECASE)
54
+ if matches:
55
+ key, value = ("Agreement Start Date", matches[0]) if "start" in pattern.lower() or "effective" in pattern.lower() or "executed" in pattern.lower() else ("Agreement End Date", matches[0])
56
  if value and not key_values.get(key):
57
  key_values[key] = value
58
 
59
  # Improved amount pattern to capture total value context
60
+ amount_pattern = r'(?:Total\s+Agreement\s+Value|Total\s+Amount|Contract\s+Value|List\s+Price)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
61
  amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
62
  if amounts:
63
+ key_values["Total Agreement Value"] = next((amt.split(":")[-1].strip() if ":" in amt else amt.strip() for amt in amounts if any(k.lower() in amt.lower() for k in ["total", "value", "price"])), "")
64
 
65
  # Attempt LayoutLMv3 processing for enhanced extraction
66
  doc = fitz.open(pdf_path)
 
114
  key = " ".join(current_value).strip()
115
  if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
116
  key_values["Agreement Name"] = key
117
+ elif "start date" in current_key.lower() or "effective date" in current_key.lower() or "executed as of" in current_key.lower():
118
  key_values["Agreement Start Date"] = key
119
  elif "end date" in current_key.lower() or "termination date" in current_key.lower():
120
  key_values["Agreement End Date"] = key
121
+ elif "total agreement value" in current_key.lower() or "amount" in current_key.lower() or "price" in current_key.lower():
122
  key_values["Total Agreement Value"] = key
123
  current_key = token
124
  current_value = []
 
128
  key = " ".join(current_value).strip()
129
  if "agreement name" in current_key.lower() and "MASTER SUBSCRIPTION AGREEMENT" not in key.upper():
130
  key_values["Agreement Name"] = key
131
+ elif "start date" in current_key.lower() or "effective date" in current_key.lower() or "executed as of" in current_key.lower():
132
  key_values["Agreement Start Date"] = key
133
  elif "end date" in current_key.lower() or "termination date" in current_key.lower():
134
  key_values["Agreement End Date"] = key
135
+ elif "total agreement value" in current_key.lower() or "amount" in current_key.lower() or "price" in current_key.lower():
136
  key_values["Total Agreement Value"] = key
137
 
138
  # Clean up temporary image
 
146
 
147
  def extract_clauses(page_data: list) -> Dict[str, str]:
148
  """
149
+ Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER and Termination.
150
  Args:
151
  page_data (list): List of dictionaries with 'text' (str) per page.
152
  Returns:
 
161
  clause_text = no_waiver_match.group(1).strip()
162
  clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
163
  elif "NO WAIVER" in text_data.upper():
164
+ clauses["NO WAIVER"] = re.search(r'(NO\s+WAIVER\s*[:\s]*[\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE).group(1).strip() if re.search(r'(NO\s+WAIVER\s*[:\s]*[\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE) else "NO WAIVER clause identified but no detailed content extracted"
165
 
166
+ # Search for Termination clause
167
  termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
168
  if termination_match:
169
  clauses["Termination"] = termination_match.group(1).strip()