pavansuresh commited on
Commit
6e4819e
·
verified ·
1 Parent(s): af4bac8

Update ai_mapping.py

Browse files
Files changed (1) hide show
  1. ai_mapping.py +57 -39
ai_mapping.py CHANGED
@@ -4,12 +4,8 @@ from PIL import Image
4
  import fitz # PyMuPDF
5
  from typing import Dict, List
6
  import os
7
- from huggingface_hub import login
8
  import re
9
 
10
- # Optional: Log in to Hugging Face if using a private model
11
- # login(token="your_hf_token")
12
-
13
  # Load pre-trained LayoutLMv3 models
14
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
15
  feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
@@ -17,41 +13,47 @@ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-b
17
 
18
  def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
19
  """
20
- Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
 
21
  Args:
22
  page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
23
  'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
24
  pdf_path (str): Path to the PDF file.
25
  Returns:
26
- dict: Key-value pairs extracted from the document.
27
  """
28
  try:
 
 
 
 
 
 
 
 
29
  # Fallback to regex using concatenated text from all pages
30
  text_data = " ".join([page["text"] for page in page_data])
31
- key_values = {}
32
- # Enhanced regex patterns with flexibility
33
- dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
34
- # Targeted date capture with OCR tolerance
35
- date_context = re.findall(r'(?:executed\s+as\s+of|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})|(?:Start\s+Date|End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
36
- amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
37
- # Refined Agreement Name to stop at Exhibit or clear break
38
- name_context = re.findall(r'(?:Order\s+Form|Contract)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
39
  if name_context:
40
  key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
41
- # Update key_values with matched fields
42
- for key, value in dates:
43
- key_values[key] = value
44
- if date_context:
45
- start_date = next((d[0] for d in date_context if d[0]), "")
46
- if start_date and not key_values.get("Agreement Start Date"):
47
- key_values["Agreement Start Date"] = start_date
48
- end_date = next((d[1] for d in date_context if d[1]), "")
49
- if end_date and not key_values.get("Agreement End Date"):
50
- key_values["Agreement End Date"] = end_date
 
 
51
  if amounts:
52
- key_values["Amount"] = amounts[0]
53
 
54
- # Attempt LayoutLMv3 processing
55
  doc = fitz.open(pdf_path)
56
  for page_num, page_info in enumerate(page_data):
57
  if not page_info["text"].strip() or "No text detected" in page_info["text"]:
@@ -100,13 +102,17 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
100
  for token, label in zip(tokens, labels):
101
  if label == 1: # Key start (adjust based on training)
102
  if current_key and current_value:
103
- key_values[current_key] = " ".join(current_value).strip()
 
 
104
  current_key = token
105
  current_value = []
106
  elif label == 2 and current_key: # Value (adjust based on training)
107
  current_value.append(token)
108
  if current_key and current_value:
109
- key_values[current_key] = " ".join(current_value).strip()
 
 
110
 
111
  # Clean up temporary image
112
  if os.path.exists(img_path):
@@ -119,7 +125,7 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
119
 
120
  def extract_clauses(page_data: list) -> Dict[str, str]:
121
  """
122
- Extract clauses from PDF text based on keywords.
123
  Args:
124
  page_data (list): List of dictionaries with 'text' (str) per page.
125
  Returns:
@@ -127,18 +133,26 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
127
  """
128
  clauses = {}
129
  text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
130
- # Broader search for "NO WAIVER" with fallback
 
131
  no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
132
  if no_waiver_match:
133
  clause_text = no_waiver_match.group(1).strip()
134
- clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER"
135
- elif "NO WAIVER" in text_data:
136
- clauses["NO WAIVER"] = "NO WAIVER"
137
- return clauses if clauses else {}
 
 
 
 
 
 
138
 
139
  def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
140
  """
141
- Map extracted key-values to object fields using LayoutLMv3-base (simplified).
 
142
  Args:
143
  key_values (dict): Extracted key-value pairs.
144
  object_field_names (list): List of object field names.
@@ -150,23 +164,27 @@ def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names:
150
  mappings = {}
151
  unmapped_fields = object_field_names.copy()
152
 
 
153
  for field in object_field_names:
154
  for key, value in key_values.items():
155
- if field.lower() in key.lower() or any(k.lower() in field.lower() for k in key_values.keys()):
156
  mappings[field] = value
157
- unmapped_fields.remove(field)
 
158
  break
159
 
160
  return {
161
  "status": "success",
162
  "mappings": mappings,
163
  "unmapped_fields": unmapped_fields,
164
- "error": None
 
165
  }
166
  except Exception as e:
167
  return {
168
  "status": "failed",
169
  "error": str(e),
170
  "mappings": {},
171
- "unmapped_fields": object_field_names
 
172
  }
 
4
  import fitz # PyMuPDF
5
  from typing import Dict, List
6
  import os
 
7
  import re
8
 
 
 
 
9
  # Load pre-trained LayoutLMv3 models
10
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
11
  feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
 
13
 
14
  def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str, str]:
15
  """
16
+ Extract key-value pairs from PDF text using LayoutLMv3-base with focus on Agreement Name,
17
+ Agreement Start Date, Agreement End Date, and Total Agreement Value, with regex fallback.
18
  Args:
19
  page_data (list): List of dictionaries with 'text' (str), 'words' (list of str),
20
  'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) per page.
21
  pdf_path (str): Path to the PDF file.
22
  Returns:
23
+ dict: Key-value pairs extracted from the document focusing on specified fields.
24
  """
25
  try:
26
+ # Initialize key-value dictionary for required fields
27
+ key_values = {
28
+ "Agreement Name": "Unknown",
29
+ "Agreement Start Date": "",
30
+ "Agreement End Date": "",
31
+ "Total Agreement Value": ""
32
+ }
33
+
34
  # Fallback to regex using concatenated text from all pages
35
  text_data = " ".join([page["text"] for page in page_data])
36
+
37
+ # Refined regex patterns for required fields
38
+ name_context = re.findall(r'(?:Order\s+Form|Contract|Agreement\s+Name)\s*[:\s]*([A-Za-z0-9\s]+?)(?=\s*(?:Exhibit|\n\n|\Z))', text_data, re.IGNORECASE)
 
 
 
 
 
39
  if name_context:
40
  key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("the below")), "Unknown")
41
+
42
+ date_patterns = [
43
+ r'(Agreement\s+Start\s+Date|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
44
+ r'(Agreement\s+End\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
45
+ ]
46
+ for pattern in date_patterns:
47
+ matches = re.findall(pattern, text_data, re.IGNORECASE)
48
+ for key, value in matches:
49
+ key_values[key] = value
50
+
51
+ amount_pattern = r'(?:Total\s+Agreement\s+Value|Amount|Total\s+Cost)\s*[:\s]*\$?\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
52
+ amounts = re.findall(amount_pattern, text_data, re.IGNORECASE)
53
  if amounts:
54
+ key_values["Total Agreement Value"] = amounts[0].split(":")[-1].strip() if ":" in amounts[0] else amounts[0].strip()
55
 
56
+ # Attempt LayoutLMv3 processing for enhanced extraction
57
  doc = fitz.open(pdf_path)
58
  for page_num, page_info in enumerate(page_data):
59
  if not page_info["text"].strip() or "No text detected" in page_info["text"]:
 
102
  for token, label in zip(tokens, labels):
103
  if label == 1: # Key start (adjust based on training)
104
  if current_key and current_value:
105
+ key = " ".join(current_value).strip()
106
+ if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
107
+ key_values[current_key] = key
108
  current_key = token
109
  current_value = []
110
  elif label == 2 and current_key: # Value (adjust based on training)
111
  current_value.append(token)
112
  if current_key and current_value:
113
+ key = " ".join(current_value).strip()
114
+ if any(f.lower() in current_key.lower() for f in ["agreement name", "start date", "end date", "total agreement value"]):
115
+ key_values[current_key] = key
116
 
117
  # Clean up temporary image
118
  if os.path.exists(img_path):
 
125
 
126
  def extract_clauses(page_data: list) -> Dict[str, str]:
127
  """
128
+ Extract clauses from PDF text based on keywords, focusing on key clauses like NO WAIVER.
129
  Args:
130
  page_data (list): List of dictionaries with 'text' (str) per page.
131
  Returns:
 
133
  """
134
  clauses = {}
135
  text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
136
+
137
+ # Search for NO WAIVER clause
138
  no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
139
  if no_waiver_match:
140
  clause_text = no_waiver_match.group(1).strip()
141
+ clauses["NO WAIVER"] = clause_text if clause_text else "NO WAIVER clause found but no content extracted"
142
+ elif "NO WAIVER" in text_data.upper():
143
+ clauses["NO WAIVER"] = "NO WAIVER clause identified but no detailed content extracted"
144
+
145
+ # Add more clause extractions as needed (e.g., Termination, Indemnity)
146
+ termination_match = re.search(r'(?:Termination\s*[:\s]*)([\s\S]*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
147
+ if termination_match:
148
+ clauses["Termination"] = termination_match.group(1).strip()
149
+
150
+ return clauses if clauses else {"No clauses extracted": "No relevant clauses found in the document"}
151
 
152
  def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
153
  """
154
+ Map extracted key-values to object fields, prioritizing Agreement Name, Agreement Start Date,
155
+ Agreement End Date, and Total Agreement Value.
156
  Args:
157
  key_values (dict): Extracted key-value pairs.
158
  object_field_names (list): List of object field names.
 
164
  mappings = {}
165
  unmapped_fields = object_field_names.copy()
166
 
167
+ # Prioritize mapping for required fields
168
  for field in object_field_names:
169
  for key, value in key_values.items():
170
+ if field.lower() in key.lower():
171
  mappings[field] = value
172
+ if field in unmapped_fields:
173
+ unmapped_fields.remove(field)
174
  break
175
 
176
  return {
177
  "status": "success",
178
  "mappings": mappings,
179
  "unmapped_fields": unmapped_fields,
180
+ "error": None,
181
+ "clauses": extract_clauses(page_data) # Include clauses in the output
182
  }
183
  except Exception as e:
184
  return {
185
  "status": "failed",
186
  "error": str(e),
187
  "mappings": {},
188
+ "unmapped_fields": object_field_names,
189
+ "clauses": {}
190
  }