pavansuresh commited on
Commit
1d36c4c
·
verified ·
1 Parent(s): a709a2b

Update ai_mapping.py

Browse files
Files changed (1) hide show
  1. ai_mapping.py +10 -12
ai_mapping.py CHANGED
@@ -31,10 +31,11 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
31
  key_values = {}
32
  # Enhanced regex patterns with flexibility
33
  dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
34
- # Capture date near "Effective Date" or "Signed"
35
- date_context = re.findall(r'(?:Effective\s+Date|Signed)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
36
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
37
- names = re.findall(r'(Agreement\s+Name)\s*[:\s]*([A-Za-z0-9\s-]+)', text_data, re.IGNORECASE)
 
38
  # Update key_values with matched fields
39
  for key, value in dates:
40
  key_values[key] = value
@@ -42,8 +43,8 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
42
  key_values["Agreement Start Date"] = date_context[0]
43
  if amounts:
44
  key_values["Amount"] = amounts[0]
45
- for key, value in names:
46
- key_values[key] = value
47
 
48
  # Attempt LayoutLMv3 processing
49
  doc = fitz.open(pdf_path)
@@ -121,13 +122,10 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
121
  """
122
  clauses = {}
123
  text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
124
- clause_keywords = ["Clause", "Section", "Terms", "Condition", "Provision", "Exhibit"]
125
- for keyword in clause_keywords:
126
- matches = re.finditer(rf'(?:{keyword}\s+\d+\.?\s*|\b{keyword}\b)\s*(.+?)(?=(?:{keyword}\s+\d+\.?|\Z))', text_data, re.IGNORECASE | re.DOTALL)
127
- for match in matches:
128
- clause_text = match.group(1).strip()
129
- if clause_text and len(clause_text.split()) > 5: # Minimum length to ensure meaningful clause
130
- clauses[f"{keyword} {len([k for k in clauses.keys() if k.startswith(keyword)]) + 1}"] = clause_text
131
  return clauses if clauses else {}
132
 
133
  def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
 
31
  key_values = {}
32
  # Enhanced regex patterns with flexibility
33
  dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
34
+ # Capture date near "Effective Date" or "executed as of"
35
+ date_context = re.findall(r'(?:Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
36
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
37
+ # Heuristic for Agreement Name near Order Form or titles
38
+ name_context = re.findall(r'(?:Order\s+Form|Agreement)\s*[:\s]*([A-Za-z0-9\s-]+)(?=\s*(?:Product|Quantity|List|Net))', text_data, re.IGNORECASE)
39
  # Update key_values with matched fields
40
  for key, value in dates:
41
  key_values[key] = value
 
43
  key_values["Agreement Start Date"] = date_context[0]
44
  if amounts:
45
  key_values["Amount"] = amounts[0]
46
+ if name_context:
47
+ key_values["Agreement Name"] = name_context[0].strip()
48
 
49
  # Attempt LayoutLMv3 processing
50
  doc = fitz.open(pdf_path)
 
122
  """
123
  clauses = {}
124
  text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
125
+ # Target "NO WAIVER" specifically under "General Provisions"
126
+ no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=(?:General\s+Provisions|\Z))', text_data, re.IGNORECASE)
127
+ if no_waiver_match and len(no_waiver_match.group(1).strip().split()) > 5:
128
+ clauses["NO WAIVER"] = no_waiver_match.group(1).strip()
 
 
 
129
  return clauses if clauses else {}
130
 
131
  def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict: