pavansuresh commited on
Commit
17db344
·
verified ·
1 Parent(s): 1d36c4c

Update ai_mapping.py

Browse files
Files changed (1) hide show
  1. ai_mapping.py +12 -9
ai_mapping.py CHANGED
@@ -31,20 +31,23 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
31
  key_values = {}
32
  # Enhanced regex patterns with flexibility
33
  dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
34
- # Capture date near "Effective Date" or "executed as of"
35
- date_context = re.findall(r'(?:Effective\s+Date|executed\s+as\s+of)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
36
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
37
- # Heuristic for Agreement Name near Order Form or titles
38
- name_context = re.findall(r'(?:Order\s+Form|Agreement)\s*[:\s]*([A-Za-z0-9\s-]+)(?=\s*(?:Product|Quantity|List|Net))', text_data, re.IGNORECASE)
 
 
 
39
  # Update key_values with matched fields
40
  for key, value in dates:
41
  key_values[key] = value
42
  if date_context and not key_values.get("Agreement Start Date"):
43
  key_values["Agreement Start Date"] = date_context[0]
 
 
44
  if amounts:
45
  key_values["Amount"] = amounts[0]
46
- if name_context:
47
- key_values["Agreement Name"] = name_context[0].strip()
48
 
49
  # Attempt LayoutLMv3 processing
50
  doc = fitz.open(pdf_path)
@@ -122,9 +125,9 @@ def extract_clauses(page_data: list) -> Dict[str, str]:
122
  """
123
  clauses = {}
124
  text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
125
- # Target "NO WAIVER" specifically under "General Provisions"
126
- no_waiver_match = re.search(r'(?:General\s+Provisions\s*[\s\S]*?NO\s+WAIVER\s*[:\s]*)([\s\S]*?)(?=(?:General\s+Provisions|\Z))', text_data, re.IGNORECASE)
127
- if no_waiver_match and len(no_waiver_match.group(1).strip().split()) > 5:
128
  clauses["NO WAIVER"] = no_waiver_match.group(1).strip()
129
  return clauses if clauses else {}
130
 
 
31
  key_values = {}
32
  # Enhanced regex patterns with flexibility
33
  dates = re.findall(r'(Agreement\s+(?:Start|End)\s+Date(?:s)?)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
34
+ # Capture date anywhere with context like "executed as of" or "Effective Date"
35
+ date_context = re.findall(r'(?:executed\s+as\s+of|Effective\s+Date)\s*[:\s]*(\d{1,2}[/-]\d{1,2}[/-]\d{4})', text_data, re.IGNORECASE)
36
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
37
+ # Refined Agreement Name near Order Form or document start
38
+ name_context = re.findall(r'(?:Order\s+Form|Agreement)\s*[:\s]*([A-Za-z0-9\s-]+)(?=\s*(?:Product|Quantity|List|Net|\Z))', text_data, re.IGNORECASE)
39
+ # Prioritize first meaningful name, avoiding procedural text
40
+ if name_context:
41
+ key_values["Agreement Name"] = next((name.strip() for name in name_context if len(name.split()) > 1 and not name.lower().startswith("no")), "Unknown")
42
  # Update key_values with matched fields
43
  for key, value in dates:
44
  key_values[key] = value
45
  if date_context and not key_values.get("Agreement Start Date"):
46
  key_values["Agreement Start Date"] = date_context[0]
47
+ if not key_values.get("Agreement End Date") and len(date_context) > 1:
48
+ key_values["Agreement End Date"] = date_context[1] if date_context[1:] else ""
49
  if amounts:
50
  key_values["Amount"] = amounts[0]
 
 
51
 
52
  # Attempt LayoutLMv3 processing
53
  doc = fitz.open(pdf_path)
 
125
  """
126
  clauses = {}
127
  text_data = "\n".join([page["text"] for page in page_data]) # Use newlines for better segmentation
128
+ # Target exact "NO WAIVER" text only
129
+ no_waiver_match = re.search(r'NO\s+WAIVER\s*[:\s]*(.*?)(?=\n\n|\Z)', text_data, re.IGNORECASE)
130
+ if no_waiver_match:
131
  clauses["NO WAIVER"] = no_waiver_match.group(1).strip()
132
  return clauses if clauses else {}
133