pavansuresh commited on
Commit
91704ec
·
verified ·
1 Parent(s): 428bcb4

Update ai_mapping.py

Browse files
Files changed (1) hide show
  1. ai_mapping.py +29 -3
ai_mapping.py CHANGED
@@ -29,10 +29,17 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
29
  # Fallback to regex using concatenated text from all pages
30
  text_data = " ".join([page["text"] for page in page_data])
31
  key_values = {}
32
- dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text_data)
 
33
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
34
- if dates or amounts:
35
- key_values.update({"Date": dates[0] if dates else "", "Amount": amounts[0] if amounts else ""})
 
 
 
 
 
 
36
 
37
  # Attempt LayoutLMv3 processing
38
  doc = fitz.open(pdf_path)
@@ -100,6 +107,25 @@ def extract_key_values_with_layoutlm(page_data: list, pdf_path: str) -> Dict[str
100
  except Exception as e:
101
  return {"status": "failed", "error": str(e), "key_values": {}}
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
104
  """
105
  Map extracted key-values to object fields using LayoutLMv3-base (simplified).
 
29
  # Fallback to regex using concatenated text from all pages
30
  text_data = " ".join([page["text"] for page in page_data])
31
  key_values = {}
32
+ # Enhanced regex patterns
33
+ dates = re.findall(r'(Agreement Start Date|Agreement End Date):\s*(\d{1,2}/\d{1,2}/\d{4})', text_data)
34
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
35
+ names = re.findall(r'(Agreement Name):\s*([A-Za-z0-9\s-]+)', text_data)
36
+ # Update key_values with matched fields
37
+ for key, value in dates:
38
+ key_values[key] = value
39
+ if amounts:
40
+ key_values["Amount"] = amounts[0]
41
+ for key, value in names:
42
+ key_values[key] = value
43
 
44
  # Attempt LayoutLMv3 processing
45
  doc = fitz.open(pdf_path)
 
107
  except Exception as e:
108
  return {"status": "failed", "error": str(e), "key_values": {}}
109
 
110
+ def extract_clauses(page_data: list) -> Dict[str, str]:
111
+ """
112
+ Extract clauses from PDF text based on keywords.
113
+ Args:
114
+ page_data (list): List of dictionaries with 'text' (str) per page.
115
+ Returns:
116
+ dict: Mapping of clause names to their text content.
117
+ """
118
+ clauses = {}
119
+ text_data = " ".join([page["text"] for page in page_data])
120
+ clause_keywords = ["Clause", "Section", "Terms", "Condition", "Provision"]
121
+ for keyword in clause_keywords:
122
+ matches = re.finditer(rf'{keyword}\s+\d+\.?\s*(.+?)(?=(?:\s*{keyword}\s+\d+\.?|\Z))', text_data, re.DOTALL)
123
+ for match in matches:
124
+ clause_text = match.group(1).strip()
125
+ if clause_text and len(clause_text.split()) > 5: # Minimum length to ensure meaningful clause
126
+ clauses[f"{keyword} {len(clauses) + 1}"] = clause_text
127
+ return clauses if clauses else {}
128
+
129
  def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
130
  """
131
  Map extracted key-values to object fields using LayoutLMv3-base (simplified).