Abhisesh7 commited on
Commit
5b964d1
·
verified ·
1 Parent(s): 15ee88d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -58
app.py CHANGED
@@ -59,30 +59,40 @@ def extract_text_from_pdf(pdf_file):
59
  return f"Error extracting text: {str(e)}"
60
 
61
  def extract_items(text):
62
- """Extract items from the invoice table with a simplified approach."""
63
  items = []
64
- # Replace escaped dollar signs
65
- text = text.replace(r'\$', '$')
66
 
67
  # Split text into lines
68
  lines = text.split('\n')
69
  print("Text split into lines:", lines) # Debug
70
 
71
- # Find the table header
 
 
 
 
 
72
  table_start = -1
 
73
  for i, line in enumerate(lines):
74
- if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
75
- table_start = i + 1 # Table data starts after the header
 
 
 
 
76
  break
77
 
78
  if table_start == -1:
79
  print("Table header not found.")
80
  return items
81
 
82
- # Find the end of the table (before "Total Amount", "Promo Code", or end of text)
83
  table_end = len(lines)
84
  for i in range(table_start, len(lines)):
85
- if "Total Amount" in lines[i] or "Total Due" in lines[i] or "Promo Code" in lines[i]:
86
  table_end = i
87
  break
88
 
@@ -90,8 +100,13 @@ def extract_items(text):
90
  table_lines = lines[table_start:table_end]
91
  print("Table lines:", table_lines) # Debug
92
 
93
- # Pattern to match table rows
94
- table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
 
 
 
 
 
95
 
96
  for line in table_lines:
97
  line = line.strip()
@@ -101,22 +116,19 @@ def extract_items(text):
101
  if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
102
  print(f"Skipping alignment row: {line}")
103
  continue
104
- # Replace alignment markers in the row (e.g., "|---|") with "|"
105
- line = re.sub(r'\|\s*---\s*\|', '|', line)
106
  print(f"Processing table row: {line}") # Debug
107
  match = re.match(table_row_pattern, line)
108
  if match:
109
- description = match.group(1).strip()
110
- # Clean the description to remove any trailing quantity or price data
111
- description = re.sub(r'\s*\d+\s*$', '', description).strip() # Remove trailing numbers
112
- description = re.sub(r'\s*\$?\d+\.\d+\s*$', '', description).strip() # Remove trailing prices
113
- # Skip lines that look like promo codes
114
- if "Promo Code" in description:
115
- print(f"Skipping promo code line: {line}")
116
- continue
117
- quantity = int(match.group(2))
118
- unit_price = float(match.group(3))
119
- total_price = float(match.group(4))
120
  items.append({
121
  "description": description,
122
  "quantity": quantity,
@@ -130,32 +142,38 @@ def extract_items(text):
130
  return items
131
 
132
  def extract_entities(text):
133
- """Extract structured invoice details using flexible regex patterns."""
134
- invoice_number = "Unknown"
135
  vendor_name = "Unknown"
136
  invoice_date = datetime.now().date()
137
  total_amount = 0.0
 
138
 
139
  # Extract items first to use as a filter for NER
140
  items = extract_items(text)
141
  item_descriptions = [item["description"].lower() for item in items]
142
 
143
  # Flexible regex patterns to handle various invoice formats
144
- invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
145
- vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
146
  invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
147
- total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
 
148
 
149
- # Invoice Number
150
- invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
151
- if invoice_num_match:
152
- invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
153
  print(f"Matched Invoice Number: {invoice_number}") # Debug
 
154
 
155
  # Vendor Name
156
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
157
  if vendor_match:
158
  vendor_name = vendor_match.group(1).strip()
 
 
 
159
  print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
160
  else:
161
  # Enhanced NER fallback for multi-word organization names
@@ -172,8 +190,16 @@ def extract_entities(text):
172
  vendor_name = candidate_vendor_name
173
  print(f"NER Matched Vendor Name: {vendor_name}") # Debug
174
 
175
- # Invoice Date
176
- invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
 
 
 
 
 
 
 
 
177
  if invoice_date_match:
178
  date_str = invoice_date_match.group(1)
179
  try:
@@ -190,15 +216,29 @@ def extract_entities(text):
190
  except ValueError as e:
191
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
192
 
193
- # Total Amount
194
- total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
195
- if total_amount_match:
196
- total_amount = float(total_amount_match.group(1).replace(",", ""))
197
- print(f"Matched Total Amount: {total_amount}") # Debug
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
- return invoice_number, vendor_name, invoice_date, total_amount
200
 
201
- def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
202
  """Fetch historical invoices for the vendor from Salesforce."""
203
  if sf is None:
204
  return pd.DataFrame()
@@ -225,14 +265,15 @@ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
225
  print(f"Failed to fetch vendor history: {str(e)}")
226
  return pd.DataFrame()
227
 
228
- def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
229
  """Check for data consistency issues like duplicates."""
230
  consistency_issues = []
231
 
232
  if not history_df.empty:
233
- duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
234
- if not duplicate_invoices.empty:
235
- consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
 
236
 
237
  return consistency_issues
238
 
@@ -334,16 +375,16 @@ def process_invoice(pdf_file):
334
  if "Error" in text:
335
  return f"**Error**: {text}"
336
 
337
- invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
338
  items = extract_items(text)
339
  text_length = len(text)
340
 
341
- history_df = fetch_vendor_history(vendor_name, invoice_number)
342
- consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
343
 
344
  data = {
345
  "invoice_id": str(uuid.uuid4()),
346
- "invoice_number": invoice_number,
347
  "vendor_name": vendor_name,
348
  "amount": total_amount,
349
  "invoice_date": invoice_date,
@@ -369,31 +410,33 @@ def process_invoice(pdf_file):
369
  desc = item['description']
370
  # Additional cleaning to ensure no quantity or price data
371
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
372
- desc = re.sub(r'\s*Unit\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
373
- desc = re.sub(r'\s*Total\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
374
  cleaned_items.append(desc)
375
  items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
376
  print(f"Items string for Salesforce (after cleaning): {items_str}") # Debug
377
 
378
  # Validate items_str to ensure it contains no quantity or price data
379
- if re.search(r'Quantity|Unit Price|Total Price|\$\d+\.\d+', items_str, re.IGNORECASE):
380
  print(f"ERROR: items_str contains unexpected quantity or price data: {items_str}")
381
  items_str = "; ".join(item['description'] for item in items) # Fallback to raw descriptions
382
  print(f"Fallback items_str: {items_str}")
383
 
384
  output = [
385
  "## Fraud Detection Summary",
386
- f"- **Invoice Number**: {invoice_number}",
 
387
  f"- **Vendor Name**: {vendor_name}",
388
  f"- **Invoice Date**: {invoice_date}",
389
- f"- **Invoice Amount**: ${total_amount:,.2f}",
390
- "- **Items Selected**:",
391
  ]
392
 
 
 
393
  if items:
394
  for item in items:
395
- clean_description = re.sub(r'\s*\d+\s*\d*$', '', item['description']).strip()
396
- output.append(f" - {clean_description}")
397
  else:
398
  output.append(" - No items found")
399
 
@@ -413,7 +456,7 @@ def process_invoice(pdf_file):
413
  if sf is not None:
414
  try:
415
  record_data = {
416
- "Invoice_Number__c": invoice_number,
417
  "Vendor_Name__c": vendor_name,
418
  "Invoice_Amount__c": total_amount,
419
  "Invoice_Date__c": str(invoice_date),
 
59
  return f"Error extracting text: {str(e)}"
60
 
61
  def extract_items(text):
62
+ """Extract items from the invoice table with support for multiple table formats."""
63
  items = []
64
+ # Replace escaped dollar signs and other symbols
65
+ text = text.replace(r'\$', '$').replace('₹', '₹')
66
 
67
  # Split text into lines
68
  lines = text.split('\n')
69
  print("Text split into lines:", lines) # Debug
70
 
71
+ # Define possible table headers
72
+ table_headers = [
73
+ ("Item Description", "Quantity", "Unit Price", "Total Price"), # Format 1 (e.g., invoice_4.pdf)
74
+ ("Particulars", "Gross value", "Discount", "Net value", "Total"), # Format 2 (e.g., Invoice_6164752968.pdf)
75
+ ]
76
+
77
  table_start = -1
78
+ table_format = None
79
  for i, line in enumerate(lines):
80
+ for headers in table_headers:
81
+ if all(header in line for header in headers):
82
+ table_start = i + 1 # Table data starts after the header
83
+ table_format = headers
84
+ break
85
+ if table_start != -1:
86
  break
87
 
88
  if table_start == -1:
89
  print("Table header not found.")
90
  return items
91
 
92
+ # Find the end of the table (before "Total Amount", "Total Value", or end of text)
93
  table_end = len(lines)
94
  for i in range(table_start, len(lines)):
95
+ if "Total Amount" in lines[i] or "Total Value" in lines[i] or "Total Due" in lines[i] or "Item(s) Total" in lines[i]:
96
  table_end = i
97
  break
98
 
 
100
  table_lines = lines[table_start:table_end]
101
  print("Table lines:", table_lines) # Debug
102
 
103
+ # Define patterns based on table format
104
+ if table_format[0] == "Item Description":
105
+ # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
106
+ table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
107
+ else:
108
+ # Pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
109
+ table_row_pattern = r"\|?\s*(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+(?:\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+)?\s*\|?\s*([\d.]+)\s*\|?"
110
 
111
  for line in table_lines:
112
  line = line.strip()
 
116
  if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
117
  print(f"Skipping alignment row: {line}")
118
  continue
 
 
119
  print(f"Processing table row: {line}") # Debug
120
  match = re.match(table_row_pattern, line)
121
  if match:
122
+ if table_format[0] == "Item Description":
123
+ description = match.group(1).strip()
124
+ quantity = int(match.group(2))
125
+ unit_price = float(match.group(3))
126
+ total_price = float(match.group(4))
127
+ else:
128
+ description = match.group(1).strip()
129
+ quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
130
+ unit_price = float(match.group(2)) # Gross value
131
+ total_price = float(match.group(5)) # Total after taxes
 
132
  items.append({
133
  "description": description,
134
  "quantity": quantity,
 
142
  return items
143
 
144
  def extract_entities(text):
145
+ """Extract structured invoice details including recipient name using flexible regex patterns."""
146
+ invoice_numbers = []
147
  vendor_name = "Unknown"
148
  invoice_date = datetime.now().date()
149
  total_amount = 0.0
150
+ recipient_name = "Unknown"
151
 
152
  # Extract items first to use as a filter for NER
153
  items = extract_items(text)
154
  item_descriptions = [item["description"].lower() for item in items]
155
 
156
  # Flexible regex patterns to handle various invoice formats
157
+ invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
158
+ vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
159
  invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
160
+ total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))?[^:\n]*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
161
+ recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z\s]+)(?=\s*(?:Address|Phone|Email|\n|$))"
162
 
163
+ # Invoice Numbers (capture multiple if present)
164
+ for match in re.finditer(invoice_num_pattern, text, re.IGNORECASE):
165
+ invoice_number = match.group(1) if match.group(1) else match.group(2)
166
+ invoice_numbers.append(invoice_number)
167
  print(f"Matched Invoice Number: {invoice_number}") # Debug
168
+ invoice_numbers = invoice_numbers if invoice_numbers else ["Unknown"]
169
 
170
  # Vendor Name
171
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
172
  if vendor_match:
173
  vendor_name = vendor_match.group(1).strip()
174
+ # Ensure vendor name is not an item description
175
+ if vendor_name.lower() in item_descriptions:
176
+ vendor_name = "Unknown"
177
  print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
178
  else:
179
  # Enhanced NER fallback for multi-word organization names
 
190
  vendor_name = candidate_vendor_name
191
  print(f"NER Matched Vendor Name: {vendor_name}") # Debug
192
 
193
+ # Invoice Date (prioritize "Invoice Date")
194
+ invoice_date_match = None
195
+ for line in text.split('\n'):
196
+ if "Invoice Date" in line:
197
+ match = re.search(invoice_date_pattern, line, re.IGNORECASE)
198
+ if match:
199
+ invoice_date_match = match
200
+ break
201
+ if not invoice_date_match:
202
+ invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
203
  if invoice_date_match:
204
  date_str = invoice_date_match.group(1)
205
  try:
 
216
  except ValueError as e:
217
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
218
 
219
+ # Total Amount (sum all "Total Value" entries)
220
+ total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
221
+ total_amounts = []
222
+ for match in total_amount_matches:
223
+ amount_str = match.group(1).replace(",", "")
224
+ try:
225
+ amount = float(amount_str)
226
+ total_amounts.append(amount)
227
+ print(f"Matched Amount: {amount}") # Debug
228
+ except ValueError:
229
+ continue
230
+ total_amount = sum(total_amounts) if total_amounts else 0.0
231
+ print(f"Calculated Total Amount: {total_amount}") # Debug
232
+
233
+ # Recipient Name
234
+ recipient_match = re.search(recipient_pattern, text, re.IGNORECASE)
235
+ if recipient_match:
236
+ recipient_name = recipient_match.group(1).strip()
237
+ print(f"Matched Recipient Name: {recipient_name}") # Debug
238
 
239
+ return invoice_numbers, vendor_name, invoice_date, total_amount, recipient_name
240
 
241
+ def fetch_vendor_history(vendor_name, invoice_numbers, time_window_days=30):
242
  """Fetch historical invoices for the vendor from Salesforce."""
243
  if sf is None:
244
  return pd.DataFrame()
 
265
  print(f"Failed to fetch vendor history: {str(e)}")
266
  return pd.DataFrame()
267
 
268
+ def check_data_consistency(invoice_numbers, vendor_name, invoice_date, history_df):
269
  """Check for data consistency issues like duplicates."""
270
  consistency_issues = []
271
 
272
  if not history_df.empty:
273
+ for invoice_number in invoice_numbers:
274
+ duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
275
+ if not duplicate_invoices.empty:
276
+ consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
277
 
278
  return consistency_issues
279
 
 
375
  if "Error" in text:
376
  return f"**Error**: {text}"
377
 
378
+ invoice_numbers, vendor_name, invoice_date, total_amount, recipient_name = extract_entities(text)
379
  items = extract_items(text)
380
  text_length = len(text)
381
 
382
+ history_df = fetch_vendor_history(vendor_name, invoice_numbers)
383
+ consistency_issues = check_data_consistency(invoice_numbers, vendor_name, invoice_date, history_df)
384
 
385
  data = {
386
  "invoice_id": str(uuid.uuid4()),
387
+ "invoice_number": "; ".join(invoice_numbers),
388
  "vendor_name": vendor_name,
389
  "amount": total_amount,
390
  "invoice_date": invoice_date,
 
410
  desc = item['description']
411
  # Additional cleaning to ensure no quantity or price data
412
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
413
+ desc = re.sub(r'\s*Unit\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
414
+ desc = re.sub(r'\s*Total\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
415
  cleaned_items.append(desc)
416
  items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
417
  print(f"Items string for Salesforce (after cleaning): {items_str}") # Debug
418
 
419
  # Validate items_str to ensure it contains no quantity or price data
420
+ if re.search(r'Quantity|Unit Price|Total Price|[₹$]\d+\.\d+', items_str, re.IGNORECASE):
421
  print(f"ERROR: items_str contains unexpected quantity or price data: {items_str}")
422
  items_str = "; ".join(item['description'] for item in items) # Fallback to raw descriptions
423
  print(f"Fallback items_str: {items_str}")
424
 
425
  output = [
426
  "## Fraud Detection Summary",
427
+ f"- **Invoice Number**: {'; '.join(invoice_numbers)}",
428
+ f"- **Recipient Name**: {recipient_name}",
429
  f"- **Vendor Name**: {vendor_name}",
430
  f"- **Invoice Date**: {invoice_date}",
431
+ f"- **Invoice Amount**: {total_amount:,.2f}", # Assuming INR for this PDF
 
432
  ]
433
 
434
+ # Add items section
435
+ output.append("- **Items Selected**:")
436
  if items:
437
  for item in items:
438
+ clean_description = re.sub(r'\s*\d+\s*x\s*', '', item['description']).strip() # Remove "1 x "
439
+ output.append(f" - {clean_description}: ₹{item['total_price']:.2f}")
440
  else:
441
  output.append(" - No items found")
442
 
 
456
  if sf is not None:
457
  try:
458
  record_data = {
459
+ "Invoice_Number__c": "; ".join(invoice_numbers),
460
  "Vendor_Name__c": vendor_name,
461
  "Invoice_Amount__c": total_amount,
462
  "Invoice_Date__c": str(invoice_date),