Abhisesh7 commited on
Commit
e03706a
·
verified ·
1 Parent(s): 5b964d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -22
app.py CHANGED
@@ -87,6 +87,36 @@ def extract_items(text):
87
 
88
  if table_start == -1:
89
  print("Table header not found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  return items
91
 
92
  # Find the end of the table (before "Total Amount", "Total Value", or end of text)
@@ -139,11 +169,43 @@ def extract_items(text):
139
  else:
140
  print(f"Failed to match row: {line}")
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  return items
143
 
144
  def extract_entities(text):
145
  """Extract structured invoice details including recipient name using flexible regex patterns."""
146
  invoice_numbers = []
 
147
  vendor_name = "Unknown"
148
  invoice_date = datetime.now().date()
149
  total_amount = 0.0
@@ -157,15 +219,27 @@ def extract_entities(text):
157
  invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
158
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
159
  invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
160
- total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))?[^:\n]*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
161
- recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z\s]+)(?=\s*(?:Address|Phone|Email|\n|$))"
162
 
163
- # Invoice Numbers (capture multiple if present)
164
- for match in re.finditer(invoice_num_pattern, text, re.IGNORECASE):
 
165
  invoice_number = match.group(1) if match.group(1) else match.group(2)
166
  invoice_numbers.append(invoice_number)
167
  print(f"Matched Invoice Number: {invoice_number}") # Debug
168
- invoice_numbers = invoice_numbers if invoice_numbers else ["Unknown"]
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  # Vendor Name
171
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
@@ -193,7 +267,7 @@ def extract_entities(text):
193
  # Invoice Date (prioritize "Invoice Date")
194
  invoice_date_match = None
195
  for line in text.split('\n'):
196
- if "Invoice Date" in line:
197
  match = re.search(invoice_date_pattern, line, re.IGNORECASE)
198
  if match:
199
  invoice_date_match = match
@@ -216,14 +290,16 @@ def extract_entities(text):
216
  except ValueError as e:
217
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
218
 
219
- # Total Amount (sum all "Total Value" entries)
220
  total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
221
  total_amounts = []
222
  for match in total_amount_matches:
223
  amount_str = match.group(1).replace(",", "")
224
  try:
225
  amount = float(amount_str)
226
- total_amounts.append(amount)
 
 
227
  print(f"Matched Amount: {amount}") # Debug
228
  except ValueError:
229
  continue
@@ -236,9 +312,9 @@ def extract_entities(text):
236
  recipient_name = recipient_match.group(1).strip()
237
  print(f"Matched Recipient Name: {recipient_name}") # Debug
238
 
239
- return invoice_numbers, vendor_name, invoice_date, total_amount, recipient_name
240
 
241
- def fetch_vendor_history(vendor_name, invoice_numbers, time_window_days=30):
242
  """Fetch historical invoices for the vendor from Salesforce."""
243
  if sf is None:
244
  return pd.DataFrame()
@@ -265,15 +341,14 @@ def fetch_vendor_history(vendor_name, invoice_numbers, time_window_days=30):
265
  print(f"Failed to fetch vendor history: {str(e)}")
266
  return pd.DataFrame()
267
 
268
- def check_data_consistency(invoice_numbers, vendor_name, invoice_date, history_df):
269
  """Check for data consistency issues like duplicates."""
270
  consistency_issues = []
271
 
272
  if not history_df.empty:
273
- for invoice_number in invoice_numbers:
274
- duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
275
- if not duplicate_invoices.empty:
276
- consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
277
 
278
  return consistency_issues
279
 
@@ -375,16 +450,16 @@ def process_invoice(pdf_file):
375
  if "Error" in text:
376
  return f"**Error**: {text}"
377
 
378
- invoice_numbers, vendor_name, invoice_date, total_amount, recipient_name = extract_entities(text)
379
  items = extract_items(text)
380
  text_length = len(text)
381
 
382
- history_df = fetch_vendor_history(vendor_name, invoice_numbers)
383
- consistency_issues = check_data_consistency(invoice_numbers, vendor_name, invoice_date, history_df)
384
 
385
  data = {
386
  "invoice_id": str(uuid.uuid4()),
387
- "invoice_number": "; ".join(invoice_numbers),
388
  "vendor_name": vendor_name,
389
  "amount": total_amount,
390
  "invoice_date": invoice_date,
@@ -424,8 +499,7 @@ def process_invoice(pdf_file):
424
 
425
  output = [
426
  "## Fraud Detection Summary",
427
- f"- **Invoice Number**: {'; '.join(invoice_numbers)}",
428
- f"- **Recipient Name**: {recipient_name}",
429
  f"- **Vendor Name**: {vendor_name}",
430
  f"- **Invoice Date**: {invoice_date}",
431
  f"- **Invoice Amount**: ₹{total_amount:,.2f}", # Assuming INR for this PDF
@@ -456,7 +530,7 @@ def process_invoice(pdf_file):
456
  if sf is not None:
457
  try:
458
  record_data = {
459
- "Invoice_Number__c": "; ".join(invoice_numbers),
460
  "Vendor_Name__c": vendor_name,
461
  "Invoice_Amount__c": total_amount,
462
  "Invoice_Date__c": str(invoice_date),
 
87
 
88
  if table_start == -1:
89
  print("Table header not found.")
90
+ # Look for platform fee as a separate table
91
+ platform_fee_start = -1
92
+ for i, line in enumerate(lines):
93
+ if "Sr.No Particulars" in line:
94
+ platform_fee_start = i + 1
95
+ break
96
+ if platform_fee_start != -1:
97
+ platform_fee_end = len(lines)
98
+ for i in range(platform_fee_start, len(lines)):
99
+ if "Total" in lines[i] and not "Sr.No" in lines[i]:
100
+ platform_fee_end = i + 1
101
+ break
102
+ platform_fee_lines = lines[platform_fee_start:platform_fee_end]
103
+ print("Platform fee lines:", platform_fee_lines) # Debug
104
+ platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
105
+ for line in platform_fee_lines:
106
+ line = line.strip()
107
+ if not line or "Total" in line:
108
+ continue
109
+ match = re.match(platform_fee_pattern, line)
110
+ if match:
111
+ description = match.group(1).strip()
112
+ total_price = float(match.group(5))
113
+ items.append({
114
+ "description": description,
115
+ "quantity": 1, # Platform fee is a single item
116
+ "unit_price": float(match.group(2)), # Taxable amount
117
+ "total_price": total_price
118
+ })
119
+ print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
120
  return items
121
 
122
  # Find the end of the table (before "Total Amount", "Total Value", or end of text)
 
169
  else:
170
  print(f"Failed to match row: {line}")
171
 
172
+ # Look for platform fee as a separate table
173
+ platform_fee_start = -1
174
+ for i, line in enumerate(lines):
175
+ if "Sr.No Particulars" in line:
176
+ platform_fee_start = i + 1
177
+ break
178
+ if platform_fee_start != -1:
179
+ platform_fee_end = len(lines)
180
+ for i in range(platform_fee_start, len(lines)):
181
+ if "Total" in lines[i] and not "Sr.No" in lines[i]:
182
+ platform_fee_end = i + 1
183
+ break
184
+ platform_fee_lines = lines[platform_fee_start:platform_fee_end]
185
+ print("Platform fee lines:", platform_fee_lines) # Debug
186
+ platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
187
+ for line in platform_fee_lines:
188
+ line = line.strip()
189
+ if not line or "Total" in line:
190
+ continue
191
+ match = re.match(platform_fee_pattern, line)
192
+ if match:
193
+ description = match.group(1).strip()
194
+ total_price = float(match.group(5))
195
+ items.append({
196
+ "description": description,
197
+ "quantity": 1, # Platform fee is a single item
198
+ "unit_price": float(match.group(2)), # Taxable amount
199
+ "total_price": total_price
200
+ })
201
+ print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
202
+
203
  return items
204
 
205
  def extract_entities(text):
206
  """Extract structured invoice details including recipient name using flexible regex patterns."""
207
  invoice_numbers = []
208
+ primary_invoice_number = "Unknown"
209
  vendor_name = "Unknown"
210
  invoice_date = datetime.now().date()
211
  total_amount = 0.0
 
219
  invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
220
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
221
  invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
222
+ total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))\s*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
223
+ recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z]+)\s*(?=\s*(?:Address|Phone|Email|\n|$))"
224
 
225
+ # Invoice Numbers (capture all, then prioritize)
226
+ invoice_num_matches = list(re.finditer(invoice_num_pattern, text, re.IGNORECASE))
227
+ for match in invoice_num_matches:
228
  invoice_number = match.group(1) if match.group(1) else match.group(2)
229
  invoice_numbers.append(invoice_number)
230
  print(f"Matched Invoice Number: {invoice_number}") # Debug
231
+ if invoice_numbers:
232
+ # Prioritize the invoice number associated with "Restaurant Service" (HSN Code: 996331)
233
+ for i, num in enumerate(invoice_numbers):
234
+ # Find the context of this invoice number in the text
235
+ start_idx = text.find(num)
236
+ context = text[max(0, start_idx-100):start_idx+100]
237
+ if "996331" in context: # HSN Code for Restaurant Service
238
+ primary_invoice_number = num
239
+ break
240
+ if primary_invoice_number == "Unknown":
241
+ primary_invoice_number = invoice_numbers[0] # Fallback to the first invoice number
242
+ print(f"Primary Invoice Number: {primary_invoice_number}") # Debug
243
 
244
  # Vendor Name
245
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
 
267
  # Invoice Date (prioritize "Invoice Date")
268
  invoice_date_match = None
269
  for line in text.split('\n'):
270
+ if "Invoice Date" in line and not "Order Date" in line:
271
  match = re.search(invoice_date_pattern, line, re.IGNORECASE)
272
  if match:
273
  invoice_date_match = match
 
290
  except ValueError as e:
291
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
292
 
293
+ # Total Amount (sum all "Total Value" entries, fix parsing)
294
  total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
295
  total_amounts = []
296
  for match in total_amount_matches:
297
  amount_str = match.group(1).replace(",", "")
298
  try:
299
  amount = float(amount_str)
300
+ # Ignore amounts that are unrealistically large (likely parsing errors)
301
+ if amount < 1000000: # Arbitrary threshold to exclude erroneous large numbers
302
+ total_amounts.append(amount)
303
  print(f"Matched Amount: {amount}") # Debug
304
  except ValueError:
305
  continue
 
312
  recipient_name = recipient_match.group(1).strip()
313
  print(f"Matched Recipient Name: {recipient_name}") # Debug
314
 
315
+ return primary_invoice_number, vendor_name, invoice_date, total_amount, recipient_name
316
 
317
+ def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
318
  """Fetch historical invoices for the vendor from Salesforce."""
319
  if sf is None:
320
  return pd.DataFrame()
 
341
  print(f"Failed to fetch vendor history: {str(e)}")
342
  return pd.DataFrame()
343
 
344
+ def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
345
  """Check for data consistency issues like duplicates."""
346
  consistency_issues = []
347
 
348
  if not history_df.empty:
349
+ duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
350
+ if not duplicate_invoices.empty:
351
+ consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
 
352
 
353
  return consistency_issues
354
 
 
450
  if "Error" in text:
451
  return f"**Error**: {text}"
452
 
453
+ invoice_number, vendor_name, invoice_date, total_amount, recipient_name = extract_entities(text)
454
  items = extract_items(text)
455
  text_length = len(text)
456
 
457
+ history_df = fetch_vendor_history(vendor_name, invoice_number)
458
+ consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
459
 
460
  data = {
461
  "invoice_id": str(uuid.uuid4()),
462
+ "invoice_number": invoice_number,
463
  "vendor_name": vendor_name,
464
  "amount": total_amount,
465
  "invoice_date": invoice_date,
 
499
 
500
  output = [
501
  "## Fraud Detection Summary",
502
+ f"- **Invoice Number**: {invoice_number}",
 
503
  f"- **Vendor Name**: {vendor_name}",
504
  f"- **Invoice Date**: {invoice_date}",
505
  f"- **Invoice Amount**: ₹{total_amount:,.2f}", # Assuming INR for this PDF
 
530
  if sf is not None:
531
  try:
532
  record_data = {
533
+ "Invoice_Number__c": invoice_number,
534
  "Vendor_Name__c": vendor_name,
535
  "Invoice_Amount__c": total_amount,
536
  "Invoice_Date__c": str(invoice_date),