Abhisesh7 commited on
Commit
dc082e0
·
verified ·
1 Parent(s): e03706a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -103
app.py CHANGED
@@ -74,6 +74,7 @@ def extract_items(text):
74
  ("Particulars", "Gross value", "Discount", "Net value", "Total"), # Format 2 (e.g., Invoice_6164752968.pdf)
75
  ]
76
 
 
77
  table_start = -1
78
  table_format = None
79
  for i, line in enumerate(lines):
@@ -85,96 +86,64 @@ def extract_items(text):
85
  if table_start != -1:
86
  break
87
 
88
- if table_start == -1:
89
- print("Table header not found.")
90
- # Look for platform fee as a separate table
91
- platform_fee_start = -1
92
- for i, line in enumerate(lines):
93
- if "Sr.No Particulars" in line:
94
- platform_fee_start = i + 1
95
  break
96
- if platform_fee_start != -1:
97
- platform_fee_end = len(lines)
98
- for i in range(platform_fee_start, len(lines)):
99
- if "Total" in lines[i] and not "Sr.No" in lines[i]:
100
- platform_fee_end = i + 1
101
- break
102
- platform_fee_lines = lines[platform_fee_start:platform_fee_end]
103
- print("Platform fee lines:", platform_fee_lines) # Debug
104
- platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
105
- for line in platform_fee_lines:
106
- line = line.strip()
107
- if not line or "Total" in line:
108
- continue
109
- match = re.match(platform_fee_pattern, line)
110
- if match:
111
- description = match.group(1).strip()
112
- total_price = float(match.group(5))
113
- items.append({
114
- "description": description,
115
- "quantity": 1, # Platform fee is a single item
116
- "unit_price": float(match.group(2)), # Taxable amount
117
- "total_price": total_price
118
- })
119
- print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
120
- return items
121
-
122
- # Find the end of the table (before "Total Amount", "Total Value", or end of text)
123
- table_end = len(lines)
124
- for i in range(table_start, len(lines)):
125
- if "Total Amount" in lines[i] or "Total Value" in lines[i] or "Total Due" in lines[i] or "Item(s) Total" in lines[i]:
126
- table_end = i
127
- break
128
-
129
- print(f"Table section: lines {table_start} to {table_end-1}") # Debug
130
- table_lines = lines[table_start:table_end]
131
- print("Table lines:", table_lines) # Debug
132
-
133
- # Define patterns based on table format
134
- if table_format[0] == "Item Description":
135
- # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
136
- table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
137
- else:
138
- # Pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
139
- table_row_pattern = r"\|?\s*(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+(?:\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+)?\s*\|?\s*([\d.]+)\s*\|?"
140
 
141
- for line in table_lines:
142
- line = line.strip()
143
- if not line:
144
- continue
145
- # Skip alignment rows (e.g., "|---|---|")
146
- if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
147
- print(f"Skipping alignment row: {line}")
148
- continue
149
- print(f"Processing table row: {line}") # Debug
150
- match = re.match(table_row_pattern, line)
151
- if match:
152
- if table_format[0] == "Item Description":
153
- description = match.group(1).strip()
154
- quantity = int(match.group(2))
155
- unit_price = float(match.group(3))
156
- total_price = float(match.group(4))
 
 
 
 
 
 
 
 
 
 
 
 
157
  else:
158
- description = match.group(1).strip()
159
- quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
160
- unit_price = float(match.group(2)) # Gross value
161
- total_price = float(match.group(5)) # Total after taxes
162
- items.append({
163
- "description": description,
164
- "quantity": quantity,
165
- "unit_price": unit_price,
166
- "total_price": total_price
167
- })
168
- print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
169
- else:
170
- print(f"Failed to match row: {line}")
171
 
172
- # Look for platform fee as a separate table
173
  platform_fee_start = -1
174
  for i, line in enumerate(lines):
175
  if "Sr.No Particulars" in line:
176
  platform_fee_start = i + 1
177
  break
 
178
  if platform_fee_start != -1:
179
  platform_fee_end = len(lines)
180
  for i in range(platform_fee_start, len(lines)):
@@ -199,17 +168,18 @@ def extract_items(text):
199
  "total_price": total_price
200
  })
201
  print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
 
 
202
 
203
  return items
204
 
205
  def extract_entities(text):
206
- """Extract structured invoice details including recipient name using flexible regex patterns."""
207
  invoice_numbers = []
208
  primary_invoice_number = "Unknown"
209
  vendor_name = "Unknown"
210
  invoice_date = datetime.now().date()
211
  total_amount = 0.0
212
- recipient_name = "Unknown"
213
 
214
  # Extract items first to use as a filter for NER
215
  items = extract_items(text)
@@ -220,7 +190,6 @@ def extract_entities(text):
220
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
221
  invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
222
  total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))\s*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
223
- recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z]+)\s*(?=\s*(?:Address|Phone|Email|\n|$))"
224
 
225
  # Invoice Numbers (capture all, then prioritize)
226
  invoice_num_matches = list(re.finditer(invoice_num_pattern, text, re.IGNORECASE))
@@ -231,7 +200,6 @@ def extract_entities(text):
231
  if invoice_numbers:
232
  # Prioritize the invoice number associated with "Restaurant Service" (HSN Code: 996331)
233
  for i, num in enumerate(invoice_numbers):
234
- # Find the context of this invoice number in the text
235
  start_idx = text.find(num)
236
  context = text[max(0, start_idx-100):start_idx+100]
237
  if "996331" in context: # HSN Code for Restaurant Service
@@ -245,12 +213,10 @@ def extract_entities(text):
245
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
246
  if vendor_match:
247
  vendor_name = vendor_match.group(1).strip()
248
- # Ensure vendor name is not an item description
249
  if vendor_name.lower() in item_descriptions:
250
  vendor_name = "Unknown"
251
  print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
252
  else:
253
- # Enhanced NER fallback for multi-word organization names
254
  ner_results = ner_pipeline(text)
255
  org_name_parts = []
256
  for i, entity in enumerate(ner_results):
@@ -264,10 +230,10 @@ def extract_entities(text):
264
  vendor_name = candidate_vendor_name
265
  print(f"NER Matched Vendor Name: {vendor_name}") # Debug
266
 
267
- # Invoice Date (prioritize "Invoice Date")
268
  invoice_date_match = None
269
  for line in text.split('\n'):
270
- if "Invoice Date" in line and not "Order Date" in line:
271
  match = re.search(invoice_date_pattern, line, re.IGNORECASE)
272
  if match:
273
  invoice_date_match = match
@@ -290,29 +256,28 @@ def extract_entities(text):
290
  except ValueError as e:
291
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
292
 
293
- # Total Amount (sum all "Total Value" entries, fix parsing)
294
  total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
295
  total_amounts = []
296
  for match in total_amount_matches:
297
  amount_str = match.group(1).replace(",", "")
298
  try:
299
  amount = float(amount_str)
300
- # Ignore amounts that are unrealistically large (likely parsing errors)
301
- if amount < 1000000: # Arbitrary threshold to exclude erroneous large numbers
302
  total_amounts.append(amount)
303
  print(f"Matched Amount: {amount}") # Debug
304
  except ValueError:
305
  continue
306
- total_amount = sum(total_amounts) if total_amounts else 0.0
 
 
 
 
 
 
307
  print(f"Calculated Total Amount: {total_amount}") # Debug
308
 
309
- # Recipient Name
310
- recipient_match = re.search(recipient_pattern, text, re.IGNORECASE)
311
- if recipient_match:
312
- recipient_name = recipient_match.group(1).strip()
313
- print(f"Matched Recipient Name: {recipient_name}") # Debug
314
-
315
- return primary_invoice_number, vendor_name, invoice_date, total_amount, recipient_name
316
 
317
  def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
318
  """Fetch historical invoices for the vendor from Salesforce."""
@@ -450,7 +415,7 @@ def process_invoice(pdf_file):
450
  if "Error" in text:
451
  return f"**Error**: {text}"
452
 
453
- invoice_number, vendor_name, invoice_date, total_amount, recipient_name = extract_entities(text)
454
  items = extract_items(text)
455
  text_length = len(text)
456
 
@@ -483,7 +448,6 @@ def process_invoice(pdf_file):
483
  cleaned_items = []
484
  for item in items:
485
  desc = item['description']
486
- # Additional cleaning to ensure no quantity or price data
487
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
488
  desc = re.sub(r'\s*Unit\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
489
  desc = re.sub(r'\s*Total\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
@@ -502,7 +466,7 @@ def process_invoice(pdf_file):
502
  f"- **Invoice Number**: {invoice_number}",
503
  f"- **Vendor Name**: {vendor_name}",
504
  f"- **Invoice Date**: {invoice_date}",
505
- f"- **Invoice Amount**: ₹{total_amount:,.2f}", # Assuming INR for this PDF
506
  ]
507
 
508
  # Add items section
 
74
  ("Particulars", "Gross value", "Discount", "Net value", "Total"), # Format 2 (e.g., Invoice_6164752968.pdf)
75
  ]
76
 
77
+ # Extract main table (e.g., Particulars | Gross value | Discount | Net value | Total)
78
  table_start = -1
79
  table_format = None
80
  for i, line in enumerate(lines):
 
86
  if table_start != -1:
87
  break
88
 
89
+ if table_start != -1:
90
+ # Find the end of the main table
91
+ table_end = len(lines)
92
+ for i in range(table_start, len(lines)):
93
+ if "Item(s) Total" in lines[i] or "Total Value" in lines[i]:
94
+ table_end = i
 
95
  break
96
+
97
+ print(f"Main table section: lines {table_start} to {table_end-1}") # Debug
98
+ table_lines = lines[table_start:table_end]
99
+ print("Main table lines:", table_lines) # Debug
100
+
101
+ # Define patterns based on table format
102
+ if table_format[0] == "Item Description":
103
+ # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
104
+ table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
105
+ else:
106
+ # Pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
107
+ table_row_pattern = r"\|?\s*(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+(?:\s*\|?\s*[\d.%]+\s*\|?\s*[\d.]+)?\s*\|?\s*([\d.]+)\s*\|?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ for line in table_lines:
110
+ line = line.strip()
111
+ if not line:
112
+ continue
113
+ # Skip alignment rows (e.g., "|---|---|")
114
+ if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
115
+ print(f"Skipping alignment row: {line}")
116
+ continue
117
+ print(f"Processing main table row: {line}") # Debug
118
+ match = re.match(table_row_pattern, line)
119
+ if match:
120
+ if table_format[0] == "Item Description":
121
+ description = match.group(1).strip()
122
+ quantity = int(match.group(2))
123
+ unit_price = float(match.group(3))
124
+ total_price = float(match.group(4))
125
+ else:
126
+ description = match.group(1).strip()
127
+ quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
128
+ unit_price = float(match.group(2)) # Gross value
129
+ total_price = float(match.group(5)) # Total after taxes
130
+ items.append({
131
+ "description": description,
132
+ "quantity": quantity,
133
+ "unit_price": unit_price,
134
+ "total_price": total_price
135
+ })
136
+ print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
137
  else:
138
+ print(f"Failed to match main table row: {line}")
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
+ # Extract platform fee table (e.g., Sr.No Particulars)
141
  platform_fee_start = -1
142
  for i, line in enumerate(lines):
143
  if "Sr.No Particulars" in line:
144
  platform_fee_start = i + 1
145
  break
146
+
147
  if platform_fee_start != -1:
148
  platform_fee_end = len(lines)
149
  for i in range(platform_fee_start, len(lines)):
 
168
  "total_price": total_price
169
  })
170
  print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
171
+ else:
172
+ print(f"Failed to match platform fee row: {line}")
173
 
174
  return items
175
 
176
  def extract_entities(text):
177
+ """Extract structured invoice details using flexible regex patterns."""
178
  invoice_numbers = []
179
  primary_invoice_number = "Unknown"
180
  vendor_name = "Unknown"
181
  invoice_date = datetime.now().date()
182
  total_amount = 0.0
 
183
 
184
  # Extract items first to use as a filter for NER
185
  items = extract_items(text)
 
190
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
191
  invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
192
  total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))\s*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
 
193
 
194
  # Invoice Numbers (capture all, then prioritize)
195
  invoice_num_matches = list(re.finditer(invoice_num_pattern, text, re.IGNORECASE))
 
200
  if invoice_numbers:
201
  # Prioritize the invoice number associated with "Restaurant Service" (HSN Code: 996331)
202
  for i, num in enumerate(invoice_numbers):
 
203
  start_idx = text.find(num)
204
  context = text[max(0, start_idx-100):start_idx+100]
205
  if "996331" in context: # HSN Code for Restaurant Service
 
213
  vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
214
  if vendor_match:
215
  vendor_name = vendor_match.group(1).strip()
 
216
  if vendor_name.lower() in item_descriptions:
217
  vendor_name = "Unknown"
218
  print(f"Matched Vendor Name (Regex): {vendor_name}") # Debug
219
  else:
 
220
  ner_results = ner_pipeline(text)
221
  org_name_parts = []
222
  for i, entity in enumerate(ner_results):
 
230
  vendor_name = candidate_vendor_name
231
  print(f"NER Matched Vendor Name: {vendor_name}") # Debug
232
 
233
+ # Invoice Date (prioritize "Invoice Date" and exclude "Order Date")
234
  invoice_date_match = None
235
  for line in text.split('\n'):
236
+ if "Invoice Date" in line and "Order Date" not in line:
237
  match = re.search(invoice_date_pattern, line, re.IGNORECASE)
238
  if match:
239
  invoice_date_match = match
 
256
  except ValueError as e:
257
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
258
 
259
+ # Total Amount (sum final totals, including taxes)
260
  total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
261
  total_amounts = []
262
  for match in total_amount_matches:
263
  amount_str = match.group(1).replace(",", "")
264
  try:
265
  amount = float(amount_str)
266
+ if amount < 1000000: # Exclude unrealistically large amounts
 
267
  total_amounts.append(amount)
268
  print(f"Matched Amount: {amount}") # Debug
269
  except ValueError:
270
  continue
271
+ # Sum only the final totals (e.g., after taxes)
272
+ if total_amounts:
273
+ # In this invoice, "Total Value" appears twice: ₹184.5 (net value before taxes) and ₹193.726 (after taxes)
274
+ # We want the final total after taxes for the main items, plus the platform fee
275
+ main_total = max([amt for amt in total_amounts if amt > 100], default=0.0) # ₹193.726
276
+ platform_fee = min([amt for amt in total_amounts if amt < 10], default=0.0) # ₹3.54
277
+ total_amount = main_total + platform_fee
278
  print(f"Calculated Total Amount: {total_amount}") # Debug
279
 
280
+ return primary_invoice_number, vendor_name, invoice_date, total_amount
 
 
 
 
 
 
281
 
282
  def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
283
  """Fetch historical invoices for the vendor from Salesforce."""
 
415
  if "Error" in text:
416
  return f"**Error**: {text}"
417
 
418
+ invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
419
  items = extract_items(text)
420
  text_length = len(text)
421
 
 
448
  cleaned_items = []
449
  for item in items:
450
  desc = item['description']
 
451
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
452
  desc = re.sub(r'\s*Unit\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
453
  desc = re.sub(r'\s*Total\s*Price\s*[₹$]\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
 
466
  f"- **Invoice Number**: {invoice_number}",
467
  f"- **Vendor Name**: {vendor_name}",
468
  f"- **Invoice Date**: {invoice_date}",
469
+ f"- **Invoice Amount**: ₹{total_amount:,.2f}",
470
  ]
471
 
472
  # Add items section