Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -87,6 +87,36 @@ def extract_items(text):
|
|
| 87 |
|
| 88 |
if table_start == -1:
|
| 89 |
print("Table header not found.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
return items
|
| 91 |
|
| 92 |
# Find the end of the table (before "Total Amount", "Total Value", or end of text)
|
|
@@ -139,11 +169,43 @@ def extract_items(text):
|
|
| 139 |
else:
|
| 140 |
print(f"Failed to match row: {line}")
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
return items
|
| 143 |
|
| 144 |
def extract_entities(text):
|
| 145 |
"""Extract structured invoice details including recipient name using flexible regex patterns."""
|
| 146 |
invoice_numbers = []
|
|
|
|
| 147 |
vendor_name = "Unknown"
|
| 148 |
invoice_date = datetime.now().date()
|
| 149 |
total_amount = 0.0
|
|
@@ -157,15 +219,27 @@ def extract_entities(text):
|
|
| 157 |
invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
|
| 158 |
vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
|
| 159 |
invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
|
| 160 |
-
total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))
|
| 161 |
-
recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z
|
| 162 |
|
| 163 |
-
# Invoice Numbers (capture
|
| 164 |
-
|
|
|
|
| 165 |
invoice_number = match.group(1) if match.group(1) else match.group(2)
|
| 166 |
invoice_numbers.append(invoice_number)
|
| 167 |
print(f"Matched Invoice Number: {invoice_number}") # Debug
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# Vendor Name
|
| 171 |
vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
|
|
@@ -193,7 +267,7 @@ def extract_entities(text):
|
|
| 193 |
# Invoice Date (prioritize "Invoice Date")
|
| 194 |
invoice_date_match = None
|
| 195 |
for line in text.split('\n'):
|
| 196 |
-
if "Invoice Date" in line:
|
| 197 |
match = re.search(invoice_date_pattern, line, re.IGNORECASE)
|
| 198 |
if match:
|
| 199 |
invoice_date_match = match
|
|
@@ -216,14 +290,16 @@ def extract_entities(text):
|
|
| 216 |
except ValueError as e:
|
| 217 |
print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
|
| 218 |
|
| 219 |
-
# Total Amount (sum all "Total Value" entries)
|
| 220 |
total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
|
| 221 |
total_amounts = []
|
| 222 |
for match in total_amount_matches:
|
| 223 |
amount_str = match.group(1).replace(",", "")
|
| 224 |
try:
|
| 225 |
amount = float(amount_str)
|
| 226 |
-
|
|
|
|
|
|
|
| 227 |
print(f"Matched Amount: {amount}") # Debug
|
| 228 |
except ValueError:
|
| 229 |
continue
|
|
@@ -236,9 +312,9 @@ def extract_entities(text):
|
|
| 236 |
recipient_name = recipient_match.group(1).strip()
|
| 237 |
print(f"Matched Recipient Name: {recipient_name}") # Debug
|
| 238 |
|
| 239 |
-
return
|
| 240 |
|
| 241 |
-
def fetch_vendor_history(vendor_name,
|
| 242 |
"""Fetch historical invoices for the vendor from Salesforce."""
|
| 243 |
if sf is None:
|
| 244 |
return pd.DataFrame()
|
|
@@ -265,15 +341,14 @@ def fetch_vendor_history(vendor_name, invoice_numbers, time_window_days=30):
|
|
| 265 |
print(f"Failed to fetch vendor history: {str(e)}")
|
| 266 |
return pd.DataFrame()
|
| 267 |
|
| 268 |
-
def check_data_consistency(
|
| 269 |
"""Check for data consistency issues like duplicates."""
|
| 270 |
consistency_issues = []
|
| 271 |
|
| 272 |
if not history_df.empty:
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
|
| 277 |
|
| 278 |
return consistency_issues
|
| 279 |
|
|
@@ -375,16 +450,16 @@ def process_invoice(pdf_file):
|
|
| 375 |
if "Error" in text:
|
| 376 |
return f"**Error**: {text}"
|
| 377 |
|
| 378 |
-
|
| 379 |
items = extract_items(text)
|
| 380 |
text_length = len(text)
|
| 381 |
|
| 382 |
-
history_df = fetch_vendor_history(vendor_name,
|
| 383 |
-
consistency_issues = check_data_consistency(
|
| 384 |
|
| 385 |
data = {
|
| 386 |
"invoice_id": str(uuid.uuid4()),
|
| 387 |
-
"invoice_number":
|
| 388 |
"vendor_name": vendor_name,
|
| 389 |
"amount": total_amount,
|
| 390 |
"invoice_date": invoice_date,
|
|
@@ -424,8 +499,7 @@ def process_invoice(pdf_file):
|
|
| 424 |
|
| 425 |
output = [
|
| 426 |
"## Fraud Detection Summary",
|
| 427 |
-
f"- **Invoice Number**: {
|
| 428 |
-
f"- **Recipient Name**: {recipient_name}",
|
| 429 |
f"- **Vendor Name**: {vendor_name}",
|
| 430 |
f"- **Invoice Date**: {invoice_date}",
|
| 431 |
f"- **Invoice Amount**: ₹{total_amount:,.2f}", # Assuming INR for this PDF
|
|
@@ -456,7 +530,7 @@ def process_invoice(pdf_file):
|
|
| 456 |
if sf is not None:
|
| 457 |
try:
|
| 458 |
record_data = {
|
| 459 |
-
"Invoice_Number__c":
|
| 460 |
"Vendor_Name__c": vendor_name,
|
| 461 |
"Invoice_Amount__c": total_amount,
|
| 462 |
"Invoice_Date__c": str(invoice_date),
|
|
|
|
| 87 |
|
| 88 |
if table_start == -1:
|
| 89 |
print("Table header not found.")
|
| 90 |
+
# Look for platform fee as a separate table
|
| 91 |
+
platform_fee_start = -1
|
| 92 |
+
for i, line in enumerate(lines):
|
| 93 |
+
if "Sr.No Particulars" in line:
|
| 94 |
+
platform_fee_start = i + 1
|
| 95 |
+
break
|
| 96 |
+
if platform_fee_start != -1:
|
| 97 |
+
platform_fee_end = len(lines)
|
| 98 |
+
for i in range(platform_fee_start, len(lines)):
|
| 99 |
+
if "Total" in lines[i] and not "Sr.No" in lines[i]:
|
| 100 |
+
platform_fee_end = i + 1
|
| 101 |
+
break
|
| 102 |
+
platform_fee_lines = lines[platform_fee_start:platform_fee_end]
|
| 103 |
+
print("Platform fee lines:", platform_fee_lines) # Debug
|
| 104 |
+
platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
|
| 105 |
+
for line in platform_fee_lines:
|
| 106 |
+
line = line.strip()
|
| 107 |
+
if not line or "Total" in line:
|
| 108 |
+
continue
|
| 109 |
+
match = re.match(platform_fee_pattern, line)
|
| 110 |
+
if match:
|
| 111 |
+
description = match.group(1).strip()
|
| 112 |
+
total_price = float(match.group(5))
|
| 113 |
+
items.append({
|
| 114 |
+
"description": description,
|
| 115 |
+
"quantity": 1, # Platform fee is a single item
|
| 116 |
+
"unit_price": float(match.group(2)), # Taxable amount
|
| 117 |
+
"total_price": total_price
|
| 118 |
+
})
|
| 119 |
+
print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
|
| 120 |
return items
|
| 121 |
|
| 122 |
# Find the end of the table (before "Total Amount", "Total Value", or end of text)
|
|
|
|
| 169 |
else:
|
| 170 |
print(f"Failed to match row: {line}")
|
| 171 |
|
| 172 |
+
# Look for platform fee as a separate table
|
| 173 |
+
platform_fee_start = -1
|
| 174 |
+
for i, line in enumerate(lines):
|
| 175 |
+
if "Sr.No Particulars" in line:
|
| 176 |
+
platform_fee_start = i + 1
|
| 177 |
+
break
|
| 178 |
+
if platform_fee_start != -1:
|
| 179 |
+
platform_fee_end = len(lines)
|
| 180 |
+
for i in range(platform_fee_start, len(lines)):
|
| 181 |
+
if "Total" in lines[i] and not "Sr.No" in lines[i]:
|
| 182 |
+
platform_fee_end = i + 1
|
| 183 |
+
break
|
| 184 |
+
platform_fee_lines = lines[platform_fee_start:platform_fee_end]
|
| 185 |
+
print("Platform fee lines:", platform_fee_lines) # Debug
|
| 186 |
+
platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
|
| 187 |
+
for line in platform_fee_lines:
|
| 188 |
+
line = line.strip()
|
| 189 |
+
if not line or "Total" in line:
|
| 190 |
+
continue
|
| 191 |
+
match = re.match(platform_fee_pattern, line)
|
| 192 |
+
if match:
|
| 193 |
+
description = match.group(1).strip()
|
| 194 |
+
total_price = float(match.group(5))
|
| 195 |
+
items.append({
|
| 196 |
+
"description": description,
|
| 197 |
+
"quantity": 1, # Platform fee is a single item
|
| 198 |
+
"unit_price": float(match.group(2)), # Taxable amount
|
| 199 |
+
"total_price": total_price
|
| 200 |
+
})
|
| 201 |
+
print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
|
| 202 |
+
|
| 203 |
return items
|
| 204 |
|
| 205 |
def extract_entities(text):
|
| 206 |
"""Extract structured invoice details including recipient name using flexible regex patterns."""
|
| 207 |
invoice_numbers = []
|
| 208 |
+
primary_invoice_number = "Unknown"
|
| 209 |
vendor_name = "Unknown"
|
| 210 |
invoice_date = datetime.now().date()
|
| 211 |
total_amount = 0.0
|
|
|
|
| 219 |
invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-|Z\d{2}APOT\d{9})([\w-]+)"
|
| 220 |
vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By|Restaurant\s*Name)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
|
| 221 |
invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
|
| 222 |
+
total_amount_pattern = r"(?:Total\s*(?:Amount|Due|Value))\s*[:\-\s]*[₹$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
|
| 223 |
+
recipient_pattern = r"(?:Customer\s*Name|Recipient|Bill\s*To)\s*[:\-\s]*([A-Za-z]+)\s*(?=\s*(?:Address|Phone|Email|\n|$))"
|
| 224 |
|
| 225 |
+
# Invoice Numbers (capture all, then prioritize)
|
| 226 |
+
invoice_num_matches = list(re.finditer(invoice_num_pattern, text, re.IGNORECASE))
|
| 227 |
+
for match in invoice_num_matches:
|
| 228 |
invoice_number = match.group(1) if match.group(1) else match.group(2)
|
| 229 |
invoice_numbers.append(invoice_number)
|
| 230 |
print(f"Matched Invoice Number: {invoice_number}") # Debug
|
| 231 |
+
if invoice_numbers:
|
| 232 |
+
# Prioritize the invoice number associated with "Restaurant Service" (HSN Code: 996331)
|
| 233 |
+
for i, num in enumerate(invoice_numbers):
|
| 234 |
+
# Find the context of this invoice number in the text
|
| 235 |
+
start_idx = text.find(num)
|
| 236 |
+
context = text[max(0, start_idx-100):start_idx+100]
|
| 237 |
+
if "996331" in context: # HSN Code for Restaurant Service
|
| 238 |
+
primary_invoice_number = num
|
| 239 |
+
break
|
| 240 |
+
if primary_invoice_number == "Unknown":
|
| 241 |
+
primary_invoice_number = invoice_numbers[0] # Fallback to the first invoice number
|
| 242 |
+
print(f"Primary Invoice Number: {primary_invoice_number}") # Debug
|
| 243 |
|
| 244 |
# Vendor Name
|
| 245 |
vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
|
|
|
|
| 267 |
# Invoice Date (prioritize "Invoice Date")
|
| 268 |
invoice_date_match = None
|
| 269 |
for line in text.split('\n'):
|
| 270 |
+
if "Invoice Date" in line and not "Order Date" in line:
|
| 271 |
match = re.search(invoice_date_pattern, line, re.IGNORECASE)
|
| 272 |
if match:
|
| 273 |
invoice_date_match = match
|
|
|
|
| 290 |
except ValueError as e:
|
| 291 |
print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
|
| 292 |
|
| 293 |
+
# Total Amount (sum all "Total Value" entries, fix parsing)
|
| 294 |
total_amount_matches = re.finditer(total_amount_pattern, text, re.IGNORECASE)
|
| 295 |
total_amounts = []
|
| 296 |
for match in total_amount_matches:
|
| 297 |
amount_str = match.group(1).replace(",", "")
|
| 298 |
try:
|
| 299 |
amount = float(amount_str)
|
| 300 |
+
# Ignore amounts that are unrealistically large (likely parsing errors)
|
| 301 |
+
if amount < 1000000: # Arbitrary threshold to exclude erroneous large numbers
|
| 302 |
+
total_amounts.append(amount)
|
| 303 |
print(f"Matched Amount: {amount}") # Debug
|
| 304 |
except ValueError:
|
| 305 |
continue
|
|
|
|
| 312 |
recipient_name = recipient_match.group(1).strip()
|
| 313 |
print(f"Matched Recipient Name: {recipient_name}") # Debug
|
| 314 |
|
| 315 |
+
return primary_invoice_number, vendor_name, invoice_date, total_amount, recipient_name
|
| 316 |
|
| 317 |
+
def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
|
| 318 |
"""Fetch historical invoices for the vendor from Salesforce."""
|
| 319 |
if sf is None:
|
| 320 |
return pd.DataFrame()
|
|
|
|
| 341 |
print(f"Failed to fetch vendor history: {str(e)}")
|
| 342 |
return pd.DataFrame()
|
| 343 |
|
| 344 |
+
def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
|
| 345 |
"""Check for data consistency issues like duplicates."""
|
| 346 |
consistency_issues = []
|
| 347 |
|
| 348 |
if not history_df.empty:
|
| 349 |
+
duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
|
| 350 |
+
if not duplicate_invoices.empty:
|
| 351 |
+
consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
|
|
|
|
| 352 |
|
| 353 |
return consistency_issues
|
| 354 |
|
|
|
|
| 450 |
if "Error" in text:
|
| 451 |
return f"**Error**: {text}"
|
| 452 |
|
| 453 |
+
invoice_number, vendor_name, invoice_date, total_amount, recipient_name = extract_entities(text)
|
| 454 |
items = extract_items(text)
|
| 455 |
text_length = len(text)
|
| 456 |
|
| 457 |
+
history_df = fetch_vendor_history(vendor_name, invoice_number)
|
| 458 |
+
consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
|
| 459 |
|
| 460 |
data = {
|
| 461 |
"invoice_id": str(uuid.uuid4()),
|
| 462 |
+
"invoice_number": invoice_number,
|
| 463 |
"vendor_name": vendor_name,
|
| 464 |
"amount": total_amount,
|
| 465 |
"invoice_date": invoice_date,
|
|
|
|
| 499 |
|
| 500 |
output = [
|
| 501 |
"## Fraud Detection Summary",
|
| 502 |
+
f"- **Invoice Number**: {invoice_number}",
|
|
|
|
| 503 |
f"- **Vendor Name**: {vendor_name}",
|
| 504 |
f"- **Invoice Date**: {invoice_date}",
|
| 505 |
f"- **Invoice Amount**: ₹{total_amount:,.2f}", # Assuming INR for this PDF
|
|
|
|
| 530 |
if sf is not None:
|
| 531 |
try:
|
| 532 |
record_data = {
|
| 533 |
+
"Invoice_Number__c": invoice_number,
|
| 534 |
"Vendor_Name__c": vendor_name,
|
| 535 |
"Invoice_Amount__c": total_amount,
|
| 536 |
"Invoice_Date__c": str(invoice_date),
|