Abhisesh7 commited on
Commit
db037c1
·
verified ·
1 Parent(s): 6156d09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -21
app.py CHANGED
@@ -62,17 +62,18 @@ def extract_text_from_pdf(pdf_file):
62
  def extract_items(text):
63
  """Extract items from the invoice table with a simplified approach."""
64
  items = []
65
- # Replace escaped dollar signs
66
- text = text.replace(r'\$', '$')
67
 
68
  # Split text into lines
69
  lines = text.split('\n')
70
  print("Text split into lines:", lines) # Debug
71
 
72
- # Find the table header
73
  table_start = -1
74
  for i, line in enumerate(lines):
75
- if "Item Description" in line and "Quantity" in line and "Unit Price" in line and "Total Price" in line:
 
76
  table_start = i + 1 # Table data starts after the header
77
  break
78
 
@@ -80,10 +81,10 @@ def extract_items(text):
80
  print("Table header not found.")
81
  return items
82
 
83
- # Find the end of the table (before "Total Amount", "Promo Code", or end of text)
84
  table_end = len(lines)
85
  for i in range(table_start, len(lines)):
86
- if "Total Amount" in lines[i] or "Total Due" in lines[i] or "Promo Code" in lines[i]:
87
  table_end = i
88
  break
89
 
@@ -92,8 +93,8 @@ def extract_items(text):
92
  print("Table lines:", table_lines) # Debug
93
 
94
  # Updated pattern to match table rows more accurately
95
- # Captures: Description (non-greedy), Quantity (digits), Unit Price (decimal), Total Price (decimal)
96
- table_row_pattern = r"^(.*?)\s+(\d+)\s+([\d.]+)\s+([\d.]+)$"
97
 
98
  for line in table_lines:
99
  line = line.strip()
@@ -105,14 +106,14 @@ def extract_items(text):
105
  description = match.group(1).strip()
106
  # Clean the description to remove any trailing quantity or price data
107
  description = re.sub(r'\s*\d+\s*$', '', description).strip() # Remove trailing numbers
108
- description = re.sub(r'\s*\$?[\d.]+\s*$', '', description).strip() # Remove trailing prices
109
  # Skip lines that look like promo codes
110
  if "Promo Code" in description:
111
  print(f"Skipping promo code line: {line}")
112
  continue
113
  quantity = int(match.group(2))
114
- unit_price = float(match.group(3))
115
- total_price = float(match.group(4))
116
  items.append({
117
  "description": description,
118
  "quantity": quantity,
@@ -130,7 +131,7 @@ def extract_entities(text):
130
  invoice_number = "Unknown"
131
  vendor_name = "Unknown"
132
  invoice_date = datetime.now().date()
133
- due_date = None # Default to None instead of today's date
134
  total_amount = 0.0
135
 
136
  # Extract items first to use as a filter for NER
@@ -138,16 +139,16 @@ def extract_entities(text):
138
  item_descriptions = [item["description"].lower() for item in items]
139
 
140
  # Flexible regex patterns to handle various invoice formats
141
- invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)"
142
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
143
- invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
144
- due_date_pattern = r"(?:Due\s*Date|Payment\s*Due\s*Date|Due\s*By)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4})"
145
- total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total)\s*[:\-\s]*[$£€]?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR)?"
146
 
147
  # Invoice Number
148
  invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
149
  if invoice_num_match:
150
- invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
151
  print(f"Matched Invoice Number: {invoice_number}") # Debug
152
 
153
  # Vendor Name
@@ -184,6 +185,8 @@ def extract_entities(text):
184
  invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
185
  except ValueError:
186
  invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
 
 
187
  print(f"Matched Invoice Date: {invoice_date}") # Debug
188
  except ValueError as e:
189
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
@@ -201,7 +204,9 @@ def extract_entities(text):
201
  try:
202
  due_date = datetime.strptime(date_str, "%Y-%m-%d").date()
203
  except ValueError:
204
- due_date = datetime.strptime(date_str, "%d-%m-%Y").date()
 
 
205
  print(f"Matched Due Date: {due_date}") # Debug
206
  except ValueError as e:
207
  print(f"Failed to parse Due Date '{date_str}': {str(e)}") # Debug
@@ -401,8 +406,8 @@ def process_invoice(file_path):
401
  desc = item['description']
402
  # Additional cleaning to ensure no quantity or price data
403
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
404
- desc = re.sub(r'\s*Unit\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
405
- desc = re.sub(r'\s*Total\s*Price\s*\$\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
406
  cleaned_items.append(desc)
407
  items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
408
  print(f"Items string for Salesforce (after cleaning): {items_str}") # Debug
@@ -427,7 +432,7 @@ def process_invoice(file_path):
427
  output.append(f"- **Due Date**: Not specified")
428
 
429
  output.extend([
430
- f"- **Invoice Amount**: ${total_amount:,.2f}",
431
  "- **Items Selected**:",
432
  ])
433
 
 
62
  def extract_items(text):
63
  """Extract items from the invoice table with a simplified approach."""
64
  items = []
65
+ # Replace escaped dollar signs and other currency symbols
66
+ text = text.replace(r'\$', '$').replace('₹', '₹')
67
 
68
  # Split text into lines
69
  lines = text.split('\n')
70
  print("Text split into lines:", lines) # Debug
71
 
72
+ # Find the table header (more flexible matching)
73
  table_start = -1
74
  for i, line in enumerate(lines):
75
+ # Match variations of table headers like "Item Quantity Rate Amount"
76
+ if re.search(r'Item.*Quantity.*(Rate|Unit\s*Price).*(Amount|Total\s*Price)', line, re.IGNORECASE):
77
  table_start = i + 1 # Table data starts after the header
78
  break
79
 
 
81
  print("Table header not found.")
82
  return items
83
 
84
+ # Find the end of the table (before "Subtotal", "Total", "Tax", or end of text)
85
  table_end = len(lines)
86
  for i in range(table_start, len(lines)):
87
+ if any(keyword in lines[i] for keyword in ["Subtotal", "Total", "Tax", "Balance Due", "Promo Code"]):
88
  table_end = i
89
  break
90
 
 
93
  print("Table lines:", table_lines) # Debug
94
 
95
  # Updated pattern to match table rows more accurately
96
+ # Captures: Description (non-greedy), Quantity (digits), Rate/Unit Price (decimal with optional currency), Amount/Total Price (decimal with optional currency)
97
+ table_row_pattern = r"^(.*?)\s+(\d+)\s+(?:₹|[$£€]?\s*)([\d,]+\.?\d*)\s+(?:₹|[$£€]?\s*)([\d,]+\.?\d*)$"
98
 
99
  for line in table_lines:
100
  line = line.strip()
 
106
  description = match.group(1).strip()
107
  # Clean the description to remove any trailing quantity or price data
108
  description = re.sub(r'\s*\d+\s*$', '', description).strip() # Remove trailing numbers
109
+ description = re.sub(r'\s*(?:₹|[$£€]?)[\d,]+\.?\d*\s*$', '', description).strip() # Remove trailing prices
110
  # Skip lines that look like promo codes
111
  if "Promo Code" in description:
112
  print(f"Skipping promo code line: {line}")
113
  continue
114
  quantity = int(match.group(2))
115
+ unit_price = float(match.group(3).replace(",", ""))
116
+ total_price = float(match.group(4).replace(",", ""))
117
  items.append({
118
  "description": description,
119
  "quantity": quantity,
 
131
  invoice_number = "Unknown"
132
  vendor_name = "Unknown"
133
  invoice_date = datetime.now().date()
134
+ due_date = None # Default to None
135
  total_amount = 0.0
136
 
137
  # Extract items first to use as a filter for NER
 
139
  item_descriptions = [item["description"].lower() for item in items]
140
 
141
  # Flexible regex patterns to handle various invoice formats
142
+ invoice_num_pattern = r"(?:Invoice\s*(?:Number|No\.?|#)|Order\s*(?:Number|No\.?))\s*[:\-\s#]*([\w-]+)|(?:INV-|ORD-)([\w-]+)|#?\s*(\d+)"
143
  vendor_pattern = r"(?:Vendor\s*(?:Name|Company)?|Supplier|Company\s*Name|From|Sold\s*By)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:Address|Invoice\s*(?:No|Number)|Date|Phone|Email|\n|$))"
144
+ invoice_date_pattern = r"(?:Invoice\s*Date|Date|Issue\s*Date)\s*[:\-\s]*((\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{1,2}\s*\d{4}))"
145
+ due_date_pattern = r"(?:Due\s*Date|Payment\s*Due\s*Date|Due\s*By)\s*[:\-\s]*((\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|[A-Za-z]+\s*\d{1,2},\s*\d{4}|[A-Za-z]+\s*\d{1,2}\s*\d{4}))"
146
+ total_amount_pattern = r"(?:Total\s*(?:Amount|Due)?|Amount\s*Due|Total|Balance\s*Due)\s*[:\-\s]*(?:₹|[$£€])?\s*([\d,]+\.?\d*)\s*(?:USD|GBP|EUR|INR)?"
147
 
148
  # Invoice Number
149
  invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
150
  if invoice_num_match:
151
+ invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else (invoice_num_match.group(2) if invoice_num_match.group(2) else invoice_num_match.group(3))
152
  print(f"Matched Invoice Number: {invoice_number}") # Debug
153
 
154
  # Vendor Name
 
185
  invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
186
  except ValueError:
187
  invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
188
+ elif re.match(r"[A-Za-z]+\s*\d{1,2}\s*\d{4}", date_str):
189
+ invoice_date = datetime.strptime(date_str, "%B %d %Y").date()
190
  print(f"Matched Invoice Date: {invoice_date}") # Debug
191
  except ValueError as e:
192
  print(f"Failed to parse Invoice Date '{date_str}': {str(e)}") # Debug
 
204
  try:
205
  due_date = datetime.strptime(date_str, "%Y-%m-%d").date()
206
  except ValueError:
207
+ invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
208
+ elif re.match(r"[A-Za-z]+\s*\d{1,2}\s*\d{4}", date_str):
209
+ due_date = datetime.strptime(date_str, "%B %d %Y").date()
210
  print(f"Matched Due Date: {due_date}") # Debug
211
  except ValueError as e:
212
  print(f"Failed to parse Due Date '{date_str}': {str(e)}") # Debug
 
406
  desc = item['description']
407
  # Additional cleaning to ensure no quantity or price data
408
  desc = re.sub(r'\s*Quantity\s*\d+', '', desc, flags=re.IGNORECASE).strip()
409
+ desc = re.sub(r'\s*(?:Rate|Unit\s*Price)\s*(?:₹|[$£€])\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
410
+ desc = re.sub(r'\s*(?:Amount|Total\s*Price)\s*(?:₹|[$£€])\d+\.\d+', '', desc, flags=re.IGNORECASE).strip()
411
  cleaned_items.append(desc)
412
  items_str = "; ".join(cleaned_items) if cleaned_items else "No items found"
413
  print(f"Items string for Salesforce (after cleaning): {items_str}") # Debug
 
432
  output.append(f"- **Due Date**: Not specified")
433
 
434
  output.extend([
435
+ f"- **Invoice Amount**: {total_amount:,.2f}",
436
  "- **Items Selected**:",
437
  ])
438