Abhisesh7 commited on
Commit
49cc078
·
verified ·
1 Parent(s): 855d3bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -124
app.py CHANGED
@@ -58,137 +58,180 @@ def extract_text_from_pdf(pdf_file):
58
  except Exception as e:
59
  return f"Error extracting text: {str(e)}"
60
 
61
- def extract_items(text):
62
- """Extract items from the invoice table with support for multiple table formats."""
63
  items = []
64
- # Replace escaped dollar signs and other symbols
65
- text = text.replace(r'\$', '$').replace('₹', '₹')
66
 
67
- # Split text into lines
68
- lines = text.split('\n')
69
- print("Text split into lines:", lines) # Debug
70
-
71
- # Define possible table headers
72
- table_headers = [
73
- ("Item Description", "Quantity", "Unit Price", "Total Price"), # Format 1 (e.g., invoice_4.pdf)
74
- ("Particulars", "Gross value", "Discount", "Net value", "Total"), # Format 2 (e.g., Invoice_6164752968.pdf)
75
- ]
76
-
77
- # Extract main table (e.g., Particulars | Gross value | Discount | Net value | Total)
78
- table_start = -1
79
- table_format = None
80
- for i, line in enumerate(lines):
81
- for headers in table_headers:
82
- if all(header in line for header in headers):
83
- table_start = i + 1 # Table data starts after the header
84
- table_format = headers
85
- break
86
- if table_start != -1:
87
- break
88
-
89
- if table_start != -1:
90
- # Find the end of the main table
91
- table_end = len(lines)
92
- for i in range(table_start, len(lines)):
93
- if "Item(s) Total" in lines[i] or "Total Value" in lines[i] or "Sr.No Particulars" in lines[i]:
94
- table_end = i
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  break
96
 
97
- print(f"Main table section: lines {table_start} to {table_end-1}") # Debug
98
- table_lines = lines[table_start:table_end]
99
- print("Main table lines:", table_lines) # Debug
100
-
101
- # Define patterns based on table format
102
- if table_format[0] == "Item Description":
103
- # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
104
- table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
105
- else:
106
- # Simplified pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
 
107
  table_row_pattern = r"(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*[0-9.%]+\s*\|?\s*[\d.]+){2}\s*(?:\|\s*)?([\d.]+)"
108
 
109
- for line in table_lines:
110
- line = line.strip()
111
- if not line or "HSN Code" in line or "Total" in line:
112
- print(f"Skipping irrelevant line: {line}")
113
- continue
114
- # Skip alignment rows (e.g., "|---|---|")
115
- if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
116
- print(f"Skipping alignment row: {line}")
117
- continue
118
- print(f"Processing main table row: {line}") # Debug
119
- match = re.match(table_row_pattern, line)
120
- if match:
121
- description = match.group(1).strip()
122
- quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
123
- unit_price = float(match.group(2)) # Gross value
124
- total_price = float(match.group(5)) # Total after taxes
125
- items.append({
126
- "description": description,
127
- "quantity": quantity,
128
- "unit_price": unit_price,
129
- "total_price": total_price
130
- })
131
- print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
132
- else:
133
- # Fallback: Split by | and validate fields manually
134
- fields = [f.strip() for f in line.split('|')]
135
- print(f"Fallback processing: {fields}") # Debug
136
- if len(fields) >= 9: # Expecting at least 9 fields (description, gross value, discount, net value, CGST %, CGST amount, SGST %, SGST amount, total)
137
- try:
138
- description = fields[0].strip()
139
- if not description.startswith('1 x'):
140
- continue # Skip if not an item row
141
- quantity = int(description.split(' x ')[0].strip())
142
- unit_price = float(fields[1].strip()) # Gross value
143
- total_price = float(fields[-1].strip()) # Total after taxes
144
- items.append({
145
- "description": description,
146
- "quantity": quantity,
147
- "unit_price": unit_price,
148
- "total_price": total_price
149
- })
150
- print(f"Fallback Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
151
- except (ValueError, IndexError) as e:
152
- print(f"Failed fallback parsing for line '{line}': {str(e)}")
153
- continue
154
-
155
- # Extract platform fee table (e.g., Sr.No Particulars)
156
- platform_fee_start = -1
157
- for i, line in enumerate(lines):
158
- if "Sr.No Particulars" in line:
159
- platform_fee_start = i + 1
160
- break
161
-
162
- if platform_fee_start != -1:
163
- platform_fee_end = len(lines)
164
- for i in range(platform_fee_start, len(lines)):
165
- if "Total" in lines[i] and "Sr.No" not in lines[i]:
166
- platform_fee_end = i + 1
167
  break
168
- platform_fee_lines = lines[platform_fee_start:platform_fee_end]
169
- print("Platform fee lines:", platform_fee_lines) # Debug
170
- platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
171
- for line in platform_fee_lines:
172
- line = line.strip()
173
- if not line or "Total" in line:
174
- continue
175
- match = re.match(platform_fee_pattern, line)
176
- if match:
177
- description = match.group(1).strip()
178
- total_price = float(match.group(5))
179
- items.append({
180
- "description": description,
181
- "quantity": 1, # Platform fee is a single item
182
- "unit_price": float(match.group(2)), # Taxable amount
183
- "total_price": total_price
184
- })
185
- print(f"Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
186
- else:
187
- print(f"Failed to match platform fee row: {line}")
 
 
 
 
 
 
 
188
 
189
  return items
190
 
191
- def extract_entities(text):
192
  """Extract structured invoice details using flexible regex patterns."""
193
  invoice_numbers = []
194
  primary_invoice_number = "Unknown"
@@ -197,7 +240,7 @@ def extract_entities(text):
197
  total_amount = 0.0
198
 
199
  # Extract items first to use as a filter for NER
200
- items = extract_items(text)
201
  item_descriptions = [item["description"].lower() for item in items]
202
 
203
  # Flexible regex patterns to handle various invoice formats
@@ -426,8 +469,8 @@ def process_invoice(pdf_file):
426
  if "Error" in text:
427
  return f"**Error**: {text}"
428
 
429
- invoice_number, vendor_name, invoice_date, total_amount = extract_entities(text)
430
- items = extract_items(text)
431
  text_length = len(text)
432
 
433
  history_df = fetch_vendor_history(vendor_name, invoice_number)
 
58
  except Exception as e:
59
  return f"Error extracting text: {str(e)}"
60
 
61
+ def extract_items(pdf_file, text):
62
+ """Extract items from the invoice using table extraction and text fallback."""
63
  items = []
 
 
64
 
65
+ # First, try to extract tables using pdfplumber
66
+ try:
67
+ with pdfplumber.open(pdf_file) as pdf:
68
+ for page in pdf.pages:
69
+ tables = page.extract_tables()
70
+ print(f"Found {len(tables)} tables on page") # Debug
71
+ for table_idx, table in enumerate(tables):
72
+ print(f"Table {table_idx}:\n{table}") # Debug
73
+ # Identify main table (Particulars | Gross value | Discount | Net value | Total)
74
+ if table and len(table) > 0 and any("Particulars" in str(cell) for cell in table[0]):
75
+ # Skip the header row
76
+ for row in table[1:]:
77
+ if not row or len(row) < 9: # Expecting at least 9 columns
78
+ continue
79
+ # Check if row contains item data (starts with "1 x")
80
+ description = str(row[0]).strip()
81
+ if not description or "Total" in description or "HSN Code" in description:
82
+ continue
83
+ if description.startswith('1 x'):
84
+ try:
85
+ quantity = int(description.split(' x ')[0].strip())
86
+ unit_price = float(str(row[1]).strip()) # Gross value
87
+ total_price = float(str(row[-1]).strip()) # Total after taxes
88
+ items.append({
89
+ "description": description,
90
+ "quantity": quantity,
91
+ "unit_price": unit_price,
92
+ "total_price": total_price
93
+ })
94
+ print(f"Table Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
95
+ except (ValueError, IndexError) as e:
96
+ print(f"Failed to parse table row {row}: {str(e)}")
97
+ continue
98
+ # Identify platform fee table (Sr.No Particulars)
99
+ if table and len(table) > 0 and any("Sr.No Particulars" in str(cell) for cell in table[0]):
100
+ for row in table[1:]:
101
+ if not row or len(row) < 5 or "Total" in str(row[1]):
102
+ continue
103
+ description = str(row[1]).strip()
104
+ try:
105
+ total_price = float(str(row[-1]).strip())
106
+ items.append({
107
+ "description": description,
108
+ "quantity": 1,
109
+ "unit_price": float(str(row[2]).strip()), # Taxable amount
110
+ "total_price": total_price
111
+ })
112
+ print(f"Table Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
113
+ except (ValueError, IndexError) as e:
114
+ print(f"Failed to parse platform fee row {row}: {str(e)}")
115
+ continue
116
+ except Exception as e:
117
+ print(f"Table extraction failed: {str(e)}. Falling back to text-based extraction.")
118
+
119
+ # Fallback to text-based extraction if no items were extracted
120
+ if not items or len(items) < 3: # Expecting at least 3 items (2 main items + platform fee)
121
+ print("Falling back to text-based item extraction.")
122
+ text = text.replace(r'\$', '$').replace('₹', '₹')
123
+ lines = text.split('\n')
124
+ print("Text split into lines:", lines) # Debug
125
+
126
+ # Define possible table headers
127
+ table_headers = [
128
+ ("Particulars", "Gross value", "Discount", "Net value", "Total"),
129
+ ]
130
+
131
+ # Extract main table
132
+ table_start = -1
133
+ for i, line in enumerate(lines):
134
+ for headers in table_headers:
135
+ if all(header in line for header in headers):
136
+ table_start = i + 1
137
+ break
138
+ if table_start != -1:
139
  break
140
 
141
+ if table_start != -1:
142
+ table_end = len(lines)
143
+ for i in range(table_start, len(lines)):
144
+ if "Item(s) Total" in lines[i] or "Total Value" in lines[i] or "Sr.No Particulars" in lines[i]:
145
+ table_end = i
146
+ break
147
+
148
+ print(f"Main table section: lines {table_start} to {table_end-1}") # Debug
149
+ table_lines = lines[table_start:table_end]
150
+ print("Main table lines:", table_lines) # Debug
151
+
152
  table_row_pattern = r"(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*[0-9.%]+\s*\|?\s*[\d.]+){2}\s*(?:\|\s*)?([\d.]+)"
153
 
154
+ for line in table_lines:
155
+ line = line.strip()
156
+ if not line or "HSN Code" in line or "Total" in line:
157
+ print(f"Skipping irrelevant line: {line}")
158
+ continue
159
+ if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
160
+ print(f"Skipping alignment row: {line}")
161
+ continue
162
+ print(f"Processing main table row: {line}") # Debug
163
+ match = re.match(table_row_pattern, line)
164
+ if match:
165
+ description = match.group(1).strip()
166
+ quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
167
+ unit_price = float(match.group(2))
168
+ total_price = float(match.group(5))
169
+ items.append({
170
+ "description": description,
171
+ "quantity": quantity,
172
+ "unit_price": unit_price,
173
+ "total_price": total_price
174
+ })
175
+ print(f"Fallback Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
176
+ else:
177
+ fields = [f.strip() for f in line.split('|')]
178
+ print(f"Fallback splitting: {fields}") # Debug
179
+ if len(fields) >= 9:
180
+ try:
181
+ description = fields[0].strip()
182
+ if not description.startswith('1 x'):
183
+ continue
184
+ quantity = int(description.split(' x ')[0].strip())
185
+ unit_price = float(fields[1].strip())
186
+ total_price = float(fields[-1].strip())
187
+ items.append({
188
+ "description": description,
189
+ "quantity": quantity,
190
+ "unit_price": unit_price,
191
+ "total_price": total_price
192
+ })
193
+ print(f"Fallback Split Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
194
+ except (ValueError, IndexError) as e:
195
+ print(f"Failed fallback parsing for line '{line}': {str(e)}")
196
+ continue
197
+
198
+ # Extract platform fee table
199
+ platform_fee_start = -1
200
+ for i, line in enumerate(lines):
201
+ if "Sr.No Particulars" in line:
202
+ platform_fee_start = i + 1
 
 
 
 
 
 
 
 
 
203
  break
204
+
205
+ if platform_fee_start != -1:
206
+ platform_fee_end = len(lines)
207
+ for i in range(platform_fee_start, len(lines)):
208
+ if "Total" in lines[i] and "Sr.No" not in lines[i]:
209
+ platform_fee_end = i + 1
210
+ break
211
+ platform_fee_lines = lines[platform_fee_start:platform_fee_end]
212
+ print("Platform fee lines:", platform_fee_lines) # Debug
213
+ platform_fee_pattern = r"\|?\s*\d+\s*\|?\s*([A-Za-z\s]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
214
+ for line in platform_fee_lines:
215
+ line = line.strip()
216
+ if not line or "Total" in line:
217
+ continue
218
+ match = re.match(platform_fee_pattern, line)
219
+ if match:
220
+ description = match.group(1).strip()
221
+ total_price = float(match.group(5))
222
+ items.append({
223
+ "description": description,
224
+ "quantity": 1,
225
+ "unit_price": float(match.group(2)),
226
+ "total_price": total_price
227
+ })
228
+ print(f"Fallback Extracted Platform Fee: {description}, Total Price: {total_price}") # Debug
229
+ else:
230
+ print(f"Failed to match platform fee row: {line}")
231
 
232
  return items
233
 
234
+ def extract_entities(pdf_file, text):
235
  """Extract structured invoice details using flexible regex patterns."""
236
  invoice_numbers = []
237
  primary_invoice_number = "Unknown"
 
240
  total_amount = 0.0
241
 
242
  # Extract items first to use as a filter for NER
243
+ items = extract_items(pdf_file, text)
244
  item_descriptions = [item["description"].lower() for item in items]
245
 
246
  # Flexible regex patterns to handle various invoice formats
 
469
  if "Error" in text:
470
  return f"**Error**: {text}"
471
 
472
+ invoice_number, vendor_name, invoice_date, total_amount = extract_entities(pdf_file, text)
473
+ items = extract_items(pdf_file, text)
474
  text_length = len(text)
475
 
476
  history_df = fetch_vendor_history(vendor_name, invoice_number)