Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -103,13 +103,13 @@ def extract_items(text):
|
|
| 103 |
# Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
|
| 104 |
table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
|
| 105 |
else:
|
| 106 |
-
#
|
| 107 |
-
|
| 108 |
-
table_row_pattern = r"\|?\s*(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*(?:\|?\s*[\d.%]+\s*\|?\s*[\d.]+){2}\s*\|?\s*([\d.]+)\s*\|?"
|
| 109 |
|
| 110 |
for line in table_lines:
|
| 111 |
line = line.strip()
|
| 112 |
if not line or "HSN Code" in line or "Total" in line:
|
|
|
|
| 113 |
continue
|
| 114 |
# Skip alignment rows (e.g., "|---|---|")
|
| 115 |
if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
|
|
@@ -118,16 +118,10 @@ def extract_items(text):
|
|
| 118 |
print(f"Processing main table row: {line}") # Debug
|
| 119 |
match = re.match(table_row_pattern, line)
|
| 120 |
if match:
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
total_price = float(match.group(4))
|
| 126 |
-
else:
|
| 127 |
-
description = match.group(1).strip()
|
| 128 |
-
quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
|
| 129 |
-
unit_price = float(match.group(2)) # Gross value
|
| 130 |
-
total_price = float(match.group(5)) # Total after taxes
|
| 131 |
items.append({
|
| 132 |
"description": description,
|
| 133 |
"quantity": quantity,
|
|
@@ -136,7 +130,27 @@ def extract_items(text):
|
|
| 136 |
})
|
| 137 |
print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
|
| 138 |
else:
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# Extract platform fee table (e.g., Sr.No Particulars)
|
| 142 |
platform_fee_start = -1
|
|
|
|
| 103 |
# Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
|
| 104 |
table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
|
| 105 |
else:
|
| 106 |
+
# Simplified pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
|
| 107 |
+
table_row_pattern = r"(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*[0-9.%]+\s*\|?\s*[\d.]+){2}\s*(?:\|\s*)?([\d.]+)"
|
|
|
|
| 108 |
|
| 109 |
for line in table_lines:
|
| 110 |
line = line.strip()
|
| 111 |
if not line or "HSN Code" in line or "Total" in line:
|
| 112 |
+
print(f"Skipping irrelevant line: {line}")
|
| 113 |
continue
|
| 114 |
# Skip alignment rows (e.g., "|---|---|")
|
| 115 |
if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
|
|
|
|
| 118 |
print(f"Processing main table row: {line}") # Debug
|
| 119 |
match = re.match(table_row_pattern, line)
|
| 120 |
if match:
|
| 121 |
+
description = match.group(1).strip()
|
| 122 |
+
quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
|
| 123 |
+
unit_price = float(match.group(2)) # Gross value
|
| 124 |
+
total_price = float(match.group(5)) # Total after taxes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
items.append({
|
| 126 |
"description": description,
|
| 127 |
"quantity": quantity,
|
|
|
|
| 130 |
})
|
| 131 |
print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
|
| 132 |
else:
|
| 133 |
+
# Fallback: Split by | and validate fields manually
|
| 134 |
+
fields = [f.strip() for f in line.split('|')]
|
| 135 |
+
print(f"Fallback processing: {fields}") # Debug
|
| 136 |
+
if len(fields) >= 9: # Expecting at least 9 fields (description, gross value, discount, net value, CGST %, CGST amount, SGST %, SGST amount, total)
|
| 137 |
+
try:
|
| 138 |
+
description = fields[0].strip()
|
| 139 |
+
if not description.startswith('1 x'):
|
| 140 |
+
continue # Skip if not an item row
|
| 141 |
+
quantity = int(description.split(' x ')[0].strip())
|
| 142 |
+
unit_price = float(fields[1].strip()) # Gross value
|
| 143 |
+
total_price = float(fields[-1].strip()) # Total after taxes
|
| 144 |
+
items.append({
|
| 145 |
+
"description": description,
|
| 146 |
+
"quantity": quantity,
|
| 147 |
+
"unit_price": unit_price,
|
| 148 |
+
"total_price": total_price
|
| 149 |
+
})
|
| 150 |
+
print(f"Fallback Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
|
| 151 |
+
except (ValueError, IndexError) as e:
|
| 152 |
+
print(f"Failed fallback parsing for line '{line}': {str(e)}")
|
| 153 |
+
continue
|
| 154 |
|
| 155 |
# Extract platform fee table (e.g., Sr.No Particulars)
|
| 156 |
platform_fee_start = -1
|