Abhisesh7 commited on
Commit
855d3bc
·
verified ·
1 Parent(s): f6c5876

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -14
app.py CHANGED
@@ -103,13 +103,13 @@ def extract_items(text):
103
  # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
104
  table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
105
  else:
106
- # Pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
107
- # Adjusted to be more flexible for potential spacing or formatting issues
108
- table_row_pattern = r"\|?\s*(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*(?:\|?\s*[\d.%]+\s*\|?\s*[\d.]+){2}\s*\|?\s*([\d.]+)\s*\|?"
109
 
110
  for line in table_lines:
111
  line = line.strip()
112
  if not line or "HSN Code" in line or "Total" in line:
 
113
  continue
114
  # Skip alignment rows (e.g., "|---|---|")
115
  if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
@@ -118,16 +118,10 @@ def extract_items(text):
118
  print(f"Processing main table row: {line}") # Debug
119
  match = re.match(table_row_pattern, line)
120
  if match:
121
- if table_format[0] == "Item Description":
122
- description = match.group(1).strip()
123
- quantity = int(match.group(2))
124
- unit_price = float(match.group(3))
125
- total_price = float(match.group(4))
126
- else:
127
- description = match.group(1).strip()
128
- quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
129
- unit_price = float(match.group(2)) # Gross value
130
- total_price = float(match.group(5)) # Total after taxes
131
  items.append({
132
  "description": description,
133
  "quantity": quantity,
@@ -136,7 +130,27 @@ def extract_items(text):
136
  })
137
  print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
138
  else:
139
- print(f"Failed to match main table row: {line}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  # Extract platform fee table (e.g., Sr.No Particulars)
142
  platform_fee_start = -1
 
103
  # Pattern for invoice_4.pdf: "Monitor 24 inch | 7 | 150.00 | 1050.00"
104
  table_row_pattern = r"\|?\s*([A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*\|?\s*(\d+)\s*\|?\s*([\d.]+)\s*\|?\s*([\d.]+)\s*\|?"
105
  else:
106
+ # Simplified pattern for Invoice_6164752968.pdf: "1 x Chicken Frankie | 60 | 6 | 54 | 2.5% | 1.35 | 2.5% | 1.35 | 56.7"
107
+ table_row_pattern = r"(\d+\s*x\s*[A-Za-z\s\d-]+(?:\s[A-Za-z\s\d-]+)*?)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*)?([\d.]+)\s*(?:\|\s*[0-9.%]+\s*\|?\s*[\d.]+){2}\s*(?:\|\s*)?([\d.]+)"
 
108
 
109
  for line in table_lines:
110
  line = line.strip()
111
  if not line or "HSN Code" in line or "Total" in line:
112
+ print(f"Skipping irrelevant line: {line}")
113
  continue
114
  # Skip alignment rows (e.g., "|---|---|")
115
  if re.match(r"\|?\s*[-:]+(\s*\|\s*[-:]+)*\s*\|?", line):
 
118
  print(f"Processing main table row: {line}") # Debug
119
  match = re.match(table_row_pattern, line)
120
  if match:
121
+ description = match.group(1).strip()
122
+ quantity = int(description.split(' x ')[0].strip()) if ' x ' in description else 1
123
+ unit_price = float(match.group(2)) # Gross value
124
+ total_price = float(match.group(5)) # Total after taxes
 
 
 
 
 
 
125
  items.append({
126
  "description": description,
127
  "quantity": quantity,
 
130
  })
131
  print(f"Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
132
  else:
133
+ # Fallback: Split by | and validate fields manually
134
+ fields = [f.strip() for f in line.split('|')]
135
+ print(f"Fallback processing: {fields}") # Debug
136
+ if len(fields) >= 9: # Expecting at least 9 fields (description, gross value, discount, net value, CGST %, CGST amount, SGST %, SGST amount, total)
137
+ try:
138
+ description = fields[0].strip()
139
+ if not description.startswith('1 x'):
140
+ continue # Skip if not an item row
141
+ quantity = int(description.split(' x ')[0].strip())
142
+ unit_price = float(fields[1].strip()) # Gross value
143
+ total_price = float(fields[-1].strip()) # Total after taxes
144
+ items.append({
145
+ "description": description,
146
+ "quantity": quantity,
147
+ "unit_price": unit_price,
148
+ "total_price": total_price
149
+ })
150
+ print(f"Fallback Extracted Item: {description}, Qty: {quantity}, Unit Price: {unit_price}, Total Price: {total_price}") # Debug
151
+ except (ValueError, IndexError) as e:
152
+ print(f"Failed fallback parsing for line '{line}': {str(e)}")
153
+ continue
154
 
155
  # Extract platform fee table (e.g., Sr.No Particulars)
156
  platform_fee_start = -1