valentynliubchenko commited on
Commit
75a9c52
·
1 Parent(s): 4a10a29

merging new version

Browse files
algorithm/receipt_calculation.py CHANGED
@@ -87,6 +87,11 @@ def second_algorithm(_products_total, receipt_total):
87
  return _products_total_rounded, _final_total
88
 
89
 
 
 
 
 
 
90
  def calculate_tips_and_taxes(items_table, total_amount, tax, tips):
91
  products = []
92
 
@@ -106,7 +111,7 @@ def calculate_tips_and_taxes(items_table, total_amount, tax, tips):
106
  price = item[5]
107
  if price == "Not specified" or price == "unknown":
108
  price = "0.0"
109
- item_value = float(str(price).replace(",", ".")) if item[5] is not None else 0.0
110
  products.append(Product(item[0], item_value))
111
 
112
  sum_of_product_prices = 0
 
87
  return _products_total_rounded, _final_total
88
 
89
 
90
+ def clean_and_convert_to_float(price):
91
+ clean_price = ''.join(c for c in str(price) if c.isdigit() or c in ",.")
92
+ return float(clean_price.replace(",", "."))
93
+
94
+
95
  def calculate_tips_and_taxes(items_table, total_amount, tax, tips):
96
  products = []
97
 
 
111
  price = item[5]
112
  if price == "Not specified" or price == "unknown":
113
  price = "0.0"
114
+ item_value = clean_and_convert_to_float(price) if item[5] is not None else 0.0
115
  products.append(Product(item[0], item_value))
116
 
117
  sum_of_product_prices = 0
examples_canada/photo_2024-10-10_15-23-22.webp ADDED
prompt_v3.txt CHANGED
@@ -1,116 +1,64 @@
1
- #Your Task: Receipt Recognition and Data Extraction
2
-
3
- You are tasked with extracting structured information from receipts. Receipts will come from various countries, in different languages, and can have different layouts and formats. Your goal is to parse the receipt text, identify the receipt type (store, cafe/restaurant, or payment for services), and return the data in JSON format with the required fields. Follow the specific instructions below to ensure accurate extraction.
4
-
5
- #Required Fields:
6
-
7
- 1. Receipt Type: Identify the type of receipt. It could be from:
8
- * Store: Typically involves grocery or retail items.
9
- * Cafe/Restaurant: Typically involves food and beverage items, table numbers, or tipping sections.
10
- * Payment for Services: This type of receipt may involve service fees or professional services.
11
- 2. Receipt Number: Extract the unique receipt number, typically found at the top of the receipt.
12
- 3. Store/Business Name: Extract the name of the store, cafe, restaurant, or service provider.
13
- 4. Store Address: Extract the address of the store, including city and country if available.
14
- 5. Date: Extract the date of the transaction and format it as YYYY-MM-DD HH:MM.
15
- 6. Currency: Extract the currency if explicitly mentioned (e.g., EUR, USD). If the currency is not specified, detect the language of the receipt and infer the currency based on the country where the language is predominantly used. For example, if the receipt is in Ukrainian, set the currency to UAH (Ukrainian Hryvnia).
16
- 7. Payment Method: Identify whether the payment was made by "card" or "cash."
17
- 8. Total Amount: Extract the total amount of the transaction. This is typically located at the end of the receipt, often highlighted in bold or a larger font.
18
- 9. Total Discount: Extract the total discount if explicitly mentioned. If not, calculate the total discount by summing up the discounts for individual items.
19
- 10. Tax: Extract the total tax amount if it is listed on the receipt.
20
-
21
- #Item-Level Details:
22
-
23
- For each item on the receipt, extract the following details:
24
-
25
- 1. Item Name: Extract the full name of each item. Some items may have names split across multiple lines; in this case, concatenate the lines until you encounter a quantity or unit of measurement (e.g., "2ks"), which marks the end of the item name or on the next line. You should extract full name till statements as, for example, "1 ks" or "1 ks * 2"
26
- 2. Unit Price: Extract the price per unit for each item.
27
- 3. Quantity: Extract the quantity of each item, including the unit of measurement (e.g., "ks" for pieces, "kg" for kilograms).
28
- 4. Price: Extract the final price for each item.
29
- 5. Discount: Extract any discount applied to the item. If no discount is provided, set it to 0.
30
- 6. Category: Automatically assign a category based on the item name. For groceries, assign relevant subcategories such as Dairy, Bakery, Fruits, etc. If this receipt was from restaurant - you should put category only from this list: Food, Drinks.
31
-
32
- #Special Cases:
33
-
34
- 1. Cafe/Restaurant Receipts: If the receipt is from a cafe or restaurant, handle additional fields like:
35
- * Table Number: Extract the table number if available, often printed near the top of the receipt.
36
- * Tips: Extract any tip amounts explicitly listed or infer from the total paid amount minus the original bill amount.
37
- * Service Charges: Some restaurants may include an automatic service charge, which should be listed separately from the tax or tips.
38
- * Order Type: Identify whether the order was "dine-in" or "takeaway."
39
- 2. Missing Currency: If no currency is mentioned on the receipt, infer the local currency by detecting the language and country of origin. For example, a receipt in French would use EUR, while one in Ukrainian would use UAH.
40
- 3. Multi-line Item Names: If an item name spans multiple lines, merge the lines to form the complete name. Stop merging when a quantity or unit of measurement is encountered.
41
- 4. Total Amount: The total amount is often larger than other numbers or displayed in bold at the bottom of the receipt. Make sure to capture this accurately.
42
- 5. Total Discount: If no total discount is listed, sum the discounts for each individual item.
43
- 6. Rounding Adjustments: Some receipts may include a "rounding" line item, where the total amount is adjusted (typically for cash payments) to avoid dealing with fractions of currency (e.g., rounding to the nearest 0.05 in some countries). If a rounding adjustment is present, extract the value of the rounding adjustment and reflect it in the total amount. For example:
44
- * Total Before Rounding: 19.97
45
- * Rounding: -0.02
46
- * Final Total: 19.95 If the rounding adjustment is found, include it as a separate field in the JSON output under "rounding_adjustment", and ensure that the "total_amount" reflects the final adjusted total.
47
- 7. Taxes: Receipts can handle taxes in various ways, and the system should be prepared to capture these scenarios:
48
- * Tax-Inclusive Pricing: In some countries or for certain receipts, taxes are already included in the item price and not listed separately. If the receipt mentions that taxes are included in prices, record the "tax" field as 0 and note that taxes are included in the item prices.
49
- * Multiple Tax Rates: Some receipts may include multiple tax rates (e.g., different VAT rates for different items). In this case, extract each tax rate and the corresponding tax amounts, and store them in a separate list of tax breakdowns. For example, the receipt might show "5% VAT" and "15% VAT" for different categories of goods:
50
- ** "taxes": [{"rate": "5%", "amount": 1.00}, {"rate": "15%", "amount": 3.50}]
51
- * Missing Tax Information: In some cases, the receipt might not clearly mention taxes, but you may infer them based on standard rates in the country of origin. If no explicit tax amount is listed and you are unable to infer it, set the tax to "unknown" or null in the JSON output.
52
- * Tax-Exempt Items: Some items on the receipt may be tax-exempt. If this is indicated, ensure that these items are excluded from any tax calculations. Note these in the item-level details with "tax_exempt": true and make sure the "tax" field reflects the correct amount for taxable items only.
53
- * Service Charges vs. Taxes: Sometimes service charges may be listed separately from taxes (common in restaurants). Ensure that service charges are not included in the tax amount, and store them under the "service_charge" field.
54
- * Tax Breakdown and Total: If both individual item taxes and a total tax amount are listed, the system should ensure consistency between the sum of item-level taxes and the total tax listed at the bottom of the receipt.
55
- 8. In certain receipt formats, the quantity and unit price may appear before the item name. When processing such receipts, the goal is to correctly extract the quantity, unit price, and item name in their proper order. For example, if one line of the receipt shows "5 * 23.00 = 115.0" and the next line displays "Milk," the system should interpret this as:
56
- * Quantity: 5 units
57
- * Unit Price: 23.00
58
- * Item Name: Milk
59
- * Total Price: 115.0 This approach should be applied consistently throughout the entire receipt to extract data accurately.
60
-
61
- #JSON Output Format:
62
-
63
  {
64
- "receipt_type": "string",
65
- "receipt_number": "string",
66
- "store_name": "string",
67
- "store_address": "string",
68
- "date_time": "string",
69
- "currency": "string",
70
- "payment_method": "string",
71
- "total_amount": "number",
72
- "total_discount": "number",
73
- "tax": "number",
74
- "taxes": [
75
- {
76
- "rate": "string",
77
- "amount": "number"
78
- }
79
- ],
80
- "rounding_adjustment": "number",
81
- "rounded_total_aount": "number",
82
  "items": [
83
  {
84
- "name": "string",
85
- "unit_price": "number",
86
- "quantity": {
87
- "amount": "number",
88
- "unit_of_measurement": "string"
89
- },
90
- "price": "number",
91
- "discount": "number",
92
- "category": "string",
93
- "tax_exempt": "boolean"
94
  }
95
  ],
96
- "cafe_additional_info": {
97
- "table_number": "string",
98
- "tips": "number",
99
- "service_charge": "number",
100
- "order_type": "string"
101
- }
102
  }
103
-
104
- #Additional Notes:
105
-
106
- 1. You should handle receipts in various languages and from different countries.
107
- 2. Pay special attention to formatting differences and edge cases, such as multi-line item names, missing currency symbols, or cafe/restaurant-specific information.
108
- 3. Always ensure the output is well-structured and follows the JSON format provided.
109
- 4. The "rounding_adjustment" field should reflect the value by which the total was adjusted due to rounding. If no rounding adjustment is present, it can be set to 0 or omitted from the output.
110
- 5. Ensure that the final "total_amount" field reflects the total after any rounding adjustment has been applied.
111
- 6. Inclusive Taxes: If taxes are included in the item prices, set the "tax" field to 0 and adjust the item prices accordingly.
112
- 7. Multiple Tax Rates: The "taxes" field provides a detailed breakdown for receipts with different tax rates. This field is optional and can be excluded if only a single tax amount is listed.
113
- 8. Tax-Exempt Items: Mark tax-exempt items with the "tax_exempt": true field.
114
- 9. Service Charges vs. Taxes: Ensure that service charges are captured separately from taxes in the "service_charge" field.
115
- 10. Return the full JSON object with all available information. If any information is unclear or missing, include it as "unknown" or "not available" in the output.
116
- 11. Your final response should be in valid JSON format with no additional text.
 
1
+ Task: Extract structured information from receipt images.
2
+ Receipts: May be in various languages, including non-Latin scripts, and have diverse formats.
3
+ Information to Extract:
4
+ - Store name
5
+ - Store address
6
+ - Currency (e.g., USD, EUR)
7
+ - date (in format "YYYY.MM.DD HH:MM:SS")
8
+ - tax
9
+ - tips (if tips included in Total you need to write tips price, but if tips not included in Total you should write it's price as 0.00)
10
+ - Purchased items:
11
+ - Name
12
+ - Price per unit (format: 0.00)
13
+ - Quantity or weight
14
+ - unit of measurement (string)
15
+ - Total price (format: 0.00)
16
+ - Discount applied (if any, format: 0.00)
17
+ - Category (from the following list):
18
+ * Groceries
19
+ * Produce
20
+ * Meat
21
+ * Seafood
22
+ * Dairy
23
+ * Bakery
24
+ * Canned goods
25
+ * Frozen foods
26
+ * Beverages
27
+ * Snacks
28
+ * Cleaning supplies
29
+ * Personal care products
30
+ * Electronics
31
+ * Clothing
32
+ * Dining
33
+ * Home goods
34
+ * Other (specify if not in the list)
35
+ - Total amount (format: 0.00)
36
+ - Total discount (format: 0.00)
37
+ Output Format: JSON object with the following structure:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  {
39
+ "store_name": "Store Name",
40
+ "store_address": "Store Address",
41
+ "currency": "Currency",
42
+ "date_time": "YYYY.MM.DD HH:MM:SS",
43
+ "payment_method": "card" or "cash
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  "items": [
45
  {
46
+ "name": "Item Name",
47
+ "unit_price": 0.00,
48
+ "quantity": 0,
49
+ "unit_of_measurement":
50
+ "total_price": 0.00,
51
+ "discount": 0.00,
52
+ "category": "Category Name"
 
 
 
53
  }
54
  ],
55
+ "total_amount": 0.00,
56
+ "total_discount": 0.00,
57
+ "tax": 0.00,
58
+ "tips": 0.00
 
 
59
  }
60
+ If no receipt is detected: Return "Receipt not found."
61
+ Additional Notes:
62
+ 1. If the receipt is in a non-Latin script, extract the information in its original form unless translation is required.
63
+ 2. If any information is unclear or missing, include it as "unknown" or "not available" in the output.
64
+ Write whole json with information about all products.
 
 
 
 
 
 
 
 
 
utils.py CHANGED
@@ -1,10 +1,11 @@
1
- import numpy as np
2
- import json
3
- import pandas as pd
4
-
5
  import base64
 
6
  from io import BytesIO
 
 
 
7
  from PIL import Image
 
8
  from algorithm import receipt_calculation
9
 
10
 
@@ -130,21 +131,67 @@ def process_receipt_json(json_input):
130
  item.get("unit_of_measurement", "Not specified"),
131
  item.get("total_price", "Not specified"),
132
  item.get("discount", "Not specified")]
133
- for item in items]
134
  else:
135
  items_table = [["No items"]]
136
 
137
  total_product_prices, total_sum = receipt_calculation.calculate_tips_and_taxes(items_table, total_amount, tax, tips)
138
- message = "Everything is okay!"
139
  if items_table[0][0] != "No items":
140
  for i in range(len(items_table)):
141
  items_table[i].append(total_product_prices[i].price)
142
 
143
- if total_sum == "Not specified" or total_sum is None or total_sum != round(float(str(total_amount).replace(",", ".")), 2):
144
- message = "Recognized total sum and products total sum is not equal. Check if AI model correctly created a JSON"
145
  return store_info, items_table, message
146
 
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  def save_to_excel(json_output, excel_file_path):
149
  store_info, items_table, _ = process_receipt_json(json_output)
150
  if isinstance(store_info, str) and store_info.startswith("Error:"):
 
 
 
 
 
1
  import base64
2
+ import json
3
  from io import BytesIO
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
  from PIL import Image
8
+
9
  from algorithm import receipt_calculation
10
 
11
 
 
131
  item.get("unit_of_measurement", "Not specified"),
132
  item.get("total_price", "Not specified"),
133
  item.get("discount", "Not specified")]
134
+ for item in items]
135
  else:
136
  items_table = [["No items"]]
137
 
138
  total_product_prices, total_sum = receipt_calculation.calculate_tips_and_taxes(items_table, total_amount, tax, tips)
 
139
  if items_table[0][0] != "No items":
140
  for i in range(len(items_table)):
141
  items_table[i].append(total_product_prices[i].price)
142
 
143
+ message = create_message(total_sum, total_amount, items_table)
144
+
145
  return store_info, items_table, message
146
 
147
 
148
+ def clean_and_convert_to_float(value, return_value):
149
+ return return_value if validate_is_unknown(value) or value is None else clean_value(value)
150
+
151
+
152
+ def validate_is_unknown(value):
153
+ return value == "unknown" or value == "not available" or value == "Not specified"
154
+
155
+
156
+ def clean_value(value):
157
+ clean_price = ''.join(c for c in str(value) if c.isdigit() or c in ",.")
158
+ return float(clean_price.replace(",", "."))
159
+
160
+
161
+ def create_message(total_sum, total_amount, table):
162
+ message = ""
163
+ print(table)
164
+ print(round(
165
+ float(str(total_sum).replace(",", ".")), 2))
166
+ print(round(
167
+ float(str(total_amount).replace(",", ".")), 2))
168
+ print(91.1!=91.1)
169
+ if total_amount in ["Not specified", "unknown", "not available"] or total_amount is None or round(
170
+ float(str(total_sum).replace(",", ".")), 2) != round(
171
+ float(str(total_amount).replace(",", ".")), 2):
172
+ message = message + (
173
+ f"The recognized total sum and product total sum are not equal. Check if the AI model correctly created a JSON.\n"
174
+ f"Recognized total sum: {total_amount},\n"
175
+ f"Calculated total sum: {total_sum}.\n"
176
+ )
177
+ if table[0][0] == "No items":
178
+ message = message + "Receipt hasn't items!\n"
179
+ else:
180
+ for i in range(len(table)):
181
+ price = clean_and_convert_to_float(table[i][2], -1.111)
182
+ amount = clean_and_convert_to_float(table[i][3], 1)
183
+ total_price = clean_value(table[i][5])
184
+ total_price_calculated = round(0 if price == -1.111 else price * amount, 2)
185
+ discount = 0 if validate_is_unknown or table[i][6] is None else clean_value(table[i][6])
186
+ total_price_calculated = round(total_price_calculated - discount, 2)
187
+ if validate_is_unknown(table[i][2]):
188
+ message = message + f" {table[i][0]} have {table[i][2]} price! Please retry!\n"
189
+ elif total_price != total_price_calculated:
190
+ message = message + f"{table[i][0]} has incorrect 'Total Price'. Recognized total: {total_price} VS Calculated total: {total_price_calculated}. \n"
191
+
192
+ return "Everything is okay!" if message == "" else message
193
+
194
+
195
  def save_to_excel(json_output, excel_file_path):
196
  store_info, items_table, _ = process_receipt_json(json_output)
197
  if isinstance(store_info, str) and store_info.startswith("Error:"):