Spaces:
Sleeping
Sleeping
valentynliubchenko
commited on
Commit
·
75a9c52
1
Parent(s):
4a10a29
merging new version
Browse files- algorithm/receipt_calculation.py +6 -1
- examples_canada/photo_2024-10-10_15-23-22.webp +0 -0
- prompt_v3.txt +58 -110
- utils.py +55 -8
algorithm/receipt_calculation.py
CHANGED
|
@@ -87,6 +87,11 @@ def second_algorithm(_products_total, receipt_total):
|
|
| 87 |
return _products_total_rounded, _final_total
|
| 88 |
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
def calculate_tips_and_taxes(items_table, total_amount, tax, tips):
|
| 91 |
products = []
|
| 92 |
|
|
@@ -106,7 +111,7 @@ def calculate_tips_and_taxes(items_table, total_amount, tax, tips):
|
|
| 106 |
price = item[5]
|
| 107 |
if price == "Not specified" or price == "unknown":
|
| 108 |
price = "0.0"
|
| 109 |
-
item_value =
|
| 110 |
products.append(Product(item[0], item_value))
|
| 111 |
|
| 112 |
sum_of_product_prices = 0
|
|
|
|
| 87 |
return _products_total_rounded, _final_total
|
| 88 |
|
| 89 |
|
| 90 |
+
def clean_and_convert_to_float(price):
|
| 91 |
+
clean_price = ''.join(c for c in str(price) if c.isdigit() or c in ",.")
|
| 92 |
+
return float(clean_price.replace(",", "."))
|
| 93 |
+
|
| 94 |
+
|
| 95 |
def calculate_tips_and_taxes(items_table, total_amount, tax, tips):
|
| 96 |
products = []
|
| 97 |
|
|
|
|
| 111 |
price = item[5]
|
| 112 |
if price == "Not specified" or price == "unknown":
|
| 113 |
price = "0.0"
|
| 114 |
+
item_value = clean_and_convert_to_float(price) if item[5] is not None else 0.0
|
| 115 |
products.append(Product(item[0], item_value))
|
| 116 |
|
| 117 |
sum_of_product_prices = 0
|
examples_canada/photo_2024-10-10_15-23-22.webp
ADDED
|
prompt_v3.txt
CHANGED
|
@@ -1,116 +1,64 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
* Order Type: Identify whether the order was "dine-in" or "takeaway."
|
| 39 |
-
2. Missing Currency: If no currency is mentioned on the receipt, infer the local currency by detecting the language and country of origin. For example, a receipt in French would use EUR, while one in Ukrainian would use UAH.
|
| 40 |
-
3. Multi-line Item Names: If an item name spans multiple lines, merge the lines to form the complete name. Stop merging when a quantity or unit of measurement is encountered.
|
| 41 |
-
4. Total Amount: The total amount is often larger than other numbers or displayed in bold at the bottom of the receipt. Make sure to capture this accurately.
|
| 42 |
-
5. Total Discount: If no total discount is listed, sum the discounts for each individual item.
|
| 43 |
-
6. Rounding Adjustments: Some receipts may include a "rounding" line item, where the total amount is adjusted (typically for cash payments) to avoid dealing with fractions of currency (e.g., rounding to the nearest 0.05 in some countries). If a rounding adjustment is present, extract the value of the rounding adjustment and reflect it in the total amount. For example:
|
| 44 |
-
* Total Before Rounding: 19.97
|
| 45 |
-
* Rounding: -0.02
|
| 46 |
-
* Final Total: 19.95 If the rounding adjustment is found, include it as a separate field in the JSON output under "rounding_adjustment", and ensure that the "total_amount" reflects the final adjusted total.
|
| 47 |
-
7. Taxes: Receipts can handle taxes in various ways, and the system should be prepared to capture these scenarios:
|
| 48 |
-
* Tax-Inclusive Pricing: In some countries or for certain receipts, taxes are already included in the item price and not listed separately. If the receipt mentions that taxes are included in prices, record the "tax" field as 0 and note that taxes are included in the item prices.
|
| 49 |
-
* Multiple Tax Rates: Some receipts may include multiple tax rates (e.g., different VAT rates for different items). In this case, extract each tax rate and the corresponding tax amounts, and store them in a separate list of tax breakdowns. For example, the receipt might show "5% VAT" and "15% VAT" for different categories of goods:
|
| 50 |
-
** "taxes": [{"rate": "5%", "amount": 1.00}, {"rate": "15%", "amount": 3.50}]
|
| 51 |
-
* Missing Tax Information: In some cases, the receipt might not clearly mention taxes, but you may infer them based on standard rates in the country of origin. If no explicit tax amount is listed and you are unable to infer it, set the tax to "unknown" or null in the JSON output.
|
| 52 |
-
* Tax-Exempt Items: Some items on the receipt may be tax-exempt. If this is indicated, ensure that these items are excluded from any tax calculations. Note these in the item-level details with "tax_exempt": true and make sure the "tax" field reflects the correct amount for taxable items only.
|
| 53 |
-
* Service Charges vs. Taxes: Sometimes service charges may be listed separately from taxes (common in restaurants). Ensure that service charges are not included in the tax amount, and store them under the "service_charge" field.
|
| 54 |
-
* Tax Breakdown and Total: If both individual item taxes and a total tax amount are listed, the system should ensure consistency between the sum of item-level taxes and the total tax listed at the bottom of the receipt.
|
| 55 |
-
8. In certain receipt formats, the quantity and unit price may appear before the item name. When processing such receipts, the goal is to correctly extract the quantity, unit price, and item name in their proper order. For example, if one line of the receipt shows "5 * 23.00 = 115.0" and the next line displays "Milk," the system should interpret this as:
|
| 56 |
-
* Quantity: 5 units
|
| 57 |
-
* Unit Price: 23.00
|
| 58 |
-
* Item Name: Milk
|
| 59 |
-
* Total Price: 115.0 This approach should be applied consistently throughout the entire receipt to extract data accurately.
|
| 60 |
-
|
| 61 |
-
#JSON Output Format:
|
| 62 |
-
|
| 63 |
{
|
| 64 |
-
"
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
-
"
|
| 69 |
-
"currency": "string",
|
| 70 |
-
"payment_method": "string",
|
| 71 |
-
"total_amount": "number",
|
| 72 |
-
"total_discount": "number",
|
| 73 |
-
"tax": "number",
|
| 74 |
-
"taxes": [
|
| 75 |
-
{
|
| 76 |
-
"rate": "string",
|
| 77 |
-
"amount": "number"
|
| 78 |
-
}
|
| 79 |
-
],
|
| 80 |
-
"rounding_adjustment": "number",
|
| 81 |
-
"rounded_total_aount": "number",
|
| 82 |
"items": [
|
| 83 |
{
|
| 84 |
-
"name": "
|
| 85 |
-
"unit_price":
|
| 86 |
-
"quantity":
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
"
|
| 91 |
-
"discount": "number",
|
| 92 |
-
"category": "string",
|
| 93 |
-
"tax_exempt": "boolean"
|
| 94 |
}
|
| 95 |
],
|
| 96 |
-
"
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
"order_type": "string"
|
| 101 |
-
}
|
| 102 |
}
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
3. Always ensure the output is well-structured and follows the JSON format provided.
|
| 109 |
-
4. The "rounding_adjustment" field should reflect the value by which the total was adjusted due to rounding. If no rounding adjustment is present, it can be set to 0 or omitted from the output.
|
| 110 |
-
5. Ensure that the final "total_amount" field reflects the total after any rounding adjustment has been applied.
|
| 111 |
-
6. Inclusive Taxes: If taxes are included in the item prices, set the "tax" field to 0 and adjust the item prices accordingly.
|
| 112 |
-
7. Multiple Tax Rates: The "taxes" field provides a detailed breakdown for receipts with different tax rates. This field is optional and can be excluded if only a single tax amount is listed.
|
| 113 |
-
8. Tax-Exempt Items: Mark tax-exempt items with the "tax_exempt": true field.
|
| 114 |
-
9. Service Charges vs. Taxes: Ensure that service charges are captured separately from taxes in the "service_charge" field.
|
| 115 |
-
10. Return the full JSON object with all available information. If any information is unclear or missing, include it as "unknown" or "not available" in the output.
|
| 116 |
-
11. Your final response should be in valid JSON format with no additional text.
|
|
|
|
| 1 |
+
Task: Extract structured information from receipt images.
|
| 2 |
+
Receipts: May be in various languages, including non-Latin scripts, and have diverse formats.
|
| 3 |
+
Information to Extract:
|
| 4 |
+
- Store name
|
| 5 |
+
- Store address
|
| 6 |
+
- Currency (e.g., USD, EUR)
|
| 7 |
+
- date (in format "YYYY.MM.DD HH:MM:SS")
|
| 8 |
+
- tax
|
| 9 |
+
- tips (if tips included in Total you need to write tips price, but if tips not included in Total you should write it's price as 0.00)
|
| 10 |
+
- Purchased items:
|
| 11 |
+
- Name
|
| 12 |
+
- Price per unit (format: 0.00)
|
| 13 |
+
- Quantity or weight
|
| 14 |
+
- unit of measurement (string)
|
| 15 |
+
- Total price (format: 0.00)
|
| 16 |
+
- Discount applied (if any, format: 0.00)
|
| 17 |
+
- Category (from the following list):
|
| 18 |
+
* Groceries
|
| 19 |
+
* Produce
|
| 20 |
+
* Meat
|
| 21 |
+
* Seafood
|
| 22 |
+
* Dairy
|
| 23 |
+
* Bakery
|
| 24 |
+
* Canned goods
|
| 25 |
+
* Frozen foods
|
| 26 |
+
* Beverages
|
| 27 |
+
* Snacks
|
| 28 |
+
* Cleaning supplies
|
| 29 |
+
* Personal care products
|
| 30 |
+
* Electronics
|
| 31 |
+
* Clothing
|
| 32 |
+
* Dining
|
| 33 |
+
* Home goods
|
| 34 |
+
* Other (specify if not in the list)
|
| 35 |
+
- Total amount (format: 0.00)
|
| 36 |
+
- Total discount (format: 0.00)
|
| 37 |
+
Output Format: JSON object with the following structure:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
{
|
| 39 |
+
"store_name": "Store Name",
|
| 40 |
+
"store_address": "Store Address",
|
| 41 |
+
"currency": "Currency",
|
| 42 |
+
"date_time": "YYYY.MM.DD HH:MM:SS",
|
| 43 |
+
"payment_method": "card" or "cash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"items": [
|
| 45 |
{
|
| 46 |
+
"name": "Item Name",
|
| 47 |
+
"unit_price": 0.00,
|
| 48 |
+
"quantity": 0,
|
| 49 |
+
"unit_of_measurement":
|
| 50 |
+
"total_price": 0.00,
|
| 51 |
+
"discount": 0.00,
|
| 52 |
+
"category": "Category Name"
|
|
|
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
],
|
| 55 |
+
"total_amount": 0.00,
|
| 56 |
+
"total_discount": 0.00,
|
| 57 |
+
"tax": 0.00,
|
| 58 |
+
"tips": 0.00
|
|
|
|
|
|
|
| 59 |
}
|
| 60 |
+
If no receipt is detected: Return "Receipt not found."
|
| 61 |
+
Additional Notes:
|
| 62 |
+
1. If the receipt is in a non-Latin script, extract the information in its original form unless translation is required.
|
| 63 |
+
2. If any information is unclear or missing, include it as "unknown" or "not available" in the output.
|
| 64 |
+
Write whole json with information about all products.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import json
|
| 3 |
-
import pandas as pd
|
| 4 |
-
|
| 5 |
import base64
|
|
|
|
| 6 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
| 7 |
from PIL import Image
|
|
|
|
| 8 |
from algorithm import receipt_calculation
|
| 9 |
|
| 10 |
|
|
@@ -130,21 +131,67 @@ def process_receipt_json(json_input):
|
|
| 130 |
item.get("unit_of_measurement", "Not specified"),
|
| 131 |
item.get("total_price", "Not specified"),
|
| 132 |
item.get("discount", "Not specified")]
|
| 133 |
-
|
| 134 |
else:
|
| 135 |
items_table = [["No items"]]
|
| 136 |
|
| 137 |
total_product_prices, total_sum = receipt_calculation.calculate_tips_and_taxes(items_table, total_amount, tax, tips)
|
| 138 |
-
message = "Everything is okay!"
|
| 139 |
if items_table[0][0] != "No items":
|
| 140 |
for i in range(len(items_table)):
|
| 141 |
items_table[i].append(total_product_prices[i].price)
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
return store_info, items_table, message
|
| 146 |
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def save_to_excel(json_output, excel_file_path):
|
| 149 |
store_info, items_table, _ = process_receipt_json(json_output)
|
| 150 |
if isinstance(store_info, str) and store_info.startswith("Error:"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import base64
|
| 2 |
+
import json
|
| 3 |
from io import BytesIO
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
from PIL import Image
|
| 8 |
+
|
| 9 |
from algorithm import receipt_calculation
|
| 10 |
|
| 11 |
|
|
|
|
| 131 |
item.get("unit_of_measurement", "Not specified"),
|
| 132 |
item.get("total_price", "Not specified"),
|
| 133 |
item.get("discount", "Not specified")]
|
| 134 |
+
for item in items]
|
| 135 |
else:
|
| 136 |
items_table = [["No items"]]
|
| 137 |
|
| 138 |
total_product_prices, total_sum = receipt_calculation.calculate_tips_and_taxes(items_table, total_amount, tax, tips)
|
|
|
|
| 139 |
if items_table[0][0] != "No items":
|
| 140 |
for i in range(len(items_table)):
|
| 141 |
items_table[i].append(total_product_prices[i].price)
|
| 142 |
|
| 143 |
+
message = create_message(total_sum, total_amount, items_table)
|
| 144 |
+
|
| 145 |
return store_info, items_table, message
|
| 146 |
|
| 147 |
|
| 148 |
+
def clean_and_convert_to_float(value, return_value):
|
| 149 |
+
return return_value if validate_is_unknown(value) or value is None else clean_value(value)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def validate_is_unknown(value):
|
| 153 |
+
return value == "unknown" or value == "not available" or value == "Not specified"
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def clean_value(value):
|
| 157 |
+
clean_price = ''.join(c for c in str(value) if c.isdigit() or c in ",.")
|
| 158 |
+
return float(clean_price.replace(",", "."))
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def create_message(total_sum, total_amount, table):
|
| 162 |
+
message = ""
|
| 163 |
+
print(table)
|
| 164 |
+
print(round(
|
| 165 |
+
float(str(total_sum).replace(",", ".")), 2))
|
| 166 |
+
print(round(
|
| 167 |
+
float(str(total_amount).replace(",", ".")), 2))
|
| 168 |
+
print(91.1!=91.1)
|
| 169 |
+
if total_amount in ["Not specified", "unknown", "not available"] or total_amount is None or round(
|
| 170 |
+
float(str(total_sum).replace(",", ".")), 2) != round(
|
| 171 |
+
float(str(total_amount).replace(",", ".")), 2):
|
| 172 |
+
message = message + (
|
| 173 |
+
f"The recognized total sum and product total sum are not equal. Check if the AI model correctly created a JSON.\n"
|
| 174 |
+
f"Recognized total sum: {total_amount},\n"
|
| 175 |
+
f"Calculated total sum: {total_sum}.\n"
|
| 176 |
+
)
|
| 177 |
+
if table[0][0] == "No items":
|
| 178 |
+
message = message + "Receipt hasn't items!\n"
|
| 179 |
+
else:
|
| 180 |
+
for i in range(len(table)):
|
| 181 |
+
price = clean_and_convert_to_float(table[i][2], -1.111)
|
| 182 |
+
amount = clean_and_convert_to_float(table[i][3], 1)
|
| 183 |
+
total_price = clean_value(table[i][5])
|
| 184 |
+
total_price_calculated = round(0 if price == -1.111 else price * amount, 2)
|
| 185 |
+
discount = 0 if validate_is_unknown or table[i][6] is None else clean_value(table[i][6])
|
| 186 |
+
total_price_calculated = round(total_price_calculated - discount, 2)
|
| 187 |
+
if validate_is_unknown(table[i][2]):
|
| 188 |
+
message = message + f" {table[i][0]} have {table[i][2]} price! Please retry!\n"
|
| 189 |
+
elif total_price != total_price_calculated:
|
| 190 |
+
message = message + f"{table[i][0]} has incorrect 'Total Price'. Recognized total: {total_price} VS Calculated total: {total_price_calculated}. \n"
|
| 191 |
+
|
| 192 |
+
return "Everything is okay!" if message == "" else message
|
| 193 |
+
|
| 194 |
+
|
| 195 |
def save_to_excel(json_output, excel_file_path):
|
| 196 |
store_info, items_table, _ = process_receipt_json(json_output)
|
| 197 |
if isinstance(store_info, str) and store_info.startswith("Error:"):
|