TEST-FRANKO / receipt_processor /parsers /kaufland_parser.py
Kristijan Nincevic
Fixed spar and Kaufland parsers, raised rate limiter
1d22489
import re
from datetime import datetime
from .base import BaseParser
class KauflandParser(BaseParser):
def parse(self, text: str) -> dict:
lines = text.splitlines()
receipt = {
"store": "Kaufland",
"date": None,
"address": None,
"items": [],
"parser_used": "KauflandParser"
}
# --- Address Extraction ---
for line in lines:
if "Zagreb" in line and "," in line:
receipt["address"] = line.strip()
break
# --- Date Extraction ---
date_match = re.search(r'DATUM[:]?\s*(\d{2})/(\d{2})/(\d{2})', text, re.IGNORECASE)
if date_match:
year, month, day = date_match.groups()
receipt["date"] = f"20{year}-{month.zfill(2)}-{day.zfill(2)}"
# --- Item Extraction ---
skip_keywords = {
"račun", "cijena", "oib", "zagreb", "kartica", "odobreno", "vrijeme", "br.",
"pdv", "kontakt", "iznos", "rrn", "aid", "mid", "pan", "za platiti", "za vratiti",
"neto", "bruto", "prodaja", "terminal", "potvrde", "jir", "zkir", "tc", "datum",
"kaufland", "www.", "središće", "knifera"
}
item_pattern = re.compile(r'^[A-Za-zČĆŽŠĐčćžšđ\s\-\.\d/]+$')
price_pattern = re.compile(r'(\d+,\d{2})\s*(?:E|C|EUR)?$', re.IGNORECASE)
potential_items = []
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue
line_lower = line.lower()
if any(kw in line_lower for kw in skip_keywords):
continue
# Ignore receipt number format like 22043/4430/30
if re.match(r'^\d{3,6}/\d{3,6}/?\d*$', line):
continue
# Match product name
if item_pattern.match(line) and len(line) > 4:
potential_items.append(line)
continue
# Match price line
price_match = price_pattern.search(line)
if price_match and potential_items:
price = float(price_match.group(1).replace(',', '.'))
name = potential_items.pop(0)
receipt["items"].append({
"name": name,
"quantity": 1.0,
"price": price
})
return receipt