TEST-FRANKO / receipt_processor /parsers /plodine_parser.py
Kristijan Nincevic
Updated Konzum, Kaufland and Studenac parsers, added a folder for locally testing the parsers
5e55f4d
from .base import BaseParser
import re
from datetime import datetime
class PlodineParser(BaseParser):
def parse(self, text: str) -> dict:
result = {
"store": "Plodine",
"date": None,
"address": None,
"items": [],
"parser_used": "PlodineParser"
}
# Address extraction (Improved)
address_lines = text.splitlines()
street_name = None
house_number = None
postal_city = None
for i, line in enumerate(address_lines):
if "HIPERMARKET" in line.upper() or "SUPERMARKET" in line.upper():
# This should contain something like "HIPERMARKET ZAGREB Karla Metikosa"
parts = line.strip().split()
if len(parts) >= 3:
street_name = " ".join(parts[2:])
if re.match(r'^\d+\s+[A-Za-zČĆŽŠĐčćžšđ ]+$', line.strip()):
house_number = line.strip()
if re.match(r'^\d{5}\s+[A-Za-zČĆŽŠĐčćžšđ]+$', line.strip()):
postal_city = line.strip()
if street_name and house_number and postal_city:
result['address'] = f"{street_name}, {postal_city}"
# Date extraction (just the date, no time)
date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', text)
if date_match:
try:
result['date'] = datetime.strptime(
date_match.group(1),
'%d.%m.%Y'
).date().isoformat() # Only YYYY-MM-DD
except:
pass
# Item parsing
item_section = re.search(
r'(?<=Artikal\nKol\nCijena\nIznos €\n)(.*?)(?=\nZA PLATITI)',
text,
re.DOTALL
)
if item_section:
item_pattern = re.compile(
r'^([^\n]+)\n' # Item name
r'(\d+,\d+|\d+)\s*x\s*([\d,]+)?\n?' # Quantity x UnitPrice
r'([\d,]+)?', # Total price
re.MULTILINE
)
matches = item_pattern.finditer(item_section.group(1))
for match in matches:
name = match.group(1).strip()
quantity = float(match.group(2).replace(',', '.'))
price_str = match.group(3) or match.group(4)
if price_str:
result['items'].append({
"name": name,
"quantity": quantity,
"price": float(price_str.replace(',', '.'))
})
return result