Spaces:

dal4933
/

TEST-FRANKO

Runtime error

TEST-FRANKO / receipt_processor /parsers /plodine_parser.py

Kristijan Nincevic

Updated Konzum, Kaufland and Studenac parsers, added a folder for locally testing the parsers

5e55f4d about 1 year ago

2.67 kB

	from .base import BaseParser
	import re
	from datetime import datetime

	class PlodineParser(BaseParser):
	def parse(self, text: str) -> dict:
	result = {
	"store": "Plodine",
	"date": None,
	"address": None,
	"items": [],
	"parser_used": "PlodineParser"
	}

	# Address extraction (Improved)
	address_lines = text.splitlines()

	street_name = None
	house_number = None
	postal_city = None

	for i, line in enumerate(address_lines):
	if "HIPERMARKET" in line.upper() or "SUPERMARKET" in line.upper():
	# This should contain something like "HIPERMARKET ZAGREB Karla Metikosa"
	parts = line.strip().split()
	if len(parts) >= 3:
	street_name = " ".join(parts[2:])

	if re.match(r'^\d+\s+[A-Za-zČĆŽŠĐčćžšđ ]+$', line.strip()):
	house_number = line.strip()

	if re.match(r'^\d{5}\s+[A-Za-zČĆŽŠĐčćžšđ]+$', line.strip()):
	postal_city = line.strip()

	if street_name and house_number and postal_city:
	result['address'] = f"{street_name}, {postal_city}"

	# Date extraction (just the date, no time)
	date_match = re.search(r'(\d{2}\.\d{2}\.\d{4})', text)
	if date_match:
	try:
	result['date'] = datetime.strptime(
	date_match.group(1),
	'%d.%m.%Y'
	).date().isoformat() # Only YYYY-MM-DD
	except:
	pass

	# Item parsing
	item_section = re.search(
	r'(?<=Artikal\nKol\nCijena\nIznos €\n)(.*?)(?=\nZA PLATITI)',
	text,
	re.DOTALL
	)
	if item_section:
	item_pattern = re.compile(
	r'^([^\n]+)\n' # Item name
	r'(\d+,\d+\|\d+)\sx\s([\d,]+)?\n?' # Quantity x UnitPrice
	r'([\d,]+)?', # Total price
	re.MULTILINE
	)
	matches = item_pattern.finditer(item_section.group(1))
	for match in matches:
	name = match.group(1).strip()
	quantity = float(match.group(2).replace(',', '.'))
	price_str = match.group(3) or match.group(4)

	if price_str:
	result['items'].append({
	"name": name,
	"quantity": quantity,
	"price": float(price_str.replace(',', '.'))
	})

	return result