Spaces:
Runtime error
Runtime error
Denny Lulak commited on
Commit ·
7965fc0
1
Parent(s): 92f5cd9
Receipt-Implementations
Browse files- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/inference.cpython-312.pyc +0 -0
- app.py +14 -14
- index.html +103 -0
- parsed_receipts/lidl_receipt.json +43 -0
- parsed_receipts/plodine_receipt.json +18 -0
- parsed_receipts/studenac_receipt.json +13 -0
- receipt_processor/__pycache__/google_ocr.cpython-312.pyc +0 -0
- receipt_processor/__pycache__/receipt_parser.cpython-312.pyc +0 -0
- receipt_processor/google_ocr.py +15 -5
- receipt_processor/parsers/__pycache__/base.cpython-312.pyc +0 -0
- receipt_processor/parsers/__pycache__/konzum_parser.cpython-312.pyc +0 -0
- receipt_processor/parsers/__pycache__/lidl_parser.cpython-312.pyc +0 -0
- receipt_processor/parsers/__pycache__/parser_selector.cpython-312.pyc +0 -0
- receipt_processor/parsers/__pycache__/plodine_parser.cpython-312.pyc +0 -0
- receipt_processor/parsers/__pycache__/studenac_parser.cpython-312.pyc +0 -0
- receipt_processor/parsers/base.py +10 -0
- receipt_processor/parsers/konzum_parser.py +44 -0
- receipt_processor/parsers/lidl_parser.py +61 -0
- receipt_processor/parsers/parser_selector.py +29 -0
- receipt_processor/parsers/plodine_parser.py +65 -0
- receipt_processor/parsers/studenac_parser.py +47 -0
- receipt_processor/receipt_parser.py +0 -58
__pycache__/app.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ
|
|
|
__pycache__/inference.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/inference.cpython-312.pyc and b/__pycache__/inference.cpython-312.pyc differ
|
|
|
app.py
CHANGED
|
@@ -4,7 +4,7 @@ from inference import ObjectDetector
|
|
| 4 |
import numpy as np
|
| 5 |
import cv2
|
| 6 |
from receipt_processor.google_ocr import GoogleVisionOCR
|
| 7 |
-
from receipt_processor.
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
MODEL_ONNX_PATH = "model.onnx"
|
|
@@ -44,8 +44,7 @@ detector = ObjectDetector(
|
|
| 44 |
input_size=INPUT_SIZE
|
| 45 |
)
|
| 46 |
ocr_processor = GoogleVisionOCR()
|
| 47 |
-
|
| 48 |
-
# Initialize FastAPI
|
| 49 |
app = FastAPI()
|
| 50 |
|
| 51 |
# Enhanced CORS configuration
|
|
@@ -95,33 +94,34 @@ async def detect_objects(file: UploadFile = File(...)):
|
|
| 95 |
except Exception as e:
|
| 96 |
raise HTTPException(500, f"Processing error: {str(e)}")
|
| 97 |
|
| 98 |
-
# Add new endpoint
|
| 99 |
@app.post("/receipt-ocr")
|
| 100 |
async def process_receipt(file: UploadFile = File(...)):
|
| 101 |
try:
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
content = await file.read()
|
|
|
|
|
|
|
| 106 |
extracted_text = ocr_processor.extract_text(content)
|
|
|
|
| 107 |
|
| 108 |
if not extracted_text:
|
| 109 |
raise HTTPException(400, "No text extracted from image")
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
return {
|
| 114 |
"status": "success",
|
| 115 |
"receipt": parsed_receipt
|
| 116 |
}
|
| 117 |
|
| 118 |
-
except HTTPException:
|
| 119 |
-
raise
|
| 120 |
except Exception as e:
|
|
|
|
| 121 |
raise HTTPException(500, f"Receipt processing error: {str(e)}")
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
if __name__ == "__main__":
|
| 126 |
import uvicorn
|
| 127 |
-
uvicorn.run(app, host="0.0.0.0", port=7860) # Hugging Face requires port 7860
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import cv2
|
| 6 |
from receipt_processor.google_ocr import GoogleVisionOCR
|
| 7 |
+
from receipt_processor.parsers.parser_selector import ParserSelector
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
MODEL_ONNX_PATH = "model.onnx"
|
|
|
|
| 44 |
input_size=INPUT_SIZE
|
| 45 |
)
|
| 46 |
ocr_processor = GoogleVisionOCR()
|
| 47 |
+
parser_selector = ParserSelector() # Initialize FastAPI
|
|
|
|
| 48 |
app = FastAPI()
|
| 49 |
|
| 50 |
# Enhanced CORS configuration
|
|
|
|
| 94 |
except Exception as e:
|
| 95 |
raise HTTPException(500, f"Processing error: {str(e)}")
|
| 96 |
|
|
|
|
| 97 |
@app.post("/receipt-ocr")
|
| 98 |
async def process_receipt(file: UploadFile = File(...)):
|
| 99 |
try:
|
| 100 |
+
print(f"Received file: {file.filename} ({file.content_type})")
|
| 101 |
+
|
|
|
|
| 102 |
content = await file.read()
|
| 103 |
+
print(f"File size: {len(content)} bytes")
|
| 104 |
+
|
| 105 |
extracted_text = ocr_processor.extract_text(content)
|
| 106 |
+
print(f"Extracted text length: {len(extracted_text)} chars")
|
| 107 |
|
| 108 |
if not extracted_text:
|
| 109 |
raise HTTPException(400, "No text extracted from image")
|
| 110 |
|
| 111 |
+
parser = parser_selector.get_store_parser(extracted_text)
|
| 112 |
+
print(f"Using parser: {parser.__class__.__name__}")
|
| 113 |
+
|
| 114 |
+
parsed_receipt = parser.parse(extracted_text)
|
| 115 |
+
print("Parsing completed successfully")
|
| 116 |
|
| 117 |
return {
|
| 118 |
"status": "success",
|
| 119 |
"receipt": parsed_receipt
|
| 120 |
}
|
| 121 |
|
|
|
|
|
|
|
| 122 |
except Exception as e:
|
| 123 |
+
print(f"ERROR: {str(e)}")
|
| 124 |
raise HTTPException(500, f"Receipt processing error: {str(e)}")
|
|
|
|
|
|
|
|
|
|
| 125 |
if __name__ == "__main__":
|
| 126 |
import uvicorn
|
| 127 |
+
uvicorn.run(app, host="0.0.0.0", port=7860) # Hugging Face requires port 7860
|
index.html
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Receipt Parser</title>
|
| 7 |
+
<style>
|
| 8 |
+
body {
|
| 9 |
+
font-family: Arial, sans-serif;
|
| 10 |
+
max-width: 800px;
|
| 11 |
+
margin: 20px auto;
|
| 12 |
+
padding: 0 20px;
|
| 13 |
+
}
|
| 14 |
+
.upload-box {
|
| 15 |
+
border: 2px dashed #ccc;
|
| 16 |
+
padding: 20px;
|
| 17 |
+
text-align: center;
|
| 18 |
+
margin-bottom: 20px;
|
| 19 |
+
}
|
| 20 |
+
#preview {
|
| 21 |
+
max-width: 100%;
|
| 22 |
+
margin-top: 10px;
|
| 23 |
+
display: none;
|
| 24 |
+
}
|
| 25 |
+
pre {
|
| 26 |
+
background: #f0f0f0;
|
| 27 |
+
padding: 15px;
|
| 28 |
+
border-radius: 5px;
|
| 29 |
+
white-space: pre-wrap;
|
| 30 |
+
word-wrap: break-word;
|
| 31 |
+
}
|
| 32 |
+
.loading {
|
| 33 |
+
display: none;
|
| 34 |
+
color: #666;
|
| 35 |
+
margin: 10px 0;
|
| 36 |
+
}
|
| 37 |
+
</style>
|
| 38 |
+
</head>
|
| 39 |
+
<body>
|
| 40 |
+
<h1>Receipt Parser</h1>
|
| 41 |
+
|
| 42 |
+
<div class="upload-box">
|
| 43 |
+
<input type="file" id="fileInput" accept="image/*">
|
| 44 |
+
<p>Drag and drop or click to upload receipt</p>
|
| 45 |
+
<img id="preview">
|
| 46 |
+
</div>
|
| 47 |
+
|
| 48 |
+
<div class="loading" id="loading">
|
| 49 |
+
Processing receipt...
|
| 50 |
+
</div>
|
| 51 |
+
|
| 52 |
+
<pre id="results" style="display: none;"></pre>
|
| 53 |
+
|
| 54 |
+
<script>
|
| 55 |
+
const fileInput = document.getElementById('fileInput');
|
| 56 |
+
const preview = document.getElementById('preview');
|
| 57 |
+
const loading = document.getElementById('loading');
|
| 58 |
+
const results = document.getElementById('results');
|
| 59 |
+
|
| 60 |
+
fileInput.addEventListener('change', handleFile);
|
| 61 |
+
|
| 62 |
+
document.addEventListener('dragover', e => e.preventDefault());
|
| 63 |
+
document.addEventListener('drop', e => {
|
| 64 |
+
e.preventDefault();
|
| 65 |
+
if (e.dataTransfer.files[0]) {
|
| 66 |
+
fileInput.files = e.dataTransfer.files;
|
| 67 |
+
handleFile();
|
| 68 |
+
}
|
| 69 |
+
});
|
| 70 |
+
|
| 71 |
+
async function handleFile() {
|
| 72 |
+
const file = fileInput.files[0];
|
| 73 |
+
if (!file) return;
|
| 74 |
+
|
| 75 |
+
preview.src = URL.createObjectURL(file);
|
| 76 |
+
preview.style.display = 'block';
|
| 77 |
+
|
| 78 |
+
loading.style.display = 'block';
|
| 79 |
+
results.style.display = 'none';
|
| 80 |
+
|
| 81 |
+
try {
|
| 82 |
+
const formData = new FormData();
|
| 83 |
+
formData.append('file', file);
|
| 84 |
+
|
| 85 |
+
const response = await fetch('http://localhost:7860/receipt-ocr', {
|
| 86 |
+
method: 'POST',
|
| 87 |
+
body: formData
|
| 88 |
+
});
|
| 89 |
+
|
| 90 |
+
const data = await response.json();
|
| 91 |
+
results.textContent = JSON.stringify(data, null, 2);
|
| 92 |
+
results.style.display = 'block';
|
| 93 |
+
|
| 94 |
+
} catch (error) {
|
| 95 |
+
results.textContent = `Error: ${error.message}`;
|
| 96 |
+
results.style.display = 'block';
|
| 97 |
+
} finally {
|
| 98 |
+
loading.style.display = 'none';
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
</script>
|
| 102 |
+
</body>
|
| 103 |
+
</html>
|
parsed_receipts/lidl_receipt.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"store": "Lidl",
|
| 3 |
+
"date": "2025-04-06",
|
| 4 |
+
"address": "Lastovska ulica 42, Zagreb",
|
| 5 |
+
"items": [
|
| 6 |
+
{
|
| 7 |
+
"name": "Vrećica mala",
|
| 8 |
+
"quantity": 1,
|
| 9 |
+
"price": 0.1
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"name": "Violeta Pr. toal p",
|
| 13 |
+
"quantity": 1,
|
| 14 |
+
"price": 4.89
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"name": "Toast bijeli",
|
| 18 |
+
"quantity": 1,
|
| 19 |
+
"price": 0.99
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"name": "Franck kava, 400",
|
| 23 |
+
"quantity": 1,
|
| 24 |
+
"price": 8.29
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "Cascaval listići",
|
| 28 |
+
"quantity": 1,
|
| 29 |
+
"price": 2.09
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"name": "Kulenova seka",
|
| 33 |
+
"quantity": 1,
|
| 34 |
+
"price": 1.99
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"name": "Trajno mlijeko 2.8%",
|
| 38 |
+
"quantity": 1,
|
| 39 |
+
"price": 0.79
|
| 40 |
+
}
|
| 41 |
+
],
|
| 42 |
+
"parser_used": "LidlParser"
|
| 43 |
+
}
|
parsed_receipts/plodine_receipt.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"store": "Plodine",
|
| 3 |
+
"date": "2025-04-04T13:05:57",
|
| 4 |
+
"address": "HIPERMARKET ZAGREB Karla Metikosa, Karla Metikosa 4",
|
| 5 |
+
"items": [
|
| 6 |
+
{
|
| 7 |
+
"name": "JAJA PLODINE M 18/1",
|
| 8 |
+
"quantity": 1.0,
|
| 9 |
+
"price": 3.99
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"name": "BANANA SORTA CAVENDISH",
|
| 13 |
+
"quantity": 0.348,
|
| 14 |
+
"price": 1.44
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"parser_used": "PlodineParser"
|
| 18 |
+
}
|
parsed_receipts/studenac_receipt.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"store": "Studenac",
|
| 3 |
+
"date": "2025-04-01",
|
| 4 |
+
"address": "Zagreb, Karla Metikoša 2",
|
| 5 |
+
"items": [
|
| 6 |
+
{
|
| 7 |
+
"name": "TEREA SILVER",
|
| 8 |
+
"quantity": 3,
|
| 9 |
+
"price": 4.1
|
| 10 |
+
}
|
| 11 |
+
],
|
| 12 |
+
"parser_used": "StudenacParser"
|
| 13 |
+
}
|
receipt_processor/__pycache__/google_ocr.cpython-312.pyc
CHANGED
|
Binary files a/receipt_processor/__pycache__/google_ocr.cpython-312.pyc and b/receipt_processor/__pycache__/google_ocr.cpython-312.pyc differ
|
|
|
receipt_processor/__pycache__/receipt_parser.cpython-312.pyc
DELETED
|
Binary file (3.37 kB)
|
|
|
receipt_processor/google_ocr.py
CHANGED
|
@@ -1,12 +1,22 @@
|
|
| 1 |
from google.cloud import vision
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
class GoogleVisionOCR:
|
| 5 |
def __init__(self):
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
self.client = vision.ImageAnnotatorClient()
|
| 8 |
|
| 9 |
-
def extract_text(self, image_content
|
| 10 |
-
image
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from google.cloud import vision
|
| 2 |
import os
|
| 3 |
+
import io
|
| 4 |
|
| 5 |
class GoogleVisionOCR:
|
| 6 |
def __init__(self):
|
| 7 |
+
# Initialize with either environment credentials or local key file
|
| 8 |
+
creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS", "receipt-vision-key.json")
|
| 9 |
+
if os.path.exists(creds_path):
|
| 10 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
|
| 11 |
self.client = vision.ImageAnnotatorClient()
|
| 12 |
|
| 13 |
+
def extract_text(self, image_content):
|
| 14 |
+
"""Extracts text from image using Google Vision API"""
|
| 15 |
+
try:
|
| 16 |
+
image = vision.Image(content=image_content)
|
| 17 |
+
response = self.client.text_detection(image=image)
|
| 18 |
+
texts = response.text_annotations
|
| 19 |
+
return texts[0].description if texts else ""
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f"OCR Error: {str(e)}")
|
| 22 |
+
return ""
|
receipt_processor/parsers/__pycache__/base.cpython-312.pyc
ADDED
|
Binary file (944 Bytes). View file
|
|
|
receipt_processor/parsers/__pycache__/konzum_parser.cpython-312.pyc
ADDED
|
Binary file (2.26 kB). View file
|
|
|
receipt_processor/parsers/__pycache__/lidl_parser.cpython-312.pyc
ADDED
|
Binary file (2.48 kB). View file
|
|
|
receipt_processor/parsers/__pycache__/parser_selector.cpython-312.pyc
ADDED
|
Binary file (2.01 kB). View file
|
|
|
receipt_processor/parsers/__pycache__/plodine_parser.cpython-312.pyc
ADDED
|
Binary file (2.69 kB). View file
|
|
|
receipt_processor/parsers/__pycache__/studenac_parser.cpython-312.pyc
ADDED
|
Binary file (2.02 kB). View file
|
|
|
receipt_processor/parsers/base.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
|
| 3 |
+
class BaseParser(ABC):
|
| 4 |
+
@abstractmethod
|
| 5 |
+
def parse(self, text: str) -> dict:
|
| 6 |
+
pass
|
| 7 |
+
|
| 8 |
+
@classmethod
|
| 9 |
+
def get_parser_name(cls) -> str:
|
| 10 |
+
return cls.__name__.replace('Parser', '').lower()
|
receipt_processor/parsers/konzum_parser.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseParser
|
| 2 |
+
import re
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
class KonzumParser(BaseParser):
|
| 6 |
+
def parse(self, text: str) -> dict:
|
| 7 |
+
result = {
|
| 8 |
+
"store": "Konzum",
|
| 9 |
+
"date": None,
|
| 10 |
+
"address": None,
|
| 11 |
+
"items": [],
|
| 12 |
+
"parser_used": "KonzumParser"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
# Address extraction
|
| 16 |
+
address_matches = re.findall(r'Zagreb, [A-Za-zšđžčćŠĐŽČĆ0-9 ]+ \d+', text)
|
| 17 |
+
if len(address_matches) > 1:
|
| 18 |
+
result['address'] = address_matches[1]
|
| 19 |
+
|
| 20 |
+
# Date extraction
|
| 21 |
+
date_match = re.search(r'Datum[\s:]*(\d{2}\.\d{2}\.\d{4})', text)
|
| 22 |
+
if date_match:
|
| 23 |
+
try:
|
| 24 |
+
result['date'] = datetime.strptime(date_match.group(1), '%d.%m.%Y').date().isoformat()
|
| 25 |
+
except:
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
# Item parsing
|
| 29 |
+
item_section = re.search(r'Naziv artikla.*?Kol Cijena\nIznos P\n(.*?)\nUKUPNO', text, re.DOTALL)
|
| 30 |
+
if item_section:
|
| 31 |
+
item_lines = re.finditer(
|
| 32 |
+
r'^([A-ZŠĐŽČĆ][A-Za-zšđžčć0-9 \/\.-]+?)\s+(\d+)\s+([\d,]+)',
|
| 33 |
+
item_section.group(1),
|
| 34 |
+
re.MULTILINE
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
for match in item_lines:
|
| 38 |
+
result['items'].append({
|
| 39 |
+
"name": match.group(1).strip(),
|
| 40 |
+
"quantity": int(match.group(2)),
|
| 41 |
+
"price": float(match.group(3).replace(',', '.'))
|
| 42 |
+
})
|
| 43 |
+
|
| 44 |
+
return result
|
receipt_processor/parsers/lidl_parser.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseParser
|
| 2 |
+
import re
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
class LidlParser(BaseParser):
|
| 6 |
+
def parse(self, text: str) -> dict:
|
| 7 |
+
result = {
|
| 8 |
+
"store": "Lidl",
|
| 9 |
+
"date": None,
|
| 10 |
+
"address": None,
|
| 11 |
+
"items": [],
|
| 12 |
+
"parser_used": "LidlParser"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
# Address extraction (your original working version)
|
| 16 |
+
address_match = re.search(
|
| 17 |
+
r'OIB:[^\n]*\n([^\n]*\d+,\s*Zagreb)\s*\n€',
|
| 18 |
+
text,
|
| 19 |
+
re.MULTILINE
|
| 20 |
+
)
|
| 21 |
+
if address_match:
|
| 22 |
+
result['address'] = address_match.group(1).strip()
|
| 23 |
+
|
| 24 |
+
# Date extraction (your working version)
|
| 25 |
+
date_match = re.search(r'Datum:\s*(\d{2}\.\d{2}\.\d{4})', text)
|
| 26 |
+
if date_match:
|
| 27 |
+
try:
|
| 28 |
+
result['date'] = datetime.strptime(
|
| 29 |
+
date_match.group(1),
|
| 30 |
+
'%d.%m.%Y'
|
| 31 |
+
).date().isoformat()
|
| 32 |
+
except:
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
+
# Item parsing (your original working pattern)
|
| 36 |
+
item_section = re.search(
|
| 37 |
+
r'(?<=€\n)(.*?)(?=\nza platiti)',
|
| 38 |
+
text,
|
| 39 |
+
re.DOTALL
|
| 40 |
+
)
|
| 41 |
+
if item_section:
|
| 42 |
+
item_pattern = re.compile(
|
| 43 |
+
r'^(\d+)\s+((?:[^\d\n]|[\d,]+[^\n])+?)\n'
|
| 44 |
+
r'([\d,]+)\s+[A-Z]$',
|
| 45 |
+
re.MULTILINE
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
matches = item_pattern.finditer(item_section.group(1))
|
| 49 |
+
for match in matches:
|
| 50 |
+
quantity = int(match.group(1))
|
| 51 |
+
item_name = match.group(2).strip()
|
| 52 |
+
price = float(match.group(3).replace(',', '.'))
|
| 53 |
+
|
| 54 |
+
if not re.search(r'[<>*]|EUR:|^\d+$', item_name):
|
| 55 |
+
result['items'].append({
|
| 56 |
+
"name": item_name,
|
| 57 |
+
"quantity": quantity,
|
| 58 |
+
"price": price
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
return result
|
receipt_processor/parsers/parser_selector.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
from .base import BaseParser
|
| 3 |
+
|
| 4 |
+
class ParserSelector:
|
| 5 |
+
def __init__(self):
|
| 6 |
+
self.store_keywords = {
|
| 7 |
+
'lidl': ['lidl'],
|
| 8 |
+
'konzum': ['konzum'],
|
| 9 |
+
'plodine': ['plodine'],
|
| 10 |
+
'studenac': ['studenac']
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
def get_store_parser(self, text: str):
|
| 14 |
+
text_lower = text.lower()
|
| 15 |
+
|
| 16 |
+
for store, keywords in self.store_keywords.items():
|
| 17 |
+
if any(kw in text_lower for kw in keywords):
|
| 18 |
+
try:
|
| 19 |
+
module = importlib.import_module(f"receipt_processor.parsers.{store}_parser")
|
| 20 |
+
for attr_name in dir(module):
|
| 21 |
+
attr = getattr(module, attr_name)
|
| 22 |
+
try:
|
| 23 |
+
if issubclass(attr, BaseParser) and attr != BaseParser:
|
| 24 |
+
return attr()
|
| 25 |
+
except TypeError:
|
| 26 |
+
continue
|
| 27 |
+
except ModuleNotFoundError:
|
| 28 |
+
continue
|
| 29 |
+
raise ValueError(f"No parser found for text: {text[:50]}...")
|
receipt_processor/parsers/plodine_parser.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseParser
|
| 2 |
+
import re
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
class PlodineParser(BaseParser):
|
| 6 |
+
def parse(self, text: str) -> dict:
|
| 7 |
+
result = {
|
| 8 |
+
"store": "Plodine",
|
| 9 |
+
"date": None,
|
| 10 |
+
"address": None,
|
| 11 |
+
"items": [],
|
| 12 |
+
"parser_used": "PlodineParser"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
# Address extraction (your original working version)
|
| 16 |
+
address_match = re.search(
|
| 17 |
+
r'PLODINE d\.d\. Rijeka\n.*?\nOIB:.*?\n(.*?)\n(.*?)\n\d+',
|
| 18 |
+
text,
|
| 19 |
+
re.DOTALL
|
| 20 |
+
)
|
| 21 |
+
if address_match:
|
| 22 |
+
street = address_match.group(1).strip()
|
| 23 |
+
city = address_match.group(2).strip()
|
| 24 |
+
if "Rijeka" not in street:
|
| 25 |
+
result['address'] = f"{street}, {city}"
|
| 26 |
+
|
| 27 |
+
# Date extraction (your working version)
|
| 28 |
+
date_match = re.search(r'(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2})', text)
|
| 29 |
+
if date_match:
|
| 30 |
+
try:
|
| 31 |
+
result['date'] = datetime.strptime(
|
| 32 |
+
date_match.group(1),
|
| 33 |
+
'%d.%m.%Y %H:%M:%S'
|
| 34 |
+
).isoformat()
|
| 35 |
+
except:
|
| 36 |
+
pass
|
| 37 |
+
|
| 38 |
+
# Item parsing (your original working pattern)
|
| 39 |
+
item_section = re.search(
|
| 40 |
+
r'(?<=Artikal\nKol\nCijena\nIznos €\n)(.*?)(?=\nZA PLATITI)',
|
| 41 |
+
text,
|
| 42 |
+
re.DOTALL
|
| 43 |
+
)
|
| 44 |
+
if item_section:
|
| 45 |
+
item_pattern = re.compile(
|
| 46 |
+
r'^([^\n]+)\n' # Item name
|
| 47 |
+
r'(\d+,\d+|\d+)\s*x\s*([\d,]+)?\n?' # Quantity
|
| 48 |
+
r'([\d,]+)?', # Price
|
| 49 |
+
re.MULTILINE
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
matches = item_pattern.finditer(item_section.group(1))
|
| 53 |
+
for match in matches:
|
| 54 |
+
name = match.group(1).strip()
|
| 55 |
+
quantity = float(match.group(2).replace(',', '.'))
|
| 56 |
+
price_str = match.group(3) or match.group(4)
|
| 57 |
+
|
| 58 |
+
if price_str:
|
| 59 |
+
result['items'].append({
|
| 60 |
+
"name": name,
|
| 61 |
+
"quantity": quantity,
|
| 62 |
+
"price": float(price_str.replace(',', '.'))
|
| 63 |
+
})
|
| 64 |
+
|
| 65 |
+
return result
|
receipt_processor/parsers/studenac_parser.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import BaseParser
|
| 2 |
+
import re
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
class StudenacParser(BaseParser):
|
| 6 |
+
def parse(self, text: str) -> dict:
|
| 7 |
+
result = {
|
| 8 |
+
"store": "Studenac",
|
| 9 |
+
"date": None,
|
| 10 |
+
"address": None,
|
| 11 |
+
"items": [],
|
| 12 |
+
"parser_used": "StudenacParser"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
# Address extraction (your working version)
|
| 16 |
+
address_match = re.search(
|
| 17 |
+
r'Prodavaonica \d+\n([^,]+,\s*[^\n]+)\nBlagajna',
|
| 18 |
+
text
|
| 19 |
+
)
|
| 20 |
+
if address_match:
|
| 21 |
+
result['address'] = address_match.group(1).strip()
|
| 22 |
+
|
| 23 |
+
# Date extraction (your working version)
|
| 24 |
+
date_match = re.search(r'Datum:\s*(\d{2}\.\d{2}\.\d{4})', text)
|
| 25 |
+
if date_match:
|
| 26 |
+
try:
|
| 27 |
+
result['date'] = datetime.strptime(
|
| 28 |
+
date_match.group(1),
|
| 29 |
+
'%d.%m.%Y'
|
| 30 |
+
).date().isoformat()
|
| 31 |
+
except:
|
| 32 |
+
pass
|
| 33 |
+
|
| 34 |
+
# Item parsing (your original working pattern)
|
| 35 |
+
item_match = re.search(
|
| 36 |
+
r'(C)\s+(TEREA SILVER)\n.*?\n(\d+)\s+([\d,]+)\n([\d,]+)',
|
| 37 |
+
text,
|
| 38 |
+
re.DOTALL
|
| 39 |
+
)
|
| 40 |
+
if item_match:
|
| 41 |
+
result['items'].append({
|
| 42 |
+
"name": item_match.group(2).strip(),
|
| 43 |
+
"quantity": int(item_match.group(3)),
|
| 44 |
+
"price": float(item_match.group(4).replace(',', '.'))
|
| 45 |
+
})
|
| 46 |
+
|
| 47 |
+
return result
|
receipt_processor/receipt_parser.py
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
class ReceiptParser:
|
| 4 |
-
@staticmethod
|
| 5 |
-
def parse_receipt_text(full_text: str) -> dict:
|
| 6 |
-
lines = full_text.splitlines()
|
| 7 |
-
receipt = {"store": None, "date": None, "total": None, "items": []}
|
| 8 |
-
|
| 9 |
-
# Store detection
|
| 10 |
-
for line in lines:
|
| 11 |
-
if any(kw in line.lower() for kw in ["konzum", "plodine", "studenac"]):
|
| 12 |
-
receipt["store"] = line.strip()
|
| 13 |
-
break
|
| 14 |
-
|
| 15 |
-
# Date detection
|
| 16 |
-
for line in lines:
|
| 17 |
-
if match := re.search(r'\b(\d{2}\.\d{2}\.\d{4})\b', line):
|
| 18 |
-
receipt["date"] = match.group(1)
|
| 19 |
-
break
|
| 20 |
-
|
| 21 |
-
# Total detection
|
| 22 |
-
for line in reversed(lines):
|
| 23 |
-
if any(word in line.lower() for word in ["ukupno", "za platiti"]):
|
| 24 |
-
if match := re.search(r'(\d+,\d{2})', line):
|
| 25 |
-
receipt["total"] = f"{match.group(1).replace(',', '.')} EUR"
|
| 26 |
-
break
|
| 27 |
-
|
| 28 |
-
# Item parsing logic
|
| 29 |
-
merged_lines = []
|
| 30 |
-
skip_next = False
|
| 31 |
-
for i, line in enumerate(lines):
|
| 32 |
-
if skip_next:
|
| 33 |
-
skip_next = False
|
| 34 |
-
continue
|
| 35 |
-
if re.search(r'\d+,\d{2}$', line):
|
| 36 |
-
if i+1 < len(lines) and re.match(r'^\d+,\d{2}', lines[i+1]):
|
| 37 |
-
merged_lines.append(f"{line} {lines[i+1]}")
|
| 38 |
-
skip_next = True
|
| 39 |
-
continue
|
| 40 |
-
merged_lines.append(line)
|
| 41 |
-
|
| 42 |
-
item_patterns = [
|
| 43 |
-
re.compile(r'(.+?)\s+(\d+)\s+(\d+,\d{2})\s+(\d+,\d{2})'),
|
| 44 |
-
re.compile(r'(.+?)\s+(\d+)\s+x\s+(\d+,\d{2})\s+(\d+,\d{2})'),
|
| 45 |
-
re.compile(r'(.+?)\s+(\d+)\s+(\d+)\s+(\d+,\d{2})'),
|
| 46 |
-
]
|
| 47 |
-
|
| 48 |
-
for line in merged_lines:
|
| 49 |
-
for pattern in item_patterns:
|
| 50 |
-
if match := pattern.match(line):
|
| 51 |
-
receipt["items"].append({
|
| 52 |
-
"name": match.group(1).strip().title(),
|
| 53 |
-
"qty": int(match.group(2)),
|
| 54 |
-
"price": match.group(4).replace(",", ".")
|
| 55 |
-
})
|
| 56 |
-
break
|
| 57 |
-
|
| 58 |
-
return receipt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|