Spaces:
Sleeping
Sleeping
Add document parser Docker service
Browse files- Dockerfile +23 -0
- README.md +5 -6
- scripts/parse_vendor_document.py +507 -0
- scripts/requirements-document-parser.txt +7 -0
- services/document-parser-api/main.py +85 -0
- services/document-parser-api/requirements.txt +8 -0
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 4 |
+
tesseract-ocr \
|
| 5 |
+
libgl1 \
|
| 6 |
+
libglib2.0-0 \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
COPY scripts/requirements-document-parser.txt /app/scripts/requirements-document-parser.txt
|
| 12 |
+
COPY services/document-parser-api/requirements.txt /app/services/document-parser-api/requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir -r /app/services/document-parser-api/requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY scripts/parse_vendor_document.py /app/scripts/parse_vendor_document.py
|
| 16 |
+
COPY services/document-parser-api/main.py /app/services/document-parser-api/main.py
|
| 17 |
+
|
| 18 |
+
ENV PYTHONUNBUFFERED=1
|
| 19 |
+
WORKDIR /app/services/document-parser-api
|
| 20 |
+
|
| 21 |
+
EXPOSE 7860
|
| 22 |
+
|
| 23 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
---
|
| 2 |
title: Fresh Catch Parser
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
---
|
| 9 |
-
|
| 10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
title: Fresh Catch Parser
|
| 3 |
+
emoji: 🐟
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
---
|
| 9 |
+
Document parser API for Fresh Catch Inventory.
|
|
|
scripts/parse_vendor_document.py
ADDED
|
@@ -0,0 +1,507 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Parse vendor invoices (LayoutLMv3 FUNSD) or retail receipts (Donut CORD v2).
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python3 scripts/parse_vendor_document.py --image /path/to.png [--type invoice|receipt|auto]
|
| 7 |
+
|
| 8 |
+
Prints a single JSON object to stdout matching ParsedVendorInvoice.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import json
|
| 15 |
+
import re
|
| 16 |
+
import sys
|
| 17 |
+
from dataclasses import dataclass
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Any
|
| 20 |
+
|
| 21 |
+
RECEIPT_MODEL = "naver-clova-ix/donut-base-finetuned-cord-v2"
|
| 22 |
+
INVOICE_MODEL = "nielsr/layoutlmv3-finetuned-funsd"
|
| 23 |
+
|
| 24 |
+
INVOICE_HINTS = (
|
| 25 |
+
"invoice",
|
| 26 |
+
"inv #",
|
| 27 |
+
"inv no",
|
| 28 |
+
"bill to",
|
| 29 |
+
"ship to",
|
| 30 |
+
"purchase order",
|
| 31 |
+
"po #",
|
| 32 |
+
"remit to",
|
| 33 |
+
"net 30",
|
| 34 |
+
"del weight",
|
| 35 |
+
"unit price",
|
| 36 |
+
"vendor",
|
| 37 |
+
"food service",
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
RECEIPT_HINTS = (
|
| 41 |
+
"receipt",
|
| 42 |
+
"thank you",
|
| 43 |
+
"subtotal",
|
| 44 |
+
"sub total",
|
| 45 |
+
"change due",
|
| 46 |
+
"cashier",
|
| 47 |
+
"register",
|
| 48 |
+
"visa",
|
| 49 |
+
"mastercard",
|
| 50 |
+
"debit",
|
| 51 |
+
"loyalty",
|
| 52 |
+
"store #",
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class OcrWord:
|
| 58 |
+
text: str
|
| 59 |
+
left: int
|
| 60 |
+
top: int
|
| 61 |
+
width: int
|
| 62 |
+
height: int
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def box(self) -> list[int]:
|
| 66 |
+
return [self.left, self.top, self.left + self.width, self.top + self.height]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def eprint(*args: object) -> None:
|
| 70 |
+
print(*args, file=sys.stderr)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def load_image(path: Path):
|
| 74 |
+
from PIL import Image
|
| 75 |
+
|
| 76 |
+
image = Image.open(path).convert("RGB")
|
| 77 |
+
return image
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def ocr_words(image) -> list[OcrWord]:
|
| 81 |
+
import pytesseract
|
| 82 |
+
|
| 83 |
+
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
| 84 |
+
words: list[OcrWord] = []
|
| 85 |
+
count = len(data["text"])
|
| 86 |
+
for i in range(count):
|
| 87 |
+
text = (data["text"][i] or "").strip()
|
| 88 |
+
if not text:
|
| 89 |
+
continue
|
| 90 |
+
conf = int(float(data["conf"][i])) if data["conf"][i] not in ("-1", "") else -1
|
| 91 |
+
if conf >= 0 and conf < 35:
|
| 92 |
+
continue
|
| 93 |
+
words.append(
|
| 94 |
+
OcrWord(
|
| 95 |
+
text=text,
|
| 96 |
+
left=int(data["left"][i]),
|
| 97 |
+
top=int(data["top"][i]),
|
| 98 |
+
width=int(data["width"][i]),
|
| 99 |
+
height=int(data["height"][i]),
|
| 100 |
+
)
|
| 101 |
+
)
|
| 102 |
+
return words
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def normalize_boxes(words: list[OcrWord], width: int, height: int) -> list[list[int]]:
|
| 106 |
+
boxes: list[list[int]] = []
|
| 107 |
+
for word in words:
|
| 108 |
+
x0, y0, x1, y1 = word.box
|
| 109 |
+
boxes.append(
|
| 110 |
+
[
|
| 111 |
+
min(1000, max(0, int(1000 * x0 / width))),
|
| 112 |
+
min(1000, max(0, int(1000 * y0 / height))),
|
| 113 |
+
min(1000, max(0, int(1000 * x1 / width))),
|
| 114 |
+
min(1000, max(0, int(1000 * y1 / height))),
|
| 115 |
+
]
|
| 116 |
+
)
|
| 117 |
+
return boxes
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def classify_document_type(words: list[OcrWord], forced: str | None) -> str:
|
| 121 |
+
if forced in ("invoice", "receipt"):
|
| 122 |
+
return forced
|
| 123 |
+
|
| 124 |
+
text = " ".join(word.text for word in words).lower()
|
| 125 |
+
invoice_score = sum(1 for hint in INVOICE_HINTS if hint in text)
|
| 126 |
+
receipt_score = sum(1 for hint in RECEIPT_HINTS if hint in text)
|
| 127 |
+
|
| 128 |
+
if "invoice" in text or "inv " in text:
|
| 129 |
+
invoice_score += 2
|
| 130 |
+
if "receipt" in text:
|
| 131 |
+
receipt_score += 2
|
| 132 |
+
|
| 133 |
+
if invoice_score > receipt_score + 1:
|
| 134 |
+
return "invoice"
|
| 135 |
+
if receipt_score > invoice_score:
|
| 136 |
+
return "receipt"
|
| 137 |
+
return "invoice"
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def parse_loose_number(value: Any) -> float | None:
|
| 141 |
+
if isinstance(value, (int, float)):
|
| 142 |
+
return float(value)
|
| 143 |
+
if not isinstance(value, str):
|
| 144 |
+
return None
|
| 145 |
+
cleaned = re.sub(r"[^0-9.,-]", "", value).replace(",", ".")
|
| 146 |
+
if not cleaned:
|
| 147 |
+
return None
|
| 148 |
+
try:
|
| 149 |
+
return float(cleaned)
|
| 150 |
+
except ValueError:
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def normalize_date(value: str | None) -> str | None:
|
| 155 |
+
if not value:
|
| 156 |
+
return None
|
| 157 |
+
value = value.strip()
|
| 158 |
+
if re.match(r"^\d{4}-\d{2}-\d{2}$", value):
|
| 159 |
+
return value
|
| 160 |
+
match = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", value)
|
| 161 |
+
if not match:
|
| 162 |
+
return value
|
| 163 |
+
month, day, year = match.groups()
|
| 164 |
+
if len(year) == 2:
|
| 165 |
+
year = f"20{year}"
|
| 166 |
+
return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def map_cord_json(cord: dict[str, Any]) -> dict[str, Any]:
|
| 170 |
+
line_items: list[dict[str, Any]] = []
|
| 171 |
+
menu = cord.get("menu")
|
| 172 |
+
menus = menu if isinstance(menu, list) else [menu] if isinstance(menu, dict) else []
|
| 173 |
+
|
| 174 |
+
for entry in menus:
|
| 175 |
+
if not isinstance(entry, dict):
|
| 176 |
+
continue
|
| 177 |
+
description = (
|
| 178 |
+
entry.get("nm")
|
| 179 |
+
or entry.get("item")
|
| 180 |
+
or entry.get("name")
|
| 181 |
+
or entry.get("menu.nm")
|
| 182 |
+
)
|
| 183 |
+
if not description or not str(description).strip():
|
| 184 |
+
continue
|
| 185 |
+
line_items.append(
|
| 186 |
+
{
|
| 187 |
+
"description": str(description).strip(),
|
| 188 |
+
"vendorItemNumber": None,
|
| 189 |
+
"quantity": parse_loose_number(entry.get("cnt") or entry.get("num")),
|
| 190 |
+
"unit": str(entry.get("unit") or entry.get("itemsubtotal") or "").strip() or None,
|
| 191 |
+
"unitPrice": parse_loose_number(
|
| 192 |
+
entry.get("unitprice") or entry.get("price") or entry.get("itemprice")
|
| 193 |
+
),
|
| 194 |
+
"lineTotal": parse_loose_number(
|
| 195 |
+
entry.get("price") or entry.get("cntprice") or entry.get("itemprice")
|
| 196 |
+
),
|
| 197 |
+
}
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
sub_total = cord.get("sub_total") or cord.get("subtotal")
|
| 201 |
+
tax = cord.get("tax") or cord.get("tax_price")
|
| 202 |
+
total = cord.get("total") or cord.get("total_price") or cord.get("total_etc")
|
| 203 |
+
|
| 204 |
+
def price_field(block: Any, *keys: str) -> float | None:
|
| 205 |
+
if isinstance(block, dict):
|
| 206 |
+
for key in keys:
|
| 207 |
+
if key in block:
|
| 208 |
+
return parse_loose_number(block[key])
|
| 209 |
+
return parse_loose_number(block)
|
| 210 |
+
|
| 211 |
+
return {
|
| 212 |
+
"vendorName": str(cord.get("store") or cord.get("company") or cord.get("brand") or "").strip()
|
| 213 |
+
or None,
|
| 214 |
+
"invoiceNumber": str(cord.get("receipt_no") or cord.get("order_no") or "").strip() or None,
|
| 215 |
+
"invoiceDate": normalize_date(
|
| 216 |
+
str(cord.get("date") or cord.get("receipt_date") or "").strip() or None
|
| 217 |
+
),
|
| 218 |
+
"subtotal": price_field(sub_total, "price", "subtotal_price", "sub_total_price"),
|
| 219 |
+
"tax": price_field(tax, "price", "tax_price"),
|
| 220 |
+
"total": price_field(total, "total_price", "price", "total"),
|
| 221 |
+
"currency": None,
|
| 222 |
+
"confidence": "medium" if line_items else "low",
|
| 223 |
+
"rawNotes": json.dumps(cord)[:4000] if cord else None,
|
| 224 |
+
"lineItems": line_items,
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def parse_receipt(image) -> dict[str, Any]:
|
| 229 |
+
import torch
|
| 230 |
+
from transformers import DonutProcessor, VisionEncoderDecoderModel
|
| 231 |
+
|
| 232 |
+
processor = DonutProcessor.from_pretrained(RECEIPT_MODEL)
|
| 233 |
+
model = VisionEncoderDecoderModel.from_pretrained(RECEIPT_MODEL)
|
| 234 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 235 |
+
model.to(device)
|
| 236 |
+
model.eval()
|
| 237 |
+
|
| 238 |
+
pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
|
| 239 |
+
task_prompt = "<s_cord-v2>"
|
| 240 |
+
decoder_input_ids = processor.tokenizer(
|
| 241 |
+
task_prompt, add_special_tokens=False, return_tensors="pt"
|
| 242 |
+
).input_ids.to(device)
|
| 243 |
+
|
| 244 |
+
outputs = model.generate(
|
| 245 |
+
pixel_values,
|
| 246 |
+
decoder_input_ids=decoder_input_ids,
|
| 247 |
+
max_length=model.decoder.config.max_position_embeddings,
|
| 248 |
+
early_stopping=True,
|
| 249 |
+
pad_token_id=processor.tokenizer.pad_token_id,
|
| 250 |
+
eos_token_id=processor.tokenizer.eos_token_id,
|
| 251 |
+
use_cache=True,
|
| 252 |
+
num_beams=1,
|
| 253 |
+
bad_words_ids=[[processor.tokenizer.unk_token_id]],
|
| 254 |
+
return_dict_in_generate=True,
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
sequence = processor.batch_decode(outputs.sequences)[0]
|
| 258 |
+
sequence = (
|
| 259 |
+
sequence.replace(processor.tokenizer.eos_token, "")
|
| 260 |
+
.replace(processor.tokenizer.pad_token, "")
|
| 261 |
+
.strip()
|
| 262 |
+
)
|
| 263 |
+
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
|
| 264 |
+
cord = processor.token2json(sequence)
|
| 265 |
+
return map_cord_json(cord)
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def align_word_labels(word_texts: list[str], word_ids: list[int | None], predictions: list[int], id2label: dict) -> list[str]:
|
| 269 |
+
labels = ["O"] * len(word_texts)
|
| 270 |
+
for word_id, pred in zip(word_ids, predictions):
|
| 271 |
+
if word_id is None:
|
| 272 |
+
continue
|
| 273 |
+
label = id2label.get(pred, id2label.get(str(pred), "O"))
|
| 274 |
+
labels[word_id] = label
|
| 275 |
+
return labels
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def group_entities(words: list[str], labels: list[str]) -> list[tuple[str, str]]:
|
| 279 |
+
groups: list[tuple[str, str]] = []
|
| 280 |
+
current_label: str | None = None
|
| 281 |
+
current_tokens: list[str] = []
|
| 282 |
+
|
| 283 |
+
def flush() -> None:
|
| 284 |
+
nonlocal current_label, current_tokens
|
| 285 |
+
if current_tokens and current_label:
|
| 286 |
+
groups.append((current_label, " ".join(current_tokens).strip()))
|
| 287 |
+
current_label = None
|
| 288 |
+
current_tokens = []
|
| 289 |
+
|
| 290 |
+
for word, label in zip(words, labels):
|
| 291 |
+
if label == "O":
|
| 292 |
+
flush()
|
| 293 |
+
continue
|
| 294 |
+
prefix = label[:2]
|
| 295 |
+
base = label[2:] if prefix in ("B-", "I-") else label
|
| 296 |
+
if prefix == "B-" or current_label != base:
|
| 297 |
+
flush()
|
| 298 |
+
current_label = base
|
| 299 |
+
current_tokens = [word]
|
| 300 |
+
else:
|
| 301 |
+
current_tokens.append(word)
|
| 302 |
+
flush()
|
| 303 |
+
return groups
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def extract_qa_pairs(groups: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
| 307 |
+
pairs: list[tuple[str, str]] = []
|
| 308 |
+
pending_question: str | None = None
|
| 309 |
+
for label, text in groups:
|
| 310 |
+
if label.endswith("QUESTION"):
|
| 311 |
+
pending_question = text
|
| 312 |
+
elif label.endswith("ANSWER") and pending_question:
|
| 313 |
+
pairs.append((pending_question, text))
|
| 314 |
+
pending_question = None
|
| 315 |
+
elif label.endswith("HEADER"):
|
| 316 |
+
pairs.append(("HEADER", text))
|
| 317 |
+
return pairs
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def extract_line_items_from_ocr(words: list[OcrWord]) -> list[dict[str, Any]]:
|
| 321 |
+
if not words:
|
| 322 |
+
return []
|
| 323 |
+
|
| 324 |
+
rows: dict[int, list[OcrWord]] = {}
|
| 325 |
+
for word in words:
|
| 326 |
+
bucket = round(word.top / 12) * 12
|
| 327 |
+
rows.setdefault(bucket, []).append(word)
|
| 328 |
+
|
| 329 |
+
line_items: list[dict[str, Any]] = []
|
| 330 |
+
for _, row_words in sorted(rows.items()):
|
| 331 |
+
row_words = sorted(row_words, key=lambda w: w.left)
|
| 332 |
+
text = " ".join(word.text for word in row_words)
|
| 333 |
+
if len(text) < 4:
|
| 334 |
+
continue
|
| 335 |
+
lower = text.lower()
|
| 336 |
+
if any(
|
| 337 |
+
skip in lower
|
| 338 |
+
for skip in (
|
| 339 |
+
"subtotal",
|
| 340 |
+
"sub total",
|
| 341 |
+
"total",
|
| 342 |
+
"tax",
|
| 343 |
+
"balance",
|
| 344 |
+
"thank you",
|
| 345 |
+
"page ",
|
| 346 |
+
"invoice",
|
| 347 |
+
"bill to",
|
| 348 |
+
"ship to",
|
| 349 |
+
)
|
| 350 |
+
):
|
| 351 |
+
continue
|
| 352 |
+
|
| 353 |
+
numbers = [
|
| 354 |
+
parse_loose_number(match.group())
|
| 355 |
+
for match in re.finditer(r"\d[\d,]*\.?\d*", text)
|
| 356 |
+
]
|
| 357 |
+
numbers = [n for n in numbers if n is not None]
|
| 358 |
+
if len(numbers) < 2:
|
| 359 |
+
continue
|
| 360 |
+
|
| 361 |
+
quantity = numbers[-2] if len(numbers) >= 2 else None
|
| 362 |
+
line_total = numbers[-1]
|
| 363 |
+
description = re.sub(r"\s+\d[\d,]*\.?\d*.*$", "", text).strip()
|
| 364 |
+
if len(description) < 3:
|
| 365 |
+
continue
|
| 366 |
+
|
| 367 |
+
line_items.append(
|
| 368 |
+
{
|
| 369 |
+
"description": description,
|
| 370 |
+
"vendorItemNumber": None,
|
| 371 |
+
"quantity": quantity,
|
| 372 |
+
"unit": None,
|
| 373 |
+
"unitPrice": round(line_total / quantity, 4) if quantity and quantity > 0 else None,
|
| 374 |
+
"lineTotal": line_total,
|
| 375 |
+
}
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
return line_items[:40]
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def parse_invoice(image, words: list[OcrWord]) -> dict[str, Any]:
|
| 382 |
+
import torch
|
| 383 |
+
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
| 384 |
+
|
| 385 |
+
if not words:
|
| 386 |
+
return {
|
| 387 |
+
"vendorName": None,
|
| 388 |
+
"invoiceNumber": None,
|
| 389 |
+
"invoiceDate": None,
|
| 390 |
+
"subtotal": None,
|
| 391 |
+
"tax": None,
|
| 392 |
+
"total": None,
|
| 393 |
+
"currency": None,
|
| 394 |
+
"confidence": "low",
|
| 395 |
+
"rawNotes": None,
|
| 396 |
+
"lineItems": [],
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
processor = LayoutLMv3Processor.from_pretrained(INVOICE_MODEL, apply_ocr=False)
|
| 400 |
+
model = LayoutLMv3ForTokenClassification.from_pretrained(INVOICE_MODEL)
|
| 401 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 402 |
+
model.to(device)
|
| 403 |
+
model.eval()
|
| 404 |
+
|
| 405 |
+
width, height = image.size
|
| 406 |
+
word_texts = [word.text for word in words]
|
| 407 |
+
boxes = normalize_boxes(words, width, height)
|
| 408 |
+
|
| 409 |
+
encoding = processor(
|
| 410 |
+
image,
|
| 411 |
+
word_texts,
|
| 412 |
+
boxes=boxes,
|
| 413 |
+
return_tensors="pt",
|
| 414 |
+
truncation=True,
|
| 415 |
+
padding="max_length",
|
| 416 |
+
max_length=512,
|
| 417 |
+
)
|
| 418 |
+
encoding = {key: value.to(device) for key, value in encoding.items()}
|
| 419 |
+
|
| 420 |
+
with torch.no_grad():
|
| 421 |
+
outputs = model(**encoding)
|
| 422 |
+
|
| 423 |
+
predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
| 424 |
+
if isinstance(predictions, int):
|
| 425 |
+
predictions = [predictions]
|
| 426 |
+
|
| 427 |
+
id2label = model.config.id2label
|
| 428 |
+
word_ids = encoding.word_ids(batch_index=0)
|
| 429 |
+
labels = align_word_labels(word_texts, word_ids, predictions, id2label)
|
| 430 |
+
groups = group_entities(word_texts, labels)
|
| 431 |
+
qa_pairs = extract_qa_pairs(groups)
|
| 432 |
+
|
| 433 |
+
vendor_name = None
|
| 434 |
+
invoice_number = None
|
| 435 |
+
invoice_date = None
|
| 436 |
+
total = None
|
| 437 |
+
tax = None
|
| 438 |
+
subtotal = None
|
| 439 |
+
|
| 440 |
+
for question, answer in qa_pairs:
|
| 441 |
+
q = question.lower()
|
| 442 |
+
if question == "HEADER" and not vendor_name:
|
| 443 |
+
vendor_name = answer
|
| 444 |
+
continue
|
| 445 |
+
if any(token in q for token in ("invoice", "inv", "bill")) and "date" in q:
|
| 446 |
+
invoice_date = normalize_date(answer)
|
| 447 |
+
elif any(token in q for token in ("invoice", "inv")) and "no" in q:
|
| 448 |
+
invoice_number = answer
|
| 449 |
+
elif "date" in q:
|
| 450 |
+
invoice_date = normalize_date(answer)
|
| 451 |
+
elif "total" in q and "sub" not in q:
|
| 452 |
+
total = parse_loose_number(answer)
|
| 453 |
+
elif "tax" in q:
|
| 454 |
+
tax = parse_loose_number(answer)
|
| 455 |
+
elif "subtotal" in q or "sub total" in q:
|
| 456 |
+
subtotal = parse_loose_number(answer)
|
| 457 |
+
elif any(token in q for token in ("vendor", "supplier", "seller", "remit", "from")):
|
| 458 |
+
vendor_name = answer
|
| 459 |
+
|
| 460 |
+
line_items = extract_line_items_from_ocr(words)
|
| 461 |
+
confidence = "high" if line_items and (invoice_number or vendor_name) else "medium" if line_items else "low"
|
| 462 |
+
|
| 463 |
+
return {
|
| 464 |
+
"vendorName": vendor_name,
|
| 465 |
+
"invoiceNumber": invoice_number,
|
| 466 |
+
"invoiceDate": invoice_date,
|
| 467 |
+
"subtotal": subtotal,
|
| 468 |
+
"tax": tax,
|
| 469 |
+
"total": total,
|
| 470 |
+
"currency": None,
|
| 471 |
+
"confidence": confidence,
|
| 472 |
+
"rawNotes": None,
|
| 473 |
+
"lineItems": line_items,
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
def main() -> int:
|
| 478 |
+
parser = argparse.ArgumentParser()
|
| 479 |
+
parser.add_argument("--image", required=True, help="Path to a PNG/JPG/WebP image")
|
| 480 |
+
parser.add_argument(
|
| 481 |
+
"--type",
|
| 482 |
+
default="auto",
|
| 483 |
+
choices=("auto", "invoice", "receipt"),
|
| 484 |
+
help="Document type routing",
|
| 485 |
+
)
|
| 486 |
+
args = parser.parse_args()
|
| 487 |
+
|
| 488 |
+
image_path = Path(args.image)
|
| 489 |
+
if not image_path.exists():
|
| 490 |
+
eprint(f"Image not found: {image_path}")
|
| 491 |
+
return 1
|
| 492 |
+
|
| 493 |
+
try:
|
| 494 |
+
image = load_image(image_path)
|
| 495 |
+
words = ocr_words(image)
|
| 496 |
+
doc_type = classify_document_type(words, None if args.type == "auto" else args.type)
|
| 497 |
+
result = parse_receipt(image) if doc_type == "receipt" else parse_invoice(image, words)
|
| 498 |
+
payload = {"documentType": doc_type, **result}
|
| 499 |
+
print(json.dumps(payload))
|
| 500 |
+
return 0
|
| 501 |
+
except Exception as error: # noqa: BLE001
|
| 502 |
+
eprint(f"Document parse failed: {error}")
|
| 503 |
+
return 1
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
if __name__ == "__main__":
|
| 507 |
+
raise SystemExit(main())
|
scripts/requirements-document-parser.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Optional local parser for invoice/receipt import (see scripts/parse_vendor_document.py).
|
| 2 |
+
# Requires Tesseract OCR installed on the host (macOS: brew install tesseract).
|
| 3 |
+
torch>=2.0
|
| 4 |
+
transformers>=4.36,<5
|
| 5 |
+
pillow>=10.0
|
| 6 |
+
pytesseract>=0.3.10
|
| 7 |
+
accelerate>=0.26
|
services/document-parser-api/main.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hosted document parser API for Fresh Catch Inventory.
|
| 3 |
+
|
| 4 |
+
Deploy to Hugging Face Spaces (Docker), Fly.io, or any VM with Python + Tesseract.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
import tempfile
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from fastapi import FastAPI, File, Header, HTTPException, Query, UploadFile
|
| 16 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 17 |
+
|
| 18 |
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 19 |
+
PARSE_SCRIPT = REPO_ROOT / "scripts" / "parse_vendor_document.py"
|
| 20 |
+
SERVICE_SECRET = os.environ.get("DOCUMENT_PARSER_SERVICE_SECRET", "").strip()
|
| 21 |
+
|
| 22 |
+
app = FastAPI(title="Fresh Catch Document Parser", version="1.0.0")
|
| 23 |
+
|
| 24 |
+
app.add_middleware(
|
| 25 |
+
CORSMiddleware,
|
| 26 |
+
allow_origins=os.environ.get("DOCUMENT_PARSER_CORS_ORIGINS", "*").split(","),
|
| 27 |
+
allow_credentials=True,
|
| 28 |
+
allow_methods=["POST", "GET"],
|
| 29 |
+
allow_headers=["*"],
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def verify_auth(authorization: str | None) -> None:
|
| 34 |
+
if not SERVICE_SECRET:
|
| 35 |
+
return
|
| 36 |
+
if not authorization or authorization != f"Bearer {SERVICE_SECRET}":
|
| 37 |
+
raise HTTPException(status_code=401, detail="Unauthorized")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.get("/health")
|
| 41 |
+
def health() -> dict[str, str]:
|
| 42 |
+
return {"status": "ok"}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@app.post("/parse")
|
| 46 |
+
async def parse_document(
|
| 47 |
+
file: UploadFile = File(...),
|
| 48 |
+
type: str = Query("auto", pattern="^(auto|invoice|receipt)$"),
|
| 49 |
+
authorization: str | None = Header(default=None),
|
| 50 |
+
) -> dict:
|
| 51 |
+
verify_auth(authorization)
|
| 52 |
+
|
| 53 |
+
if not PARSE_SCRIPT.exists():
|
| 54 |
+
raise HTTPException(status_code=500, detail="parse_vendor_document.py not found")
|
| 55 |
+
|
| 56 |
+
contents = await file.read()
|
| 57 |
+
if not contents:
|
| 58 |
+
raise HTTPException(status_code=400, detail="Empty file")
|
| 59 |
+
|
| 60 |
+
suffix = Path(file.filename or "upload.png").suffix or ".png"
|
| 61 |
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
| 62 |
+
tmp.write(contents)
|
| 63 |
+
image_path = tmp.name
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
completed = subprocess.run(
|
| 67 |
+
[sys.executable, str(PARSE_SCRIPT), "--image", image_path, "--type", type],
|
| 68 |
+
capture_output=True,
|
| 69 |
+
text=True,
|
| 70 |
+
timeout=int(os.environ.get("DOCUMENT_PARSER_TIMEOUT_MS", "120000")) // 1000,
|
| 71 |
+
cwd=str(REPO_ROOT),
|
| 72 |
+
)
|
| 73 |
+
finally:
|
| 74 |
+
Path(image_path).unlink(missing_ok=True)
|
| 75 |
+
|
| 76 |
+
if completed.returncode != 0:
|
| 77 |
+
detail = (completed.stderr or completed.stdout or "Parse failed").strip()
|
| 78 |
+
raise HTTPException(status_code=500, detail=detail[:2000])
|
| 79 |
+
|
| 80 |
+
import json
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
return json.loads(completed.stdout)
|
| 84 |
+
except json.JSONDecodeError as error:
|
| 85 |
+
raise HTTPException(status_code=500, detail=f"Invalid parser output: {error}") from error
|
services/document-parser-api/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.115.0
|
| 2 |
+
uvicorn[standard]>=0.32.0
|
| 3 |
+
python-multipart>=0.0.12
|
| 4 |
+
torch>=2.0
|
| 5 |
+
transformers>=4.36,<5
|
| 6 |
+
pillow>=10.0
|
| 7 |
+
pytesseract>=0.3.10
|
| 8 |
+
accelerate>=0.26
|