Spaces:

stubdude
/

fresh-catch-parser

Sleeping

App Files Files Community

fresh-catch-parser / scripts /parse_vendor_document.py

stubdude

Add document parser Docker service

fbba60e 11 days ago

raw

history blame contribute delete

15.6 kB

	#!/usr/bin/env python3
	"""
	Parse vendor invoices (LayoutLMv3 FUNSD) or retail receipts (Donut CORD v2).

	Usage:
	python3 scripts/parse_vendor_document.py --image /path/to.png [--type invoice\|receipt\|auto]

	Prints a single JSON object to stdout matching ParsedVendorInvoice.
	"""

	from __future__ import annotations

	import argparse
	import json
	import re
	import sys
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any

	RECEIPT_MODEL = "naver-clova-ix/donut-base-finetuned-cord-v2"
	INVOICE_MODEL = "nielsr/layoutlmv3-finetuned-funsd"

	INVOICE_HINTS = (
	"invoice",
	"inv #",
	"inv no",
	"bill to",
	"ship to",
	"purchase order",
	"po #",
	"remit to",
	"net 30",
	"del weight",
	"unit price",
	"vendor",
	"food service",
	)

	RECEIPT_HINTS = (
	"receipt",
	"thank you",
	"subtotal",
	"sub total",
	"change due",
	"cashier",
	"register",
	"visa",
	"mastercard",
	"debit",
	"loyalty",
	"store #",
	)


	@dataclass
	class OcrWord:
	text: str
	left: int
	top: int
	width: int
	height: int

	@property
	def box(self) -> list[int]:
	return [self.left, self.top, self.left + self.width, self.top + self.height]


	def eprint(*args: object) -> None:
	print(*args, file=sys.stderr)


	def load_image(path: Path):
	from PIL import Image

	image = Image.open(path).convert("RGB")
	return image


	def ocr_words(image) -> list[OcrWord]:
	import pytesseract

	data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
	words: list[OcrWord] = []
	count = len(data["text"])
	for i in range(count):
	text = (data["text"][i] or "").strip()
	if not text:
	continue
	conf = int(float(data["conf"][i])) if data["conf"][i] not in ("-1", "") else -1
	if conf >= 0 and conf < 35:
	continue
	words.append(
	OcrWord(
	text=text,
	left=int(data["left"][i]),
	top=int(data["top"][i]),
	width=int(data["width"][i]),
	height=int(data["height"][i]),
	)
	)
	return words


	def normalize_boxes(words: list[OcrWord], width: int, height: int) -> list[list[int]]:
	boxes: list[list[int]] = []
	for word in words:
	x0, y0, x1, y1 = word.box
	boxes.append(
	[
	min(1000, max(0, int(1000 * x0 / width))),
	min(1000, max(0, int(1000 * y0 / height))),
	min(1000, max(0, int(1000 * x1 / width))),
	min(1000, max(0, int(1000 * y1 / height))),
	]
	)
	return boxes


	def classify_document_type(words: list[OcrWord], forced: str \| None) -> str:
	if forced in ("invoice", "receipt"):
	return forced

	text = " ".join(word.text for word in words).lower()
	invoice_score = sum(1 for hint in INVOICE_HINTS if hint in text)
	receipt_score = sum(1 for hint in RECEIPT_HINTS if hint in text)

	if "invoice" in text or "inv " in text:
	invoice_score += 2
	if "receipt" in text:
	receipt_score += 2

	if invoice_score > receipt_score + 1:
	return "invoice"
	if receipt_score > invoice_score:
	return "receipt"
	return "invoice"


	def parse_loose_number(value: Any) -> float \| None:
	if isinstance(value, (int, float)):
	return float(value)
	if not isinstance(value, str):
	return None
	cleaned = re.sub(r"[^0-9.,-]", "", value).replace(",", ".")
	if not cleaned:
	return None
	try:
	return float(cleaned)
	except ValueError:
	return None


	def normalize_date(value: str \| None) -> str \| None:
	if not value:
	return None
	value = value.strip()
	if re.match(r"^\d{4}-\d{2}-\d{2}$", value):
	return value
	match = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", value)
	if not match:
	return value
	month, day, year = match.groups()
	if len(year) == 2:
	year = f"20{year}"
	return f"{year}-{month.zfill(2)}-{day.zfill(2)}"


	def map_cord_json(cord: dict[str, Any]) -> dict[str, Any]:
	line_items: list[dict[str, Any]] = []
	menu = cord.get("menu")
	menus = menu if isinstance(menu, list) else [menu] if isinstance(menu, dict) else []

	for entry in menus:
	if not isinstance(entry, dict):
	continue
	description = (
	entry.get("nm")
	or entry.get("item")
	or entry.get("name")
	or entry.get("menu.nm")
	)
	if not description or not str(description).strip():
	continue
	line_items.append(
	{
	"description": str(description).strip(),
	"vendorItemNumber": None,
	"quantity": parse_loose_number(entry.get("cnt") or entry.get("num")),
	"unit": str(entry.get("unit") or entry.get("itemsubtotal") or "").strip() or None,
	"unitPrice": parse_loose_number(
	entry.get("unitprice") or entry.get("price") or entry.get("itemprice")
	),
	"lineTotal": parse_loose_number(
	entry.get("price") or entry.get("cntprice") or entry.get("itemprice")
	),
	}
	)

	sub_total = cord.get("sub_total") or cord.get("subtotal")
	tax = cord.get("tax") or cord.get("tax_price")
	total = cord.get("total") or cord.get("total_price") or cord.get("total_etc")

	def price_field(block: Any, *keys: str) -> float \| None:
	if isinstance(block, dict):
	for key in keys:
	if key in block:
	return parse_loose_number(block[key])
	return parse_loose_number(block)

	return {
	"vendorName": str(cord.get("store") or cord.get("company") or cord.get("brand") or "").strip()
	or None,
	"invoiceNumber": str(cord.get("receipt_no") or cord.get("order_no") or "").strip() or None,
	"invoiceDate": normalize_date(
	str(cord.get("date") or cord.get("receipt_date") or "").strip() or None
	),
	"subtotal": price_field(sub_total, "price", "subtotal_price", "sub_total_price"),
	"tax": price_field(tax, "price", "tax_price"),
	"total": price_field(total, "total_price", "price", "total"),
	"currency": None,
	"confidence": "medium" if line_items else "low",
	"rawNotes": json.dumps(cord)[:4000] if cord else None,
	"lineItems": line_items,
	}


	def parse_receipt(image) -> dict[str, Any]:
	import torch
	from transformers import DonutProcessor, VisionEncoderDecoderModel

	processor = DonutProcessor.from_pretrained(RECEIPT_MODEL)
	model = VisionEncoderDecoderModel.from_pretrained(RECEIPT_MODEL)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	model.eval()

	pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
	task_prompt = "<s_cord-v2>"
	decoder_input_ids = processor.tokenizer(
	task_prompt, add_special_tokens=False, return_tensors="pt"
	).input_ids.to(device)

	outputs = model.generate(
	pixel_values,
	decoder_input_ids=decoder_input_ids,
	max_length=model.decoder.config.max_position_embeddings,
	early_stopping=True,
	pad_token_id=processor.tokenizer.pad_token_id,
	eos_token_id=processor.tokenizer.eos_token_id,
	use_cache=True,
	num_beams=1,
	bad_words_ids=[[processor.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	)

	sequence = processor.batch_decode(outputs.sequences)[0]
	sequence = (
	sequence.replace(processor.tokenizer.eos_token, "")
	.replace(processor.tokenizer.pad_token, "")
	.strip()
	)
	sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
	cord = processor.token2json(sequence)
	return map_cord_json(cord)


	def align_word_labels(word_texts: list[str], word_ids: list[int \| None], predictions: list[int], id2label: dict) -> list[str]:
	labels = ["O"] * len(word_texts)
	for word_id, pred in zip(word_ids, predictions):
	if word_id is None:
	continue
	label = id2label.get(pred, id2label.get(str(pred), "O"))
	labels[word_id] = label
	return labels


	def group_entities(words: list[str], labels: list[str]) -> list[tuple[str, str]]:
	groups: list[tuple[str, str]] = []
	current_label: str \| None = None
	current_tokens: list[str] = []

	def flush() -> None:
	nonlocal current_label, current_tokens
	if current_tokens and current_label:
	groups.append((current_label, " ".join(current_tokens).strip()))
	current_label = None
	current_tokens = []

	for word, label in zip(words, labels):
	if label == "O":
	flush()
	continue
	prefix = label[:2]
	base = label[2:] if prefix in ("B-", "I-") else label
	if prefix == "B-" or current_label != base:
	flush()
	current_label = base
	current_tokens = [word]
	else:
	current_tokens.append(word)
	flush()
	return groups


	def extract_qa_pairs(groups: list[tuple[str, str]]) -> list[tuple[str, str]]:
	pairs: list[tuple[str, str]] = []
	pending_question: str \| None = None
	for label, text in groups:
	if label.endswith("QUESTION"):
	pending_question = text
	elif label.endswith("ANSWER") and pending_question:
	pairs.append((pending_question, text))
	pending_question = None
	elif label.endswith("HEADER"):
	pairs.append(("HEADER", text))
	return pairs


	def extract_line_items_from_ocr(words: list[OcrWord]) -> list[dict[str, Any]]:
	if not words:
	return []

	rows: dict[int, list[OcrWord]] = {}
	for word in words:
	bucket = round(word.top / 12) * 12
	rows.setdefault(bucket, []).append(word)

	line_items: list[dict[str, Any]] = []
	for _, row_words in sorted(rows.items()):
	row_words = sorted(row_words, key=lambda w: w.left)
	text = " ".join(word.text for word in row_words)
	if len(text) < 4:
	continue
	lower = text.lower()
	if any(
	skip in lower
	for skip in (
	"subtotal",
	"sub total",
	"total",
	"tax",
	"balance",
	"thank you",
	"page ",
	"invoice",
	"bill to",
	"ship to",
	)
	):
	continue

	numbers = [
	parse_loose_number(match.group())
	for match in re.finditer(r"\d[\d,]\.?\d", text)
	]
	numbers = [n for n in numbers if n is not None]
	if len(numbers) < 2:
	continue

	quantity = numbers[-2] if len(numbers) >= 2 else None
	line_total = numbers[-1]
	description = re.sub(r"\s+\d[\d,]\.?\d.*$", "", text).strip()
	if len(description) < 3:
	continue

	line_items.append(
	{
	"description": description,
	"vendorItemNumber": None,
	"quantity": quantity,
	"unit": None,
	"unitPrice": round(line_total / quantity, 4) if quantity and quantity > 0 else None,
	"lineTotal": line_total,
	}
	)

	return line_items[:40]


	def parse_invoice(image, words: list[OcrWord]) -> dict[str, Any]:
	import torch
	from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification

	if not words:
	return {
	"vendorName": None,
	"invoiceNumber": None,
	"invoiceDate": None,
	"subtotal": None,
	"tax": None,
	"total": None,
	"currency": None,
	"confidence": "low",
	"rawNotes": None,
	"lineItems": [],
	}

	processor = LayoutLMv3Processor.from_pretrained(INVOICE_MODEL, apply_ocr=False)
	model = LayoutLMv3ForTokenClassification.from_pretrained(INVOICE_MODEL)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)
	model.eval()

	width, height = image.size
	word_texts = [word.text for word in words]
	boxes = normalize_boxes(words, width, height)

	encoding = processor(
	image,
	word_texts,
	boxes=boxes,
	return_tensors="pt",
	truncation=True,
	padding="max_length",
	max_length=512,
	)
	encoding = {key: value.to(device) for key, value in encoding.items()}

	with torch.no_grad():
	outputs = model(**encoding)

	predictions = outputs.logits.argmax(-1).squeeze().tolist()
	if isinstance(predictions, int):
	predictions = [predictions]

	id2label = model.config.id2label
	word_ids = encoding.word_ids(batch_index=0)
	labels = align_word_labels(word_texts, word_ids, predictions, id2label)
	groups = group_entities(word_texts, labels)
	qa_pairs = extract_qa_pairs(groups)

	vendor_name = None
	invoice_number = None
	invoice_date = None
	total = None
	tax = None
	subtotal = None

	for question, answer in qa_pairs:
	q = question.lower()
	if question == "HEADER" and not vendor_name:
	vendor_name = answer
	continue
	if any(token in q for token in ("invoice", "inv", "bill")) and "date" in q:
	invoice_date = normalize_date(answer)
	elif any(token in q for token in ("invoice", "inv")) and "no" in q:
	invoice_number = answer
	elif "date" in q:
	invoice_date = normalize_date(answer)
	elif "total" in q and "sub" not in q:
	total = parse_loose_number(answer)
	elif "tax" in q:
	tax = parse_loose_number(answer)
	elif "subtotal" in q or "sub total" in q:
	subtotal = parse_loose_number(answer)
	elif any(token in q for token in ("vendor", "supplier", "seller", "remit", "from")):
	vendor_name = answer

	line_items = extract_line_items_from_ocr(words)
	confidence = "high" if line_items and (invoice_number or vendor_name) else "medium" if line_items else "low"

	return {
	"vendorName": vendor_name,
	"invoiceNumber": invoice_number,
	"invoiceDate": invoice_date,
	"subtotal": subtotal,
	"tax": tax,
	"total": total,
	"currency": None,
	"confidence": confidence,
	"rawNotes": None,
	"lineItems": line_items,
	}


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--image", required=True, help="Path to a PNG/JPG/WebP image")
	parser.add_argument(
	"--type",
	default="auto",
	choices=("auto", "invoice", "receipt"),
	help="Document type routing",
	)
	args = parser.parse_args()

	image_path = Path(args.image)
	if not image_path.exists():
	eprint(f"Image not found: {image_path}")
	return 1

	try:
	image = load_image(image_path)
	words = ocr_words(image)
	doc_type = classify_document_type(words, None if args.type == "auto" else args.type)
	result = parse_receipt(image) if doc_type == "receipt" else parse_invoice(image, words)
	payload = {"documentType": doc_type, **result}
	print(json.dumps(payload))
	return 0
	except Exception as error: # noqa: BLE001
	eprint(f"Document parse failed: {error}")
	return 1


	if __name__ == "__main__":
	raise SystemExit(main())