Spaces:

stubdude
/

fresh-catch-parser

Sleeping

App Files Files Community

stubdude commited on May 23

Commit

fbba60e

1 Parent(s): d543c2a

Add document parser Docker service

Browse files

Files changed (6) hide show

Dockerfile +23 -0
README.md +5 -6
scripts/parse_vendor_document.py +507 -0
scripts/requirements-document-parser.txt +7 -0
services/document-parser-api/main.py +85 -0
services/document-parser-api/requirements.txt +8 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.11-slim
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    tesseract-ocr \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY scripts/requirements-document-parser.txt /app/scripts/requirements-document-parser.txt
+COPY services/document-parser-api/requirements.txt /app/services/document-parser-api/requirements.txt
+RUN pip install --no-cache-dir -r /app/services/document-parser-api/requirements.txt
+COPY scripts/parse_vendor_document.py /app/scripts/parse_vendor_document.py
+COPY services/document-parser-api/main.py /app/services/document-parser-api/main.py
+ENV PYTHONUNBUFFERED=1
+WORKDIR /app/services/document-parser-api
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,10 +1,9 @@
 ---
 title: Fresh Catch Parser
-emoji: 🐠
-colorFrom: gray
-colorTo: red
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Fresh Catch Parser
+emoji: 🐟
+colorFrom: blue
+colorTo: green
 sdk: docker
+app_port: 7860
 ---
+Document parser API for Fresh Catch Inventory.

scripts/parse_vendor_document.py ADDED Viewed

	@@ -0,0 +1,507 @@

+#!/usr/bin/env python3
+"""
+Parse vendor invoices (LayoutLMv3 FUNSD) or retail receipts (Donut CORD v2).
+Usage:
+  python3 scripts/parse_vendor_document.py --image /path/to.png [--type invoice|receipt|auto]
+Prints a single JSON object to stdout matching ParsedVendorInvoice.
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+RECEIPT_MODEL = "naver-clova-ix/donut-base-finetuned-cord-v2"
+INVOICE_MODEL = "nielsr/layoutlmv3-finetuned-funsd"
+INVOICE_HINTS = (
+    "invoice",
+    "inv #",
+    "inv no",
+    "bill to",
+    "ship to",
+    "purchase order",
+    "po #",
+    "remit to",
+    "net 30",
+    "del weight",
+    "unit price",
+    "vendor",
+    "food service",
+)
+RECEIPT_HINTS = (
+    "receipt",
+    "thank you",
+    "subtotal",
+    "sub total",
+    "change due",
+    "cashier",
+    "register",
+    "visa",
+    "mastercard",
+    "debit",
+    "loyalty",
+    "store #",
+)
+@dataclass
+class OcrWord:
+    text: str
+    left: int
+    top: int
+    width: int
+    height: int
+    @property
+    def box(self) -> list[int]:
+        return [self.left, self.top, self.left + self.width, self.top + self.height]
+def eprint(*args: object) -> None:
+    print(*args, file=sys.stderr)
+def load_image(path: Path):
+    from PIL import Image
+    image = Image.open(path).convert("RGB")
+    return image
+def ocr_words(image) -> list[OcrWord]:
+    import pytesseract
+    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
+    words: list[OcrWord] = []
+    count = len(data["text"])
+    for i in range(count):
+        text = (data["text"][i] or "").strip()
+        if not text:
+            continue
+        conf = int(float(data["conf"][i])) if data["conf"][i] not in ("-1", "") else -1
+        if conf >= 0 and conf < 35:
+            continue
+        words.append(
+            OcrWord(
+                text=text,
+                left=int(data["left"][i]),
+                top=int(data["top"][i]),
+                width=int(data["width"][i]),
+                height=int(data["height"][i]),
+            )
+        )
+    return words
+def normalize_boxes(words: list[OcrWord], width: int, height: int) -> list[list[int]]:
+    boxes: list[list[int]] = []
+    for word in words:
+        x0, y0, x1, y1 = word.box
+        boxes.append(
+            [
+                min(1000, max(0, int(1000 * x0 / width))),
+                min(1000, max(0, int(1000 * y0 / height))),
+                min(1000, max(0, int(1000 * x1 / width))),
+                min(1000, max(0, int(1000 * y1 / height))),
+            ]
+        )
+    return boxes
+def classify_document_type(words: list[OcrWord], forced: str | None) -> str:
+    if forced in ("invoice", "receipt"):
+        return forced
+    text = " ".join(word.text for word in words).lower()
+    invoice_score = sum(1 for hint in INVOICE_HINTS if hint in text)
+    receipt_score = sum(1 for hint in RECEIPT_HINTS if hint in text)
+    if "invoice" in text or "inv " in text:
+        invoice_score += 2
+    if "receipt" in text:
+        receipt_score += 2
+    if invoice_score > receipt_score + 1:
+        return "invoice"
+    if receipt_score > invoice_score:
+        return "receipt"
+    return "invoice"
+def parse_loose_number(value: Any) -> float | None:
+    if isinstance(value, (int, float)):
+        return float(value)
+    if not isinstance(value, str):
+        return None
+    cleaned = re.sub(r"[^0-9.,-]", "", value).replace(",", ".")
+    if not cleaned:
+        return None
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None
+def normalize_date(value: str | None) -> str | None:
+    if not value:
+        return None
+    value = value.strip()
+    if re.match(r"^\d{4}-\d{2}-\d{2}$", value):
+        return value
+    match = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", value)
+    if not match:
+        return value
+    month, day, year = match.groups()
+    if len(year) == 2:
+        year = f"20{year}"
+    return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
+def map_cord_json(cord: dict[str, Any]) -> dict[str, Any]:
+    line_items: list[dict[str, Any]] = []
+    menu = cord.get("menu")
+    menus = menu if isinstance(menu, list) else [menu] if isinstance(menu, dict) else []
+    for entry in menus:
+        if not isinstance(entry, dict):
+            continue
+        description = (
+            entry.get("nm")
+            or entry.get("item")
+            or entry.get("name")
+            or entry.get("menu.nm")
+        )
+        if not description or not str(description).strip():
+            continue
+        line_items.append(
+            {
+                "description": str(description).strip(),
+                "vendorItemNumber": None,
+                "quantity": parse_loose_number(entry.get("cnt") or entry.get("num")),
+                "unit": str(entry.get("unit") or entry.get("itemsubtotal") or "").strip() or None,
+                "unitPrice": parse_loose_number(
+                    entry.get("unitprice") or entry.get("price") or entry.get("itemprice")
+                ),
+                "lineTotal": parse_loose_number(
+                    entry.get("price") or entry.get("cntprice") or entry.get("itemprice")
+                ),
+            }
+        )
+    sub_total = cord.get("sub_total") or cord.get("subtotal")
+    tax = cord.get("tax") or cord.get("tax_price")
+    total = cord.get("total") or cord.get("total_price") or cord.get("total_etc")
+    def price_field(block: Any, *keys: str) -> float | None:
+        if isinstance(block, dict):
+            for key in keys:
+                if key in block:
+                    return parse_loose_number(block[key])
+        return parse_loose_number(block)
+    return {
+        "vendorName": str(cord.get("store") or cord.get("company") or cord.get("brand") or "").strip()
+        or None,
+        "invoiceNumber": str(cord.get("receipt_no") or cord.get("order_no") or "").strip() or None,
+        "invoiceDate": normalize_date(
+            str(cord.get("date") or cord.get("receipt_date") or "").strip() or None
+        ),
+        "subtotal": price_field(sub_total, "price", "subtotal_price", "sub_total_price"),
+        "tax": price_field(tax, "price", "tax_price"),
+        "total": price_field(total, "total_price", "price", "total"),
+        "currency": None,
+        "confidence": "medium" if line_items else "low",
+        "rawNotes": json.dumps(cord)[:4000] if cord else None,
+        "lineItems": line_items,
+    }
+def parse_receipt(image) -> dict[str, Any]:
+    import torch
+    from transformers import DonutProcessor, VisionEncoderDecoderModel
+    processor = DonutProcessor.from_pretrained(RECEIPT_MODEL)
+    model = VisionEncoderDecoderModel.from_pretrained(RECEIPT_MODEL)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    model.eval()
+    pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
+    task_prompt = "<s_cord-v2>"
+    decoder_input_ids = processor.tokenizer(
+        task_prompt, add_special_tokens=False, return_tensors="pt"
+    ).input_ids.to(device)
+    outputs = model.generate(
+        pixel_values,
+        decoder_input_ids=decoder_input_ids,
+        max_length=model.decoder.config.max_position_embeddings,
+        early_stopping=True,
+        pad_token_id=processor.tokenizer.pad_token_id,
+        eos_token_id=processor.tokenizer.eos_token_id,
+        use_cache=True,
+        num_beams=1,
+        bad_words_ids=[[processor.tokenizer.unk_token_id]],
+        return_dict_in_generate=True,
+    )
+    sequence = processor.batch_decode(outputs.sequences)[0]
+    sequence = (
+        sequence.replace(processor.tokenizer.eos_token, "")
+        .replace(processor.tokenizer.pad_token, "")
+        .strip()
+    )
+    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
+    cord = processor.token2json(sequence)
+    return map_cord_json(cord)
+def align_word_labels(word_texts: list[str], word_ids: list[int | None], predictions: list[int], id2label: dict) -> list[str]:
+    labels = ["O"] * len(word_texts)
+    for word_id, pred in zip(word_ids, predictions):
+        if word_id is None:
+            continue
+        label = id2label.get(pred, id2label.get(str(pred), "O"))
+        labels[word_id] = label
+    return labels
+def group_entities(words: list[str], labels: list[str]) -> list[tuple[str, str]]:
+    groups: list[tuple[str, str]] = []
+    current_label: str | None = None
+    current_tokens: list[str] = []
+    def flush() -> None:
+        nonlocal current_label, current_tokens
+        if current_tokens and current_label:
+            groups.append((current_label, " ".join(current_tokens).strip()))
+        current_label = None
+        current_tokens = []
+    for word, label in zip(words, labels):
+        if label == "O":
+            flush()
+            continue
+        prefix = label[:2]
+        base = label[2:] if prefix in ("B-", "I-") else label
+        if prefix == "B-" or current_label != base:
+            flush()
+            current_label = base
+            current_tokens = [word]
+        else:
+            current_tokens.append(word)
+    flush()
+    return groups
+def extract_qa_pairs(groups: list[tuple[str, str]]) -> list[tuple[str, str]]:
+    pairs: list[tuple[str, str]] = []
+    pending_question: str | None = None
+    for label, text in groups:
+        if label.endswith("QUESTION"):
+            pending_question = text
+        elif label.endswith("ANSWER") and pending_question:
+            pairs.append((pending_question, text))
+            pending_question = None
+        elif label.endswith("HEADER"):
+            pairs.append(("HEADER", text))
+    return pairs
+def extract_line_items_from_ocr(words: list[OcrWord]) -> list[dict[str, Any]]:
+    if not words:
+        return []
+    rows: dict[int, list[OcrWord]] = {}
+    for word in words:
+        bucket = round(word.top / 12) * 12
+        rows.setdefault(bucket, []).append(word)
+    line_items: list[dict[str, Any]] = []
+    for _, row_words in sorted(rows.items()):
+        row_words = sorted(row_words, key=lambda w: w.left)
+        text = " ".join(word.text for word in row_words)
+        if len(text) < 4:
+            continue
+        lower = text.lower()
+        if any(
+            skip in lower
+            for skip in (
+                "subtotal",
+                "sub total",
+                "total",
+                "tax",
+                "balance",
+                "thank you",
+                "page ",
+                "invoice",
+                "bill to",
+                "ship to",
+            )
+        ):
+            continue
+        numbers = [
+            parse_loose_number(match.group())
+            for match in re.finditer(r"\d[\d,]*\.?\d*", text)
+        ]
+        numbers = [n for n in numbers if n is not None]
+        if len(numbers) < 2:
+            continue
+        quantity = numbers[-2] if len(numbers) >= 2 else None
+        line_total = numbers[-1]
+        description = re.sub(r"\s+\d[\d,]*\.?\d*.*$", "", text).strip()
+        if len(description) < 3:
+            continue
+        line_items.append(
+            {
+                "description": description,
+                "vendorItemNumber": None,
+                "quantity": quantity,
+                "unit": None,
+                "unitPrice": round(line_total / quantity, 4) if quantity and quantity > 0 else None,
+                "lineTotal": line_total,
+            }
+        )
+    return line_items[:40]
+def parse_invoice(image, words: list[OcrWord]) -> dict[str, Any]:
+    import torch
+    from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
+    if not words:
+        return {
+            "vendorName": None,
+            "invoiceNumber": None,
+            "invoiceDate": None,
+            "subtotal": None,
+            "tax": None,
+            "total": None,
+            "currency": None,
+            "confidence": "low",
+            "rawNotes": None,
+            "lineItems": [],
+        }
+    processor = LayoutLMv3Processor.from_pretrained(INVOICE_MODEL, apply_ocr=False)
+    model = LayoutLMv3ForTokenClassification.from_pretrained(INVOICE_MODEL)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    model.eval()
+    width, height = image.size
+    word_texts = [word.text for word in words]
+    boxes = normalize_boxes(words, width, height)
+    encoding = processor(
+        image,
+        word_texts,
+        boxes=boxes,
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=512,
+    )
+    encoding = {key: value.to(device) for key, value in encoding.items()}
+    with torch.no_grad():
+        outputs = model(**encoding)
+    predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    if isinstance(predictions, int):
+        predictions = [predictions]
+    id2label = model.config.id2label
+    word_ids = encoding.word_ids(batch_index=0)
+    labels = align_word_labels(word_texts, word_ids, predictions, id2label)
+    groups = group_entities(word_texts, labels)
+    qa_pairs = extract_qa_pairs(groups)
+    vendor_name = None
+    invoice_number = None
+    invoice_date = None
+    total = None
+    tax = None
+    subtotal = None
+    for question, answer in qa_pairs:
+        q = question.lower()
+        if question == "HEADER" and not vendor_name:
+            vendor_name = answer
+            continue
+        if any(token in q for token in ("invoice", "inv", "bill")) and "date" in q:
+            invoice_date = normalize_date(answer)
+        elif any(token in q for token in ("invoice", "inv")) and "no" in q:
+            invoice_number = answer
+        elif "date" in q:
+            invoice_date = normalize_date(answer)
+        elif "total" in q and "sub" not in q:
+            total = parse_loose_number(answer)
+        elif "tax" in q:
+            tax = parse_loose_number(answer)
+        elif "subtotal" in q or "sub total" in q:
+            subtotal = parse_loose_number(answer)
+        elif any(token in q for token in ("vendor", "supplier", "seller", "remit", "from")):
+            vendor_name = answer
+    line_items = extract_line_items_from_ocr(words)
+    confidence = "high" if line_items and (invoice_number or vendor_name) else "medium" if line_items else "low"
+    return {
+        "vendorName": vendor_name,
+        "invoiceNumber": invoice_number,
+        "invoiceDate": invoice_date,
+        "subtotal": subtotal,
+        "tax": tax,
+        "total": total,
+        "currency": None,
+        "confidence": confidence,
+        "rawNotes": None,
+        "lineItems": line_items,
+    }
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image", required=True, help="Path to a PNG/JPG/WebP image")
+    parser.add_argument(
+        "--type",
+        default="auto",
+        choices=("auto", "invoice", "receipt"),
+        help="Document type routing",
+    )
+    args = parser.parse_args()
+    image_path = Path(args.image)
+    if not image_path.exists():
+        eprint(f"Image not found: {image_path}")
+        return 1
+    try:
+        image = load_image(image_path)
+        words = ocr_words(image)
+        doc_type = classify_document_type(words, None if args.type == "auto" else args.type)
+        result = parse_receipt(image) if doc_type == "receipt" else parse_invoice(image, words)
+        payload = {"documentType": doc_type, **result}
+        print(json.dumps(payload))
+        return 0
+    except Exception as error:  # noqa: BLE001
+        eprint(f"Document parse failed: {error}")
+        return 1
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/requirements-document-parser.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+# Optional local parser for invoice/receipt import (see scripts/parse_vendor_document.py).
+# Requires Tesseract OCR installed on the host (macOS: brew install tesseract).
+torch>=2.0
+transformers>=4.36,<5
+pillow>=10.0
+pytesseract>=0.3.10
+accelerate>=0.26

services/document-parser-api/main.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Hosted document parser API for Fresh Catch Inventory.
+Deploy to Hugging Face Spaces (Docker), Fly.io, or any VM with Python + Tesseract.
+"""
+from __future__ import annotations
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from fastapi import FastAPI, File, Header, HTTPException, Query, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+REPO_ROOT = Path(__file__).resolve().parents[2]
+PARSE_SCRIPT = REPO_ROOT / "scripts" / "parse_vendor_document.py"
+SERVICE_SECRET = os.environ.get("DOCUMENT_PARSER_SERVICE_SECRET", "").strip()
+app = FastAPI(title="Fresh Catch Document Parser", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=os.environ.get("DOCUMENT_PARSER_CORS_ORIGINS", "*").split(","),
+    allow_credentials=True,
+    allow_methods=["POST", "GET"],
+    allow_headers=["*"],
+)
+def verify_auth(authorization: str | None) -> None:
+    if not SERVICE_SECRET:
+        return
+    if not authorization or authorization != f"Bearer {SERVICE_SECRET}":
+        raise HTTPException(status_code=401, detail="Unauthorized")
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "ok"}
+@app.post("/parse")
+async def parse_document(
+    file: UploadFile = File(...),
+    type: str = Query("auto", pattern="^(auto|invoice|receipt)$"),
+    authorization: str | None = Header(default=None),
+) -> dict:
+    verify_auth(authorization)
+    if not PARSE_SCRIPT.exists():
+        raise HTTPException(status_code=500, detail="parse_vendor_document.py not found")
+    contents = await file.read()
+    if not contents:
+        raise HTTPException(status_code=400, detail="Empty file")
+    suffix = Path(file.filename or "upload.png").suffix or ".png"
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(contents)
+        image_path = tmp.name
+    try:
+        completed = subprocess.run(
+            [sys.executable, str(PARSE_SCRIPT), "--image", image_path, "--type", type],
+            capture_output=True,
+            text=True,
+            timeout=int(os.environ.get("DOCUMENT_PARSER_TIMEOUT_MS", "120000")) // 1000,
+            cwd=str(REPO_ROOT),
+        )
+    finally:
+        Path(image_path).unlink(missing_ok=True)
+    if completed.returncode != 0:
+        detail = (completed.stderr or completed.stdout or "Parse failed").strip()
+        raise HTTPException(status_code=500, detail=detail[:2000])
+    import json
+    try:
+        return json.loads(completed.stdout)
+    except json.JSONDecodeError as error:
+        raise HTTPException(status_code=500, detail=f"Invalid parser output: {error}") from error

services/document-parser-api/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+fastapi>=0.115.0
+uvicorn[standard]>=0.32.0
+python-multipart>=0.0.12
+torch>=2.0
+transformers>=4.36,<5
+pillow>=10.0
+pytesseract>=0.3.10
+accelerate>=0.26