stubdude commited on
Commit
fbba60e
·
1 Parent(s): d543c2a

Add document parser Docker service

Browse files
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ tesseract-ocr \
5
+ libgl1 \
6
+ libglib2.0-0 \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ WORKDIR /app
10
+
11
+ COPY scripts/requirements-document-parser.txt /app/scripts/requirements-document-parser.txt
12
+ COPY services/document-parser-api/requirements.txt /app/services/document-parser-api/requirements.txt
13
+ RUN pip install --no-cache-dir -r /app/services/document-parser-api/requirements.txt
14
+
15
+ COPY scripts/parse_vendor_document.py /app/scripts/parse_vendor_document.py
16
+ COPY services/document-parser-api/main.py /app/services/document-parser-api/main.py
17
+
18
+ ENV PYTHONUNBUFFERED=1
19
+ WORKDIR /app/services/document-parser-api
20
+
21
+ EXPOSE 7860
22
+
23
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,9 @@
1
  ---
2
  title: Fresh Catch Parser
3
- emoji: 🐠
4
- colorFrom: gray
5
- colorTo: red
6
  sdk: docker
7
- pinned: false
8
  ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Fresh Catch Parser
3
+ emoji: 🐟
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  ---
9
+ Document parser API for Fresh Catch Inventory.
 
scripts/parse_vendor_document.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Parse vendor invoices (LayoutLMv3 FUNSD) or retail receipts (Donut CORD v2).
4
+
5
+ Usage:
6
+ python3 scripts/parse_vendor_document.py --image /path/to.png [--type invoice|receipt|auto]
7
+
8
+ Prints a single JSON object to stdout matching ParsedVendorInvoice.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import re
16
+ import sys
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ RECEIPT_MODEL = "naver-clova-ix/donut-base-finetuned-cord-v2"
22
+ INVOICE_MODEL = "nielsr/layoutlmv3-finetuned-funsd"
23
+
24
+ INVOICE_HINTS = (
25
+ "invoice",
26
+ "inv #",
27
+ "inv no",
28
+ "bill to",
29
+ "ship to",
30
+ "purchase order",
31
+ "po #",
32
+ "remit to",
33
+ "net 30",
34
+ "del weight",
35
+ "unit price",
36
+ "vendor",
37
+ "food service",
38
+ )
39
+
40
+ RECEIPT_HINTS = (
41
+ "receipt",
42
+ "thank you",
43
+ "subtotal",
44
+ "sub total",
45
+ "change due",
46
+ "cashier",
47
+ "register",
48
+ "visa",
49
+ "mastercard",
50
+ "debit",
51
+ "loyalty",
52
+ "store #",
53
+ )
54
+
55
+
56
+ @dataclass
57
+ class OcrWord:
58
+ text: str
59
+ left: int
60
+ top: int
61
+ width: int
62
+ height: int
63
+
64
+ @property
65
+ def box(self) -> list[int]:
66
+ return [self.left, self.top, self.left + self.width, self.top + self.height]
67
+
68
+
69
+ def eprint(*args: object) -> None:
70
+ print(*args, file=sys.stderr)
71
+
72
+
73
+ def load_image(path: Path):
74
+ from PIL import Image
75
+
76
+ image = Image.open(path).convert("RGB")
77
+ return image
78
+
79
+
80
+ def ocr_words(image) -> list[OcrWord]:
81
+ import pytesseract
82
+
83
+ data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
84
+ words: list[OcrWord] = []
85
+ count = len(data["text"])
86
+ for i in range(count):
87
+ text = (data["text"][i] or "").strip()
88
+ if not text:
89
+ continue
90
+ conf = int(float(data["conf"][i])) if data["conf"][i] not in ("-1", "") else -1
91
+ if conf >= 0 and conf < 35:
92
+ continue
93
+ words.append(
94
+ OcrWord(
95
+ text=text,
96
+ left=int(data["left"][i]),
97
+ top=int(data["top"][i]),
98
+ width=int(data["width"][i]),
99
+ height=int(data["height"][i]),
100
+ )
101
+ )
102
+ return words
103
+
104
+
105
+ def normalize_boxes(words: list[OcrWord], width: int, height: int) -> list[list[int]]:
106
+ boxes: list[list[int]] = []
107
+ for word in words:
108
+ x0, y0, x1, y1 = word.box
109
+ boxes.append(
110
+ [
111
+ min(1000, max(0, int(1000 * x0 / width))),
112
+ min(1000, max(0, int(1000 * y0 / height))),
113
+ min(1000, max(0, int(1000 * x1 / width))),
114
+ min(1000, max(0, int(1000 * y1 / height))),
115
+ ]
116
+ )
117
+ return boxes
118
+
119
+
120
+ def classify_document_type(words: list[OcrWord], forced: str | None) -> str:
121
+ if forced in ("invoice", "receipt"):
122
+ return forced
123
+
124
+ text = " ".join(word.text for word in words).lower()
125
+ invoice_score = sum(1 for hint in INVOICE_HINTS if hint in text)
126
+ receipt_score = sum(1 for hint in RECEIPT_HINTS if hint in text)
127
+
128
+ if "invoice" in text or "inv " in text:
129
+ invoice_score += 2
130
+ if "receipt" in text:
131
+ receipt_score += 2
132
+
133
+ if invoice_score > receipt_score + 1:
134
+ return "invoice"
135
+ if receipt_score > invoice_score:
136
+ return "receipt"
137
+ return "invoice"
138
+
139
+
140
+ def parse_loose_number(value: Any) -> float | None:
141
+ if isinstance(value, (int, float)):
142
+ return float(value)
143
+ if not isinstance(value, str):
144
+ return None
145
+ cleaned = re.sub(r"[^0-9.,-]", "", value).replace(",", ".")
146
+ if not cleaned:
147
+ return None
148
+ try:
149
+ return float(cleaned)
150
+ except ValueError:
151
+ return None
152
+
153
+
154
+ def normalize_date(value: str | None) -> str | None:
155
+ if not value:
156
+ return None
157
+ value = value.strip()
158
+ if re.match(r"^\d{4}-\d{2}-\d{2}$", value):
159
+ return value
160
+ match = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", value)
161
+ if not match:
162
+ return value
163
+ month, day, year = match.groups()
164
+ if len(year) == 2:
165
+ year = f"20{year}"
166
+ return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
167
+
168
+
169
+ def map_cord_json(cord: dict[str, Any]) -> dict[str, Any]:
170
+ line_items: list[dict[str, Any]] = []
171
+ menu = cord.get("menu")
172
+ menus = menu if isinstance(menu, list) else [menu] if isinstance(menu, dict) else []
173
+
174
+ for entry in menus:
175
+ if not isinstance(entry, dict):
176
+ continue
177
+ description = (
178
+ entry.get("nm")
179
+ or entry.get("item")
180
+ or entry.get("name")
181
+ or entry.get("menu.nm")
182
+ )
183
+ if not description or not str(description).strip():
184
+ continue
185
+ line_items.append(
186
+ {
187
+ "description": str(description).strip(),
188
+ "vendorItemNumber": None,
189
+ "quantity": parse_loose_number(entry.get("cnt") or entry.get("num")),
190
+ "unit": str(entry.get("unit") or entry.get("itemsubtotal") or "").strip() or None,
191
+ "unitPrice": parse_loose_number(
192
+ entry.get("unitprice") or entry.get("price") or entry.get("itemprice")
193
+ ),
194
+ "lineTotal": parse_loose_number(
195
+ entry.get("price") or entry.get("cntprice") or entry.get("itemprice")
196
+ ),
197
+ }
198
+ )
199
+
200
+ sub_total = cord.get("sub_total") or cord.get("subtotal")
201
+ tax = cord.get("tax") or cord.get("tax_price")
202
+ total = cord.get("total") or cord.get("total_price") or cord.get("total_etc")
203
+
204
+ def price_field(block: Any, *keys: str) -> float | None:
205
+ if isinstance(block, dict):
206
+ for key in keys:
207
+ if key in block:
208
+ return parse_loose_number(block[key])
209
+ return parse_loose_number(block)
210
+
211
+ return {
212
+ "vendorName": str(cord.get("store") or cord.get("company") or cord.get("brand") or "").strip()
213
+ or None,
214
+ "invoiceNumber": str(cord.get("receipt_no") or cord.get("order_no") or "").strip() or None,
215
+ "invoiceDate": normalize_date(
216
+ str(cord.get("date") or cord.get("receipt_date") or "").strip() or None
217
+ ),
218
+ "subtotal": price_field(sub_total, "price", "subtotal_price", "sub_total_price"),
219
+ "tax": price_field(tax, "price", "tax_price"),
220
+ "total": price_field(total, "total_price", "price", "total"),
221
+ "currency": None,
222
+ "confidence": "medium" if line_items else "low",
223
+ "rawNotes": json.dumps(cord)[:4000] if cord else None,
224
+ "lineItems": line_items,
225
+ }
226
+
227
+
228
+ def parse_receipt(image) -> dict[str, Any]:
229
+ import torch
230
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
231
+
232
+ processor = DonutProcessor.from_pretrained(RECEIPT_MODEL)
233
+ model = VisionEncoderDecoderModel.from_pretrained(RECEIPT_MODEL)
234
+ device = "cuda" if torch.cuda.is_available() else "cpu"
235
+ model.to(device)
236
+ model.eval()
237
+
238
+ pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)
239
+ task_prompt = "<s_cord-v2>"
240
+ decoder_input_ids = processor.tokenizer(
241
+ task_prompt, add_special_tokens=False, return_tensors="pt"
242
+ ).input_ids.to(device)
243
+
244
+ outputs = model.generate(
245
+ pixel_values,
246
+ decoder_input_ids=decoder_input_ids,
247
+ max_length=model.decoder.config.max_position_embeddings,
248
+ early_stopping=True,
249
+ pad_token_id=processor.tokenizer.pad_token_id,
250
+ eos_token_id=processor.tokenizer.eos_token_id,
251
+ use_cache=True,
252
+ num_beams=1,
253
+ bad_words_ids=[[processor.tokenizer.unk_token_id]],
254
+ return_dict_in_generate=True,
255
+ )
256
+
257
+ sequence = processor.batch_decode(outputs.sequences)[0]
258
+ sequence = (
259
+ sequence.replace(processor.tokenizer.eos_token, "")
260
+ .replace(processor.tokenizer.pad_token, "")
261
+ .strip()
262
+ )
263
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
264
+ cord = processor.token2json(sequence)
265
+ return map_cord_json(cord)
266
+
267
+
268
+ def align_word_labels(word_texts: list[str], word_ids: list[int | None], predictions: list[int], id2label: dict) -> list[str]:
269
+ labels = ["O"] * len(word_texts)
270
+ for word_id, pred in zip(word_ids, predictions):
271
+ if word_id is None:
272
+ continue
273
+ label = id2label.get(pred, id2label.get(str(pred), "O"))
274
+ labels[word_id] = label
275
+ return labels
276
+
277
+
278
+ def group_entities(words: list[str], labels: list[str]) -> list[tuple[str, str]]:
279
+ groups: list[tuple[str, str]] = []
280
+ current_label: str | None = None
281
+ current_tokens: list[str] = []
282
+
283
+ def flush() -> None:
284
+ nonlocal current_label, current_tokens
285
+ if current_tokens and current_label:
286
+ groups.append((current_label, " ".join(current_tokens).strip()))
287
+ current_label = None
288
+ current_tokens = []
289
+
290
+ for word, label in zip(words, labels):
291
+ if label == "O":
292
+ flush()
293
+ continue
294
+ prefix = label[:2]
295
+ base = label[2:] if prefix in ("B-", "I-") else label
296
+ if prefix == "B-" or current_label != base:
297
+ flush()
298
+ current_label = base
299
+ current_tokens = [word]
300
+ else:
301
+ current_tokens.append(word)
302
+ flush()
303
+ return groups
304
+
305
+
306
+ def extract_qa_pairs(groups: list[tuple[str, str]]) -> list[tuple[str, str]]:
307
+ pairs: list[tuple[str, str]] = []
308
+ pending_question: str | None = None
309
+ for label, text in groups:
310
+ if label.endswith("QUESTION"):
311
+ pending_question = text
312
+ elif label.endswith("ANSWER") and pending_question:
313
+ pairs.append((pending_question, text))
314
+ pending_question = None
315
+ elif label.endswith("HEADER"):
316
+ pairs.append(("HEADER", text))
317
+ return pairs
318
+
319
+
320
+ def extract_line_items_from_ocr(words: list[OcrWord]) -> list[dict[str, Any]]:
321
+ if not words:
322
+ return []
323
+
324
+ rows: dict[int, list[OcrWord]] = {}
325
+ for word in words:
326
+ bucket = round(word.top / 12) * 12
327
+ rows.setdefault(bucket, []).append(word)
328
+
329
+ line_items: list[dict[str, Any]] = []
330
+ for _, row_words in sorted(rows.items()):
331
+ row_words = sorted(row_words, key=lambda w: w.left)
332
+ text = " ".join(word.text for word in row_words)
333
+ if len(text) < 4:
334
+ continue
335
+ lower = text.lower()
336
+ if any(
337
+ skip in lower
338
+ for skip in (
339
+ "subtotal",
340
+ "sub total",
341
+ "total",
342
+ "tax",
343
+ "balance",
344
+ "thank you",
345
+ "page ",
346
+ "invoice",
347
+ "bill to",
348
+ "ship to",
349
+ )
350
+ ):
351
+ continue
352
+
353
+ numbers = [
354
+ parse_loose_number(match.group())
355
+ for match in re.finditer(r"\d[\d,]*\.?\d*", text)
356
+ ]
357
+ numbers = [n for n in numbers if n is not None]
358
+ if len(numbers) < 2:
359
+ continue
360
+
361
+ quantity = numbers[-2] if len(numbers) >= 2 else None
362
+ line_total = numbers[-1]
363
+ description = re.sub(r"\s+\d[\d,]*\.?\d*.*$", "", text).strip()
364
+ if len(description) < 3:
365
+ continue
366
+
367
+ line_items.append(
368
+ {
369
+ "description": description,
370
+ "vendorItemNumber": None,
371
+ "quantity": quantity,
372
+ "unit": None,
373
+ "unitPrice": round(line_total / quantity, 4) if quantity and quantity > 0 else None,
374
+ "lineTotal": line_total,
375
+ }
376
+ )
377
+
378
+ return line_items[:40]
379
+
380
+
381
+ def parse_invoice(image, words: list[OcrWord]) -> dict[str, Any]:
382
+ import torch
383
+ from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
384
+
385
+ if not words:
386
+ return {
387
+ "vendorName": None,
388
+ "invoiceNumber": None,
389
+ "invoiceDate": None,
390
+ "subtotal": None,
391
+ "tax": None,
392
+ "total": None,
393
+ "currency": None,
394
+ "confidence": "low",
395
+ "rawNotes": None,
396
+ "lineItems": [],
397
+ }
398
+
399
+ processor = LayoutLMv3Processor.from_pretrained(INVOICE_MODEL, apply_ocr=False)
400
+ model = LayoutLMv3ForTokenClassification.from_pretrained(INVOICE_MODEL)
401
+ device = "cuda" if torch.cuda.is_available() else "cpu"
402
+ model.to(device)
403
+ model.eval()
404
+
405
+ width, height = image.size
406
+ word_texts = [word.text for word in words]
407
+ boxes = normalize_boxes(words, width, height)
408
+
409
+ encoding = processor(
410
+ image,
411
+ word_texts,
412
+ boxes=boxes,
413
+ return_tensors="pt",
414
+ truncation=True,
415
+ padding="max_length",
416
+ max_length=512,
417
+ )
418
+ encoding = {key: value.to(device) for key, value in encoding.items()}
419
+
420
+ with torch.no_grad():
421
+ outputs = model(**encoding)
422
+
423
+ predictions = outputs.logits.argmax(-1).squeeze().tolist()
424
+ if isinstance(predictions, int):
425
+ predictions = [predictions]
426
+
427
+ id2label = model.config.id2label
428
+ word_ids = encoding.word_ids(batch_index=0)
429
+ labels = align_word_labels(word_texts, word_ids, predictions, id2label)
430
+ groups = group_entities(word_texts, labels)
431
+ qa_pairs = extract_qa_pairs(groups)
432
+
433
+ vendor_name = None
434
+ invoice_number = None
435
+ invoice_date = None
436
+ total = None
437
+ tax = None
438
+ subtotal = None
439
+
440
+ for question, answer in qa_pairs:
441
+ q = question.lower()
442
+ if question == "HEADER" and not vendor_name:
443
+ vendor_name = answer
444
+ continue
445
+ if any(token in q for token in ("invoice", "inv", "bill")) and "date" in q:
446
+ invoice_date = normalize_date(answer)
447
+ elif any(token in q for token in ("invoice", "inv")) and "no" in q:
448
+ invoice_number = answer
449
+ elif "date" in q:
450
+ invoice_date = normalize_date(answer)
451
+ elif "total" in q and "sub" not in q:
452
+ total = parse_loose_number(answer)
453
+ elif "tax" in q:
454
+ tax = parse_loose_number(answer)
455
+ elif "subtotal" in q or "sub total" in q:
456
+ subtotal = parse_loose_number(answer)
457
+ elif any(token in q for token in ("vendor", "supplier", "seller", "remit", "from")):
458
+ vendor_name = answer
459
+
460
+ line_items = extract_line_items_from_ocr(words)
461
+ confidence = "high" if line_items and (invoice_number or vendor_name) else "medium" if line_items else "low"
462
+
463
+ return {
464
+ "vendorName": vendor_name,
465
+ "invoiceNumber": invoice_number,
466
+ "invoiceDate": invoice_date,
467
+ "subtotal": subtotal,
468
+ "tax": tax,
469
+ "total": total,
470
+ "currency": None,
471
+ "confidence": confidence,
472
+ "rawNotes": None,
473
+ "lineItems": line_items,
474
+ }
475
+
476
+
477
+ def main() -> int:
478
+ parser = argparse.ArgumentParser()
479
+ parser.add_argument("--image", required=True, help="Path to a PNG/JPG/WebP image")
480
+ parser.add_argument(
481
+ "--type",
482
+ default="auto",
483
+ choices=("auto", "invoice", "receipt"),
484
+ help="Document type routing",
485
+ )
486
+ args = parser.parse_args()
487
+
488
+ image_path = Path(args.image)
489
+ if not image_path.exists():
490
+ eprint(f"Image not found: {image_path}")
491
+ return 1
492
+
493
+ try:
494
+ image = load_image(image_path)
495
+ words = ocr_words(image)
496
+ doc_type = classify_document_type(words, None if args.type == "auto" else args.type)
497
+ result = parse_receipt(image) if doc_type == "receipt" else parse_invoice(image, words)
498
+ payload = {"documentType": doc_type, **result}
499
+ print(json.dumps(payload))
500
+ return 0
501
+ except Exception as error: # noqa: BLE001
502
+ eprint(f"Document parse failed: {error}")
503
+ return 1
504
+
505
+
506
+ if __name__ == "__main__":
507
+ raise SystemExit(main())
scripts/requirements-document-parser.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Optional local parser for invoice/receipt import (see scripts/parse_vendor_document.py).
2
+ # Requires Tesseract OCR installed on the host (macOS: brew install tesseract).
3
+ torch>=2.0
4
+ transformers>=4.36,<5
5
+ pillow>=10.0
6
+ pytesseract>=0.3.10
7
+ accelerate>=0.26
services/document-parser-api/main.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hosted document parser API for Fresh Catch Inventory.
3
+
4
+ Deploy to Hugging Face Spaces (Docker), Fly.io, or any VM with Python + Tesseract.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ from pathlib import Path
14
+
15
+ from fastapi import FastAPI, File, Header, HTTPException, Query, UploadFile
16
+ from fastapi.middleware.cors import CORSMiddleware
17
+
18
+ REPO_ROOT = Path(__file__).resolve().parents[2]
19
+ PARSE_SCRIPT = REPO_ROOT / "scripts" / "parse_vendor_document.py"
20
+ SERVICE_SECRET = os.environ.get("DOCUMENT_PARSER_SERVICE_SECRET", "").strip()
21
+
22
+ app = FastAPI(title="Fresh Catch Document Parser", version="1.0.0")
23
+
24
+ app.add_middleware(
25
+ CORSMiddleware,
26
+ allow_origins=os.environ.get("DOCUMENT_PARSER_CORS_ORIGINS", "*").split(","),
27
+ allow_credentials=True,
28
+ allow_methods=["POST", "GET"],
29
+ allow_headers=["*"],
30
+ )
31
+
32
+
33
+ def verify_auth(authorization: str | None) -> None:
34
+ if not SERVICE_SECRET:
35
+ return
36
+ if not authorization or authorization != f"Bearer {SERVICE_SECRET}":
37
+ raise HTTPException(status_code=401, detail="Unauthorized")
38
+
39
+
40
+ @app.get("/health")
41
+ def health() -> dict[str, str]:
42
+ return {"status": "ok"}
43
+
44
+
45
+ @app.post("/parse")
46
+ async def parse_document(
47
+ file: UploadFile = File(...),
48
+ type: str = Query("auto", pattern="^(auto|invoice|receipt)$"),
49
+ authorization: str | None = Header(default=None),
50
+ ) -> dict:
51
+ verify_auth(authorization)
52
+
53
+ if not PARSE_SCRIPT.exists():
54
+ raise HTTPException(status_code=500, detail="parse_vendor_document.py not found")
55
+
56
+ contents = await file.read()
57
+ if not contents:
58
+ raise HTTPException(status_code=400, detail="Empty file")
59
+
60
+ suffix = Path(file.filename or "upload.png").suffix or ".png"
61
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
62
+ tmp.write(contents)
63
+ image_path = tmp.name
64
+
65
+ try:
66
+ completed = subprocess.run(
67
+ [sys.executable, str(PARSE_SCRIPT), "--image", image_path, "--type", type],
68
+ capture_output=True,
69
+ text=True,
70
+ timeout=int(os.environ.get("DOCUMENT_PARSER_TIMEOUT_MS", "120000")) // 1000,
71
+ cwd=str(REPO_ROOT),
72
+ )
73
+ finally:
74
+ Path(image_path).unlink(missing_ok=True)
75
+
76
+ if completed.returncode != 0:
77
+ detail = (completed.stderr or completed.stdout or "Parse failed").strip()
78
+ raise HTTPException(status_code=500, detail=detail[:2000])
79
+
80
+ import json
81
+
82
+ try:
83
+ return json.loads(completed.stdout)
84
+ except json.JSONDecodeError as error:
85
+ raise HTTPException(status_code=500, detail=f"Invalid parser output: {error}") from error
services/document-parser-api/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0
2
+ uvicorn[standard]>=0.32.0
3
+ python-multipart>=0.0.12
4
+ torch>=2.0
5
+ transformers>=4.36,<5
6
+ pillow>=10.0
7
+ pytesseract>=0.3.10
8
+ accelerate>=0.26