Spaces:
Running
Running
| import re | |
| import io | |
| from datetime import datetime | |
| from typing import Optional, List, Any | |
| import pdfplumber | |
| from fastapi import FastAPI, UploadFile, File, HTTPException | |
| from pydantic import BaseModel, Field | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Pydantic models | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| class VoucherLine(BaseModel): | |
| type: str = Field( | |
| default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| alias="$type", | |
| ) | |
| Number: str | |
| Quantity: float | |
| Price: float | |
| # SellPrice: float | |
| # Description: str = "" | |
| VatCode: str = "01" | |
| # DeliveryDate: Optional[str] = None | |
| model_config = {"populate_by_name": True} | |
| class VoucherResponse(BaseModel): | |
| # Supplier: str | |
| OrderNumber: str | |
| DeliveryDate: Optional[str] | |
| CustomerNumber: Optional[str] | |
| VoucherDate: Optional[str] | |
| # Currency: str | |
| AdditionalFields: List[Any] = [] | |
| VoucherLines: List[VoucherLine] | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Sell-price markup | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| SELL_PRICE_MARKUP = { | |
| "Trelleborg": 1.35, | |
| "Cleanfix": 1.30, | |
| "Polyflex": 1.30, | |
| } | |
| def sell_price(unit_price: float, supplier: str) -> float: | |
| markup = SELL_PRICE_MARKUP.get(supplier, 1.40) | |
| return round(unit_price * markup, 2) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Helpers | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def extract_text(pdf_bytes: bytes) -> str: | |
| parts = [] | |
| with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | |
| for page in pdf.pages: | |
| t = page.extract_text() | |
| if t: | |
| parts.append(t) | |
| return "\n".join(parts) | |
| def to_iso(raw: str) -> Optional[str]: | |
| raw = raw.strip() | |
| for fmt in ("%d.%m.%Y", "%d.%m.%y", "%d/%m/%Y", "%d/%m/%y"): | |
| try: | |
| return datetime.strptime(raw, fmt).date().isoformat() | |
| except ValueError: | |
| pass | |
| return raw | |
| GERMAN_MONTHS = { | |
| "januar": "01", "februar": "02", "märz": "03", "april": "04", | |
| "mai": "05", "juni": "06", "juli": "07", "august": "08", | |
| "september": "09", "oktober": "10", "november": "11", "dezember": "12", | |
| } | |
| def german_date_to_iso(raw: str) -> Optional[str]: | |
| m = re.match(r"(\d{1,2})\.\s*(\w+)\s+(\d{4})", raw.strip(), re.IGNORECASE) | |
| if m: | |
| day = m.group(1).zfill(2) | |
| mon = GERMAN_MONTHS.get(m.group(2).lower()) | |
| year = m.group(3) | |
| if mon: | |
| return f"{year}-{mon}-{day}" | |
| return to_iso(raw) | |
| def num(s: str) -> float: | |
| return float(s.replace(",", ".")) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Supplier identification | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def identify_supplier(text: str) -> str: | |
| upper = text.upper() | |
| if "TRELLEBORG" in upper: | |
| return "Trelleborg" | |
| if "CLEANFIX" in upper or re.search(r"Auftragsbestätigung VA\d+", text): | |
| return "Cleanfix" | |
| if "POLYFLEX" in upper: | |
| return "Polyflex" | |
| return "Unknown" | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Parser: Trelleborg | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Relevant extracted text structure: | |
| # IHRE REFERENZ VON RECHNUNG AN Referenzen | |
| # 2600364 100 D01 Herr Baumann | |
| # UNSERE REFERENZ IE SCHLAUCHSERVICE BAUMANN GMBH | |
| # 0010223953 | |
| # ... | |
| # ADE - Alexandra DENEU +33 473 258 206 10/03/26 1/ 2 | |
| # ... | |
| # Pos10 ZollNr : 4009410000 Ursp.:FR | |
| # 0060068 CITERDIAL 38 L 13,30 11/03/26 M 13,3 52,51 | |
| # %RAB: 33,00 35,181 467,91 | |
| def parse_trelleborg(text: str) -> VoucherResponse: | |
| lines = text.splitlines() | |
| # Voucher date from header line "10/03/26" | |
| voucher_date = None | |
| m = re.search(r"\b(\d{2}/\d{2}/\d{2})\b", text) | |
| if m: | |
| voucher_date = to_iso(m.group(1)) | |
| # Our order number: first 7-digit number after "IHRE REFERENZ" line | |
| our_order = "" | |
| m = re.search(r"IHRE REFERENZ\b.*?\n(\d{7})\b", text, re.DOTALL) | |
| if m: | |
| our_order = m.group(1) | |
| # Supplier's reference number (UNSERE REFERENZ line, then next line) | |
| customer_num = None | |
| m = re.search(r"UNSERE REFERENZ\b.*?\n(\S+)", text, re.DOTALL) | |
| if m: | |
| customer_num = m.group(1) | |
| # Article lines — 3-line block per article: | |
| # Line A: Pos10 ZollNr : 4009410000 Ursp.:FR | |
| # Line B: 0060068 CITERDIAL 38 L 13,30 11/03/26 M 13,3 52,51 | |
| # Line C: %RAB: 33,00 35,181 467,91 | |
| block_re = re.compile( | |
| r"Pos\d+\s+ZollNr\s*:\s*\S+\s+Ursp\.:\s*\S+\s*\n" | |
| r"(\S+)\s+(.+?)\s+(\d{2}/\d{2}/\d{2})\s+M\s+([\d,]+)\s+([\d,]+)\s*\n" | |
| r"\s*%RAB:\s*([\d,]+)\s+([\d,]+)\s+([\d,]+)", | |
| re.DOTALL, | |
| ) | |
| voucher_lines = [] | |
| delivery_date = voucher_date | |
| for m in block_re.finditer(text): | |
| art_num = m.group(1) | |
| desc = m.group(2).strip() | |
| line_date = to_iso(m.group(3)) | |
| qty = num(m.group(4)) | |
| unit_price = num(m.group(5)) # gross unit price | |
| # group(6) = %RAB (discount %) | |
| net_price = num(m.group(7)) # unit price after discount | |
| if delivery_date == voucher_date and line_date: | |
| delivery_date = line_date | |
| voucher_lines.append(VoucherLine(**{ | |
| "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| "Number": art_num, | |
| "Quantity": qty, | |
| "Price": round(net_price, 4), | |
| # "SellPrice": sell_price(net_price, "Trelleborg"), | |
| # "Description": desc, | |
| "VatCode": "01", | |
| # "DeliveryDate": line_date, | |
| })) | |
| return VoucherResponse( | |
| # Supplier="Trelleborg", | |
| OrderNumber=our_order, | |
| DeliveryDate=delivery_date, | |
| CustomerNumber=customer_num, | |
| VoucherDate=voucher_date, | |
| # Currency="EUR", | |
| VoucherLines=voucher_lines, | |
| ) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Parser: Cleanfix | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Relevant extracted text structure: | |
| # Datum 10.03.2026 | |
| # Ihre Bestellnr. 2600370 | |
| # Debitorennr. 35228 | |
| # Auslieferdatum 11.03.2026 | |
| # Auftragsbestätigung VA516165 | |
| # Pos Artikelnr. Menge / Einheit VK-Preis % Betrag | |
| # 1 710.657 2.00Stück 87.05 35 113.16 ← no space between qty and "Stück"! | |
| # Ladesteckdose FT80A 16mm2 | |
| # 2 710.656 2.00Stück 64.40 35 83.72 | |
| # Ladestecker FT80A 16mm2 | |
| # 3 607.000 1.00Stück 105.75 35 68.74 | |
| # Treppenadapter 1Düse, 23cm | |
| def parse_cleanfix(text: str) -> VoucherResponse: | |
| # Voucher date | |
| voucher_date = None | |
| m = re.search(r"Datum\s+(\d{2}\.\d{2}\.\d{4})", text) | |
| if m: | |
| voucher_date = to_iso(m.group(1)) | |
| # Delivery date | |
| delivery_date = voucher_date | |
| m = re.search(r"Auslieferdatum\s+(\d{2}\.\d{2}\.\d{4})", text) | |
| if m: | |
| delivery_date = to_iso(m.group(1)) | |
| # Our order number | |
| our_order = "" | |
| m = re.search(r"Ihre Bestellnr\.\s+(\d+)", text) | |
| if m: | |
| our_order = m.group(1) | |
| # Customer number | |
| customer_num = None | |
| m = re.search(r"Debitorennr\.\s+(\S+)", text) | |
| if m: | |
| customer_num = m.group(1) | |
| # Article line pattern: | |
| # "1 710.657 2.00Stück 87.05 35 113.16" | |
| # qty and "Stück" are concatenated ("2.00Stück") in the extracted text | |
| line_re = re.compile( | |
| r"^(\d+)\s+([\d.]+)\s+([\d.]+)Stück\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$", | |
| re.MULTILINE, | |
| ) | |
| all_lines = text.splitlines() | |
| voucher_lines = [] | |
| for m in line_re.finditer(text): | |
| art_num = m.group(2) | |
| qty = float(m.group(3)) | |
| unit_price = float(m.group(4)) # VK-Preis (gross) | |
| discount = float(m.group(5)) # discount % | |
| # line total = m.group(6) | |
| # Description is on the very next line | |
| match_end_line = text[:m.end()].count("\n") | |
| desc = "" | |
| if match_end_line + 1 < len(all_lines): | |
| candidate = all_lines[match_end_line + 1].strip() | |
| # Skip if it's another article row, a total line, or a thank-you line | |
| if candidate and not re.match(r"^\d+\s+[\d.]+\s+[\d.]+Stück", candidate) \ | |
| and not candidate.startswith("Total") \ | |
| and not candidate.startswith("Besten"): | |
| desc = candidate | |
| voucher_lines.append(VoucherLine(**{ | |
| "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| "Number": art_num, | |
| "Quantity": qty, | |
| "Price": unit_price, | |
| # "SellPrice": sell_price(unit_price, "Cleanfix"), | |
| # "Description": desc, | |
| "VatCode": "01", | |
| # "DeliveryDate": delivery_date, | |
| })) | |
| return VoucherResponse( | |
| # Supplier="Cleanfix", | |
| OrderNumber=our_order, | |
| DeliveryDate=delivery_date, | |
| CustomerNumber=customer_num, | |
| VoucherDate=voucher_date, | |
| # Currency="CHF", | |
| VoucherLines=voucher_lines, | |
| ) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Parser: Polyflex | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Relevant extracted text structure: | |
| # Kundennummer: D00030 | |
| # Ihre Bestellung 2600357 | |
| # Würenlos, 06.03.26 | |
| # 21200025 Schlauch POLYWELL antistatisch 60.00 Meter 6.80 45.00 224.40 | |
| # id=25mm, 2x30m | |
| # Warenausgangsdatum: 6. März 2026 | |
| def parse_polyflex(text: str) -> VoucherResponse: | |
| # Voucher date "Würenlos, 06.03.26" | |
| voucher_date = None | |
| m = re.search(r"Würenlos,?\s+(\d{2}\.\d{2}\.\d{2,4})", text) | |
| if m: | |
| voucher_date = to_iso(m.group(1)) | |
| # Dispatch / delivery date "Warenausgangsdatum: 6. März 2026" | |
| delivery_date = voucher_date | |
| m = re.search(r"Warenausgangsdatum:\s+(.+)", text) | |
| if m: | |
| delivery_date = german_date_to_iso(m.group(1).strip()) | |
| # Our order number | |
| our_order = "" | |
| m = re.search(r"Ihre Bestellung\s+(\d+)", text) | |
| if m: | |
| our_order = m.group(1) | |
| # Customer number | |
| customer_num = None | |
| m = re.search(r"Kundennummer:\s+(\S+)", text) | |
| if m: | |
| customer_num = m.group(1) | |
| # Article line: | |
| # "21200025 Schlauch POLYWELL antistatisch 60.00 Meter 6.80 45.00 224.40" | |
| line_re = re.compile( | |
| r"^(\d{5,12})\s+(.+?)\s+([\d.]+)\s+Meter\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$", | |
| re.MULTILINE, | |
| ) | |
| all_lines = text.splitlines() | |
| voucher_lines = [] | |
| for m in line_re.finditer(text): | |
| art_num = m.group(1) | |
| desc = m.group(2).strip() | |
| qty = float(m.group(3)) | |
| unit_price = float(m.group(4)) | |
| # discount = float(m.group(5)) | |
| # total = float(m.group(6)) | |
| # Append continuation description line (e.g. "id=25mm, 2x30m") | |
| match_end_line = text[:m.end()].count("\n") | |
| if match_end_line + 1 < len(all_lines): | |
| nxt = all_lines[match_end_line + 1].strip() | |
| if nxt and not re.match(r"^\d{5,}", nxt) and not nxt.startswith("Total"): | |
| desc += " " + nxt | |
| voucher_lines.append(VoucherLine(**{ | |
| "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", | |
| "Number": art_num, | |
| "Quantity": qty, | |
| "Price": unit_price, | |
| # "SellPrice": sell_price(unit_price, "Polyflex"), | |
| # "Description": desc, | |
| "VatCode": "01", | |
| # "DeliveryDate": delivery_date, | |
| })) | |
| return VoucherResponse( | |
| # Supplier="Polyflex", | |
| OrderNumber=our_order, | |
| DeliveryDate=delivery_date, | |
| CustomerNumber=customer_num, | |
| VoucherDate=voucher_date, | |
| # Currency="CHF", | |
| VoucherLines=voucher_lines, | |
| ) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # Dispatcher | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def parse_pdf(pdf_bytes: bytes) -> VoucherResponse: | |
| text = extract_text(pdf_bytes) | |
| supplier = identify_supplier(text) | |
| if supplier == "Trelleborg": | |
| return parse_trelleborg(text) | |
| if supplier == "Cleanfix": | |
| return parse_cleanfix(text) | |
| if supplier == "Polyflex": | |
| return parse_polyflex(text) | |
| raise ValueError( | |
| f"Could not identify supplier.\nExtracted text snippet:\n{text[:400]}" | |
| ) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # FastAPI | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| app = FastAPI( | |
| title="Order Confirmation PDF Extractor", | |
| description=( | |
| "Upload a supplier order-confirmation PDF " | |
| "(Trelleborg / Cleanfix / Polyflex) and receive ERP-ready JSON." | |
| ), | |
| version="2.0.0", | |
| ) | |
| async def extract_order( | |
| file: UploadFile = File(..., description="Supplier order-confirmation PDF"), | |
| ): | |
| if not file.filename.lower().endswith(".pdf"): | |
| raise HTTPException(status_code=400, detail="Only PDF files are accepted.") | |
| content = await file.read() | |
| try: | |
| result = parse_pdf(content) | |
| except ValueError as e: | |
| raise HTTPException(status_code=422, detail=str(e)) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Parsing error: {e}") | |
| return result | |
| def health(): | |
| return {"status": "ok"} |