import re import io from datetime import datetime from typing import Optional, List, Any import pdfplumber from fastapi import FastAPI, UploadFile, File, HTTPException from pydantic import BaseModel, Field # ───────────────────────────────────────────────────────────────────────────── # Pydantic models # ───────────────────────────────────────────────────────────────────────────── class VoucherLine(BaseModel): type: str = Field( default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", alias="$type", ) Number: str Quantity: float Price: float # SellPrice: float # Description: str = "" VatCode: str = "01" # DeliveryDate: Optional[str] = None model_config = {"populate_by_name": True} class VoucherResponse(BaseModel): # Supplier: str OrderNumber: str DeliveryDate: Optional[str] CustomerNumber: Optional[str] VoucherDate: Optional[str] # Currency: str AdditionalFields: List[Any] = [] VoucherLines: List[VoucherLine] # ───────────────────────────────────────────────────────────────────────────── # Sell-price markup # ───────────────────────────────────────────────────────────────────────────── SELL_PRICE_MARKUP = { "Trelleborg": 1.35, "Cleanfix": 1.30, "Polyflex": 1.30, } def sell_price(unit_price: float, supplier: str) -> float: markup = SELL_PRICE_MARKUP.get(supplier, 1.40) return round(unit_price * markup, 2) # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── def extract_text(pdf_bytes: bytes) -> str: parts = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for page in pdf.pages: t = page.extract_text() if t: parts.append(t) return "\n".join(parts) def to_iso(raw: str) -> Optional[str]: raw = raw.strip() for fmt in ("%d.%m.%Y", "%d.%m.%y", "%d/%m/%Y", "%d/%m/%y"): try: return datetime.strptime(raw, fmt).date().isoformat() except ValueError: pass return raw GERMAN_MONTHS = { "januar": "01", "februar": "02", "märz": "03", "april": "04", "mai": "05", "juni": "06", "juli": "07", "august": "08", "september": "09", "oktober": "10", "november": "11", "dezember": "12", } def german_date_to_iso(raw: str) -> Optional[str]: m = re.match(r"(\d{1,2})\.\s*(\w+)\s+(\d{4})", raw.strip(), re.IGNORECASE) if m: day = m.group(1).zfill(2) mon = GERMAN_MONTHS.get(m.group(2).lower()) year = m.group(3) if mon: return f"{year}-{mon}-{day}" return to_iso(raw) def num(s: str) -> float: return float(s.replace(",", ".")) # ───────────────────────────────────────────────────────────────────────────── # Supplier identification # ───────────────────────────────────────────────────────────────────────────── def identify_supplier(text: str) -> str: upper = text.upper() if "TRELLEBORG" in upper: return "Trelleborg" if "CLEANFIX" in upper or re.search(r"Auftragsbestätigung VA\d+", text): return "Cleanfix" if "POLYFLEX" in upper: return "Polyflex" return "Unknown" # ───────────────────────────────────────────────────────────────────────────── # Parser: Trelleborg # ───────────────────────────────────────────────────────────────────────────── # Relevant extracted text structure: # IHRE REFERENZ VON RECHNUNG AN Referenzen # 2600364 100 D01 Herr Baumann # UNSERE REFERENZ IE SCHLAUCHSERVICE BAUMANN GMBH # 0010223953 # ... # ADE - Alexandra DENEU +33 473 258 206 10/03/26 1/ 2 # ... # Pos10 ZollNr : 4009410000 Ursp.:FR # 0060068 CITERDIAL 38 L 13,30 11/03/26 M 13,3 52,51 # %RAB: 33,00 35,181 467,91 def parse_trelleborg(text: str) -> VoucherResponse: lines = text.splitlines() # Voucher date from header line "10/03/26" voucher_date = None m = re.search(r"\b(\d{2}/\d{2}/\d{2})\b", text) if m: voucher_date = to_iso(m.group(1)) # Our order number: first 7-digit number after "IHRE REFERENZ" line our_order = "" m = re.search(r"IHRE REFERENZ\b.*?\n(\d{7})\b", text, re.DOTALL) if m: our_order = m.group(1) # Supplier's reference number (UNSERE REFERENZ line, then next line) customer_num = None m = re.search(r"UNSERE REFERENZ\b.*?\n(\S+)", text, re.DOTALL) if m: customer_num = m.group(1) # Article lines — 3-line block per article: # Line A: Pos10 ZollNr : 4009410000 Ursp.:FR # Line B: 0060068 CITERDIAL 38 L 13,30 11/03/26 M 13,3 52,51 # Line C: %RAB: 33,00 35,181 467,91 block_re = re.compile( r"Pos\d+\s+ZollNr\s*:\s*\S+\s+Ursp\.:\s*\S+\s*\n" r"(\S+)\s+(.+?)\s+(\d{2}/\d{2}/\d{2})\s+M\s+([\d,]+)\s+([\d,]+)\s*\n" r"\s*%RAB:\s*([\d,]+)\s+([\d,]+)\s+([\d,]+)", re.DOTALL, ) voucher_lines = [] delivery_date = voucher_date for m in block_re.finditer(text): art_num = m.group(1) desc = m.group(2).strip() line_date = to_iso(m.group(3)) qty = num(m.group(4)) unit_price = num(m.group(5)) # gross unit price # group(6) = %RAB (discount %) net_price = num(m.group(7)) # unit price after discount if delivery_date == voucher_date and line_date: delivery_date = line_date voucher_lines.append(VoucherLine(**{ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", "Number": art_num, "Quantity": qty, "Price": round(net_price, 4), # "SellPrice": sell_price(net_price, "Trelleborg"), # "Description": desc, "VatCode": "01", # "DeliveryDate": line_date, })) return VoucherResponse( # Supplier="Trelleborg", OrderNumber=our_order, DeliveryDate=delivery_date, CustomerNumber=customer_num, VoucherDate=voucher_date, # Currency="EUR", VoucherLines=voucher_lines, ) # ───────────────────────────────────────────────────────────────────────────── # Parser: Cleanfix # ───────────────────────────────────────────────────────────────────────────── # Relevant extracted text structure: # Datum 10.03.2026 # Ihre Bestellnr. 2600370 # Debitorennr. 35228 # Auslieferdatum 11.03.2026 # Auftragsbestätigung VA516165 # Pos Artikelnr. Menge / Einheit VK-Preis % Betrag # 1 710.657 2.00Stück 87.05 35 113.16 ← no space between qty and "Stück"! # Ladesteckdose FT80A 16mm2 # 2 710.656 2.00Stück 64.40 35 83.72 # Ladestecker FT80A 16mm2 # 3 607.000 1.00Stück 105.75 35 68.74 # Treppenadapter 1Düse, 23cm def parse_cleanfix(text: str) -> VoucherResponse: # Voucher date voucher_date = None m = re.search(r"Datum\s+(\d{2}\.\d{2}\.\d{4})", text) if m: voucher_date = to_iso(m.group(1)) # Delivery date delivery_date = voucher_date m = re.search(r"Auslieferdatum\s+(\d{2}\.\d{2}\.\d{4})", text) if m: delivery_date = to_iso(m.group(1)) # Our order number our_order = "" m = re.search(r"Ihre Bestellnr\.\s+(\d+)", text) if m: our_order = m.group(1) # Customer number customer_num = None m = re.search(r"Debitorennr\.\s+(\S+)", text) if m: customer_num = m.group(1) # Article line pattern: # "1 710.657 2.00Stück 87.05 35 113.16" # qty and "Stück" are concatenated ("2.00Stück") in the extracted text line_re = re.compile( r"^(\d+)\s+([\d.]+)\s+([\d.]+)Stück\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$", re.MULTILINE, ) all_lines = text.splitlines() voucher_lines = [] for m in line_re.finditer(text): art_num = m.group(2) qty = float(m.group(3)) unit_price = float(m.group(4)) # VK-Preis (gross) discount = float(m.group(5)) # discount % # line total = m.group(6) # Description is on the very next line match_end_line = text[:m.end()].count("\n") desc = "" if match_end_line + 1 < len(all_lines): candidate = all_lines[match_end_line + 1].strip() # Skip if it's another article row, a total line, or a thank-you line if candidate and not re.match(r"^\d+\s+[\d.]+\s+[\d.]+Stück", candidate) \ and not candidate.startswith("Total") \ and not candidate.startswith("Besten"): desc = candidate voucher_lines.append(VoucherLine(**{ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", "Number": art_num, "Quantity": qty, "Price": unit_price, # "SellPrice": sell_price(unit_price, "Cleanfix"), # "Description": desc, "VatCode": "01", # "DeliveryDate": delivery_date, })) return VoucherResponse( # Supplier="Cleanfix", OrderNumber=our_order, DeliveryDate=delivery_date, CustomerNumber=customer_num, VoucherDate=voucher_date, # Currency="CHF", VoucherLines=voucher_lines, ) # ───────────────────────────────────────────────────────────────────────────── # Parser: Polyflex # ───────────────────────────────────────────────────────────────────────────── # Relevant extracted text structure: # Kundennummer: D00030 # Ihre Bestellung 2600357 # Würenlos, 06.03.26 # 21200025 Schlauch POLYWELL antistatisch 60.00 Meter 6.80 45.00 224.40 # id=25mm, 2x30m # Warenausgangsdatum: 6. März 2026 def parse_polyflex(text: str) -> VoucherResponse: # Voucher date "Würenlos, 06.03.26" voucher_date = None m = re.search(r"Würenlos,?\s+(\d{2}\.\d{2}\.\d{2,4})", text) if m: voucher_date = to_iso(m.group(1)) # Dispatch / delivery date "Warenausgangsdatum: 6. März 2026" delivery_date = voucher_date m = re.search(r"Warenausgangsdatum:\s+(.+)", text) if m: delivery_date = german_date_to_iso(m.group(1).strip()) # Our order number our_order = "" m = re.search(r"Ihre Bestellung\s+(\d+)", text) if m: our_order = m.group(1) # Customer number customer_num = None m = re.search(r"Kundennummer:\s+(\S+)", text) if m: customer_num = m.group(1) # Article line: # "21200025 Schlauch POLYWELL antistatisch 60.00 Meter 6.80 45.00 224.40" line_re = re.compile( r"^(\d{5,12})\s+(.+?)\s+([\d.]+)\s+Meter\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$", re.MULTILINE, ) all_lines = text.splitlines() voucher_lines = [] for m in line_re.finditer(text): art_num = m.group(1) desc = m.group(2).strip() qty = float(m.group(3)) unit_price = float(m.group(4)) # discount = float(m.group(5)) # total = float(m.group(6)) # Append continuation description line (e.g. "id=25mm, 2x30m") match_end_line = text[:m.end()].count("\n") if match_end_line + 1 < len(all_lines): nxt = all_lines[match_end_line + 1].strip() if nxt and not re.match(r"^\d{5,}", nxt) and not nxt.startswith("Total"): desc += " " + nxt voucher_lines.append(VoucherLine(**{ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer", "Number": art_num, "Quantity": qty, "Price": unit_price, # "SellPrice": sell_price(unit_price, "Polyflex"), # "Description": desc, "VatCode": "01", # "DeliveryDate": delivery_date, })) return VoucherResponse( # Supplier="Polyflex", OrderNumber=our_order, DeliveryDate=delivery_date, CustomerNumber=customer_num, VoucherDate=voucher_date, # Currency="CHF", VoucherLines=voucher_lines, ) # ───────────────────────────────────────────────────────────────────────────── # Dispatcher # ───────────────────────────────────────────────────────────────────────────── def parse_pdf(pdf_bytes: bytes) -> VoucherResponse: text = extract_text(pdf_bytes) supplier = identify_supplier(text) if supplier == "Trelleborg": return parse_trelleborg(text) if supplier == "Cleanfix": return parse_cleanfix(text) if supplier == "Polyflex": return parse_polyflex(text) raise ValueError( f"Could not identify supplier.\nExtracted text snippet:\n{text[:400]}" ) # ───────────────────────────────────────────────────────────────────────────── # FastAPI # ───────────────────────────────────────────────────────────────────────────── app = FastAPI( title="Order Confirmation PDF Extractor", description=( "Upload a supplier order-confirmation PDF " "(Trelleborg / Cleanfix / Polyflex) and receive ERP-ready JSON." ), version="2.0.0", ) @app.post( "/extract", response_model=VoucherResponse, summary="Extract order data from a supplier PDF", ) async def extract_order( file: UploadFile = File(..., description="Supplier order-confirmation PDF"), ): if not file.filename.lower().endswith(".pdf"): raise HTTPException(status_code=400, detail="Only PDF files are accepted.") content = await file.read() try: result = parse_pdf(content) except ValueError as e: raise HTTPException(status_code=422, detail=str(e)) except Exception as e: raise HTTPException(status_code=500, detail=f"Parsing error: {e}") return result @app.get("/health", summary="Health check") def health(): return {"status": "ok"}