AiDeveloper1 commited on
Commit
199b6b1
·
verified ·
1 Parent(s): 5f3dc8a

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +12 -0
  2. main.py +436 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # RUN useradd -m -u 1000 user
4
+ # USER user
5
+ # ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ COPY . .
12
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import io
3
+ from datetime import datetime
4
+ from typing import Optional, List, Any
5
+
6
+ import pdfplumber
7
+ from fastapi import FastAPI, UploadFile, File, HTTPException
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ # ─────────────────────────────────────────────────────────────────────────────
12
+ # Pydantic models
13
+ # ─────────────────────────────────────────────────────────────────────────────
14
+
15
+ class VoucherLine(BaseModel):
16
+ type: str = Field(
17
+ default="E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
18
+ alias="$type",
19
+ )
20
+ Number: str
21
+ Quantity: float
22
+ Price: float
23
+ # SellPrice: float
24
+ # Description: str = ""
25
+ VatCode: str = "01"
26
+ # DeliveryDate: Optional[str] = None
27
+
28
+ model_config = {"populate_by_name": True}
29
+
30
+
31
+ class VoucherResponse(BaseModel):
32
+ # Supplier: str
33
+ OrderNumber: str
34
+ DeliveryDate: Optional[str]
35
+ CustomerNumber: Optional[str]
36
+ VoucherDate: Optional[str]
37
+ # Currency: str
38
+ AdditionalFields: List[Any] = []
39
+ VoucherLines: List[VoucherLine]
40
+
41
+
42
+ # ─────────────────────────────────────────────────────────────────────────────
43
+ # Sell-price markup
44
+ # ─────────────────────────────────────────────────────────────────────────────
45
+ SELL_PRICE_MARKUP = {
46
+ "Trelleborg": 1.35,
47
+ "Cleanfix": 1.30,
48
+ "Polyflex": 1.30,
49
+ }
50
+
51
+
52
+ def sell_price(unit_price: float, supplier: str) -> float:
53
+ markup = SELL_PRICE_MARKUP.get(supplier, 1.40)
54
+ return round(unit_price * markup, 2)
55
+
56
+
57
+ # ─────────────────────────────────────────────────────────────────────────────
58
+ # Helpers
59
+ # ─────────────────────────────────────────────────────────────────────────────
60
+
61
+ def extract_text(pdf_bytes: bytes) -> str:
62
+ parts = []
63
+ with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
64
+ for page in pdf.pages:
65
+ t = page.extract_text()
66
+ if t:
67
+ parts.append(t)
68
+ return "\n".join(parts)
69
+
70
+
71
+ def to_iso(raw: str) -> Optional[str]:
72
+ raw = raw.strip()
73
+ for fmt in ("%d.%m.%Y", "%d.%m.%y", "%d/%m/%Y", "%d/%m/%y"):
74
+ try:
75
+ return datetime.strptime(raw, fmt).date().isoformat()
76
+ except ValueError:
77
+ pass
78
+ return raw
79
+
80
+
81
+ GERMAN_MONTHS = {
82
+ "januar": "01", "februar": "02", "märz": "03", "april": "04",
83
+ "mai": "05", "juni": "06", "juli": "07", "august": "08",
84
+ "september": "09", "oktober": "10", "november": "11", "dezember": "12",
85
+ }
86
+
87
+
88
+ def german_date_to_iso(raw: str) -> Optional[str]:
89
+ m = re.match(r"(\d{1,2})\.\s*(\w+)\s+(\d{4})", raw.strip(), re.IGNORECASE)
90
+ if m:
91
+ day = m.group(1).zfill(2)
92
+ mon = GERMAN_MONTHS.get(m.group(2).lower())
93
+ year = m.group(3)
94
+ if mon:
95
+ return f"{year}-{mon}-{day}"
96
+ return to_iso(raw)
97
+
98
+
99
+ def num(s: str) -> float:
100
+ return float(s.replace(",", "."))
101
+
102
+
103
+ # ─────────────────────────────────────────────────────────────────────────────
104
+ # Supplier identification
105
+ # ─────────────────────────────────────────────────────────────────────────────
106
+
107
+ def identify_supplier(text: str) -> str:
108
+ upper = text.upper()
109
+ if "TRELLEBORG" in upper:
110
+ return "Trelleborg"
111
+ if "CLEANFIX" in upper or re.search(r"Auftragsbestätigung VA\d+", text):
112
+ return "Cleanfix"
113
+ if "POLYFLEX" in upper:
114
+ return "Polyflex"
115
+ return "Unknown"
116
+
117
+
118
+ # ─────────────────────────────────────────────────────────────────────────────
119
+ # Parser: Trelleborg
120
+ # ────────────────────��────────────────────────────────────────────────────────
121
+ # Relevant extracted text structure:
122
+ # IHRE REFERENZ VON RECHNUNG AN Referenzen
123
+ # 2600364 100 D01 Herr Baumann
124
+ # UNSERE REFERENZ IE SCHLAUCHSERVICE BAUMANN GMBH
125
+ # 0010223953
126
+ # ...
127
+ # ADE - Alexandra DENEU +33 473 258 206 10/03/26 1/ 2
128
+ # ...
129
+ # Pos10 ZollNr : 4009410000 Ursp.:FR
130
+ # 0060068 CITERDIAL 38 L 13,30 11/03/26 M 13,3 52,51
131
+ # %RAB: 33,00 35,181 467,91
132
+
133
+ def parse_trelleborg(text: str) -> VoucherResponse:
134
+ lines = text.splitlines()
135
+
136
+ # Voucher date from header line "10/03/26"
137
+ voucher_date = None
138
+ m = re.search(r"\b(\d{2}/\d{2}/\d{2})\b", text)
139
+ if m:
140
+ voucher_date = to_iso(m.group(1))
141
+
142
+ # Our order number: first 7-digit number after "IHRE REFERENZ" line
143
+ our_order = ""
144
+ m = re.search(r"IHRE REFERENZ\b.*?\n(\d{7})\b", text, re.DOTALL)
145
+ if m:
146
+ our_order = m.group(1)
147
+
148
+ # Supplier's reference number (UNSERE REFERENZ line, then next line)
149
+ customer_num = None
150
+ m = re.search(r"UNSERE REFERENZ\b.*?\n(\S+)", text, re.DOTALL)
151
+ if m:
152
+ customer_num = m.group(1)
153
+
154
+ # Article lines — 3-line block per article:
155
+ # Line A: Pos10 ZollNr : 4009410000 Ursp.:FR
156
+ # Line B: 0060068 CITERDIAL 38 L 13,30 11/03/26 M 13,3 52,51
157
+ # Line C: %RAB: 33,00 35,181 467,91
158
+ block_re = re.compile(
159
+ r"Pos\d+\s+ZollNr\s*:\s*\S+\s+Ursp\.:\s*\S+\s*\n"
160
+ r"(\S+)\s+(.+?)\s+(\d{2}/\d{2}/\d{2})\s+M\s+([\d,]+)\s+([\d,]+)\s*\n"
161
+ r"\s*%RAB:\s*([\d,]+)\s+([\d,]+)\s+([\d,]+)",
162
+ re.DOTALL,
163
+ )
164
+
165
+ voucher_lines = []
166
+ delivery_date = voucher_date
167
+
168
+ for m in block_re.finditer(text):
169
+ art_num = m.group(1)
170
+ desc = m.group(2).strip()
171
+ line_date = to_iso(m.group(3))
172
+ qty = num(m.group(4))
173
+ unit_price = num(m.group(5)) # gross unit price
174
+ # group(6) = %RAB (discount %)
175
+ net_price = num(m.group(7)) # unit price after discount
176
+
177
+ if delivery_date == voucher_date and line_date:
178
+ delivery_date = line_date
179
+
180
+ voucher_lines.append(VoucherLine(**{
181
+ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
182
+ "Number": art_num,
183
+ "Quantity": qty,
184
+ "Price": round(net_price, 4),
185
+ # "SellPrice": sell_price(net_price, "Trelleborg"),
186
+ # "Description": desc,
187
+ "VatCode": "01",
188
+ # "DeliveryDate": line_date,
189
+ }))
190
+
191
+ return VoucherResponse(
192
+ # Supplier="Trelleborg",
193
+ OrderNumber=our_order,
194
+ DeliveryDate=delivery_date,
195
+ CustomerNumber=customer_num,
196
+ VoucherDate=voucher_date,
197
+ # Currency="EUR",
198
+ VoucherLines=voucher_lines,
199
+ )
200
+
201
+
202
+ # ─────────────────────────────────────────────────────────────────────────────
203
+ # Parser: Cleanfix
204
+ # ─────────────────────────────────────────────────────────────────────────────
205
+ # Relevant extracted text structure:
206
+ # Datum 10.03.2026
207
+ # Ihre Bestellnr. 2600370
208
+ # Debitorennr. 35228
209
+ # Auslieferdatum 11.03.2026
210
+ # Auftragsbestätigung VA516165
211
+ # Pos Artikelnr. Menge / Einheit VK-Preis % Betrag
212
+ # 1 710.657 2.00Stück 87.05 35 113.16 ← no space between qty and "Stück"!
213
+ # Ladesteckdose FT80A 16mm2
214
+ # 2 710.656 2.00Stück 64.40 35 83.72
215
+ # Ladestecker FT80A 16mm2
216
+ # 3 607.000 1.00Stück 105.75 35 68.74
217
+ # Treppenadapter 1Düse, 23cm
218
+
219
+ def parse_cleanfix(text: str) -> VoucherResponse:
220
+ # Voucher date
221
+ voucher_date = None
222
+ m = re.search(r"Datum\s+(\d{2}\.\d{2}\.\d{4})", text)
223
+ if m:
224
+ voucher_date = to_iso(m.group(1))
225
+
226
+ # Delivery date
227
+ delivery_date = voucher_date
228
+ m = re.search(r"Auslieferdatum\s+(\d{2}\.\d{2}\.\d{4})", text)
229
+ if m:
230
+ delivery_date = to_iso(m.group(1))
231
+
232
+ # Our order number
233
+ our_order = ""
234
+ m = re.search(r"Ihre Bestellnr\.\s+(\d+)", text)
235
+ if m:
236
+ our_order = m.group(1)
237
+
238
+ # Customer number
239
+ customer_num = None
240
+ m = re.search(r"Debitorennr\.\s+(\S+)", text)
241
+ if m:
242
+ customer_num = m.group(1)
243
+
244
+ # Article line pattern:
245
+ # "1 710.657 2.00Stück 87.05 35 113.16"
246
+ # qty and "Stück" are concatenated ("2.00Stück") in the extracted text
247
+ line_re = re.compile(
248
+ r"^(\d+)\s+([\d.]+)\s+([\d.]+)Stück\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$",
249
+ re.MULTILINE,
250
+ )
251
+
252
+ all_lines = text.splitlines()
253
+ voucher_lines = []
254
+
255
+ for m in line_re.finditer(text):
256
+ art_num = m.group(2)
257
+ qty = float(m.group(3))
258
+ unit_price = float(m.group(4)) # VK-Preis (gross)
259
+ discount = float(m.group(5)) # discount %
260
+ # line total = m.group(6)
261
+
262
+ # Description is on the very next line
263
+ match_end_line = text[:m.end()].count("\n")
264
+ desc = ""
265
+ if match_end_line + 1 < len(all_lines):
266
+ candidate = all_lines[match_end_line + 1].strip()
267
+ # Skip if it's another article row, a total line, or a thank-you line
268
+ if candidate and not re.match(r"^\d+\s+[\d.]+\s+[\d.]+Stück", candidate) \
269
+ and not candidate.startswith("Total") \
270
+ and not candidate.startswith("Besten"):
271
+ desc = candidate
272
+
273
+ voucher_lines.append(VoucherLine(**{
274
+ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
275
+ "Number": art_num,
276
+ "Quantity": qty,
277
+ "Price": unit_price,
278
+ # "SellPrice": sell_price(unit_price, "Cleanfix"),
279
+ # "Description": desc,
280
+ "VatCode": "01",
281
+ # "DeliveryDate": delivery_date,
282
+ }))
283
+
284
+ return VoucherResponse(
285
+ # Supplier="Cleanfix",
286
+ OrderNumber=our_order,
287
+ DeliveryDate=delivery_date,
288
+ CustomerNumber=customer_num,
289
+ VoucherDate=voucher_date,
290
+ # Currency="CHF",
291
+ VoucherLines=voucher_lines,
292
+ )
293
+
294
+
295
+ # ─────────────────────────────────────────────────────────────────────────────
296
+ # Parser: Polyflex
297
+ # ─────────────────────────────────────────────────────────────────────────────
298
+ # Relevant extracted text structure:
299
+ # Kundennummer: D00030
300
+ # Ihre Bestellung 2600357
301
+ # Würenlos, 06.03.26
302
+ # 21200025 Schlauch POLYWELL antistatisch 60.00 Meter 6.80 45.00 224.40
303
+ # id=25mm, 2x30m
304
+ # Warenausgangsdatum: 6. März 2026
305
+
306
+ def parse_polyflex(text: str) -> VoucherResponse:
307
+ # Voucher date "Würenlos, 06.03.26"
308
+ voucher_date = None
309
+ m = re.search(r"Würenlos,?\s+(\d{2}\.\d{2}\.\d{2,4})", text)
310
+ if m:
311
+ voucher_date = to_iso(m.group(1))
312
+
313
+ # Dispatch / delivery date "Warenausgangsdatum: 6. März 2026"
314
+ delivery_date = voucher_date
315
+ m = re.search(r"Warenausgangsdatum:\s+(.+)", text)
316
+ if m:
317
+ delivery_date = german_date_to_iso(m.group(1).strip())
318
+
319
+ # Our order number
320
+ our_order = ""
321
+ m = re.search(r"Ihre Bestellung\s+(\d+)", text)
322
+ if m:
323
+ our_order = m.group(1)
324
+
325
+ # Customer number
326
+ customer_num = None
327
+ m = re.search(r"Kundennummer:\s+(\S+)", text)
328
+ if m:
329
+ customer_num = m.group(1)
330
+
331
+ # Article line:
332
+ # "21200025 Schlauch POLYWELL antistatisch 60.00 Meter 6.80 45.00 224.40"
333
+ line_re = re.compile(
334
+ r"^(\d{5,12})\s+(.+?)\s+([\d.]+)\s+Meter\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s*$",
335
+ re.MULTILINE,
336
+ )
337
+
338
+ all_lines = text.splitlines()
339
+ voucher_lines = []
340
+
341
+ for m in line_re.finditer(text):
342
+ art_num = m.group(1)
343
+ desc = m.group(2).strip()
344
+ qty = float(m.group(3))
345
+ unit_price = float(m.group(4))
346
+ # discount = float(m.group(5))
347
+ # total = float(m.group(6))
348
+
349
+ # Append continuation description line (e.g. "id=25mm, 2x30m")
350
+ match_end_line = text[:m.end()].count("\n")
351
+ if match_end_line + 1 < len(all_lines):
352
+ nxt = all_lines[match_end_line + 1].strip()
353
+ if nxt and not re.match(r"^\d{5,}", nxt) and not nxt.startswith("Total"):
354
+ desc += " " + nxt
355
+
356
+ voucher_lines.append(VoucherLine(**{
357
+ "$type": "E3k.Web.Objects.DataTransfer.VoucherLines.ArticleVoucherLine, E3k.Web.Objects.DataTransfer",
358
+ "Number": art_num,
359
+ "Quantity": qty,
360
+ "Price": unit_price,
361
+ # "SellPrice": sell_price(unit_price, "Polyflex"),
362
+ # "Description": desc,
363
+ "VatCode": "01",
364
+ # "DeliveryDate": delivery_date,
365
+ }))
366
+
367
+ return VoucherResponse(
368
+ # Supplier="Polyflex",
369
+ OrderNumber=our_order,
370
+ DeliveryDate=delivery_date,
371
+ CustomerNumber=customer_num,
372
+ VoucherDate=voucher_date,
373
+ # Currency="CHF",
374
+ VoucherLines=voucher_lines,
375
+ )
376
+
377
+
378
+ # ──────────────────────────────────────────────────────��──────────────────────
379
+ # Dispatcher
380
+ # ─────────────────────────────────────────────────────────────────────────────
381
+
382
+ def parse_pdf(pdf_bytes: bytes) -> VoucherResponse:
383
+ text = extract_text(pdf_bytes)
384
+ supplier = identify_supplier(text)
385
+
386
+ if supplier == "Trelleborg":
387
+ return parse_trelleborg(text)
388
+ if supplier == "Cleanfix":
389
+ return parse_cleanfix(text)
390
+ if supplier == "Polyflex":
391
+ return parse_polyflex(text)
392
+
393
+ raise ValueError(
394
+ f"Could not identify supplier.\nExtracted text snippet:\n{text[:400]}"
395
+ )
396
+
397
+
398
+ # ─────────────────────────────────────────────────────────────────────────────
399
+ # FastAPI
400
+ # ─────────────────────────────────────────────────────────────────────────────
401
+
402
+ app = FastAPI(
403
+ title="Order Confirmation PDF Extractor",
404
+ description=(
405
+ "Upload a supplier order-confirmation PDF "
406
+ "(Trelleborg / Cleanfix / Polyflex) and receive ERP-ready JSON."
407
+ ),
408
+ version="2.0.0",
409
+ )
410
+
411
+
412
+ @app.post(
413
+ "/extract",
414
+ response_model=VoucherResponse,
415
+ summary="Extract order data from a supplier PDF",
416
+ )
417
+ async def extract_order(
418
+ file: UploadFile = File(..., description="Supplier order-confirmation PDF"),
419
+ ):
420
+ if not file.filename.lower().endswith(".pdf"):
421
+ raise HTTPException(status_code=400, detail="Only PDF files are accepted.")
422
+
423
+ content = await file.read()
424
+ try:
425
+ result = parse_pdf(content)
426
+ except ValueError as e:
427
+ raise HTTPException(status_code=422, detail=str(e))
428
+ except Exception as e:
429
+ raise HTTPException(status_code=500, detail=f"Parsing error: {e}")
430
+
431
+ return result
432
+
433
+
434
+ @app.get("/health", summary="Health check")
435
+ def health():
436
+ return {"status": "ok"}
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi[standard]
2
+ uvicorn[standard]
3
+ pdfplumber
4
+ pydantic
5
+ python-multipart