Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,89 +4,124 @@ import re
|
|
| 4 |
from transformers import LayoutLMForTokenClassification, AutoTokenizer
|
| 5 |
import torch
|
| 6 |
|
| 7 |
-
# Wczytanie modelu LayoutLMv3
|
| 8 |
model_name = "kryman27/layoutlmv3-finetuned"
|
| 9 |
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
nip_pattern = re.compile(r'\
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def extract_invoice_data(pdf_file):
|
| 20 |
with pdfplumber.open(pdf_file) as pdf:
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
full_text = "\n".join(full_text)
|
| 35 |
-
|
| 36 |
-
# Tokenizacja + bounding boxes
|
| 37 |
-
encoding = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt", truncation=True)
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
if
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
|
| 67 |
-
#
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
if any(keyword in line for keyword in payment_keywords):
|
| 71 |
-
date_match = data_pattern.search(line)
|
| 72 |
-
if date_match:
|
| 73 |
-
payment_date = date_match.group()
|
| 74 |
-
break
|
| 75 |
|
| 76 |
-
|
| 77 |
-
"Sprzedawca":
|
| 78 |
-
"
|
| 79 |
-
"
|
| 80 |
-
"
|
|
|
|
|
|
|
| 81 |
}
|
|
|
|
| 82 |
|
| 83 |
-
# Interfejs użytkownika
|
| 84 |
iface = gr.Interface(
|
| 85 |
fn=extract_invoice_data,
|
| 86 |
inputs=gr.File(label="Wybierz plik PDF"),
|
| 87 |
outputs="json",
|
| 88 |
title="Ekstrakcja danych z faktury",
|
| 89 |
-
description="Prześlij plik PDF, a
|
| 90 |
)
|
| 91 |
|
| 92 |
if __name__ == "__main__":
|
|
|
|
| 4 |
from transformers import LayoutLMForTokenClassification, AutoTokenizer
|
| 5 |
import torch
|
| 6 |
|
|
|
|
| 7 |
model_name = "kryman27/layoutlmv3-finetuned"
|
| 8 |
model = LayoutLMForTokenClassification.from_pretrained(model_name)
|
| 9 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 10 |
|
| 11 |
+
# Wzorce regex dla nowych pól
|
| 12 |
+
nip_pattern = re.compile(r'\b(?:PL\s?)?\d{10}\b')
|
| 13 |
+
invoice_number_pattern = re.compile(r'Faktura\s*(?:VAT)?\s*(?:nr\.?|#)\s*([\w\-/]+)', re.IGNORECASE)
|
| 14 |
+
sale_date_pattern = re.compile(r'Data\s+wystawienia[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
| 15 |
+
delivery_date_pattern = re.compile(r'Data\s+dostawy[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
| 16 |
+
payment_date_pattern = re.compile(r'(?:Termin\s+płatności|Data\s+płatności)[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
| 17 |
+
order_number_pattern = re.compile(r'Zamówienie\s*Nr[:\s]*([\w\-/]+)', re.IGNORECASE)
|
| 18 |
+
order_date_pattern = re.compile(r'Data\s+zamówienia[:\s]*([\d]{2}[.\-/][\d]{2}[.\-/][\d]{4})', re.IGNORECASE)
|
| 19 |
+
sale_order_pattern = re.compile(r'Zlecenie\s+sprzedaży\s*Nr[:\s]*([\w\-/]+)', re.IGNORECASE)
|
| 20 |
+
payment_amount_pattern = re.compile(r'(?:Kwota\s+zapłacona)[:\s]*([\d.,]+)', re.IGNORECASE)
|
| 21 |
+
payment_method_pattern = re.compile(r'(?:Forma\s+płatności)[:\s]*([\w/]+)', re.IGNORECASE)
|
| 22 |
+
|
| 23 |
+
def extract_section(text, section_title):
|
| 24 |
+
pattern = re.compile(rf'{section_title}:(.*?)(?=\n\S|$)', re.IGNORECASE | re.DOTALL)
|
| 25 |
+
match = pattern.search(text)
|
| 26 |
+
return match.group(1).strip() if match else None
|
| 27 |
|
| 28 |
def extract_invoice_data(pdf_file):
|
| 29 |
with pdfplumber.open(pdf_file) as pdf:
|
| 30 |
+
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
| 31 |
+
|
| 32 |
+
# Wyodrębnienie sekcji na podstawie tytułów
|
| 33 |
+
sprzedawca_section = extract_section(full_text, "Sprzedawca")
|
| 34 |
+
nabywca_section = extract_section(full_text, "Nabywca")
|
| 35 |
+
|
| 36 |
+
sprzedawca = {}
|
| 37 |
+
nabywca = {}
|
| 38 |
+
faktura = {}
|
| 39 |
+
platnosc = {}
|
| 40 |
+
pozycje = [] # Do implementacji ekstrakcji tabelarycznej
|
| 41 |
+
podsumowanie = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
# Ekstrakcja danych Sprzedawcy
|
| 44 |
+
if sprzedawca_section:
|
| 45 |
+
lines = sprzedawca_section.splitlines()
|
| 46 |
+
sprzedawca['Nazwa'] = lines[0].strip() if lines else "Nie znaleziono"
|
| 47 |
+
nip_match = nip_pattern.search(sprzedawca_section)
|
| 48 |
+
sprzedawca['NIP'] = nip_match.group() if nip_match else "Nie znaleziono"
|
| 49 |
+
bdo_match = re.search(r'BDO[:\s]*([\w\d]+)', sprzedawca_section, re.IGNORECASE)
|
| 50 |
+
sprzedawca['Numer Rejestracyjny BDO'] = bdo_match.group(1) if bdo_match else "Nie znaleziono"
|
| 51 |
+
sprzedawca['Adres'] = lines[1].strip() if len(lines) > 1 else "Nie znaleziono"
|
| 52 |
+
telefon_match = re.search(r'tel\.?[:\s]*([\+\d\s()-]+)', sprzedawca_section, re.IGNORECASE)
|
| 53 |
+
sprzedawca['Telefon'] = telefon_match.group(1).strip() if telefon_match else "Nie znaleziono"
|
| 54 |
+
fax_match = re.search(r'fax\.?[:\s]*([\+\d\s()-]+)', sprzedawca_section, re.IGNORECASE)
|
| 55 |
+
sprzedawca['Fax'] = fax_match.group(1).strip() if fax_match else "Nie znaleziono"
|
| 56 |
+
else:
|
| 57 |
+
sprzedawca = {
|
| 58 |
+
"Nazwa": "Nie znaleziono",
|
| 59 |
+
"NIP": "Nie znaleziono",
|
| 60 |
+
"Numer Rejestracyjny BDO": "Nie znaleziono",
|
| 61 |
+
"Adres": "Nie znaleziono",
|
| 62 |
+
"Telefon": "Nie znaleziono",
|
| 63 |
+
"Fax": "Nie znaleziono"
|
| 64 |
+
}
|
| 65 |
|
| 66 |
+
# Ekstrakcja danych Nabywcy
|
| 67 |
+
if nabywca_section:
|
| 68 |
+
lines = nabywca_section.splitlines()
|
| 69 |
+
nabywca['Nazwa'] = lines[0].strip() if lines else "Nie znaleziono"
|
| 70 |
+
nip_match = nip_pattern.search(nabywca_section)
|
| 71 |
+
nabywca['NIP'] = nip_match.group() if nip_match else "Nie podano"
|
| 72 |
+
nabywca['Adres'] = lines[1].strip() if len(lines) > 1 else "Nie znaleziono"
|
| 73 |
+
klient_match = re.search(r'Nr\s+Klienta[:\s]*([\w\d]+)', nabywca_section, re.IGNORECASE)
|
| 74 |
+
nabywca['Nr Klienta'] = klient_match.group(1) if klient_match else "Nie znaleziono"
|
| 75 |
+
else:
|
| 76 |
+
nabywca = {
|
| 77 |
+
"Nazwa": "Nie znaleziono",
|
| 78 |
+
"NIP": "Nie podano",
|
| 79 |
+
"Adres": "Nie znaleziono",
|
| 80 |
+
"Nr Klienta": "Nie znaleziono"
|
| 81 |
+
}
|
| 82 |
|
| 83 |
+
# Ekstrakcja danych faktury
|
| 84 |
+
invoice_number_match = invoice_number_pattern.search(full_text)
|
| 85 |
+
faktura['Numer'] = invoice_number_match.group(1) if invoice_number_match else "Nie znaleziono"
|
| 86 |
+
sale_date_match = sale_date_pattern.search(full_text)
|
| 87 |
+
faktura['Data Wystawienia'] = sale_date_match.group(1) if sale_date_match else "Nie znaleziono"
|
| 88 |
+
delivery_date_match = delivery_date_pattern.search(full_text)
|
| 89 |
+
faktura['Data Dostawy'] = delivery_date_match.group(1) if delivery_date_match else "Nie znaleziono"
|
| 90 |
+
order_number_match = order_number_pattern.search(full_text)
|
| 91 |
+
faktura['Zamówienie Nr'] = order_number_match.group(1) if order_number_match else "Nie znaleziono"
|
| 92 |
+
order_date_match = order_date_pattern.search(full_text)
|
| 93 |
+
faktura['Data Zamówienia'] = order_date_match.group(1) if order_date_match else "Nie znaleziono"
|
| 94 |
+
sale_order_match = sale_order_pattern.search(full_text)
|
| 95 |
+
faktura['Zlecenie Sprzedaży Nr'] = sale_order_match.group(1) if sale_order_match else "Nie znaleziono"
|
| 96 |
|
| 97 |
+
# Ekstrakcja danych płatności
|
| 98 |
+
payment_date_match = payment_date_pattern.search(full_text)
|
| 99 |
+
platnosc['Termin Zapłaty'] = payment_date_match.group(1) if payment_date_match else "Nie znaleziono"
|
| 100 |
+
payment_method_match = payment_method_pattern.search(full_text)
|
| 101 |
+
platnosc['Forma Zapłaty'] = payment_method_match.group(1) if payment_method_match else "Nie znaleziono"
|
| 102 |
+
payment_amount_match = payment_amount_pattern.search(full_text)
|
| 103 |
+
platnosc['Kwota Zapłacona'] = float(payment_amount_match.group(1).replace(',', '.')) if payment_amount_match else "Nie znaleziono"
|
| 104 |
|
| 105 |
+
# Ekstrakcja podsumowania (przykładowo)
|
| 106 |
+
podsumowanie_match = re.search(r'Razem[:\s]*([\d.,]+)', full_text)
|
| 107 |
+
podsumowanie['Suma Brutto'] = float(podsumowanie_match.group(1).replace(',', '.')) if podsumowanie_match else "Nie znaleziono"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
+
result = {
|
| 110 |
+
"Sprzedawca": sprzedawca,
|
| 111 |
+
"Nabywca": nabywca,
|
| 112 |
+
"Faktura": faktura,
|
| 113 |
+
"Płatność": platnosc,
|
| 114 |
+
"Pozycje": pozycje,
|
| 115 |
+
"Podsumowanie": podsumowanie
|
| 116 |
}
|
| 117 |
+
return result
|
| 118 |
|
|
|
|
| 119 |
iface = gr.Interface(
|
| 120 |
fn=extract_invoice_data,
|
| 121 |
inputs=gr.File(label="Wybierz plik PDF"),
|
| 122 |
outputs="json",
|
| 123 |
title="Ekstrakcja danych z faktury",
|
| 124 |
+
description="Prześlij plik PDF, a narzędzie zwróci szczegółowe dane faktury."
|
| 125 |
)
|
| 126 |
|
| 127 |
if __name__ == "__main__":
|