Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,4 +19,49 @@ def extract_invoice_data(pdf_file):
|
|
| 19 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
| 20 |
|
| 21 |
# Tokenizacja danych z uwzgl臋dnieniem uk艂adu dokumentu
|
| 22 |
-
tokens = tokenizer(full_text, return_tensors="pt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
full_text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
| 20 |
|
| 21 |
# Tokenizacja danych z uwzgl臋dnieniem uk艂adu dokumentu
|
| 22 |
+
tokens = tokenizer(full_text, return_tensors="pt", truncation=True)
|
| 23 |
+
|
| 24 |
+
# Predykcja modelu
|
| 25 |
+
outputs = model(**tokens)
|
| 26 |
+
predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
| 27 |
+
|
| 28 |
+
# Przetwarzanie wynik贸w
|
| 29 |
+
entities = []
|
| 30 |
+
for token, pred in zip(tokens.tokens(), predictions):
|
| 31 |
+
if pred > 0: # Pomijamy t艂o
|
| 32 |
+
entities.append((token, model.config.id2label[pred]))
|
| 33 |
+
|
| 34 |
+
# Wyszukiwanie kluczowych warto艣ci
|
| 35 |
+
seller_name = [token for token, label in entities if "ORG" in label]
|
| 36 |
+
seller_nip = nip_pattern.search(full_text)
|
| 37 |
+
kwoty = kwota_pattern.findall(full_text)
|
| 38 |
+
kwoty = [float(k.replace(",", ".")) for k in kwoty if k.replace(",", ".").replace(".", "").isdigit()]
|
| 39 |
+
total_amount = max(kwoty) if kwoty else None
|
| 40 |
+
|
| 41 |
+
# Szukamy daty p艂atno艣ci
|
| 42 |
+
payment_date = None
|
| 43 |
+
for line in full_text.split("\n"):
|
| 44 |
+
if any(keyword in line.lower() for keyword in payment_keywords):
|
| 45 |
+
date_match = data_pattern.search(line)
|
| 46 |
+
if date_match:
|
| 47 |
+
payment_date = date_match.group()
|
| 48 |
+
break
|
| 49 |
+
|
| 50 |
+
return {
|
| 51 |
+
"Sprzedawca": " ".join(seller_name) if seller_name else "Nie znaleziono",
|
| 52 |
+
"NIP": seller_nip.group() if seller_nip else "Nie znaleziono",
|
| 53 |
+
"Kwota ca艂kowita": total_amount if total_amount else "Nie znaleziono",
|
| 54 |
+
"Data p艂atno艣ci": payment_date if payment_date else "Nie znaleziono"
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Interfejs u偶ytkownika
|
| 58 |
+
iface = gr.Interface(
|
| 59 |
+
fn=extract_invoice_data,
|
| 60 |
+
inputs=gr.File(label="Wybierz plik PDF"),
|
| 61 |
+
outputs="json",
|
| 62 |
+
title="Ekstrakcja danych z faktury",
|
| 63 |
+
description="Prze艣lij plik PDF, a model zwr贸ci dane sprzedawcy, NIP, kwot臋 i dat臋 p艂atno艣ci."
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
iface.launch()
|