Spaces:

Alexvatti
/

PDF-Invoce-Extract

Sleeping

App Files Files Community

Alexvatti commited on Apr 19, 2025

Commit

b6c1dae

verified ·

1 Parent(s): a8f360f

Create app.py

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import streamlit as st
+import fitz  # PyMuPDF
+import pytesseract
+from PIL import Image
+from pdf2image import convert_from_bytes
+import pandas as pd
+import re
+st.set_page_config(page_title="Invoice Extractor", layout="centered")
+st.title("🧾 PDF Invoice Data Extractor")
+st.write("Upload a PDF invoice and extract mentioned details like Invoice Number, Date, Total, and more.")
+uploaded_file = st.file_uploader("Upload your invoice PDF", type=["pdf"])
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    images = convert_from_bytes(pdf_file.read())
+    for img in images:
+        text += pytesseract.image_to_string(img)
+    return text
+def parse_invoice_text(text):
+    # Simple regex-based field extraction
+    data = {}
+    data['Invoice Number'] = re.search(r'(Invoice\s*Number|No\.?)[:\-]?\s*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
+    data['Date'] = re.search(r'(Date|Invoice Date)[:\-]?\s*([0-9]{2,4}[\/\-\.][0-9]{2}[\/\-\.][0-9]{2,4})', text)
+    data['Total Amount'] = re.search(r'(Total\s*Amount|Amount\s*Due|Grand\s*Total)[:\-]?\s*[\$₹€]?\s*([0-9,]+\.\d{2})', text, re.IGNORECASE)
+    data['Supplier Name'] = re.search(r'(From|Supplier|Billed\s*By)[:\-]?\s*(.*)', text)
+    parsed_data = {
+        "Invoice Number": data['Invoice Number'].group(2) if data['Invoice Number'] else "Not found",
+        "Date": data['Date'].group(2) if data['Date'] else "Not found",
+        "Total Amount": data['Total Amount'].group(2) if data['Total Amount'] else "Not found",
+        "Supplier Name": data['Supplier Name'].group(2).split("\n")[0] if data['Supplier Name'] else "Not found"
+    }
+    return parsed_data
+if uploaded_file:
+    with st.spinner("🔍 Extracting data from invoice..."):
+        text = extract_text_from_pdf(uploaded_file)
+        extracted_data = parse_invoice_text(text)
+        st.success("✅ Extraction Complete!")
+        st.subheader("Extracted Information:")
+        st.write(pd.DataFrame([extracted_data]))
+        # Option to download Excel
+        df = pd.DataFrame([extracted_data])
+        csv = df.to_csv(index=False)
+        st.download_button("📥 Download as CSV", csv, "invoice_data.csv", "text/csv")