Alexvatti commited on
Commit
b6c1dae
·
verified ·
1 Parent(s): a8f360f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz # PyMuPDF
3
+ import pytesseract
4
+ from PIL import Image
5
+ from pdf2image import convert_from_bytes
6
+ import pandas as pd
7
+ import re
8
+
9
+ st.set_page_config(page_title="Invoice Extractor", layout="centered")
10
+
11
+ st.title("🧾 PDF Invoice Data Extractor")
12
+ st.write("Upload a PDF invoice and extract mentioned details like Invoice Number, Date, Total, and more.")
13
+
14
+ uploaded_file = st.file_uploader("Upload your invoice PDF", type=["pdf"])
15
+
16
+ def extract_text_from_pdf(pdf_file):
17
+ text = ""
18
+ images = convert_from_bytes(pdf_file.read())
19
+
20
+ for img in images:
21
+ text += pytesseract.image_to_string(img)
22
+
23
+ return text
24
+
25
+ def parse_invoice_text(text):
26
+ # Simple regex-based field extraction
27
+ data = {}
28
+ data['Invoice Number'] = re.search(r'(Invoice\s*Number|No\.?)[:\-]?\s*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
29
+ data['Date'] = re.search(r'(Date|Invoice Date)[:\-]?\s*([0-9]{2,4}[\/\-\.][0-9]{2}[\/\-\.][0-9]{2,4})', text)
30
+ data['Total Amount'] = re.search(r'(Total\s*Amount|Amount\s*Due|Grand\s*Total)[:\-]?\s*[\$₹€]?\s*([0-9,]+\.\d{2})', text, re.IGNORECASE)
31
+ data['Supplier Name'] = re.search(r'(From|Supplier|Billed\s*By)[:\-]?\s*(.*)', text)
32
+
33
+ parsed_data = {
34
+ "Invoice Number": data['Invoice Number'].group(2) if data['Invoice Number'] else "Not found",
35
+ "Date": data['Date'].group(2) if data['Date'] else "Not found",
36
+ "Total Amount": data['Total Amount'].group(2) if data['Total Amount'] else "Not found",
37
+ "Supplier Name": data['Supplier Name'].group(2).split("\n")[0] if data['Supplier Name'] else "Not found"
38
+ }
39
+ return parsed_data
40
+
41
+ if uploaded_file:
42
+ with st.spinner("🔍 Extracting data from invoice..."):
43
+ text = extract_text_from_pdf(uploaded_file)
44
+ extracted_data = parse_invoice_text(text)
45
+
46
+ st.success("✅ Extraction Complete!")
47
+ st.subheader("Extracted Information:")
48
+ st.write(pd.DataFrame([extracted_data]))
49
+
50
+ # Option to download Excel
51
+ df = pd.DataFrame([extracted_data])
52
+ csv = df.to_csv(index=False)
53
+ st.download_button("📥 Download as CSV", csv, "invoice_data.csv", "text/csv")