Alexvatti commited on
Commit
c53700e
·
verified ·
1 Parent(s): fce75b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -2,28 +2,30 @@ import streamlit as st
2
  import fitz # PyMuPDF
3
  import pytesseract
4
  from PIL import Image
5
- from pdf2image import convert_from_bytes
6
  import pandas as pd
7
  import re
 
8
 
9
  st.set_page_config(page_title="Invoice Extractor", layout="centered")
10
 
11
  st.title("🧾 PDF Invoice Data Extractor")
12
- st.write("Upload a PDF invoice and extract mentioned details like Invoice Number, Date, Total, and more.")
13
 
14
  uploaded_file = st.file_uploader("Upload your invoice PDF", type=["pdf"])
15
 
 
16
  def extract_text_from_pdf(pdf_file):
17
  text = ""
18
- images = convert_from_bytes(pdf_file.read())
19
 
20
- for img in images:
 
 
21
  text += pytesseract.image_to_string(img)
22
 
23
  return text
24
 
25
  def parse_invoice_text(text):
26
- # Simple regex-based field extraction
27
  data = {}
28
  data['Invoice Number'] = re.search(r'(Invoice\s*Number|No\.?)[:\-]?\s*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
29
  data['Date'] = re.search(r'(Date|Invoice Date)[:\-]?\s*([0-9]{2,4}[\/\-\.][0-9]{2}[\/\-\.][0-9]{2,4})', text)
@@ -51,3 +53,4 @@ if uploaded_file:
51
  df = pd.DataFrame([extracted_data])
52
  csv = df.to_csv(index=False)
53
  st.download_button("📥 Download as CSV", csv, "invoice_data.csv", "text/csv")
 
 
2
  import fitz # PyMuPDF
3
  import pytesseract
4
  from PIL import Image
 
5
  import pandas as pd
6
  import re
7
+ import io
8
 
9
  st.set_page_config(page_title="Invoice Extractor", layout="centered")
10
 
11
  st.title("🧾 PDF Invoice Data Extractor")
12
+ st.write("Upload a PDF invoice and extract details like Invoice Number, Date, Total, and more.")
13
 
14
  uploaded_file = st.file_uploader("Upload your invoice PDF", type=["pdf"])
15
 
16
+ # 📌 Replaces pdf2image with fitz
17
  def extract_text_from_pdf(pdf_file):
18
  text = ""
19
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
20
 
21
+ for page in doc:
22
+ pix = page.get_pixmap(dpi=300) # high-res rendering
23
+ img = Image.open(io.BytesIO(pix.tobytes("png")))
24
  text += pytesseract.image_to_string(img)
25
 
26
  return text
27
 
28
  def parse_invoice_text(text):
 
29
  data = {}
30
  data['Invoice Number'] = re.search(r'(Invoice\s*Number|No\.?)[:\-]?\s*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
31
  data['Date'] = re.search(r'(Date|Invoice Date)[:\-]?\s*([0-9]{2,4}[\/\-\.][0-9]{2}[\/\-\.][0-9]{2,4})', text)
 
53
  df = pd.DataFrame([extracted_data])
54
  csv = df.to_csv(index=False)
55
  st.download_button("📥 Download as CSV", csv, "invoice_data.csv", "text/csv")
56
+