yashm commited on
Commit
da4ce7a
·
verified ·
1 Parent(s): 48902e5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import camelot
3
+ import fitz # PyMuPDF
4
+ import pandas as pd
5
+
6
+ # Set the title of the Streamlit app
7
+ st.title("PDF Table Extractor")
8
+
9
+ # Instructions
10
+ st.write("Upload a PDF file containing tables, and this app will extract the tables for you.")
11
+
12
+ # File uploader widget
13
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
14
+
15
+ if uploaded_file is not None:
16
+ # Load the uploaded PDF using PyMuPDF
17
+ pdf_document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
18
+
19
+ # Show the number of pages in the PDF
20
+ num_pages = pdf_document.page_count
21
+ st.write(f"The uploaded PDF has {num_pages} pages.")
22
+
23
+ # Let the user select a page to extract tables from
24
+ page_num = st.number_input("Select the page number to extract tables from", min_value=1, max_value=num_pages, value=1)
25
+
26
+ # Extract tables using Camelot
27
+ if st.button("Extract Tables"):
28
+ # Convert the uploaded file to a local file for Camelot to process
29
+ with open("temp.pdf", "wb") as f:
30
+ f.write(uploaded_file.getvalue())
31
+
32
+ # Extract tables from the selected page
33
+ tables = camelot.read_pdf("temp.pdf", pages=str(page_num))
34
+
35
+ if len(tables) > 0:
36
+ st.write(f"Found {len(tables)} table(s) on page {page_num}.")
37
+
38
+ # Loop through all the extracted tables
39
+ for i, table in enumerate(tables):
40
+ st.write(f"Table {i+1}:")
41
+
42
+ # Convert the table to a Pandas DataFrame
43
+ df = table.df
44
+
45
+ # Display the extracted table
46
+ st.dataframe(df)
47
+
48
+ # Download button for CSV
49
+ csv = df.to_csv(index=False).encode('utf-8')
50
+ st.download_button(
51
+ label=f"Download Table {i+1} as CSV",
52
+ data=csv,
53
+ file_name=f"table_{i+1}.csv",
54
+ mime="text/csv"
55
+ )
56
+
57
+ # Download button for Excel
58
+ excel_file = df.to_excel(index=False, engine="xlsxwriter")
59
+ st.download_button(
60
+ label=f"Download Table {i+1} as Excel",
61
+ data=excel_file,
62
+ file_name=f"table_{i+1}.xlsx",
63
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
64
+ )
65
+ else:
66
+ st.write("No tables found on the selected page.")