Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,39 +1,37 @@
|
|
|
|
|
| 1 |
import fitz # PyMuPDF
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
def extract_tables_from_pdf(
|
| 5 |
-
# Open the PDF
|
| 6 |
-
doc = fitz.open(
|
| 7 |
-
|
| 8 |
tables = []
|
| 9 |
|
| 10 |
-
# Iterate through the pages to extract text or structured data
|
| 11 |
for page_num in range(len(doc)):
|
| 12 |
page = doc.load_page(page_num)
|
| 13 |
-
|
| 14 |
-
# Get the text from the page, you can then parse it for tables
|
| 15 |
text = page.get_text("text")
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
# You might need to apply custom parsing depending on the structure of your PDF
|
| 19 |
-
rows = text.split("\n") # Split by newlines
|
| 20 |
-
table_data = [row.split() for row in rows if row] # Split by spaces, or another delimiter
|
| 21 |
-
|
| 22 |
if table_data:
|
| 23 |
tables.append(table_data)
|
| 24 |
|
| 25 |
return tables
|
| 26 |
|
| 27 |
-
def
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
tables = extract_tables_from_pdf(pdf_path)
|
| 38 |
-
excel_output_path = "output.xlsx"
|
| 39 |
-
save_tables_to_excel(tables, excel_output_path)
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
import fitz # PyMuPDF
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
def extract_tables_from_pdf(uploaded_file):
|
| 6 |
+
# Open the uploaded PDF file (this will be a file-like object)
|
| 7 |
+
doc = fitz.open(uploaded_file)
|
|
|
|
| 8 |
tables = []
|
| 9 |
|
|
|
|
| 10 |
for page_num in range(len(doc)):
|
| 11 |
page = doc.load_page(page_num)
|
|
|
|
|
|
|
| 12 |
text = page.get_text("text")
|
| 13 |
+
rows = text.split("\n")
|
| 14 |
+
table_data = [row.split() for row in rows if row]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
if table_data:
|
| 16 |
tables.append(table_data)
|
| 17 |
|
| 18 |
return tables
|
| 19 |
|
| 20 |
+
def main():
|
| 21 |
+
st.title("PDF Table Extraction Tool")
|
| 22 |
+
|
| 23 |
+
# File uploader widget in Streamlit
|
| 24 |
+
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
|
| 25 |
|
| 26 |
+
if uploaded_file is not None:
|
| 27 |
+
# Call function to process the uploaded PDF file
|
| 28 |
+
tables = extract_tables_from_pdf(uploaded_file)
|
| 29 |
+
if tables:
|
| 30 |
+
st.write("Extracted Tables:")
|
| 31 |
+
for table in tables:
|
| 32 |
+
st.write(pd.DataFrame(table))
|
| 33 |
+
else:
|
| 34 |
+
st.write("No tables found in the PDF.")
|
| 35 |
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
main()
|
|
|
|
|
|
|
|
|