AzizWazir commited on
Commit
f4a8154
·
verified ·
1 Parent(s): c775bd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -24
app.py CHANGED
@@ -1,39 +1,37 @@
 
1
  import fitz # PyMuPDF
2
  import pandas as pd
3
 
4
- def extract_tables_from_pdf(pdf_path):
5
- # Open the PDF
6
- doc = fitz.open(pdf_path)
7
-
8
  tables = []
9
 
10
- # Iterate through the pages to extract text or structured data
11
  for page_num in range(len(doc)):
12
  page = doc.load_page(page_num)
13
-
14
- # Get the text from the page, you can then parse it for tables
15
  text = page.get_text("text")
16
-
17
- # Example: Extracting data from text and forming a table
18
- # You might need to apply custom parsing depending on the structure of your PDF
19
- rows = text.split("\n") # Split by newlines
20
- table_data = [row.split() for row in rows if row] # Split by spaces, or another delimiter
21
-
22
  if table_data:
23
  tables.append(table_data)
24
 
25
  return tables
26
 
27
- def save_tables_to_excel(tables, excel_output_path):
28
- with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
29
- for i, table in enumerate(tables):
30
- df = pd.DataFrame(table) # Create a DataFrame from the table
31
- df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
32
 
33
- print(f"Excel file saved as: {excel_output_path}")
 
 
 
 
 
 
 
 
34
 
35
- # Example usage
36
- pdf_path = "your_pdf_file.pdf"
37
- tables = extract_tables_from_pdf(pdf_path)
38
- excel_output_path = "output.xlsx"
39
- save_tables_to_excel(tables, excel_output_path)
 
1
+ import streamlit as st
2
  import fitz # PyMuPDF
3
  import pandas as pd
4
 
5
+ def extract_tables_from_pdf(uploaded_file):
6
+ # Open the uploaded PDF file (this will be a file-like object)
7
+ doc = fitz.open(uploaded_file)
 
8
  tables = []
9
 
 
10
  for page_num in range(len(doc)):
11
  page = doc.load_page(page_num)
 
 
12
  text = page.get_text("text")
13
+ rows = text.split("\n")
14
+ table_data = [row.split() for row in rows if row]
 
 
 
 
15
  if table_data:
16
  tables.append(table_data)
17
 
18
  return tables
19
 
20
+ def main():
21
+ st.title("PDF Table Extraction Tool")
22
+
23
+ # File uploader widget in Streamlit
24
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
25
 
26
+ if uploaded_file is not None:
27
+ # Call function to process the uploaded PDF file
28
+ tables = extract_tables_from_pdf(uploaded_file)
29
+ if tables:
30
+ st.write("Extracted Tables:")
31
+ for table in tables:
32
+ st.write(pd.DataFrame(table))
33
+ else:
34
+ st.write("No tables found in the PDF.")
35
 
36
+ if __name__ == "__main__":
37
+ main()