Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

AzizWazir commited on Dec 29, 2024

Commit

f4a8154

verified ·

1 Parent(s): c775bd6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,39 +1,37 @@
 import fitz  # PyMuPDF
 import pandas as pd
-def extract_tables_from_pdf(pdf_path):
-    # Open the PDF
-    doc = fitz.open(pdf_path)
     tables = []
-    # Iterate through the pages to extract text or structured data
     for page_num in range(len(doc)):
         page = doc.load_page(page_num)
-        # Get the text from the page, you can then parse it for tables
         text = page.get_text("text")
-        # Example: Extracting data from text and forming a table
-        # You might need to apply custom parsing depending on the structure of your PDF
-        rows = text.split("\n")  # Split by newlines
-        table_data = [row.split() for row in rows if row]  # Split by spaces, or another delimiter
         if table_data:
             tables.append(table_data)
     return tables
-def save_tables_to_excel(tables, excel_output_path):
-    with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
-        for i, table in enumerate(tables):
-            df = pd.DataFrame(table)  # Create a DataFrame from the table
-            df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
-    print(f"Excel file saved as: {excel_output_path}")
-# Example usage
-pdf_path = "your_pdf_file.pdf"
-tables = extract_tables_from_pdf(pdf_path)
-excel_output_path = "output.xlsx"
-save_tables_to_excel(tables, excel_output_path)

+import streamlit as st
 import fitz  # PyMuPDF
 import pandas as pd
+def extract_tables_from_pdf(uploaded_file):
+    # Open the uploaded PDF file (this will be a file-like object)
+    doc = fitz.open(uploaded_file)
     tables = []
     for page_num in range(len(doc)):
         page = doc.load_page(page_num)
         text = page.get_text("text")
+        rows = text.split("\n")
+        table_data = [row.split() for row in rows if row]
         if table_data:
             tables.append(table_data)
     return tables
+def main():
+    st.title("PDF Table Extraction Tool")
+    # File uploader widget in Streamlit
+    uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+    if uploaded_file is not None:
+        # Call function to process the uploaded PDF file
+        tables = extract_tables_from_pdf(uploaded_file)
+        if tables:
+            st.write("Extracted Tables:")
+            for table in tables:
+                st.write(pd.DataFrame(table))
+        else:
+            st.write("No tables found in the PDF.")
+if __name__ == "__main__":
+    main()