Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

AzizWazir commited on Dec 29, 2024

Commit

5432d3d

verified ·

1 Parent(s): ea1977e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,30 +1,39 @@
 import pandas as pd
-def pdf_to_excel(pdf_path, excel_output_path):
-    # Example: If your PDF has structured data that can be parsed into a table
-    # (You can use libraries like pdfplumber for extracting tables)
-    tables = []  # List to store the extracted tables
-    # Example of extracting a table (this part would depend on your PDF content)
-    # Extract tables using pdfplumber, PyMuPDF, or a similar library
-    # Example with pdfplumber (if tables are present in your PDF)
-    import pdfplumber
-    with pdfplumber.open(pdf_path) as pdf:
-        for page in pdf.pages:
-            table = page.extract_table()
-            if table:
-                tables.append(table)
-    # Write the extracted tables to an Excel file
     with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
         for i, table in enumerate(tables):
-            df = pd.DataFrame(table[1:], columns=table[0])  # Converting to DataFrame
             df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
     print(f"Excel file saved as: {excel_output_path}")
 # Example usage
 pdf_path = "your_pdf_file.pdf"
 excel_output_path = "output.xlsx"
-pdf_to_excel(pdf_path, excel_output_path)

+import fitz  # PyMuPDF
 import pandas as pd
+def extract_tables_from_pdf(pdf_path):
+    # Open the PDF
+    doc = fitz.open(pdf_path)
+    tables = []
+    # Iterate through the pages to extract text or structured data
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        # Get the text from the page, you can then parse it for tables
+        text = page.get_text("text")
+        # Example: Extracting data from text and forming a table
+        # You might need to apply custom parsing depending on the structure of your PDF
+        rows = text.split("\n")  # Split by newlines
+        table_data = [row.split() for row in rows if row]  # Split by spaces, or another delimiter
+        if table_data:
+            tables.append(table_data)
+    return tables
+def save_tables_to_excel(tables, excel_output_path):
     with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
         for i, table in enumerate(tables):
+            df = pd.DataFrame(table)  # Create a DataFrame from the table
             df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
     print(f"Excel file saved as: {excel_output_path}")
 # Example usage
 pdf_path = "your_pdf_file.pdf"
+tables = extract_tables_from_pdf(pdf_path)
 excel_output_path = "output.xlsx"
+save_tables_to_excel(tables, excel_output_path)