PDF-Convertor / app.py
AzizWazir's picture
Update app.py
5432d3d verified
raw
history blame
1.36 kB
import fitz # PyMuPDF
import pandas as pd
def extract_tables_from_pdf(pdf_path):
# Open the PDF
doc = fitz.open(pdf_path)
tables = []
# Iterate through the pages to extract text or structured data
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Get the text from the page, you can then parse it for tables
text = page.get_text("text")
# Example: Extracting data from text and forming a table
# You might need to apply custom parsing depending on the structure of your PDF
rows = text.split("\n") # Split by newlines
table_data = [row.split() for row in rows if row] # Split by spaces, or another delimiter
if table_data:
tables.append(table_data)
return tables
def save_tables_to_excel(tables, excel_output_path):
with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
for i, table in enumerate(tables):
df = pd.DataFrame(table) # Create a DataFrame from the table
df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
print(f"Excel file saved as: {excel_output_path}")
# Example usage
pdf_path = "your_pdf_file.pdf"
tables = extract_tables_from_pdf(pdf_path)
excel_output_path = "output.xlsx"
save_tables_to_excel(tables, excel_output_path)