Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import pandas as pd | |
| def extract_tables_from_pdf(pdf_path): | |
| # Open the PDF | |
| doc = fitz.open(pdf_path) | |
| tables = [] | |
| # Iterate through the pages to extract text or structured data | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| # Get the text from the page, you can then parse it for tables | |
| text = page.get_text("text") | |
| # Example: Extracting data from text and forming a table | |
| # You might need to apply custom parsing depending on the structure of your PDF | |
| rows = text.split("\n") # Split by newlines | |
| table_data = [row.split() for row in rows if row] # Split by spaces, or another delimiter | |
| if table_data: | |
| tables.append(table_data) | |
| return tables | |
| def save_tables_to_excel(tables, excel_output_path): | |
| with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer: | |
| for i, table in enumerate(tables): | |
| df = pd.DataFrame(table) # Create a DataFrame from the table | |
| df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False) | |
| print(f"Excel file saved as: {excel_output_path}") | |
| # Example usage | |
| pdf_path = "your_pdf_file.pdf" | |
| tables = extract_tables_from_pdf(pdf_path) | |
| excel_output_path = "output.xlsx" | |
| save_tables_to_excel(tables, excel_output_path) | |