Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

Update app.py

5432d3d verified over 1 year ago

1.36 kB

	import fitz # PyMuPDF
	import pandas as pd

	def extract_tables_from_pdf(pdf_path):
	# Open the PDF
	doc = fitz.open(pdf_path)

	tables = []

	# Iterate through the pages to extract text or structured data
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# Get the text from the page, you can then parse it for tables
	text = page.get_text("text")

	# Example: Extracting data from text and forming a table
	# You might need to apply custom parsing depending on the structure of your PDF
	rows = text.split("\n") # Split by newlines
	table_data = [row.split() for row in rows if row] # Split by spaces, or another delimiter

	if table_data:
	tables.append(table_data)

	return tables

	def save_tables_to_excel(tables, excel_output_path):
	with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
	for i, table in enumerate(tables):
	df = pd.DataFrame(table) # Create a DataFrame from the table
	df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)

	print(f"Excel file saved as: {excel_output_path}")

	# Example usage
	pdf_path = "your_pdf_file.pdf"
	tables = extract_tables_from_pdf(pdf_path)
	excel_output_path = "output.xlsx"
	save_tables_to_excel(tables, excel_output_path)