AzizWazir commited on
Commit
5432d3d
·
verified ·
1 Parent(s): ea1977e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -18
app.py CHANGED
@@ -1,30 +1,39 @@
 
1
  import pandas as pd
2
 
3
- def pdf_to_excel(pdf_path, excel_output_path):
4
- # Example: If your PDF has structured data that can be parsed into a table
5
- # (You can use libraries like pdfplumber for extracting tables)
6
-
7
- tables = [] # List to store the extracted tables
8
-
9
- # Example of extracting a table (this part would depend on your PDF content)
10
- # Extract tables using pdfplumber, PyMuPDF, or a similar library
11
- # Example with pdfplumber (if tables are present in your PDF)
12
- import pdfplumber
13
- with pdfplumber.open(pdf_path) as pdf:
14
- for page in pdf.pages:
15
- table = page.extract_table()
16
- if table:
17
- tables.append(table)
 
 
 
 
 
18
 
19
- # Write the extracted tables to an Excel file
 
 
20
  with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
21
  for i, table in enumerate(tables):
22
- df = pd.DataFrame(table[1:], columns=table[0]) # Converting to DataFrame
23
  df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
24
 
25
  print(f"Excel file saved as: {excel_output_path}")
26
 
27
  # Example usage
28
  pdf_path = "your_pdf_file.pdf"
 
29
  excel_output_path = "output.xlsx"
30
- pdf_to_excel(pdf_path, excel_output_path)
 
1
+ import fitz # PyMuPDF
2
  import pandas as pd
3
 
4
+ def extract_tables_from_pdf(pdf_path):
5
+ # Open the PDF
6
+ doc = fitz.open(pdf_path)
7
+
8
+ tables = []
9
+
10
+ # Iterate through the pages to extract text or structured data
11
+ for page_num in range(len(doc)):
12
+ page = doc.load_page(page_num)
13
+
14
+ # Get the text from the page, you can then parse it for tables
15
+ text = page.get_text("text")
16
+
17
+ # Example: Extracting data from text and forming a table
18
+ # You might need to apply custom parsing depending on the structure of your PDF
19
+ rows = text.split("\n") # Split by newlines
20
+ table_data = [row.split() for row in rows if row] # Split by spaces, or another delimiter
21
+
22
+ if table_data:
23
+ tables.append(table_data)
24
 
25
+ return tables
26
+
27
+ def save_tables_to_excel(tables, excel_output_path):
28
  with pd.ExcelWriter(excel_output_path, engine='openpyxl') as writer:
29
  for i, table in enumerate(tables):
30
+ df = pd.DataFrame(table) # Create a DataFrame from the table
31
  df.to_excel(writer, sheet_name=f"Sheet{i+1}", index=False)
32
 
33
  print(f"Excel file saved as: {excel_output_path}")
34
 
35
  # Example usage
36
  pdf_path = "your_pdf_file.pdf"
37
+ tables = extract_tables_from_pdf(pdf_path)
38
  excel_output_path = "output.xlsx"
39
+ save_tables_to_excel(tables, excel_output_path)