AbhijitClemson's picture
Upload folder using huggingface_hub
1adc2e7 verified
import streamlit as st
import pandas as pd
import tabula
import pymupdf
import os
from tqdm import tqdm
def extract_tables_pymupdf(pdf_path):
"""Extract tables using PyMuPDF (alternative method)"""
try:
doc = pymupdf.open(pdf_path)
all_tables = []
for page_num in range(len(doc)):
page = doc[page_num]
tables = page.find_tables()
for table in tables:
# Extract table data
table_data = table.extract()
if table_data:
# Convert to DataFrame
df = pd.DataFrame(table_data[1:], columns=table_data[0])
all_tables.append({
'page': page_num + 1,
'dataframe': df
})
doc.close()
return all_tables
except Exception as e:
st.error(f"Error extracting tables with PyMuPDF: {e}")
return []
def main():
st.title("PDF Table Extractor")
st.write("Upload a PDF to extract all tables")
temp_path = "temp_uploaded.pdf" # Define here
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
# Save uploaded file temporarily
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Using PyMuPDF
tables = extract_tables_pymupdf(temp_path)
if tables:
st.success(f"Found {len(tables)} tables!")
for idx, table_info in enumerate(tables):
st.subheader(f"Table {idx + 1} (Page {table_info['page']})")
df = table_info['dataframe']
st.dataframe(df, use_container_width=True)
# Clean up temp file
if os.path.exists(temp_path):
os.remove(temp_path)