Spaces:
Build error
Build error
| import streamlit as st | |
| import os | |
| import tempfile | |
| from huggingface_hub import snapshot_download | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import fitz # PyMuPDF | |
| # Step 1: Download model if not present | |
| MODEL_DIR = "./pdf-extract-kit" | |
| if not os.path.exists(MODEL_DIR): | |
| with st.spinner("Downloading model..."): | |
| snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20) | |
| # Step 2: Import model logic dynamically | |
| import sys | |
| sys.path.append(MODEL_DIR + "/inference") | |
| try: | |
| from table_recognizer import TableRecognizer | |
| except ImportError: | |
| st.error("β Unable to load TableRecognizer. Check model directory structure.") | |
| st.stop() | |
| # Step 3: Set up recognizer | |
| table_model = TableRecognizer( | |
| model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"), | |
| device="cpu" # Change to 'cuda' if using GPU | |
| ) | |
| st.title("π PDF Table Extractor") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| if uploaded_file: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: | |
| tmp_pdf.write(uploaded_file.read()) | |
| tmp_pdf_path = tmp_pdf.name | |
| images = convert_from_path(tmp_pdf_path) | |
| for i, img in enumerate(images): | |
| st.subheader(f"Page {i + 1}") | |
| st.image(img, caption="Original Page", use_column_width=True) | |
| # Step 4: Run Table Recognizer | |
| with st.spinner("Extracting tables..."): | |
| table_results = table_model(img) # This assumes model takes a PIL image and returns result | |
| if table_results: | |
| for idx, table in enumerate(table_results): | |
| st.markdown(f"#### Table {idx + 1}") | |
| st.dataframe(table["data"]) # Assuming table["data"] is a 2D list or pandas DataFrame | |
| else: | |
| st.info("No tables detected on this page.") | |