Spaces:

jithenderchoudary
/

pdftowxcel

Runtime error

File size: 2,518 Bytes

c78159a
 
62b9228
c78159a
cee176e
c78159a
 
 
 
cee176e
c78159a
 
 
cee176e
c78159a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cee176e
c78159a
cee176e
c78159a
 
cee176e
c78159a
cee176e
c78159a
 
 
 
cee176e
c78159a
 
 
 
 
 
 
 
 
cee176e
c78159a

import streamlit as st
import pdfplumber
import pandas as pd
from io import BytesIO

def is_header(text):
    """Identify headers using common keywords."""
    keywords = ['Purchase Order', 'Supplier Order', 'GSTIN', 'Annexure', 'Terms', 'Currency']
    return any(keyword in text for keyword in keywords)

def extract_cleaned_tables(pdf_file):
    """Extract tables while skipping headers and arranging them by pages."""
    tables = []
    
    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            
            # Skip pages with header-heavy content
            if is_header(text):
                continue
            
            page_tables = page.extract_tables()
            for table in page_tables:
                if table:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    # Fix misalignment issues (if 'Unit' in wrong columns, move it)
                    if 'Delivery Date' in df.columns and 'Unit' in df.columns:
                        mask = df['Delivery Date'].str.contains(r'NOS|PCS', na=False)
                        df.loc[mask, 'Unit'] = df.loc[mask, 'Delivery Date']
                        df.loc[mask, 'Delivery Date'] = None
                    
                    tables.append((f"Page_{page_num+1}", df))
    
    return tables

# Streamlit App
st.title("Enhanced PO Extraction Tool")

uploaded_file = st.file_uploader("Upload PO PDF", type=["pdf"])

if uploaded_file:
    try:
        # Extract and clean tables from the uploaded PDF
        extracted_tables = extract_cleaned_tables(uploaded_file)

        if extracted_tables:
            st.success("Tables extracted successfully!")
            
            # Create an Excel file with multiple sheets
            excel_buffer = BytesIO()
            with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
                for sheet_name, df in extracted_tables:
                    df.to_excel(writer, index=False, sheet_name=sheet_name)
            excel_buffer.seek(0)

            # Provide download options
            st.download_button(
                label="Download as Excel",
                data=excel_buffer,
                file_name="po_data.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        else:
            st.warning("No valid tables found.")
    except Exception as e:
        st.error(f"An error occurred: {e}")