File size: 2,518 Bytes
c78159a
 
62b9228
c78159a
cee176e
c78159a
 
 
 
cee176e
c78159a
 
 
cee176e
c78159a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cee176e
c78159a
cee176e
c78159a
 
cee176e
c78159a
cee176e
c78159a
 
 
 
cee176e
c78159a
 
 
 
 
 
 
 
 
cee176e
c78159a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import streamlit as st
import pdfplumber
import pandas as pd
from io import BytesIO

def is_header(text):
    """Identify headers using common keywords."""
    keywords = ['Purchase Order', 'Supplier Order', 'GSTIN', 'Annexure', 'Terms', 'Currency']
    return any(keyword in text for keyword in keywords)

def extract_cleaned_tables(pdf_file):
    """Extract tables while skipping headers and arranging them by pages."""
    tables = []
    
    with pdfplumber.open(pdf_file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            
            # Skip pages with header-heavy content
            if is_header(text):
                continue
            
            page_tables = page.extract_tables()
            for table in page_tables:
                if table:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    # Fix misalignment issues (if 'Unit' in wrong columns, move it)
                    if 'Delivery Date' in df.columns and 'Unit' in df.columns:
                        mask = df['Delivery Date'].str.contains(r'NOS|PCS', na=False)
                        df.loc[mask, 'Unit'] = df.loc[mask, 'Delivery Date']
                        df.loc[mask, 'Delivery Date'] = None
                    
                    tables.append((f"Page_{page_num+1}", df))
    
    return tables

# Streamlit App
st.title("Enhanced PO Extraction Tool")

uploaded_file = st.file_uploader("Upload PO PDF", type=["pdf"])

if uploaded_file:
    try:
        # Extract and clean tables from the uploaded PDF
        extracted_tables = extract_cleaned_tables(uploaded_file)

        if extracted_tables:
            st.success("Tables extracted successfully!")
            
            # Create an Excel file with multiple sheets
            excel_buffer = BytesIO()
            with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
                for sheet_name, df in extracted_tables:
                    df.to_excel(writer, index=False, sheet_name=sheet_name)
            excel_buffer.seek(0)

            # Provide download options
            st.download_button(
                label="Download as Excel",
                data=excel_buffer,
                file_name="po_data.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        else:
            st.warning("No valid tables found.")
    except Exception as e:
        st.error(f"An error occurred: {e}")