File size: 2,635 Bytes
b1b241b
 
 
 
 
5a48e80
 
 
 
 
 
 
 
b1b241b
 
5a48e80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1b241b
 
 
 
 
 
 
 
 
5a48e80
 
 
 
 
b1b241b
5a48e80
b1b241b
 
5a48e80
b1b241b
 
 
 
 
 
 
 
 
 
5a48e80
b1b241b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import streamlit as st
import pdfplumber
import pandas as pd
from io import BytesIO

def is_header_row(row):
    """Check if a row contains header-like information to skip."""
    keywords = ['Purchase Order', 'GSTIN', 'Terms', 'Annexure', 'Currency']
    return any(keyword in str(row).strip() for keyword in keywords)

def extract_clean_tables(pdf_file):
    """Extracts tables while skipping headers and aligning columns correctly."""
    all_data = []  # Store all extracted data across pages

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables()

            for table in tables:
                # Convert the table to DataFrame for easier manipulation
                df = pd.DataFrame(table[1:], columns=table[0])

                # Clean up rows with header-like content
                df = df[~df.apply(is_header_row, axis=1)]

                # Fix column misalignment (if units are shifted to delivery date column)
                if 'Delivery Date' in df.columns and 'Unit' in df.columns:
                    mask = df['Delivery Date'].str.contains(r'NOS|PCS', na=False)
                    df.loc[mask, 'Unit'] = df.loc[mask, 'Delivery Date']
                    df.loc[mask, 'Delivery Date'] = None

                all_data.append(df)

    # Combine all extracted data into one DataFrame
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

# Streamlit App
st.title("Enhanced PO Extraction Tool")

uploaded_file = st.file_uploader("Upload PO PDF", type=["pdf"])

if uploaded_file:
    try:
        # Extract and clean tables from the uploaded PDF
        extracted_data = extract_clean_tables(uploaded_file)

        if not extracted_data.empty:
            st.success("Data extracted successfully!")
            st.dataframe(extracted_data)

            # Create Excel file in memory
            excel_buffer = BytesIO()
            with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
                extracted_data.to_excel(writer, index=False, sheet_name="PO Data")
            excel_buffer.seek(0)

            # Provide download options
            st.download_button(
                label="Download as Excel",
                data=excel_buffer,
                file_name="po_data.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )
        else:
            st.warning("No valid tables found in the uploaded PDF.")
    except Exception as e:
        st.error(f"An error occurred: {e}")