POEXTRACTION / app.py
jithenderchoudary's picture
Update app.py
5a48e80 verified
import streamlit as st
import pdfplumber
import pandas as pd
from io import BytesIO
def is_header_row(row):
"""Check if a row contains header-like information to skip."""
keywords = ['Purchase Order', 'GSTIN', 'Terms', 'Annexure', 'Currency']
return any(keyword in str(row).strip() for keyword in keywords)
def extract_clean_tables(pdf_file):
"""Extracts tables while skipping headers and aligning columns correctly."""
all_data = [] # Store all extracted data across pages
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
# Convert the table to DataFrame for easier manipulation
df = pd.DataFrame(table[1:], columns=table[0])
# Clean up rows with header-like content
df = df[~df.apply(is_header_row, axis=1)]
# Fix column misalignment (if units are shifted to delivery date column)
if 'Delivery Date' in df.columns and 'Unit' in df.columns:
mask = df['Delivery Date'].str.contains(r'NOS|PCS', na=False)
df.loc[mask, 'Unit'] = df.loc[mask, 'Delivery Date']
df.loc[mask, 'Delivery Date'] = None
all_data.append(df)
# Combine all extracted data into one DataFrame
if all_data:
combined_df = pd.concat(all_data, ignore_index=True)
return combined_df
else:
return pd.DataFrame()
# Streamlit App
st.title("Enhanced PO Extraction Tool")
uploaded_file = st.file_uploader("Upload PO PDF", type=["pdf"])
if uploaded_file:
try:
# Extract and clean tables from the uploaded PDF
extracted_data = extract_clean_tables(uploaded_file)
if not extracted_data.empty:
st.success("Data extracted successfully!")
st.dataframe(extracted_data)
# Create Excel file in memory
excel_buffer = BytesIO()
with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
extracted_data.to_excel(writer, index=False, sheet_name="PO Data")
excel_buffer.seek(0)
# Provide download options
st.download_button(
label="Download as Excel",
data=excel_buffer,
file_name="po_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
else:
st.warning("No valid tables found in the uploaded PDF.")
except Exception as e:
st.error(f"An error occurred: {e}")