import streamlit as st import pdfplumber import pandas as pd from io import BytesIO def is_header_row(row): """Check if a row contains header-like information to skip.""" keywords = ['Purchase Order', 'GSTIN', 'Terms', 'Annexure', 'Currency'] return any(keyword in str(row).strip() for keyword in keywords) def extract_clean_tables(pdf_file): """Extracts tables while skipping headers and aligning columns correctly.""" all_data = [] # Store all extracted data across pages with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: tables = page.extract_tables() for table in tables: # Convert the table to DataFrame for easier manipulation df = pd.DataFrame(table[1:], columns=table[0]) # Clean up rows with header-like content df = df[~df.apply(is_header_row, axis=1)] # Fix column misalignment (if units are shifted to delivery date column) if 'Delivery Date' in df.columns and 'Unit' in df.columns: mask = df['Delivery Date'].str.contains(r'NOS|PCS', na=False) df.loc[mask, 'Unit'] = df.loc[mask, 'Delivery Date'] df.loc[mask, 'Delivery Date'] = None all_data.append(df) # Combine all extracted data into one DataFrame if all_data: combined_df = pd.concat(all_data, ignore_index=True) return combined_df else: return pd.DataFrame() # Streamlit App st.title("Enhanced PO Extraction Tool") uploaded_file = st.file_uploader("Upload PO PDF", type=["pdf"]) if uploaded_file: try: # Extract and clean tables from the uploaded PDF extracted_data = extract_clean_tables(uploaded_file) if not extracted_data.empty: st.success("Data extracted successfully!") st.dataframe(extracted_data) # Create Excel file in memory excel_buffer = BytesIO() with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: extracted_data.to_excel(writer, index=False, sheet_name="PO Data") excel_buffer.seek(0) # Provide download options st.download_button( label="Download as Excel", data=excel_buffer, file_name="po_data.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) else: st.warning("No valid tables found in the uploaded PDF.") except Exception as e: st.error(f"An error occurred: {e}")