Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pdfplumber | |
| import pandas as pd | |
| from io import BytesIO | |
| def is_header_row(row): | |
| """Check if a row contains header-like information to skip.""" | |
| keywords = ['Purchase Order', 'GSTIN', 'Terms', 'Annexure', 'Currency'] | |
| return any(keyword in str(row).strip() for keyword in keywords) | |
| def extract_clean_tables(pdf_file): | |
| """Extracts tables while skipping headers and aligning columns correctly.""" | |
| all_data = [] # Store all extracted data across pages | |
| with pdfplumber.open(pdf_file) as pdf: | |
| for page in pdf.pages: | |
| tables = page.extract_tables() | |
| for table in tables: | |
| # Convert the table to DataFrame for easier manipulation | |
| df = pd.DataFrame(table[1:], columns=table[0]) | |
| # Clean up rows with header-like content | |
| df = df[~df.apply(is_header_row, axis=1)] | |
| # Fix column misalignment (if units are shifted to delivery date column) | |
| if 'Delivery Date' in df.columns and 'Unit' in df.columns: | |
| mask = df['Delivery Date'].str.contains(r'NOS|PCS', na=False) | |
| df.loc[mask, 'Unit'] = df.loc[mask, 'Delivery Date'] | |
| df.loc[mask, 'Delivery Date'] = None | |
| all_data.append(df) | |
| # Combine all extracted data into one DataFrame | |
| if all_data: | |
| combined_df = pd.concat(all_data, ignore_index=True) | |
| return combined_df | |
| else: | |
| return pd.DataFrame() | |
| # Streamlit App | |
| st.title("Enhanced PO Extraction Tool") | |
| uploaded_file = st.file_uploader("Upload PO PDF", type=["pdf"]) | |
| if uploaded_file: | |
| try: | |
| # Extract and clean tables from the uploaded PDF | |
| extracted_data = extract_clean_tables(uploaded_file) | |
| if not extracted_data.empty: | |
| st.success("Data extracted successfully!") | |
| st.dataframe(extracted_data) | |
| # Create Excel file in memory | |
| excel_buffer = BytesIO() | |
| with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: | |
| extracted_data.to_excel(writer, index=False, sheet_name="PO Data") | |
| excel_buffer.seek(0) | |
| # Provide download options | |
| st.download_button( | |
| label="Download as Excel", | |
| data=excel_buffer, | |
| file_name="po_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| else: | |
| st.warning("No valid tables found in the uploaded PDF.") | |
| except Exception as e: | |
| st.error(f"An error occurred: {e}") |