Spaces:

Omarrran
/

PDF_AR_Parser

Sleeping

App Files Files Community

Omarrran commited on Nov 26, 2025

Commit

a373348

verified ·

1 Parent(s): 0e96fb3

Create app.py

Browse files

Files changed (1) hide show

app.py +801 -0

app.py ADDED Viewed

	@@ -0,0 +1,801 @@

+"""
+PDF Table Extractor & AR Aging Analyzer
+A comprehensive tool for extracting tables from PDFs and performing AR aging analysis.
+Built for Hugging Face Spaces with Gradio interface.
+"""
+import gradio as gr
+import pandas as pd
+import numpy as np
+import pdfplumber
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import tempfile
+import os
+from typing import Tuple, List, Optional, Dict, Any
+import io
+# ============================================================================
+# CORE PDF EXTRACTION FUNCTIONS
+# ============================================================================
+def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extract all text from a PDF file."""
+    text_content = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for i, page in enumerate(pdf.pages):
+            page_text = page.extract_text()
+            if page_text:
+                text_content.append(f"--- Page {i + 1} ---\n{page_text}")
+    return "\n\n".join(text_content)
+def extract_tables_from_pdf(pdf_path: str) -> List[pd.DataFrame]:
+    """Extract all tables from a PDF file."""
+    tables = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages):
+            page_tables = page.extract_tables()
+            for table_idx, table in enumerate(page_tables):
+                if table and len(table) > 1:
+                    # Clean up the table
+                    cleaned_table = [row for row in table if any(cell for cell in row)]
+                    if cleaned_table:
+                        df = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0])
+                        df.attrs['source'] = f"Page {page_num + 1}, Table {table_idx + 1}"
+                        tables.append(df)
+    return tables
+def extract_tables_with_settings(
+    pdf_path: str,
+    vertical_strategy: str = "text",
+    horizontal_strategy: str = "text",
+    snap_tolerance: int = 3,
+    join_tolerance: int = 3
+) -> List[pd.DataFrame]:
+    """Extract tables with custom pdfplumber settings."""
+    tables = []
+    table_settings = {
+        "vertical_strategy": vertical_strategy,
+        "horizontal_strategy": horizontal_strategy,
+        "snap_tolerance": snap_tolerance,
+        "join_tolerance": join_tolerance,
+    }
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages):
+            try:
+                table = page.extract_table(table_settings=table_settings)
+                if table and len(table) > 1:
+                    cleaned_table = [row for row in table if any(cell for cell in row if cell)]
+                    if cleaned_table:
+                        df = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0])
+                        df.attrs['source'] = f"Page {page_num + 1}"
+                        tables.append(df)
+            except Exception as e:
+                continue
+    return tables
+def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]:
+    """Extract metadata from a PDF file."""
+    with pdfplumber.open(pdf_path) as pdf:
+        metadata = {
+            "Number of Pages": len(pdf.pages),
+            "PDF Metadata": pdf.metadata if pdf.metadata else "No metadata available"
+        }
+        # Get page dimensions
+        if pdf.pages:
+            first_page = pdf.pages[0]
+            metadata["Page Width"] = first_page.width
+            metadata["Page Height"] = first_page.height
+    return metadata
+# ============================================================================
+# AR AGING SPECIFIC FUNCTIONS
+# ============================================================================
+def convert_to_float(num: str) -> float:
+    """Convert string number to float, handling commas and errors."""
+    try:
+        if num is None or str(num).strip() == '':
+            return 0.0
+        return float(str(num).replace(',', '').replace('$', '').strip())
+    except (ValueError, AttributeError):
+        return 0.0
+def process_ar_aging(df: pd.DataFrame, name_column: str, amount_columns: List[str]) -> Tuple[pd.DataFrame, Dict]:
+    """Process a dataframe as an AR aging report."""
+    result_df = df.copy()
+    # Convert amount columns to float
+    for col in amount_columns:
+        if col in result_df.columns:
+            result_df[col] = result_df[col].apply(convert_to_float)
+    # Forward fill name column to handle grouped rows
+    if name_column in result_df.columns:
+        result_df[name_column] = result_df[name_column].replace('', np.nan).ffill()
+    # Create pivot table
+    pivot = result_df.pivot_table(
+        index=name_column,
+        values=amount_columns,
+        aggfunc='sum'
+    )
+    # Reorder columns if they exist
+    ordered_cols = [col for col in amount_columns if col in pivot.columns]
+    pivot = pivot[ordered_cols]
+    # Add total column
+    pivot['Total'] = pivot.sum(axis=1)
+    # Add totals row
+    pivot.loc['TOTAL'] = pivot.sum()
+    # Calculate percentages
+    total_amount = pivot.loc['TOTAL', 'Total']
+    if total_amount > 0:
+        perc_row = (pivot.loc['TOTAL'] / total_amount * 100).round(2)
+        pivot.loc['PERCENTAGE'] = perc_row
+    # Prepare summary statistics
+    summary = {
+        "Total AR Amount": f"${total_amount:,.2f}",
+        "Number of Customers": len(pivot) - 2,  # Exclude TOTAL and PERCENTAGE rows
+        "Largest Balance": f"${pivot['Total'][:-2].max():,.2f}" if len(pivot) > 2 else "N/A",
+        "Average Balance": f"${pivot['Total'][:-2].mean():,.2f}" if len(pivot) > 2 else "N/A",
+    }
+    return pivot, summary
+def create_aging_charts(pivot_df: pd.DataFrame) -> Tuple[go.Figure, go.Figure, go.Figure]:
+    """Create visualization charts for AR aging analysis."""
+    # Remove TOTAL and PERCENTAGE rows for customer charts
+    customer_data = pivot_df.iloc[:-2].copy() if len(pivot_df) > 2 else pivot_df.copy()
+    # Chart 1: Aging Distribution Pie Chart
+    if 'TOTAL' in pivot_df.index:
+        totals = pivot_df.loc['TOTAL'].drop('Total', errors='ignore')
+        fig_pie = px.pie(
+            values=totals.values,
+            names=totals.index,
+            title="AR Aging Distribution",
+            hole=0.4,
+            color_discrete_sequence=px.colors.qualitative.Set2
+        )
+        fig_pie.update_traces(textposition='inside', textinfo='percent+label')
+    else:
+        fig_pie = go.Figure()
+        fig_pie.add_annotation(text="No data available", showarrow=False)
+    # Chart 2: Customer Balance Bar Chart (Top 10)
+    if len(customer_data) > 0:
+        top_customers = customer_data.nlargest(10, 'Total')
+        fig_bar = px.bar(
+            top_customers.reset_index(),
+            x=top_customers.index.name or 'Customer',
+            y='Total',
+            title="Top 10 Customer Balances",
+            color='Total',
+            color_continuous_scale='Reds'
+        )
+        fig_bar.update_layout(xaxis_tickangle=-45)
+    else:
+        fig_bar = go.Figure()
+        fig_bar.add_annotation(text="No data available", showarrow=False)
+    # Chart 3: Stacked Bar Chart by Aging Bucket
+    if len(customer_data) > 0:
+        aging_cols = [col for col in customer_data.columns if col != 'Total']
+        top_customers = customer_data.nlargest(10, 'Total')
+        fig_stacked = go.Figure()
+        colors = ['#2ecc71', '#3498db', '#f1c40f', '#e67e22', '#e74c3c']
+        for i, col in enumerate(aging_cols):
+            if col in top_customers.columns:
+                fig_stacked.add_trace(go.Bar(
+                    name=col,
+                    x=top_customers.index,
+                    y=top_customers[col],
+                    marker_color=colors[i % len(colors)]
+                ))
+        fig_stacked.update_layout(
+            barmode='stack',
+            title="AR Aging by Customer (Top 10)",
+            xaxis_tickangle=-45,
+            legend_title="Aging Bucket"
+        )
+    else:
+        fig_stacked = go.Figure()
+        fig_stacked.add_annotation(text="No data available", showarrow=False)
+    return fig_pie, fig_bar, fig_stacked
+# ============================================================================
+# GRADIO INTERFACE FUNCTIONS
+# ============================================================================
+def process_pdf_basic(pdf_file) -> Tuple[str, str, pd.DataFrame, str]:
+    """Basic PDF processing - extract text, metadata, and first table."""
+    if pdf_file is None:
+        return "No file uploaded", "", pd.DataFrame(), ""
+    try:
+        # Extract metadata
+        metadata = get_pdf_metadata(pdf_file.name)
+        metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
+        # Extract text
+        text = extract_text_from_pdf(pdf_file.name)
+        # Extract tables
+        tables = extract_tables_from_pdf(pdf_file.name)
+        if tables:
+            first_table = tables[0]
+            table_info = f"Found {len(tables)} table(s). Showing first table from {first_table.attrs.get('source', 'unknown')}."
+        else:
+            first_table = pd.DataFrame()
+            table_info = "No tables found in the PDF."
+        return metadata_str, text[:5000] + "..." if len(text) > 5000 else text, first_table, table_info
+    except Exception as e:
+        return f"Error: {str(e)}", "", pd.DataFrame(), ""
+def process_pdf_advanced(
+    pdf_file,
+    v_strategy: str,
+    h_strategy: str,
+    snap_tol: int,
+    join_tol: int,
+    page_num: int
+) -> Tuple[pd.DataFrame, str, str]:
+    """Advanced PDF table extraction with custom settings."""
+    if pdf_file is None:
+        return pd.DataFrame(), "No file uploaded", ""
+    try:
+        tables = extract_tables_with_settings(
+            pdf_file.name,
+            vertical_strategy=v_strategy,
+            horizontal_strategy=h_strategy,
+            snap_tolerance=snap_tol,
+            join_tolerance=join_tol
+        )
+        if not tables:
+            return pd.DataFrame(), "No tables found with current settings.", ""
+        # Get the requested page's table
+        idx = min(page_num - 1, len(tables) - 1)
+        table = tables[idx]
+        info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
+        columns = ", ".join(table.columns.tolist())
+        return table, info, f"Columns: {columns}"
+    except Exception as e:
+        return pd.DataFrame(), f"Error: {str(e)}", ""
+def process_ar_aging_report(
+    pdf_file,
+    name_col: str,
+    amount_cols: str
+) -> Tuple[pd.DataFrame, str, go.Figure, go.Figure, go.Figure, str]:
+    """Process PDF as AR Aging report with analysis."""
+    if pdf_file is None:
+        empty_fig = go.Figure()
+        return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, "No file uploaded"
+    try:
+        # Extract tables
+        tables = extract_tables_from_pdf(pdf_file.name)
+        if not tables:
+            # Try with text strategy
+            tables = extract_tables_with_settings(
+                pdf_file.name,
+                vertical_strategy="text",
+                horizontal_strategy="text"
+            )
+        if not tables:
+            empty_fig = go.Figure()
+            return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, "No tables found in PDF"
+        # Use the largest table
+        df = max(tables, key=len)
+        # Parse amount columns
+        amount_col_list = [col.strip() for col in amount_cols.split(",")]
+        # Find matching columns (flexible matching)
+        matched_cols = []
+        for col in amount_col_list:
+            for df_col in df.columns:
+                if col.lower() in str(df_col).lower():
+                    matched_cols.append(df_col)
+                    break
+        if not matched_cols:
+            matched_cols = [col for col in df.columns if any(
+                kw in str(col).lower() for kw in ['current', 'amount', '30', '60', '90', 'invoiced', 'balance']
+            )]
+        # Find name column
+        name_column = None
+        for df_col in df.columns:
+            if name_col.lower() in str(df_col).lower():
+                name_column = df_col
+                break
+        if not name_column:
+            name_column = df.columns[0]
+        if not matched_cols:
+            matched_cols = list(df.columns[1:6])  # Use first 5 numeric-looking columns
+        # Process the data
+        pivot, summary = process_ar_aging(df, name_column, matched_cols)
+        # Create charts
+        fig_pie, fig_bar, fig_stacked = create_aging_charts(pivot)
+        # Format summary
+        summary_str = "\n".join([f"**{k}:** {v}" for k, v in summary.items()])
+        return pivot.reset_index(), summary_str, fig_pie, fig_bar, fig_stacked, f"Processed with columns: {', '.join(matched_cols)}"
+    except Exception as e:
+        import traceback
+        empty_fig = go.Figure()
+        return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, f"Error: {str(e)}\n{traceback.format_exc()}"
+def export_to_csv(df: pd.DataFrame) -> str:
+    """Export dataframe to CSV file."""
+    if df is None or df.empty:
+        return None
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', mode='w')
+    df.to_csv(temp_file.name, index=True)
+    return temp_file.name
+def export_to_excel(df: pd.DataFrame) -> str:
+    """Export dataframe to Excel file."""
+    if df is None or df.empty:
+        return None
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
+    df.to_excel(temp_file.name, index=True, engine='openpyxl')
+    return temp_file.name
+# ============================================================================
+# GRADIO UI
+# ============================================================================
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    # Header
+    gr.HTML("""
+        <div class="main-header">
+            <h1>📄 PDF Table Extractor & AR Aging Analyzer</h1>
+            <p>Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel</p>
+        </div>
+    """)
+    with gr.Tabs() as tabs:
+        # ================================================================
+        # TAB 1: Basic Extraction
+        # ================================================================
+        with gr.TabItem("📋 Basic Extraction", id=1):
+            gr.Markdown("""
+            ### Quick PDF Analysis
+            Upload a PDF to extract text, metadata, and tables automatically.
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    basic_pdf_input = gr.File(
+                        label="Upload PDF",
+                        file_types=[".pdf"],
+                        type="filepath"
+                    )
+                    basic_extract_btn = gr.Button("🔍 Extract Content", variant="primary", size="lg")
+                with gr.Column(scale=2):
+                    basic_metadata = gr.Markdown(label="PDF Metadata")
+            with gr.Row():
+                with gr.Column():
+                    basic_text = gr.Textbox(
+                        label="Extracted Text",
+                        lines=10,
+                        max_lines=20
+                    )
+                with gr.Column():
+                    basic_table_info = gr.Textbox(label="Table Info")
+                    basic_table = gr.Dataframe(
+                        label="Extracted Table",
+                        wrap=True,
+                        height=400
+                    )
+            with gr.Row():
+                basic_csv_btn = gr.Button("📥 Export to CSV")
+                basic_excel_btn = gr.Button("📥 Export to Excel")
+                basic_csv_output = gr.File(label="CSV Download")
+                basic_excel_output = gr.File(label="Excel Download")
+            # Event handlers
+            basic_extract_btn.click(
+                fn=process_pdf_basic,
+                inputs=[basic_pdf_input],
+                outputs=[basic_metadata, basic_text, basic_table, basic_table_info]
+            )
+            basic_csv_btn.click(
+                fn=export_to_csv,
+                inputs=[basic_table],
+                outputs=[basic_csv_output]
+            )
+            basic_excel_btn.click(
+                fn=export_to_excel,
+                inputs=[basic_table],
+                outputs=[basic_excel_output]
+            )
+        # ================================================================
+        # TAB 2: Advanced Extraction
+        # ================================================================
+        with gr.TabItem("⚙️ Advanced Extraction", id=2):
+            gr.Markdown("""
+            ### Advanced Table Extraction Settings
+            Fine-tune the extraction parameters for complex PDFs.
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    adv_pdf_input = gr.File(
+                        label="Upload PDF",
+                        file_types=[".pdf"],
+                        type="filepath"
+                    )
+                    gr.Markdown("**Extraction Settings**")
+                    adv_v_strategy = gr.Dropdown(
+                        choices=["text", "lines", "lines_strict", "explicit"],
+                        value="text",
+                        label="Vertical Strategy",
+                        info="How to identify column boundaries"
+                    )
+                    adv_h_strategy = gr.Dropdown(
+                        choices=["text", "lines", "lines_strict", "explicit"],
+                        value="text",
+                        label="Horizontal Strategy",
+                        info="How to identify row boundaries"
+                    )
+                    adv_snap_tol = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=3,
+                        step=1,
+                        label="Snap Tolerance",
+                        info="Tolerance for snapping to lines"
+                    )
+                    adv_join_tol = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=3,
+                        step=1,
+                        label="Join Tolerance",
+                        info="Tolerance for joining segments"
+                    )
+                    adv_page_num = gr.Number(
+                        value=1,
+                        minimum=1,
+                        label="Table Number",
+                        info="Which table to display"
+                    )
+                    adv_extract_btn = gr.Button("🔧 Extract with Settings", variant="primary")
+                with gr.Column(scale=2):
+                    adv_info = gr.Textbox(label="Extraction Info")
+                    adv_columns = gr.Textbox(label="Detected Columns")
+                    adv_table = gr.Dataframe(
+                        label="Extracted Table",
+                        wrap=True,
+                        height=500
+                    )
+            with gr.Row():
+                adv_csv_btn = gr.Button("📥 Export to CSV")
+                adv_excel_btn = gr.Button("📥 Export to Excel")
+                adv_csv_output = gr.File(label="CSV Download")
+                adv_excel_output = gr.File(label="Excel Download")
+            # Event handlers
+            adv_extract_btn.click(
+                fn=process_pdf_advanced,
+                inputs=[adv_pdf_input, adv_v_strategy, adv_h_strategy, adv_snap_tol, adv_join_tol, adv_page_num],
+                outputs=[adv_table, adv_info, adv_columns]
+            )
+            adv_csv_btn.click(
+                fn=export_to_csv,
+                inputs=[adv_table],
+                outputs=[adv_csv_output]
+            )
+            adv_excel_btn.click(
+                fn=export_to_excel,
+                inputs=[adv_table],
+                outputs=[adv_excel_output]
+            )
+        # ================================================================
+        # TAB 3: AR Aging Analysis
+        # ================================================================
+        with gr.TabItem("💰 AR Aging Analysis", id=3):
+            gr.Markdown("""
+            ### Accounts Receivable Aging Analysis
+            Upload an AR aging PDF report to extract, analyze, and visualize the data.
+            **Common AR Aging Column Names:**
+            - Customer/Name column: `Name`, `Customer`, `Company`, `Account`
+            - Amount columns: `Current`, `1-30`, `31-60`, `61-90`, `Over 90`, `Not Invoiced`
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ar_pdf_input = gr.File(
+                        label="Upload AR Aging PDF",
+                        file_types=[".pdf"],
+                        type="filepath"
+                    )
+                    ar_name_col = gr.Textbox(
+                        value="Name",
+                        label="Customer/Name Column",
+                        info="Part of the column name that identifies customers"
+                    )
+                    ar_amount_cols = gr.Textbox(
+                        value="Not Invoiced, Current, 31-60, 61-90, Over 90",
+                        label="Amount Columns (comma-separated)",
+                        info="Column names for aging buckets"
+                    )
+                    ar_analyze_btn = gr.Button("📊 Analyze AR Aging", variant="primary", size="lg")
+                with gr.Column(scale=2):
+                    ar_summary = gr.Markdown(label="Summary Statistics")
+                    ar_status = gr.Textbox(label="Processing Status")
+            with gr.Row():
+                ar_table = gr.Dataframe(
+                    label="AR Aging Summary by Customer",
+                    wrap=True,
+                    height=400
+                )
+            gr.Markdown("### 📈 Visualizations")
+            with gr.Row():
+                ar_pie_chart = gr.Plot(label="Aging Distribution")
+                ar_bar_chart = gr.Plot(label="Top Customer Balances")
+            with gr.Row():
+                ar_stacked_chart = gr.Plot(label="Aging by Customer")
+            with gr.Row():
+                ar_csv_btn = gr.Button("📥 Export to CSV")
+                ar_excel_btn = gr.Button("📥 Export to Excel")
+                ar_csv_output = gr.File(label="CSV Download")
+                ar_excel_output = gr.File(label="Excel Download")
+            # Event handlers
+            ar_analyze_btn.click(
+                fn=process_ar_aging_report,
+                inputs=[ar_pdf_input, ar_name_col, ar_amount_cols],
+                outputs=[ar_table, ar_summary, ar_pie_chart, ar_bar_chart, ar_stacked_chart, ar_status]
+            )
+            ar_csv_btn.click(
+                fn=export_to_csv,
+                inputs=[ar_table],
+                outputs=[ar_csv_output]
+            )
+            ar_excel_btn.click(
+                fn=export_to_excel,
+                inputs=[ar_table],
+                outputs=[ar_excel_output]
+            )
+        # ================================================================
+        # TAB 4: Batch Processing
+        # ================================================================
+        with gr.TabItem("📁 Batch Processing", id=4):
+            gr.Markdown("""
+            ### Process Multiple PDFs
+            Upload multiple PDF files to extract tables from all of them at once.
+            """)
+            batch_pdf_input = gr.File(
+                label="Upload Multiple PDFs",
+                file_types=[".pdf"],
+                file_count="multiple",
+                type="filepath"
+            )
+            batch_process_btn = gr.Button("🔄 Process All PDFs", variant="primary")
+            batch_results = gr.Textbox(
+                label="Processing Results",
+                lines=10
+            )
+            batch_combined_table = gr.Dataframe(
+                label="Combined Data (All Tables)",
+                wrap=True,
+                height=400
+            )
+            with gr.Row():
+                batch_csv_btn = gr.Button("📥 Export Combined to CSV")
+                batch_csv_output = gr.File(label="CSV Download")
+            def process_batch(files):
+                if not files:
+                    return "No files uploaded", pd.DataFrame()
+                results = []
+                all_tables = []
+                for file in files:
+                    try:
+                        tables = extract_tables_from_pdf(file.name)
+                        results.append(f"✅ {os.path.basename(file.name)}: Found {len(tables)} table(s)")
+                        for table in tables:
+                            table['Source_File'] = os.path.basename(file.name)
+                            all_tables.append(table)
+                    except Exception as e:
+                        results.append(f"❌ {os.path.basename(file.name)}: Error - {str(e)}")
+                if all_tables:
+                    # Try to combine tables with same structure
+                    try:
+                        combined = pd.concat(all_tables, ignore_index=True)
+                    except:
+                        combined = all_tables[0] if all_tables else pd.DataFrame()
+                else:
+                    combined = pd.DataFrame()
+                return "\n".join(results), combined
+            batch_process_btn.click(
+                fn=process_batch,
+                inputs=[batch_pdf_input],
+                outputs=[batch_results, batch_combined_table]
+            )
+            batch_csv_btn.click(
+                fn=export_to_csv,
+                inputs=[batch_combined_table],
+                outputs=[batch_csv_output]
+            )
+        # ================================================================
+        # TAB 5: Help & Documentation
+        # ================================================================
+        with gr.TabItem("❓ Help", id=5):
+            gr.Markdown("""
+            ## 📚 Documentation & Tips
+            ### Overview
+            This application extracts tabular data from PDF files and provides specialized
+            analysis for Accounts Receivable (AR) Aging reports.
+            ---
+            ### 🔧 Extraction Strategies
+            | Strategy | Description | Best For |
+            |----------|-------------|----------|
+            | `text` | Uses text positions to identify boundaries | Most PDFs, especially text-based tables |
+            | `lines` | Uses drawn lines to identify boundaries | PDFs with visible grid lines |
+            | `lines_strict` | Strictly follows drawn lines | Clean, well-formatted tables |
+            | `explicit` | Requires explicit boundary definitions | Complex layouts |
+            ---
+            ### 💡 Tips for Best Results
+            1. **Start with Basic Extraction** - Try the basic tab first to see what's detected
+            2. **Adjust Strategies** - If tables aren't detected correctly:
+               - Try `lines` strategy if your PDF has visible gridlines
+               - Increase tolerance values for loosely formatted tables
+            3. **AR Aging Reports** - For best results:
+               - Ensure column names match your PDF headers
+               - Use partial matches (e.g., "Name" will match "Customer Name")
+            4. **Large PDFs** - Processing may take longer for multi-page documents
+            ---
+            ### 📋 Supported Formats
+            - **Input:** PDF files (.pdf)
+            - **Output:** CSV, Excel (.xlsx)
+            ---
+            ### 🔗 Technology Stack
+            - **pdfplumber** - PDF parsing and table extraction
+            - **pandas** - Data manipulation and analysis
+            - **plotly** - Interactive visualizations
+            - **gradio** - Web interface
+            ---
+            ### ⚠️ Limitations
+            - Scanned PDFs (images) are not supported - use OCR tools first
+            - Very complex table layouts may require manual adjustment
+            - Password-protected PDFs are not supported
+            ---
+            ### 📧 Feedback
+            If you encounter issues or have suggestions, please provide feedback!
+            """)
+    # Footer
+    gr.HTML("""
+        <div style="text-align: center; margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 8px;">
+            <p style="color: #666; margin: 0;">
+                Built with ❤️ using Gradio & pdfplumber |
+                <a href="https://github.com/jsvine/pdfplumber" target="_blank">pdfplumber docs</a>
+            </p>
+        </div>
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()