Spaces:
Running
Running
| """ | |
| PDF Table Extractor & AR Aging Analyzer | |
| A comprehensive tool for extracting tables from PDFs and performing AR aging analysis. | |
| Built for Hugging Face Spaces with Gradio interface. | |
| """ | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import pdfplumber | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import tempfile | |
| import os | |
| from typing import Tuple, List, Optional, Dict, Any | |
| import io | |
| # ============================================================================ | |
| # CORE PDF EXTRACTION FUNCTIONS | |
| # ============================================================================ | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """Extract all text from a PDF file.""" | |
| text_content = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_content.append(f"--- Page {i + 1} ---\n{page_text}") | |
| return "\n\n".join(text_content) | |
| def extract_tables_from_pdf(pdf_path: str) -> List[pd.DataFrame]: | |
| """Extract all tables from a PDF file.""" | |
| tables = [] | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page_num, page in enumerate(pdf.pages): | |
| page_tables = page.extract_tables() | |
| for table_idx, table in enumerate(page_tables): | |
| if table and len(table) > 1: | |
| # Clean up the table | |
| cleaned_table = [row for row in table if any(cell for cell in row)] | |
| if cleaned_table: | |
| df = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0]) | |
| df.attrs['source'] = f"Page {page_num + 1}, Table {table_idx + 1}" | |
| tables.append(df) | |
| return tables | |
| def extract_tables_with_settings( | |
| pdf_path: str, | |
| vertical_strategy: str = "text", | |
| horizontal_strategy: str = "text", | |
| snap_tolerance: int = 3, | |
| join_tolerance: int = 3 | |
| ) -> List[pd.DataFrame]: | |
| """Extract tables with custom pdfplumber settings.""" | |
| tables = [] | |
| table_settings = { | |
| "vertical_strategy": vertical_strategy, | |
| "horizontal_strategy": horizontal_strategy, | |
| "snap_tolerance": snap_tolerance, | |
| "join_tolerance": join_tolerance, | |
| } | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page_num, page in enumerate(pdf.pages): | |
| try: | |
| table = page.extract_table(table_settings=table_settings) | |
| if table and len(table) > 1: | |
| cleaned_table = [row for row in table if any(cell for cell in row if cell)] | |
| if cleaned_table: | |
| df = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0]) | |
| df.attrs['source'] = f"Page {page_num + 1}" | |
| tables.append(df) | |
| except Exception as e: | |
| continue | |
| return tables | |
| def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]: | |
| """Extract metadata from a PDF file.""" | |
| with pdfplumber.open(pdf_path) as pdf: | |
| metadata = { | |
| "Number of Pages": len(pdf.pages), | |
| "PDF Metadata": pdf.metadata if pdf.metadata else "No metadata available" | |
| } | |
| # Get page dimensions | |
| if pdf.pages: | |
| first_page = pdf.pages[0] | |
| metadata["Page Width"] = first_page.width | |
| metadata["Page Height"] = first_page.height | |
| return metadata | |
| # ============================================================================ | |
| # AR AGING SPECIFIC FUNCTIONS | |
| # ============================================================================ | |
| def convert_to_float(num: str) -> float: | |
| """Convert string number to float, handling commas and errors.""" | |
| try: | |
| if num is None or str(num).strip() == '': | |
| return 0.0 | |
| return float(str(num).replace(',', '').replace('$', '').strip()) | |
| except (ValueError, AttributeError): | |
| return 0.0 | |
| def process_ar_aging(df: pd.DataFrame, name_column: str, amount_columns: List[str]) -> Tuple[pd.DataFrame, Dict]: | |
| """Process a dataframe as an AR aging report.""" | |
| result_df = df.copy() | |
| # Convert amount columns to float | |
| for col in amount_columns: | |
| if col in result_df.columns: | |
| result_df[col] = result_df[col].apply(convert_to_float) | |
| # Forward fill name column to handle grouped rows | |
| if name_column in result_df.columns: | |
| result_df[name_column] = result_df[name_column].replace('', np.nan).ffill() | |
| # Create pivot table | |
| pivot = result_df.pivot_table( | |
| index=name_column, | |
| values=amount_columns, | |
| aggfunc='sum' | |
| ) | |
| # Reorder columns if they exist | |
| ordered_cols = [col for col in amount_columns if col in pivot.columns] | |
| pivot = pivot[ordered_cols] | |
| # Add total column | |
| pivot['Total'] = pivot.sum(axis=1) | |
| # Add totals row | |
| pivot.loc['TOTAL'] = pivot.sum() | |
| # Calculate percentages | |
| total_amount = pivot.loc['TOTAL', 'Total'] | |
| if total_amount > 0: | |
| perc_row = (pivot.loc['TOTAL'] / total_amount * 100).round(2) | |
| pivot.loc['PERCENTAGE'] = perc_row | |
| # Prepare summary statistics | |
| summary = { | |
| "Total AR Amount": f"${total_amount:,.2f}", | |
| "Number of Customers": len(pivot) - 2, # Exclude TOTAL and PERCENTAGE rows | |
| "Largest Balance": f"${pivot['Total'][:-2].max():,.2f}" if len(pivot) > 2 else "N/A", | |
| "Average Balance": f"${pivot['Total'][:-2].mean():,.2f}" if len(pivot) > 2 else "N/A", | |
| } | |
| return pivot, summary | |
| def create_aging_charts(pivot_df: pd.DataFrame) -> Tuple[go.Figure, go.Figure, go.Figure]: | |
| """Create visualization charts for AR aging analysis.""" | |
| # Remove TOTAL and PERCENTAGE rows for customer charts | |
| customer_data = pivot_df.iloc[:-2].copy() if len(pivot_df) > 2 else pivot_df.copy() | |
| # Chart 1: Aging Distribution Pie Chart | |
| if 'TOTAL' in pivot_df.index: | |
| totals = pivot_df.loc['TOTAL'].drop('Total', errors='ignore') | |
| fig_pie = px.pie( | |
| values=totals.values, | |
| names=totals.index, | |
| title="AR Aging Distribution", | |
| hole=0.4, | |
| color_discrete_sequence=px.colors.qualitative.Set2 | |
| ) | |
| fig_pie.update_traces(textposition='inside', textinfo='percent+label') | |
| else: | |
| fig_pie = go.Figure() | |
| fig_pie.add_annotation(text="No data available", showarrow=False) | |
| # Chart 2: Customer Balance Bar Chart (Top 10) | |
| if len(customer_data) > 0: | |
| top_customers = customer_data.nlargest(10, 'Total') | |
| fig_bar = px.bar( | |
| top_customers.reset_index(), | |
| x=top_customers.index.name or 'Customer', | |
| y='Total', | |
| title="Top 10 Customer Balances", | |
| color='Total', | |
| color_continuous_scale='Reds' | |
| ) | |
| fig_bar.update_layout(xaxis_tickangle=-45) | |
| else: | |
| fig_bar = go.Figure() | |
| fig_bar.add_annotation(text="No data available", showarrow=False) | |
| # Chart 3: Stacked Bar Chart by Aging Bucket | |
| if len(customer_data) > 0: | |
| aging_cols = [col for col in customer_data.columns if col != 'Total'] | |
| top_customers = customer_data.nlargest(10, 'Total') | |
| fig_stacked = go.Figure() | |
| colors = ['#2ecc71', '#3498db', '#f1c40f', '#e67e22', '#e74c3c'] | |
| for i, col in enumerate(aging_cols): | |
| if col in top_customers.columns: | |
| fig_stacked.add_trace(go.Bar( | |
| name=col, | |
| x=top_customers.index, | |
| y=top_customers[col], | |
| marker_color=colors[i % len(colors)] | |
| )) | |
| fig_stacked.update_layout( | |
| barmode='stack', | |
| title="AR Aging by Customer (Top 10)", | |
| xaxis_tickangle=-45, | |
| legend_title="Aging Bucket" | |
| ) | |
| else: | |
| fig_stacked = go.Figure() | |
| fig_stacked.add_annotation(text="No data available", showarrow=False) | |
| return fig_pie, fig_bar, fig_stacked | |
| # ============================================================================ | |
| # GRADIO INTERFACE FUNCTIONS | |
| # ============================================================================ | |
| def process_pdf_basic(pdf_file) -> Tuple[str, str, pd.DataFrame, str]: | |
| """Basic PDF processing - extract text, metadata, and first table.""" | |
| if pdf_file is None: | |
| return "No file uploaded", "", pd.DataFrame(), "" | |
| try: | |
| # Extract metadata | |
| metadata = get_pdf_metadata(pdf_file) | |
| metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()]) | |
| # Extract text | |
| text = extract_text_from_pdf(pdf_file) | |
| # Extract tables | |
| tables = extract_tables_from_pdf(pdf_file) | |
| if tables: | |
| first_table = tables[0] | |
| table_info = f"Found {len(tables)} table(s). Showing first table from {first_table.attrs.get('source', 'unknown')}." | |
| else: | |
| first_table = pd.DataFrame() | |
| table_info = "No tables found in the PDF." | |
| return metadata_str, text[:5000] + "..." if len(text) > 5000 else text, first_table, table_info | |
| except Exception as e: | |
| return f"Error: {str(e)}", "", pd.DataFrame(), "" | |
| def process_pdf_advanced( | |
| pdf_file, | |
| v_strategy: str, | |
| h_strategy: str, | |
| snap_tol: int, | |
| join_tol: int, | |
| page_num: int | |
| ) -> Tuple[pd.DataFrame, str, str]: | |
| """Advanced PDF table extraction with custom settings.""" | |
| if pdf_file is None: | |
| return pd.DataFrame(), "No file uploaded", "" | |
| try: | |
| tables = extract_tables_with_settings( | |
| pdf_file, | |
| vertical_strategy=v_strategy, | |
| horizontal_strategy=h_strategy, | |
| snap_tolerance=snap_tol, | |
| join_tolerance=join_tol | |
| ) | |
| if not tables: | |
| return pd.DataFrame(), "No tables found with current settings.", "" | |
| # Get the requested page's table | |
| idx = min(int(page_num) - 1, len(tables) - 1) | |
| idx = max(0, idx) | |
| table = tables[idx] | |
| info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}." | |
| columns = ", ".join(table.columns.tolist()) | |
| return table, info, f"Columns: {columns}" | |
| except Exception as e: | |
| return pd.DataFrame(), f"Error: {str(e)}", "" | |
| def process_ar_aging_report( | |
| pdf_file, | |
| name_col: str, | |
| amount_cols: str | |
| ) -> Tuple[pd.DataFrame, str, go.Figure, go.Figure, go.Figure, str]: | |
| """Process PDF as AR Aging report with analysis.""" | |
| if pdf_file is None: | |
| empty_fig = go.Figure() | |
| return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, "No file uploaded" | |
| try: | |
| # Extract tables | |
| tables = extract_tables_from_pdf(pdf_file) | |
| if not tables: | |
| # Try with text strategy | |
| tables = extract_tables_with_settings( | |
| pdf_file, | |
| vertical_strategy="text", | |
| horizontal_strategy="text" | |
| ) | |
| if not tables: | |
| empty_fig = go.Figure() | |
| return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, "No tables found in PDF" | |
| # Use the largest table | |
| df = max(tables, key=len) | |
| # Parse amount columns | |
| amount_col_list = [col.strip() for col in amount_cols.split(",")] | |
| # Find matching columns (flexible matching) | |
| matched_cols = [] | |
| for col in amount_col_list: | |
| for df_col in df.columns: | |
| if col.lower() in str(df_col).lower(): | |
| matched_cols.append(df_col) | |
| break | |
| if not matched_cols: | |
| matched_cols = [col for col in df.columns if any( | |
| kw in str(col).lower() for kw in ['current', 'amount', '30', '60', '90', 'invoiced', 'balance'] | |
| )] | |
| # Find name column | |
| name_column = None | |
| for df_col in df.columns: | |
| if name_col.lower() in str(df_col).lower(): | |
| name_column = df_col | |
| break | |
| if not name_column: | |
| name_column = df.columns[0] | |
| if not matched_cols: | |
| matched_cols = list(df.columns[1:6]) # Use first 5 numeric-looking columns | |
| # Process the data | |
| pivot, summary = process_ar_aging(df, name_column, matched_cols) | |
| # Create charts | |
| fig_pie, fig_bar, fig_stacked = create_aging_charts(pivot) | |
| # Format summary | |
| summary_str = "\n".join([f"**{k}:** {v}" for k, v in summary.items()]) | |
| return pivot.reset_index(), summary_str, fig_pie, fig_bar, fig_stacked, f"Processed with columns: {', '.join(matched_cols)}" | |
| except Exception as e: | |
| import traceback | |
| empty_fig = go.Figure() | |
| return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, f"Error: {str(e)}\n{traceback.format_exc()}" | |
| def export_to_csv(df: pd.DataFrame) -> str: | |
| """Export dataframe to CSV file.""" | |
| if df is None or df.empty: | |
| return None | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', mode='w') | |
| df.to_csv(temp_file.name, index=True) | |
| return temp_file.name | |
| def export_to_excel(df: pd.DataFrame) -> str: | |
| """Export dataframe to Excel file.""" | |
| if df is None or df.empty: | |
| return None | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') | |
| df.to_excel(temp_file.name, index=True, engine='openpyxl') | |
| return temp_file.name | |
| def process_batch(files): | |
| """Process multiple PDF files.""" | |
| if not files: | |
| return "No files uploaded", pd.DataFrame() | |
| results = [] | |
| all_tables = [] | |
| for file in files: | |
| try: | |
| tables = extract_tables_from_pdf(file) | |
| results.append(f"β {os.path.basename(file)}: Found {len(tables)} table(s)") | |
| for table in tables: | |
| table['Source_File'] = os.path.basename(file) | |
| all_tables.append(table) | |
| except Exception as e: | |
| results.append(f"β {os.path.basename(file)}: Error - {str(e)}") | |
| if all_tables: | |
| try: | |
| combined = pd.concat(all_tables, ignore_index=True) | |
| except: | |
| combined = all_tables[0] if all_tables else pd.DataFrame() | |
| else: | |
| combined = pd.DataFrame() | |
| return "\n".join(results), combined | |
| # ============================================================================ | |
| # GRADIO UI | |
| # ============================================================================ | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| # Header | |
| gr.Markdown(""" | |
| # π PDF Table Extractor & AR Aging Analyzer | |
| Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel | |
| """) | |
| with gr.Tabs() as tabs: | |
| # ================================================================ | |
| # TAB 1: Basic Extraction | |
| # ================================================================ | |
| with gr.Tab("π Basic Extraction"): | |
| gr.Markdown(""" | |
| ### Quick PDF Analysis | |
| Upload a PDF to extract text, metadata, and tables automatically. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| basic_pdf_input = gr.File( | |
| label="Upload PDF", | |
| file_types=[".pdf"] | |
| ) | |
| basic_extract_btn = gr.Button("π Extract Content", variant="primary") | |
| with gr.Column(scale=2): | |
| basic_metadata = gr.Markdown(label="PDF Metadata") | |
| with gr.Row(): | |
| with gr.Column(): | |
| basic_text = gr.Textbox( | |
| label="Extracted Text", | |
| lines=10, | |
| max_lines=20 | |
| ) | |
| with gr.Column(): | |
| basic_table_info = gr.Textbox(label="Table Info") | |
| basic_table = gr.Dataframe( | |
| label="Extracted Table", | |
| wrap=True, | |
| max_height=400 | |
| ) | |
| with gr.Row(): | |
| basic_csv_btn = gr.Button("π₯ Export to CSV") | |
| basic_excel_btn = gr.Button("π₯ Export to Excel") | |
| basic_csv_output = gr.File(label="CSV Download") | |
| basic_excel_output = gr.File(label="Excel Download") | |
| # Event handlers | |
| basic_extract_btn.click( | |
| fn=process_pdf_basic, | |
| inputs=[basic_pdf_input], | |
| outputs=[basic_metadata, basic_text, basic_table, basic_table_info] | |
| ) | |
| basic_csv_btn.click( | |
| fn=export_to_csv, | |
| inputs=[basic_table], | |
| outputs=[basic_csv_output] | |
| ) | |
| basic_excel_btn.click( | |
| fn=export_to_excel, | |
| inputs=[basic_table], | |
| outputs=[basic_excel_output] | |
| ) | |
| # ================================================================ | |
| # TAB 2: Advanced Extraction | |
| # ================================================================ | |
| with gr.Tab("βοΈ Advanced Extraction"): | |
| gr.Markdown(""" | |
| ### Advanced Table Extraction Settings | |
| Fine-tune the extraction parameters for complex PDFs. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| adv_pdf_input = gr.File( | |
| label="Upload PDF", | |
| file_types=[".pdf"] | |
| ) | |
| gr.Markdown("**Extraction Settings**") | |
| adv_v_strategy = gr.Dropdown( | |
| choices=["text", "lines", "lines_strict", "explicit"], | |
| value="text", | |
| label="Vertical Strategy", | |
| info="How to identify column boundaries" | |
| ) | |
| adv_h_strategy = gr.Dropdown( | |
| choices=["text", "lines", "lines_strict", "explicit"], | |
| value="text", | |
| label="Horizontal Strategy", | |
| info="How to identify row boundaries" | |
| ) | |
| adv_snap_tol = gr.Slider( | |
| minimum=1, | |
| maximum=20, | |
| value=3, | |
| step=1, | |
| label="Snap Tolerance", | |
| info="Tolerance for snapping to lines" | |
| ) | |
| adv_join_tol = gr.Slider( | |
| minimum=1, | |
| maximum=20, | |
| value=3, | |
| step=1, | |
| label="Join Tolerance", | |
| info="Tolerance for joining segments" | |
| ) | |
| adv_page_num = gr.Number( | |
| value=1, | |
| minimum=1, | |
| label="Table Number", | |
| info="Which table to display" | |
| ) | |
| adv_extract_btn = gr.Button("π§ Extract with Settings", variant="primary") | |
| with gr.Column(scale=2): | |
| adv_info = gr.Textbox(label="Extraction Info") | |
| adv_columns = gr.Textbox(label="Detected Columns") | |
| adv_table = gr.Dataframe( | |
| label="Extracted Table", | |
| wrap=True, | |
| max_height=500 | |
| ) | |
| with gr.Row(): | |
| adv_csv_btn = gr.Button("π₯ Export to CSV") | |
| adv_excel_btn = gr.Button("π₯ Export to Excel") | |
| adv_csv_output = gr.File(label="CSV Download") | |
| adv_excel_output = gr.File(label="Excel Download") | |
| # Event handlers | |
| adv_extract_btn.click( | |
| fn=process_pdf_advanced, | |
| inputs=[adv_pdf_input, adv_v_strategy, adv_h_strategy, adv_snap_tol, adv_join_tol, adv_page_num], | |
| outputs=[adv_table, adv_info, adv_columns] | |
| ) | |
| adv_csv_btn.click( | |
| fn=export_to_csv, | |
| inputs=[adv_table], | |
| outputs=[adv_csv_output] | |
| ) | |
| adv_excel_btn.click( | |
| fn=export_to_excel, | |
| inputs=[adv_table], | |
| outputs=[adv_excel_output] | |
| ) | |
| # ================================================================ | |
| # TAB 3: AR Aging Analysis | |
| # ================================================================ | |
| with gr.Tab("π° AR Aging Analysis"): | |
| gr.Markdown(""" | |
| ### Accounts Receivable Aging Analysis | |
| Upload an AR aging PDF report to extract, analyze, and visualize the data. | |
| **Common AR Aging Column Names:** | |
| - Customer/Name column: `Name`, `Customer`, `Company`, `Account` | |
| - Amount columns: `Current`, `1-30`, `31-60`, `61-90`, `Over 90`, `Not Invoiced` | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| ar_pdf_input = gr.File( | |
| label="Upload AR Aging PDF", | |
| file_types=[".pdf"] | |
| ) | |
| ar_name_col = gr.Textbox( | |
| value="Name", | |
| label="Customer/Name Column", | |
| info="Part of the column name that identifies customers" | |
| ) | |
| ar_amount_cols = gr.Textbox( | |
| value="Not Invoiced, Current, 31-60, 61-90, Over 90", | |
| label="Amount Columns (comma-separated)", | |
| info="Column names for aging buckets" | |
| ) | |
| ar_analyze_btn = gr.Button("π Analyze AR Aging", variant="primary") | |
| with gr.Column(scale=2): | |
| ar_summary = gr.Markdown(label="Summary Statistics") | |
| ar_status = gr.Textbox(label="Processing Status") | |
| with gr.Row(): | |
| ar_table = gr.Dataframe( | |
| label="AR Aging Summary by Customer", | |
| wrap=True, | |
| max_height=400 | |
| ) | |
| gr.Markdown("### π Visualizations") | |
| with gr.Row(): | |
| ar_pie_chart = gr.Plot(label="Aging Distribution") | |
| ar_bar_chart = gr.Plot(label="Top Customer Balances") | |
| with gr.Row(): | |
| ar_stacked_chart = gr.Plot(label="Aging by Customer") | |
| with gr.Row(): | |
| ar_csv_btn = gr.Button("π₯ Export to CSV") | |
| ar_excel_btn = gr.Button("π₯ Export to Excel") | |
| ar_csv_output = gr.File(label="CSV Download") | |
| ar_excel_output = gr.File(label="Excel Download") | |
| # Event handlers | |
| ar_analyze_btn.click( | |
| fn=process_ar_aging_report, | |
| inputs=[ar_pdf_input, ar_name_col, ar_amount_cols], | |
| outputs=[ar_table, ar_summary, ar_pie_chart, ar_bar_chart, ar_stacked_chart, ar_status] | |
| ) | |
| ar_csv_btn.click( | |
| fn=export_to_csv, | |
| inputs=[ar_table], | |
| outputs=[ar_csv_output] | |
| ) | |
| ar_excel_btn.click( | |
| fn=export_to_excel, | |
| inputs=[ar_table], | |
| outputs=[ar_excel_output] | |
| ) | |
| # ================================================================ | |
| # TAB 4: Batch Processing | |
| # ================================================================ | |
| with gr.Tab("π Batch Processing"): | |
| gr.Markdown(""" | |
| ### Process Multiple PDFs | |
| Upload multiple PDF files to extract tables from all of them at once. | |
| """) | |
| batch_pdf_input = gr.File( | |
| label="Upload Multiple PDFs", | |
| file_types=[".pdf"], | |
| file_count="multiple" | |
| ) | |
| batch_process_btn = gr.Button("π Process All PDFs", variant="primary") | |
| batch_results = gr.Textbox( | |
| label="Processing Results", | |
| lines=10 | |
| ) | |
| batch_combined_table = gr.Dataframe( | |
| label="Combined Data (All Tables)", | |
| wrap=True, | |
| max_height=400 | |
| ) | |
| with gr.Row(): | |
| batch_csv_btn = gr.Button("π₯ Export Combined to CSV") | |
| batch_csv_output = gr.File(label="CSV Download") | |
| batch_process_btn.click( | |
| fn=process_batch, | |
| inputs=[batch_pdf_input], | |
| outputs=[batch_results, batch_combined_table] | |
| ) | |
| batch_csv_btn.click( | |
| fn=export_to_csv, | |
| inputs=[batch_combined_table], | |
| outputs=[batch_csv_output] | |
| ) | |
| # ================================================================ | |
| # TAB 5: Help & Documentation | |
| # ================================================================ | |
| with gr.Tab("β Help"): | |
| gr.Markdown(""" | |
| ## π Documentation & Tips | |
| ### Overview | |
| This application extracts tabular data from PDF files and provides specialized | |
| analysis for Accounts Receivable (AR) Aging reports. | |
| --- | |
| ### π§ Extraction Strategies | |
| | Strategy | Description | Best For | | |
| |----------|-------------|----------| | |
| | `text` | Uses text positions to identify boundaries | Most PDFs, especially text-based tables | | |
| | `lines` | Uses drawn lines to identify boundaries | PDFs with visible grid lines | | |
| | `lines_strict` | Strictly follows drawn lines | Clean, well-formatted tables | | |
| | `explicit` | Requires explicit boundary definitions | Complex layouts | | |
| --- | |
| ### π‘ Tips for Best Results | |
| 1. **Start with Basic Extraction** - Try the basic tab first to see what's detected | |
| 2. **Adjust Strategies** - If tables aren't detected correctly: | |
| - Try `lines` strategy if your PDF has visible gridlines | |
| - Increase tolerance values for loosely formatted tables | |
| 3. **AR Aging Reports** - For best results: | |
| - Ensure column names match your PDF headers | |
| - Use partial matches (e.g., "Name" will match "Customer Name") | |
| 4. **Large PDFs** - Processing may take longer for multi-page documents | |
| --- | |
| ### π Supported Formats | |
| - **Input:** PDF files (.pdf) | |
| - **Output:** CSV, Excel (.xlsx) | |
| --- | |
| ### π Technology Stack | |
| - **pdfplumber** - PDF parsing and table extraction | |
| - **pandas** - Data manipulation and analysis | |
| - **plotly** - Interactive visualizations | |
| - **gradio** - Web interface | |
| --- | |
| ### β οΈ Limitations | |
| - Scanned PDFs (images) are not supported - use OCR tools first | |
| - Very complex table layouts may require manual adjustment | |
| - Password-protected PDFs are not supported | |
| """) | |
| # Footer | |
| gr.Markdown(""" | |
| --- | |
| Built with β€οΈ using Gradio & pdfplumber | [pdfplumber docs](https://github.com/jsvine/pdfplumber) | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |