Spaces:

Omarrran
/

PDF_AR_Parser

Sleeping

App Files Files Community

Omarrran commited on Nov 26, 2025

Commit

5638291

verified ·

1 Parent(s): 8e02dfe

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -73

app.py CHANGED Viewed

@@ -235,14 +235,14 @@ def process_pdf_basic(pdf_file) -> Tuple[str, str, pd.DataFrame, str]:
     try:
         # Extract metadata
-        metadata = get_pdf_metadata(pdf_file.name)
         metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
         # Extract text
-        text = extract_text_from_pdf(pdf_file.name)
         # Extract tables
-        tables = extract_tables_from_pdf(pdf_file.name)
         if tables:
             first_table = tables[0]
@@ -271,7 +271,7 @@ def process_pdf_advanced(
     try:
         tables = extract_tables_with_settings(
-            pdf_file.name,
             vertical_strategy=v_strategy,
             horizontal_strategy=h_strategy,
             snap_tolerance=snap_tol,
@@ -282,7 +282,8 @@ def process_pdf_advanced(
             return pd.DataFrame(), "No tables found with current settings.", ""
         # Get the requested page's table
-        idx = min(page_num - 1, len(tables) - 1)
         table = tables[idx]
         info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
@@ -306,12 +307,12 @@ def process_ar_aging_report(
     try:
         # Extract tables
-        tables = extract_tables_from_pdf(pdf_file.name)
         if not tables:
             # Try with text strategy
             tables = extract_tables_with_settings(
-                pdf_file.name,
                 vertical_strategy="text",
                 horizontal_strategy="text"
             )
@@ -389,6 +390,36 @@ def export_to_excel(df: pd.DataFrame) -> str:
     return temp_file.name
 # ============================================================================
 # GRADIO UI
 # ============================================================================
@@ -397,11 +428,10 @@ def export_to_excel(df: pd.DataFrame) -> str:
 with gr.Blocks() as demo:
     # Header
-    gr.HTML("""
-        <div class="main-header">
-            <h1>📄 PDF Table Extractor & AR Aging Analyzer</h1>
-            <p>Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel</p>
-        </div>
     """)
     with gr.Tabs() as tabs:
@@ -409,7 +439,7 @@ with gr.Blocks() as demo:
         # ================================================================
         # TAB 1: Basic Extraction
         # ================================================================
-        with gr.TabItem("📋 Basic Extraction", id=1):
             gr.Markdown("""
             ### Quick PDF Analysis
             Upload a PDF to extract text, metadata, and tables automatically.
@@ -419,10 +449,9 @@ with gr.Blocks() as demo:
                 with gr.Column(scale=1):
                     basic_pdf_input = gr.File(
                         label="Upload PDF",
-                        file_types=[".pdf"],
-                        type="filepath"
                     )
-                    basic_extract_btn = gr.Button("🔍 Extract Content", variant="primary", size="lg")
                 with gr.Column(scale=2):
                     basic_metadata = gr.Markdown(label="PDF Metadata")
@@ -440,7 +469,7 @@ with gr.Blocks() as demo:
                     basic_table = gr.Dataframe(
                         label="Extracted Table",
                         wrap=True,
-                        height=400
                     )
             with gr.Row():
@@ -471,7 +500,7 @@ with gr.Blocks() as demo:
         # ================================================================
         # TAB 2: Advanced Extraction
         # ================================================================
-        with gr.TabItem("⚙️ Advanced Extraction", id=2):
             gr.Markdown("""
             ### Advanced Table Extraction Settings
             Fine-tune the extraction parameters for complex PDFs.
@@ -481,8 +510,7 @@ with gr.Blocks() as demo:
                 with gr.Column(scale=1):
                     adv_pdf_input = gr.File(
                         label="Upload PDF",
-                        file_types=[".pdf"],
-                        type="filepath"
                     )
                     gr.Markdown("**Extraction Settings**")
@@ -534,7 +562,7 @@ with gr.Blocks() as demo:
                     adv_table = gr.Dataframe(
                         label="Extracted Table",
                         wrap=True,
-                        height=500
                     )
             with gr.Row():
@@ -565,7 +593,7 @@ with gr.Blocks() as demo:
         # ================================================================
         # TAB 3: AR Aging Analysis
         # ================================================================
-        with gr.TabItem("💰 AR Aging Analysis", id=3):
             gr.Markdown("""
             ### Accounts Receivable Aging Analysis
             Upload an AR aging PDF report to extract, analyze, and visualize the data.
@@ -579,8 +607,7 @@ with gr.Blocks() as demo:
                 with gr.Column(scale=1):
                     ar_pdf_input = gr.File(
                         label="Upload AR Aging PDF",
-                        file_types=[".pdf"],
-                        type="filepath"
                     )
                     ar_name_col = gr.Textbox(
@@ -595,7 +622,7 @@ with gr.Blocks() as demo:
                         info="Column names for aging buckets"
                     )
-                    ar_analyze_btn = gr.Button("📊 Analyze AR Aging", variant="primary", size="lg")
                 with gr.Column(scale=2):
                     ar_summary = gr.Markdown(label="Summary Statistics")
@@ -605,7 +632,7 @@ with gr.Blocks() as demo:
                 ar_table = gr.Dataframe(
                     label="AR Aging Summary by Customer",
                     wrap=True,
-                    height=400
                 )
             gr.Markdown("### 📈 Visualizations")
@@ -645,7 +672,7 @@ with gr.Blocks() as demo:
         # ================================================================
         # TAB 4: Batch Processing
         # ================================================================
-        with gr.TabItem("📁 Batch Processing", id=4):
             gr.Markdown("""
             ### Process Multiple PDFs
             Upload multiple PDF files to extract tables from all of them at once.
@@ -654,8 +681,7 @@ with gr.Blocks() as demo:
             batch_pdf_input = gr.File(
                 label="Upload Multiple PDFs",
                 file_types=[".pdf"],
-                file_count="multiple",
-                type="filepath"
             )
             batch_process_btn = gr.Button("🔄 Process All PDFs", variant="primary")
@@ -668,42 +694,13 @@ with gr.Blocks() as demo:
             batch_combined_table = gr.Dataframe(
                 label="Combined Data (All Tables)",
                 wrap=True,
-                height=400
             )
             with gr.Row():
                 batch_csv_btn = gr.Button("📥 Export Combined to CSV")
                 batch_csv_output = gr.File(label="CSV Download")
-            def process_batch(files):
-                if not files:
-                    return "No files uploaded", pd.DataFrame()
-                results = []
-                all_tables = []
-                for file in files:
-                    try:
-                        tables = extract_tables_from_pdf(file.name)
-                        results.append(f"✅ {os.path.basename(file.name)}: Found {len(tables)} table(s)")
-                        for table in tables:
-                            table['Source_File'] = os.path.basename(file.name)
-                            all_tables.append(table)
-                    except Exception as e:
-                        results.append(f"❌ {os.path.basename(file.name)}: Error - {str(e)}")
-                if all_tables:
-                    # Try to combine tables with same structure
-                    try:
-                        combined = pd.concat(all_tables, ignore_index=True)
-                    except:
-                        combined = all_tables[0] if all_tables else pd.DataFrame()
-                else:
-                    combined = pd.DataFrame()
-                return "\n".join(results), combined
             batch_process_btn.click(
                 fn=process_batch,
                 inputs=[batch_pdf_input],
@@ -719,7 +716,7 @@ with gr.Blocks() as demo:
         # ================================================================
         # TAB 5: Help & Documentation
         # ================================================================
-        with gr.TabItem("❓ Help", id=5):
             gr.Markdown("""
             ## 📚 Documentation & Tips
@@ -777,22 +774,12 @@ with gr.Blocks() as demo:
             - Scanned PDFs (images) are not supported - use OCR tools first
             - Very complex table layouts may require manual adjustment
             - Password-protected PDFs are not supported
-            ---
-            ### 📧 Feedback
-            If you encounter issues or have suggestions, please provide feedback!
             """)
     # Footer
-    gr.HTML("""
-        <div style="text-align: center; margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 8px;">
-            <p style="color: #666; margin: 0;">
-                Built with ❤️ using Gradio & pdfplumber |
-                <a href="https://github.com/jsvine/pdfplumber" target="_blank">pdfplumber docs</a>
-            </p>
-        </div>
     """)

     try:
         # Extract metadata
+        metadata = get_pdf_metadata(pdf_file)
         metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
         # Extract text
+        text = extract_text_from_pdf(pdf_file)
         # Extract tables
+        tables = extract_tables_from_pdf(pdf_file)
         if tables:
             first_table = tables[0]
     try:
         tables = extract_tables_with_settings(
+            pdf_file,
             vertical_strategy=v_strategy,
             horizontal_strategy=h_strategy,
             snap_tolerance=snap_tol,
             return pd.DataFrame(), "No tables found with current settings.", ""
         # Get the requested page's table
+        idx = min(int(page_num) - 1, len(tables) - 1)
+        idx = max(0, idx)
         table = tables[idx]
         info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
     try:
         # Extract tables
+        tables = extract_tables_from_pdf(pdf_file)
         if not tables:
             # Try with text strategy
             tables = extract_tables_with_settings(
+                pdf_file,
                 vertical_strategy="text",
                 horizontal_strategy="text"
             )
     return temp_file.name
+def process_batch(files):
+    """Process multiple PDF files."""
+    if not files:
+        return "No files uploaded", pd.DataFrame()
+    results = []
+    all_tables = []
+    for file in files:
+        try:
+            tables = extract_tables_from_pdf(file)
+            results.append(f"✅ {os.path.basename(file)}: Found {len(tables)} table(s)")
+            for table in tables:
+                table['Source_File'] = os.path.basename(file)
+                all_tables.append(table)
+        except Exception as e:
+            results.append(f"❌ {os.path.basename(file)}: Error - {str(e)}")
+    if all_tables:
+        try:
+            combined = pd.concat(all_tables, ignore_index=True)
+        except:
+            combined = all_tables[0] if all_tables else pd.DataFrame()
+    else:
+        combined = pd.DataFrame()
+    return "\n".join(results), combined
 # ============================================================================
 # GRADIO UI
 # ============================================================================
 with gr.Blocks() as demo:
     # Header
+    gr.Markdown("""
+    # 📄 PDF Table Extractor & AR Aging Analyzer
+    Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel
     """)
     with gr.Tabs() as tabs:
         # ================================================================
         # TAB 1: Basic Extraction
         # ================================================================
+        with gr.Tab("📋 Basic Extraction"):
             gr.Markdown("""
             ### Quick PDF Analysis
             Upload a PDF to extract text, metadata, and tables automatically.
                 with gr.Column(scale=1):
                     basic_pdf_input = gr.File(
                         label="Upload PDF",
+                        file_types=[".pdf"]
                     )
+                    basic_extract_btn = gr.Button("🔍 Extract Content", variant="primary")
                 with gr.Column(scale=2):
                     basic_metadata = gr.Markdown(label="PDF Metadata")
                     basic_table = gr.Dataframe(
                         label="Extracted Table",
                         wrap=True,
+                        max_height=400
                     )
             with gr.Row():
         # ================================================================
         # TAB 2: Advanced Extraction
         # ================================================================
+        with gr.Tab("⚙️ Advanced Extraction"):
             gr.Markdown("""
             ### Advanced Table Extraction Settings
             Fine-tune the extraction parameters for complex PDFs.
                 with gr.Column(scale=1):
                     adv_pdf_input = gr.File(
                         label="Upload PDF",
+                        file_types=[".pdf"]
                     )
                     gr.Markdown("**Extraction Settings**")
                     adv_table = gr.Dataframe(
                         label="Extracted Table",
                         wrap=True,
+                        max_height=500
                     )
             with gr.Row():
         # ================================================================
         # TAB 3: AR Aging Analysis
         # ================================================================
+        with gr.Tab("💰 AR Aging Analysis"):
             gr.Markdown("""
             ### Accounts Receivable Aging Analysis
             Upload an AR aging PDF report to extract, analyze, and visualize the data.
                 with gr.Column(scale=1):
                     ar_pdf_input = gr.File(
                         label="Upload AR Aging PDF",
+                        file_types=[".pdf"]
                     )
                     ar_name_col = gr.Textbox(
                         info="Column names for aging buckets"
                     )
+                    ar_analyze_btn = gr.Button("📊 Analyze AR Aging", variant="primary")
                 with gr.Column(scale=2):
                     ar_summary = gr.Markdown(label="Summary Statistics")
                 ar_table = gr.Dataframe(
                     label="AR Aging Summary by Customer",
                     wrap=True,
+                    max_height=400
                 )
             gr.Markdown("### 📈 Visualizations")
         # ================================================================
         # TAB 4: Batch Processing
         # ================================================================
+        with gr.Tab("📁 Batch Processing"):
             gr.Markdown("""
             ### Process Multiple PDFs
             Upload multiple PDF files to extract tables from all of them at once.
             batch_pdf_input = gr.File(
                 label="Upload Multiple PDFs",
                 file_types=[".pdf"],
+                file_count="multiple"
             )
             batch_process_btn = gr.Button("🔄 Process All PDFs", variant="primary")
             batch_combined_table = gr.Dataframe(
                 label="Combined Data (All Tables)",
                 wrap=True,
+                max_height=400
             )
             with gr.Row():
                 batch_csv_btn = gr.Button("📥 Export Combined to CSV")
                 batch_csv_output = gr.File(label="CSV Download")
             batch_process_btn.click(
                 fn=process_batch,
                 inputs=[batch_pdf_input],
         # ================================================================
         # TAB 5: Help & Documentation
         # ================================================================
+        with gr.Tab("❓ Help"):
             gr.Markdown("""
             ## 📚 Documentation & Tips
             - Scanned PDFs (images) are not supported - use OCR tools first
             - Very complex table layouts may require manual adjustment
             - Password-protected PDFs are not supported
             """)
     # Footer
+    gr.Markdown("""
+    ---
+    Built with ❤️ using Gradio & pdfplumber | [pdfplumber docs](https://github.com/jsvine/pdfplumber)
     """)