Omarrran commited on
Commit
a373348
Β·
verified Β·
1 Parent(s): 0e96fb3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +801 -0
app.py ADDED
@@ -0,0 +1,801 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Table Extractor & AR Aging Analyzer
3
+ A comprehensive tool for extracting tables from PDFs and performing AR aging analysis.
4
+ Built for Hugging Face Spaces with Gradio interface.
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ import numpy as np
10
+ import pdfplumber
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ from plotly.subplots import make_subplots
14
+ import tempfile
15
+ import os
16
+ from typing import Tuple, List, Optional, Dict, Any
17
+ import io
18
+
19
+
20
+ # ============================================================================
21
+ # CORE PDF EXTRACTION FUNCTIONS
22
+ # ============================================================================
23
+
24
+ def extract_text_from_pdf(pdf_path: str) -> str:
25
+ """Extract all text from a PDF file."""
26
+ text_content = []
27
+ with pdfplumber.open(pdf_path) as pdf:
28
+ for i, page in enumerate(pdf.pages):
29
+ page_text = page.extract_text()
30
+ if page_text:
31
+ text_content.append(f"--- Page {i + 1} ---\n{page_text}")
32
+ return "\n\n".join(text_content)
33
+
34
+
35
+ def extract_tables_from_pdf(pdf_path: str) -> List[pd.DataFrame]:
36
+ """Extract all tables from a PDF file."""
37
+ tables = []
38
+ with pdfplumber.open(pdf_path) as pdf:
39
+ for page_num, page in enumerate(pdf.pages):
40
+ page_tables = page.extract_tables()
41
+ for table_idx, table in enumerate(page_tables):
42
+ if table and len(table) > 1:
43
+ # Clean up the table
44
+ cleaned_table = [row for row in table if any(cell for cell in row)]
45
+ if cleaned_table:
46
+ df = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0])
47
+ df.attrs['source'] = f"Page {page_num + 1}, Table {table_idx + 1}"
48
+ tables.append(df)
49
+ return tables
50
+
51
+
52
+ def extract_tables_with_settings(
53
+ pdf_path: str,
54
+ vertical_strategy: str = "text",
55
+ horizontal_strategy: str = "text",
56
+ snap_tolerance: int = 3,
57
+ join_tolerance: int = 3
58
+ ) -> List[pd.DataFrame]:
59
+ """Extract tables with custom pdfplumber settings."""
60
+ tables = []
61
+ table_settings = {
62
+ "vertical_strategy": vertical_strategy,
63
+ "horizontal_strategy": horizontal_strategy,
64
+ "snap_tolerance": snap_tolerance,
65
+ "join_tolerance": join_tolerance,
66
+ }
67
+
68
+ with pdfplumber.open(pdf_path) as pdf:
69
+ for page_num, page in enumerate(pdf.pages):
70
+ try:
71
+ table = page.extract_table(table_settings=table_settings)
72
+ if table and len(table) > 1:
73
+ cleaned_table = [row for row in table if any(cell for cell in row if cell)]
74
+ if cleaned_table:
75
+ df = pd.DataFrame(cleaned_table[1:], columns=cleaned_table[0])
76
+ df.attrs['source'] = f"Page {page_num + 1}"
77
+ tables.append(df)
78
+ except Exception as e:
79
+ continue
80
+ return tables
81
+
82
+
83
+ def get_pdf_metadata(pdf_path: str) -> Dict[str, Any]:
84
+ """Extract metadata from a PDF file."""
85
+ with pdfplumber.open(pdf_path) as pdf:
86
+ metadata = {
87
+ "Number of Pages": len(pdf.pages),
88
+ "PDF Metadata": pdf.metadata if pdf.metadata else "No metadata available"
89
+ }
90
+
91
+ # Get page dimensions
92
+ if pdf.pages:
93
+ first_page = pdf.pages[0]
94
+ metadata["Page Width"] = first_page.width
95
+ metadata["Page Height"] = first_page.height
96
+
97
+ return metadata
98
+
99
+
100
+ # ============================================================================
101
+ # AR AGING SPECIFIC FUNCTIONS
102
+ # ============================================================================
103
+
104
+ def convert_to_float(num: str) -> float:
105
+ """Convert string number to float, handling commas and errors."""
106
+ try:
107
+ if num is None or str(num).strip() == '':
108
+ return 0.0
109
+ return float(str(num).replace(',', '').replace('$', '').strip())
110
+ except (ValueError, AttributeError):
111
+ return 0.0
112
+
113
+
114
+ def process_ar_aging(df: pd.DataFrame, name_column: str, amount_columns: List[str]) -> Tuple[pd.DataFrame, Dict]:
115
+ """Process a dataframe as an AR aging report."""
116
+ result_df = df.copy()
117
+
118
+ # Convert amount columns to float
119
+ for col in amount_columns:
120
+ if col in result_df.columns:
121
+ result_df[col] = result_df[col].apply(convert_to_float)
122
+
123
+ # Forward fill name column to handle grouped rows
124
+ if name_column in result_df.columns:
125
+ result_df[name_column] = result_df[name_column].replace('', np.nan).ffill()
126
+
127
+ # Create pivot table
128
+ pivot = result_df.pivot_table(
129
+ index=name_column,
130
+ values=amount_columns,
131
+ aggfunc='sum'
132
+ )
133
+
134
+ # Reorder columns if they exist
135
+ ordered_cols = [col for col in amount_columns if col in pivot.columns]
136
+ pivot = pivot[ordered_cols]
137
+
138
+ # Add total column
139
+ pivot['Total'] = pivot.sum(axis=1)
140
+
141
+ # Add totals row
142
+ pivot.loc['TOTAL'] = pivot.sum()
143
+
144
+ # Calculate percentages
145
+ total_amount = pivot.loc['TOTAL', 'Total']
146
+ if total_amount > 0:
147
+ perc_row = (pivot.loc['TOTAL'] / total_amount * 100).round(2)
148
+ pivot.loc['PERCENTAGE'] = perc_row
149
+
150
+ # Prepare summary statistics
151
+ summary = {
152
+ "Total AR Amount": f"${total_amount:,.2f}",
153
+ "Number of Customers": len(pivot) - 2, # Exclude TOTAL and PERCENTAGE rows
154
+ "Largest Balance": f"${pivot['Total'][:-2].max():,.2f}" if len(pivot) > 2 else "N/A",
155
+ "Average Balance": f"${pivot['Total'][:-2].mean():,.2f}" if len(pivot) > 2 else "N/A",
156
+ }
157
+
158
+ return pivot, summary
159
+
160
+
161
+ def create_aging_charts(pivot_df: pd.DataFrame) -> Tuple[go.Figure, go.Figure, go.Figure]:
162
+ """Create visualization charts for AR aging analysis."""
163
+ # Remove TOTAL and PERCENTAGE rows for customer charts
164
+ customer_data = pivot_df.iloc[:-2].copy() if len(pivot_df) > 2 else pivot_df.copy()
165
+
166
+ # Chart 1: Aging Distribution Pie Chart
167
+ if 'TOTAL' in pivot_df.index:
168
+ totals = pivot_df.loc['TOTAL'].drop('Total', errors='ignore')
169
+ fig_pie = px.pie(
170
+ values=totals.values,
171
+ names=totals.index,
172
+ title="AR Aging Distribution",
173
+ hole=0.4,
174
+ color_discrete_sequence=px.colors.qualitative.Set2
175
+ )
176
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
177
+ else:
178
+ fig_pie = go.Figure()
179
+ fig_pie.add_annotation(text="No data available", showarrow=False)
180
+
181
+ # Chart 2: Customer Balance Bar Chart (Top 10)
182
+ if len(customer_data) > 0:
183
+ top_customers = customer_data.nlargest(10, 'Total')
184
+ fig_bar = px.bar(
185
+ top_customers.reset_index(),
186
+ x=top_customers.index.name or 'Customer',
187
+ y='Total',
188
+ title="Top 10 Customer Balances",
189
+ color='Total',
190
+ color_continuous_scale='Reds'
191
+ )
192
+ fig_bar.update_layout(xaxis_tickangle=-45)
193
+ else:
194
+ fig_bar = go.Figure()
195
+ fig_bar.add_annotation(text="No data available", showarrow=False)
196
+
197
+ # Chart 3: Stacked Bar Chart by Aging Bucket
198
+ if len(customer_data) > 0:
199
+ aging_cols = [col for col in customer_data.columns if col != 'Total']
200
+ top_customers = customer_data.nlargest(10, 'Total')
201
+
202
+ fig_stacked = go.Figure()
203
+ colors = ['#2ecc71', '#3498db', '#f1c40f', '#e67e22', '#e74c3c']
204
+
205
+ for i, col in enumerate(aging_cols):
206
+ if col in top_customers.columns:
207
+ fig_stacked.add_trace(go.Bar(
208
+ name=col,
209
+ x=top_customers.index,
210
+ y=top_customers[col],
211
+ marker_color=colors[i % len(colors)]
212
+ ))
213
+
214
+ fig_stacked.update_layout(
215
+ barmode='stack',
216
+ title="AR Aging by Customer (Top 10)",
217
+ xaxis_tickangle=-45,
218
+ legend_title="Aging Bucket"
219
+ )
220
+ else:
221
+ fig_stacked = go.Figure()
222
+ fig_stacked.add_annotation(text="No data available", showarrow=False)
223
+
224
+ return fig_pie, fig_bar, fig_stacked
225
+
226
+
227
+ # ============================================================================
228
+ # GRADIO INTERFACE FUNCTIONS
229
+ # ============================================================================
230
+
231
+ def process_pdf_basic(pdf_file) -> Tuple[str, str, pd.DataFrame, str]:
232
+ """Basic PDF processing - extract text, metadata, and first table."""
233
+ if pdf_file is None:
234
+ return "No file uploaded", "", pd.DataFrame(), ""
235
+
236
+ try:
237
+ # Extract metadata
238
+ metadata = get_pdf_metadata(pdf_file.name)
239
+ metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
240
+
241
+ # Extract text
242
+ text = extract_text_from_pdf(pdf_file.name)
243
+
244
+ # Extract tables
245
+ tables = extract_tables_from_pdf(pdf_file.name)
246
+
247
+ if tables:
248
+ first_table = tables[0]
249
+ table_info = f"Found {len(tables)} table(s). Showing first table from {first_table.attrs.get('source', 'unknown')}."
250
+ else:
251
+ first_table = pd.DataFrame()
252
+ table_info = "No tables found in the PDF."
253
+
254
+ return metadata_str, text[:5000] + "..." if len(text) > 5000 else text, first_table, table_info
255
+
256
+ except Exception as e:
257
+ return f"Error: {str(e)}", "", pd.DataFrame(), ""
258
+
259
+
260
+ def process_pdf_advanced(
261
+ pdf_file,
262
+ v_strategy: str,
263
+ h_strategy: str,
264
+ snap_tol: int,
265
+ join_tol: int,
266
+ page_num: int
267
+ ) -> Tuple[pd.DataFrame, str, str]:
268
+ """Advanced PDF table extraction with custom settings."""
269
+ if pdf_file is None:
270
+ return pd.DataFrame(), "No file uploaded", ""
271
+
272
+ try:
273
+ tables = extract_tables_with_settings(
274
+ pdf_file.name,
275
+ vertical_strategy=v_strategy,
276
+ horizontal_strategy=h_strategy,
277
+ snap_tolerance=snap_tol,
278
+ join_tolerance=join_tol
279
+ )
280
+
281
+ if not tables:
282
+ return pd.DataFrame(), "No tables found with current settings.", ""
283
+
284
+ # Get the requested page's table
285
+ idx = min(page_num - 1, len(tables) - 1)
286
+ table = tables[idx]
287
+
288
+ info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
289
+ columns = ", ".join(table.columns.tolist())
290
+
291
+ return table, info, f"Columns: {columns}"
292
+
293
+ except Exception as e:
294
+ return pd.DataFrame(), f"Error: {str(e)}", ""
295
+
296
+
297
+ def process_ar_aging_report(
298
+ pdf_file,
299
+ name_col: str,
300
+ amount_cols: str
301
+ ) -> Tuple[pd.DataFrame, str, go.Figure, go.Figure, go.Figure, str]:
302
+ """Process PDF as AR Aging report with analysis."""
303
+ if pdf_file is None:
304
+ empty_fig = go.Figure()
305
+ return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, "No file uploaded"
306
+
307
+ try:
308
+ # Extract tables
309
+ tables = extract_tables_from_pdf(pdf_file.name)
310
+
311
+ if not tables:
312
+ # Try with text strategy
313
+ tables = extract_tables_with_settings(
314
+ pdf_file.name,
315
+ vertical_strategy="text",
316
+ horizontal_strategy="text"
317
+ )
318
+
319
+ if not tables:
320
+ empty_fig = go.Figure()
321
+ return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, "No tables found in PDF"
322
+
323
+ # Use the largest table
324
+ df = max(tables, key=len)
325
+
326
+ # Parse amount columns
327
+ amount_col_list = [col.strip() for col in amount_cols.split(",")]
328
+
329
+ # Find matching columns (flexible matching)
330
+ matched_cols = []
331
+ for col in amount_col_list:
332
+ for df_col in df.columns:
333
+ if col.lower() in str(df_col).lower():
334
+ matched_cols.append(df_col)
335
+ break
336
+
337
+ if not matched_cols:
338
+ matched_cols = [col for col in df.columns if any(
339
+ kw in str(col).lower() for kw in ['current', 'amount', '30', '60', '90', 'invoiced', 'balance']
340
+ )]
341
+
342
+ # Find name column
343
+ name_column = None
344
+ for df_col in df.columns:
345
+ if name_col.lower() in str(df_col).lower():
346
+ name_column = df_col
347
+ break
348
+
349
+ if not name_column:
350
+ name_column = df.columns[0]
351
+
352
+ if not matched_cols:
353
+ matched_cols = list(df.columns[1:6]) # Use first 5 numeric-looking columns
354
+
355
+ # Process the data
356
+ pivot, summary = process_ar_aging(df, name_column, matched_cols)
357
+
358
+ # Create charts
359
+ fig_pie, fig_bar, fig_stacked = create_aging_charts(pivot)
360
+
361
+ # Format summary
362
+ summary_str = "\n".join([f"**{k}:** {v}" for k, v in summary.items()])
363
+
364
+ return pivot.reset_index(), summary_str, fig_pie, fig_bar, fig_stacked, f"Processed with columns: {', '.join(matched_cols)}"
365
+
366
+ except Exception as e:
367
+ import traceback
368
+ empty_fig = go.Figure()
369
+ return pd.DataFrame(), "", empty_fig, empty_fig, empty_fig, f"Error: {str(e)}\n{traceback.format_exc()}"
370
+
371
+
372
+ def export_to_csv(df: pd.DataFrame) -> str:
373
+ """Export dataframe to CSV file."""
374
+ if df is None or df.empty:
375
+ return None
376
+
377
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv', mode='w')
378
+ df.to_csv(temp_file.name, index=True)
379
+ return temp_file.name
380
+
381
+
382
+ def export_to_excel(df: pd.DataFrame) -> str:
383
+ """Export dataframe to Excel file."""
384
+ if df is None or df.empty:
385
+ return None
386
+
387
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
388
+ df.to_excel(temp_file.name, index=True, engine='openpyxl')
389
+ return temp_file.name
390
+
391
+
392
+ # ============================================================================
393
+ # GRADIO UI
394
+ # ============================================================================
395
+
396
+ # Create the Gradio interface
397
+ with gr.Blocks() as demo:
398
+
399
+ # Header
400
+ gr.HTML("""
401
+ <div class="main-header">
402
+ <h1>πŸ“„ PDF Table Extractor & AR Aging Analyzer</h1>
403
+ <p>Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel</p>
404
+ </div>
405
+ """)
406
+
407
+ with gr.Tabs() as tabs:
408
+
409
+ # ================================================================
410
+ # TAB 1: Basic Extraction
411
+ # ================================================================
412
+ with gr.TabItem("πŸ“‹ Basic Extraction", id=1):
413
+ gr.Markdown("""
414
+ ### Quick PDF Analysis
415
+ Upload a PDF to extract text, metadata, and tables automatically.
416
+ """)
417
+
418
+ with gr.Row():
419
+ with gr.Column(scale=1):
420
+ basic_pdf_input = gr.File(
421
+ label="Upload PDF",
422
+ file_types=[".pdf"],
423
+ type="filepath"
424
+ )
425
+ basic_extract_btn = gr.Button("πŸ” Extract Content", variant="primary", size="lg")
426
+
427
+ with gr.Column(scale=2):
428
+ basic_metadata = gr.Markdown(label="PDF Metadata")
429
+
430
+ with gr.Row():
431
+ with gr.Column():
432
+ basic_text = gr.Textbox(
433
+ label="Extracted Text",
434
+ lines=10,
435
+ max_lines=20
436
+ )
437
+
438
+ with gr.Column():
439
+ basic_table_info = gr.Textbox(label="Table Info")
440
+ basic_table = gr.Dataframe(
441
+ label="Extracted Table",
442
+ wrap=True,
443
+ height=400
444
+ )
445
+
446
+ with gr.Row():
447
+ basic_csv_btn = gr.Button("πŸ“₯ Export to CSV")
448
+ basic_excel_btn = gr.Button("πŸ“₯ Export to Excel")
449
+ basic_csv_output = gr.File(label="CSV Download")
450
+ basic_excel_output = gr.File(label="Excel Download")
451
+
452
+ # Event handlers
453
+ basic_extract_btn.click(
454
+ fn=process_pdf_basic,
455
+ inputs=[basic_pdf_input],
456
+ outputs=[basic_metadata, basic_text, basic_table, basic_table_info]
457
+ )
458
+
459
+ basic_csv_btn.click(
460
+ fn=export_to_csv,
461
+ inputs=[basic_table],
462
+ outputs=[basic_csv_output]
463
+ )
464
+
465
+ basic_excel_btn.click(
466
+ fn=export_to_excel,
467
+ inputs=[basic_table],
468
+ outputs=[basic_excel_output]
469
+ )
470
+
471
+ # ================================================================
472
+ # TAB 2: Advanced Extraction
473
+ # ================================================================
474
+ with gr.TabItem("βš™οΈ Advanced Extraction", id=2):
475
+ gr.Markdown("""
476
+ ### Advanced Table Extraction Settings
477
+ Fine-tune the extraction parameters for complex PDFs.
478
+ """)
479
+
480
+ with gr.Row():
481
+ with gr.Column(scale=1):
482
+ adv_pdf_input = gr.File(
483
+ label="Upload PDF",
484
+ file_types=[".pdf"],
485
+ type="filepath"
486
+ )
487
+
488
+ gr.Markdown("**Extraction Settings**")
489
+
490
+ adv_v_strategy = gr.Dropdown(
491
+ choices=["text", "lines", "lines_strict", "explicit"],
492
+ value="text",
493
+ label="Vertical Strategy",
494
+ info="How to identify column boundaries"
495
+ )
496
+
497
+ adv_h_strategy = gr.Dropdown(
498
+ choices=["text", "lines", "lines_strict", "explicit"],
499
+ value="text",
500
+ label="Horizontal Strategy",
501
+ info="How to identify row boundaries"
502
+ )
503
+
504
+ adv_snap_tol = gr.Slider(
505
+ minimum=1,
506
+ maximum=20,
507
+ value=3,
508
+ step=1,
509
+ label="Snap Tolerance",
510
+ info="Tolerance for snapping to lines"
511
+ )
512
+
513
+ adv_join_tol = gr.Slider(
514
+ minimum=1,
515
+ maximum=20,
516
+ value=3,
517
+ step=1,
518
+ label="Join Tolerance",
519
+ info="Tolerance for joining segments"
520
+ )
521
+
522
+ adv_page_num = gr.Number(
523
+ value=1,
524
+ minimum=1,
525
+ label="Table Number",
526
+ info="Which table to display"
527
+ )
528
+
529
+ adv_extract_btn = gr.Button("πŸ”§ Extract with Settings", variant="primary")
530
+
531
+ with gr.Column(scale=2):
532
+ adv_info = gr.Textbox(label="Extraction Info")
533
+ adv_columns = gr.Textbox(label="Detected Columns")
534
+ adv_table = gr.Dataframe(
535
+ label="Extracted Table",
536
+ wrap=True,
537
+ height=500
538
+ )
539
+
540
+ with gr.Row():
541
+ adv_csv_btn = gr.Button("πŸ“₯ Export to CSV")
542
+ adv_excel_btn = gr.Button("πŸ“₯ Export to Excel")
543
+ adv_csv_output = gr.File(label="CSV Download")
544
+ adv_excel_output = gr.File(label="Excel Download")
545
+
546
+ # Event handlers
547
+ adv_extract_btn.click(
548
+ fn=process_pdf_advanced,
549
+ inputs=[adv_pdf_input, adv_v_strategy, adv_h_strategy, adv_snap_tol, adv_join_tol, adv_page_num],
550
+ outputs=[adv_table, adv_info, adv_columns]
551
+ )
552
+
553
+ adv_csv_btn.click(
554
+ fn=export_to_csv,
555
+ inputs=[adv_table],
556
+ outputs=[adv_csv_output]
557
+ )
558
+
559
+ adv_excel_btn.click(
560
+ fn=export_to_excel,
561
+ inputs=[adv_table],
562
+ outputs=[adv_excel_output]
563
+ )
564
+
565
+ # ================================================================
566
+ # TAB 3: AR Aging Analysis
567
+ # ================================================================
568
+ with gr.TabItem("πŸ’° AR Aging Analysis", id=3):
569
+ gr.Markdown("""
570
+ ### Accounts Receivable Aging Analysis
571
+ Upload an AR aging PDF report to extract, analyze, and visualize the data.
572
+
573
+ **Common AR Aging Column Names:**
574
+ - Customer/Name column: `Name`, `Customer`, `Company`, `Account`
575
+ - Amount columns: `Current`, `1-30`, `31-60`, `61-90`, `Over 90`, `Not Invoiced`
576
+ """)
577
+
578
+ with gr.Row():
579
+ with gr.Column(scale=1):
580
+ ar_pdf_input = gr.File(
581
+ label="Upload AR Aging PDF",
582
+ file_types=[".pdf"],
583
+ type="filepath"
584
+ )
585
+
586
+ ar_name_col = gr.Textbox(
587
+ value="Name",
588
+ label="Customer/Name Column",
589
+ info="Part of the column name that identifies customers"
590
+ )
591
+
592
+ ar_amount_cols = gr.Textbox(
593
+ value="Not Invoiced, Current, 31-60, 61-90, Over 90",
594
+ label="Amount Columns (comma-separated)",
595
+ info="Column names for aging buckets"
596
+ )
597
+
598
+ ar_analyze_btn = gr.Button("πŸ“Š Analyze AR Aging", variant="primary", size="lg")
599
+
600
+ with gr.Column(scale=2):
601
+ ar_summary = gr.Markdown(label="Summary Statistics")
602
+ ar_status = gr.Textbox(label="Processing Status")
603
+
604
+ with gr.Row():
605
+ ar_table = gr.Dataframe(
606
+ label="AR Aging Summary by Customer",
607
+ wrap=True,
608
+ height=400
609
+ )
610
+
611
+ gr.Markdown("### πŸ“ˆ Visualizations")
612
+
613
+ with gr.Row():
614
+ ar_pie_chart = gr.Plot(label="Aging Distribution")
615
+ ar_bar_chart = gr.Plot(label="Top Customer Balances")
616
+
617
+ with gr.Row():
618
+ ar_stacked_chart = gr.Plot(label="Aging by Customer")
619
+
620
+ with gr.Row():
621
+ ar_csv_btn = gr.Button("πŸ“₯ Export to CSV")
622
+ ar_excel_btn = gr.Button("πŸ“₯ Export to Excel")
623
+ ar_csv_output = gr.File(label="CSV Download")
624
+ ar_excel_output = gr.File(label="Excel Download")
625
+
626
+ # Event handlers
627
+ ar_analyze_btn.click(
628
+ fn=process_ar_aging_report,
629
+ inputs=[ar_pdf_input, ar_name_col, ar_amount_cols],
630
+ outputs=[ar_table, ar_summary, ar_pie_chart, ar_bar_chart, ar_stacked_chart, ar_status]
631
+ )
632
+
633
+ ar_csv_btn.click(
634
+ fn=export_to_csv,
635
+ inputs=[ar_table],
636
+ outputs=[ar_csv_output]
637
+ )
638
+
639
+ ar_excel_btn.click(
640
+ fn=export_to_excel,
641
+ inputs=[ar_table],
642
+ outputs=[ar_excel_output]
643
+ )
644
+
645
+ # ================================================================
646
+ # TAB 4: Batch Processing
647
+ # ================================================================
648
+ with gr.TabItem("πŸ“ Batch Processing", id=4):
649
+ gr.Markdown("""
650
+ ### Process Multiple PDFs
651
+ Upload multiple PDF files to extract tables from all of them at once.
652
+ """)
653
+
654
+ batch_pdf_input = gr.File(
655
+ label="Upload Multiple PDFs",
656
+ file_types=[".pdf"],
657
+ file_count="multiple",
658
+ type="filepath"
659
+ )
660
+
661
+ batch_process_btn = gr.Button("πŸ”„ Process All PDFs", variant="primary")
662
+
663
+ batch_results = gr.Textbox(
664
+ label="Processing Results",
665
+ lines=10
666
+ )
667
+
668
+ batch_combined_table = gr.Dataframe(
669
+ label="Combined Data (All Tables)",
670
+ wrap=True,
671
+ height=400
672
+ )
673
+
674
+ with gr.Row():
675
+ batch_csv_btn = gr.Button("πŸ“₯ Export Combined to CSV")
676
+ batch_csv_output = gr.File(label="CSV Download")
677
+
678
+ def process_batch(files):
679
+ if not files:
680
+ return "No files uploaded", pd.DataFrame()
681
+
682
+ results = []
683
+ all_tables = []
684
+
685
+ for file in files:
686
+ try:
687
+ tables = extract_tables_from_pdf(file.name)
688
+ results.append(f"βœ… {os.path.basename(file.name)}: Found {len(tables)} table(s)")
689
+
690
+ for table in tables:
691
+ table['Source_File'] = os.path.basename(file.name)
692
+ all_tables.append(table)
693
+ except Exception as e:
694
+ results.append(f"❌ {os.path.basename(file.name)}: Error - {str(e)}")
695
+
696
+ if all_tables:
697
+ # Try to combine tables with same structure
698
+ try:
699
+ combined = pd.concat(all_tables, ignore_index=True)
700
+ except:
701
+ combined = all_tables[0] if all_tables else pd.DataFrame()
702
+ else:
703
+ combined = pd.DataFrame()
704
+
705
+ return "\n".join(results), combined
706
+
707
+ batch_process_btn.click(
708
+ fn=process_batch,
709
+ inputs=[batch_pdf_input],
710
+ outputs=[batch_results, batch_combined_table]
711
+ )
712
+
713
+ batch_csv_btn.click(
714
+ fn=export_to_csv,
715
+ inputs=[batch_combined_table],
716
+ outputs=[batch_csv_output]
717
+ )
718
+
719
+ # ================================================================
720
+ # TAB 5: Help & Documentation
721
+ # ================================================================
722
+ with gr.TabItem("❓ Help", id=5):
723
+ gr.Markdown("""
724
+ ## πŸ“š Documentation & Tips
725
+
726
+ ### Overview
727
+ This application extracts tabular data from PDF files and provides specialized
728
+ analysis for Accounts Receivable (AR) Aging reports.
729
+
730
+ ---
731
+
732
+ ### πŸ”§ Extraction Strategies
733
+
734
+ | Strategy | Description | Best For |
735
+ |----------|-------------|----------|
736
+ | `text` | Uses text positions to identify boundaries | Most PDFs, especially text-based tables |
737
+ | `lines` | Uses drawn lines to identify boundaries | PDFs with visible grid lines |
738
+ | `lines_strict` | Strictly follows drawn lines | Clean, well-formatted tables |
739
+ | `explicit` | Requires explicit boundary definitions | Complex layouts |
740
+
741
+ ---
742
+
743
+ ### πŸ’‘ Tips for Best Results
744
+
745
+ 1. **Start with Basic Extraction** - Try the basic tab first to see what's detected
746
+
747
+ 2. **Adjust Strategies** - If tables aren't detected correctly:
748
+ - Try `lines` strategy if your PDF has visible gridlines
749
+ - Increase tolerance values for loosely formatted tables
750
+
751
+ 3. **AR Aging Reports** - For best results:
752
+ - Ensure column names match your PDF headers
753
+ - Use partial matches (e.g., "Name" will match "Customer Name")
754
+
755
+ 4. **Large PDFs** - Processing may take longer for multi-page documents
756
+
757
+ ---
758
+
759
+ ### πŸ“‹ Supported Formats
760
+
761
+ - **Input:** PDF files (.pdf)
762
+ - **Output:** CSV, Excel (.xlsx)
763
+
764
+ ---
765
+
766
+ ### πŸ”— Technology Stack
767
+
768
+ - **pdfplumber** - PDF parsing and table extraction
769
+ - **pandas** - Data manipulation and analysis
770
+ - **plotly** - Interactive visualizations
771
+ - **gradio** - Web interface
772
+
773
+ ---
774
+
775
+ ### ⚠️ Limitations
776
+
777
+ - Scanned PDFs (images) are not supported - use OCR tools first
778
+ - Very complex table layouts may require manual adjustment
779
+ - Password-protected PDFs are not supported
780
+
781
+ ---
782
+
783
+ ### πŸ“§ Feedback
784
+
785
+ If you encounter issues or have suggestions, please provide feedback!
786
+ """)
787
+
788
+ # Footer
789
+ gr.HTML("""
790
+ <div style="text-align: center; margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 8px;">
791
+ <p style="color: #666; margin: 0;">
792
+ Built with ❀️ using Gradio & pdfplumber |
793
+ <a href="https://github.com/jsvine/pdfplumber" target="_blank">pdfplumber docs</a>
794
+ </p>
795
+ </div>
796
+ """)
797
+
798
+
799
+ # Launch the app
800
+ if __name__ == "__main__":
801
+ demo.launch()