Omarrran commited on
Commit
5638291
Β·
verified Β·
1 Parent(s): 8e02dfe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -73
app.py CHANGED
@@ -235,14 +235,14 @@ def process_pdf_basic(pdf_file) -> Tuple[str, str, pd.DataFrame, str]:
235
 
236
  try:
237
  # Extract metadata
238
- metadata = get_pdf_metadata(pdf_file.name)
239
  metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
240
 
241
  # Extract text
242
- text = extract_text_from_pdf(pdf_file.name)
243
 
244
  # Extract tables
245
- tables = extract_tables_from_pdf(pdf_file.name)
246
 
247
  if tables:
248
  first_table = tables[0]
@@ -271,7 +271,7 @@ def process_pdf_advanced(
271
 
272
  try:
273
  tables = extract_tables_with_settings(
274
- pdf_file.name,
275
  vertical_strategy=v_strategy,
276
  horizontal_strategy=h_strategy,
277
  snap_tolerance=snap_tol,
@@ -282,7 +282,8 @@ def process_pdf_advanced(
282
  return pd.DataFrame(), "No tables found with current settings.", ""
283
 
284
  # Get the requested page's table
285
- idx = min(page_num - 1, len(tables) - 1)
 
286
  table = tables[idx]
287
 
288
  info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
@@ -306,12 +307,12 @@ def process_ar_aging_report(
306
 
307
  try:
308
  # Extract tables
309
- tables = extract_tables_from_pdf(pdf_file.name)
310
 
311
  if not tables:
312
  # Try with text strategy
313
  tables = extract_tables_with_settings(
314
- pdf_file.name,
315
  vertical_strategy="text",
316
  horizontal_strategy="text"
317
  )
@@ -389,6 +390,36 @@ def export_to_excel(df: pd.DataFrame) -> str:
389
  return temp_file.name
390
 
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  # ============================================================================
393
  # GRADIO UI
394
  # ============================================================================
@@ -397,11 +428,10 @@ def export_to_excel(df: pd.DataFrame) -> str:
397
  with gr.Blocks() as demo:
398
 
399
  # Header
400
- gr.HTML("""
401
- <div class="main-header">
402
- <h1>πŸ“„ PDF Table Extractor & AR Aging Analyzer</h1>
403
- <p>Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel</p>
404
- </div>
405
  """)
406
 
407
  with gr.Tabs() as tabs:
@@ -409,7 +439,7 @@ with gr.Blocks() as demo:
409
  # ================================================================
410
  # TAB 1: Basic Extraction
411
  # ================================================================
412
- with gr.TabItem("πŸ“‹ Basic Extraction", id=1):
413
  gr.Markdown("""
414
  ### Quick PDF Analysis
415
  Upload a PDF to extract text, metadata, and tables automatically.
@@ -419,10 +449,9 @@ with gr.Blocks() as demo:
419
  with gr.Column(scale=1):
420
  basic_pdf_input = gr.File(
421
  label="Upload PDF",
422
- file_types=[".pdf"],
423
- type="filepath"
424
  )
425
- basic_extract_btn = gr.Button("πŸ” Extract Content", variant="primary", size="lg")
426
 
427
  with gr.Column(scale=2):
428
  basic_metadata = gr.Markdown(label="PDF Metadata")
@@ -440,7 +469,7 @@ with gr.Blocks() as demo:
440
  basic_table = gr.Dataframe(
441
  label="Extracted Table",
442
  wrap=True,
443
- height=400
444
  )
445
 
446
  with gr.Row():
@@ -471,7 +500,7 @@ with gr.Blocks() as demo:
471
  # ================================================================
472
  # TAB 2: Advanced Extraction
473
  # ================================================================
474
- with gr.TabItem("βš™οΈ Advanced Extraction", id=2):
475
  gr.Markdown("""
476
  ### Advanced Table Extraction Settings
477
  Fine-tune the extraction parameters for complex PDFs.
@@ -481,8 +510,7 @@ with gr.Blocks() as demo:
481
  with gr.Column(scale=1):
482
  adv_pdf_input = gr.File(
483
  label="Upload PDF",
484
- file_types=[".pdf"],
485
- type="filepath"
486
  )
487
 
488
  gr.Markdown("**Extraction Settings**")
@@ -534,7 +562,7 @@ with gr.Blocks() as demo:
534
  adv_table = gr.Dataframe(
535
  label="Extracted Table",
536
  wrap=True,
537
- height=500
538
  )
539
 
540
  with gr.Row():
@@ -565,7 +593,7 @@ with gr.Blocks() as demo:
565
  # ================================================================
566
  # TAB 3: AR Aging Analysis
567
  # ================================================================
568
- with gr.TabItem("πŸ’° AR Aging Analysis", id=3):
569
  gr.Markdown("""
570
  ### Accounts Receivable Aging Analysis
571
  Upload an AR aging PDF report to extract, analyze, and visualize the data.
@@ -579,8 +607,7 @@ with gr.Blocks() as demo:
579
  with gr.Column(scale=1):
580
  ar_pdf_input = gr.File(
581
  label="Upload AR Aging PDF",
582
- file_types=[".pdf"],
583
- type="filepath"
584
  )
585
 
586
  ar_name_col = gr.Textbox(
@@ -595,7 +622,7 @@ with gr.Blocks() as demo:
595
  info="Column names for aging buckets"
596
  )
597
 
598
- ar_analyze_btn = gr.Button("πŸ“Š Analyze AR Aging", variant="primary", size="lg")
599
 
600
  with gr.Column(scale=2):
601
  ar_summary = gr.Markdown(label="Summary Statistics")
@@ -605,7 +632,7 @@ with gr.Blocks() as demo:
605
  ar_table = gr.Dataframe(
606
  label="AR Aging Summary by Customer",
607
  wrap=True,
608
- height=400
609
  )
610
 
611
  gr.Markdown("### πŸ“ˆ Visualizations")
@@ -645,7 +672,7 @@ with gr.Blocks() as demo:
645
  # ================================================================
646
  # TAB 4: Batch Processing
647
  # ================================================================
648
- with gr.TabItem("πŸ“ Batch Processing", id=4):
649
  gr.Markdown("""
650
  ### Process Multiple PDFs
651
  Upload multiple PDF files to extract tables from all of them at once.
@@ -654,8 +681,7 @@ with gr.Blocks() as demo:
654
  batch_pdf_input = gr.File(
655
  label="Upload Multiple PDFs",
656
  file_types=[".pdf"],
657
- file_count="multiple",
658
- type="filepath"
659
  )
660
 
661
  batch_process_btn = gr.Button("πŸ”„ Process All PDFs", variant="primary")
@@ -668,42 +694,13 @@ with gr.Blocks() as demo:
668
  batch_combined_table = gr.Dataframe(
669
  label="Combined Data (All Tables)",
670
  wrap=True,
671
- height=400
672
  )
673
 
674
  with gr.Row():
675
  batch_csv_btn = gr.Button("πŸ“₯ Export Combined to CSV")
676
  batch_csv_output = gr.File(label="CSV Download")
677
 
678
- def process_batch(files):
679
- if not files:
680
- return "No files uploaded", pd.DataFrame()
681
-
682
- results = []
683
- all_tables = []
684
-
685
- for file in files:
686
- try:
687
- tables = extract_tables_from_pdf(file.name)
688
- results.append(f"βœ… {os.path.basename(file.name)}: Found {len(tables)} table(s)")
689
-
690
- for table in tables:
691
- table['Source_File'] = os.path.basename(file.name)
692
- all_tables.append(table)
693
- except Exception as e:
694
- results.append(f"❌ {os.path.basename(file.name)}: Error - {str(e)}")
695
-
696
- if all_tables:
697
- # Try to combine tables with same structure
698
- try:
699
- combined = pd.concat(all_tables, ignore_index=True)
700
- except:
701
- combined = all_tables[0] if all_tables else pd.DataFrame()
702
- else:
703
- combined = pd.DataFrame()
704
-
705
- return "\n".join(results), combined
706
-
707
  batch_process_btn.click(
708
  fn=process_batch,
709
  inputs=[batch_pdf_input],
@@ -719,7 +716,7 @@ with gr.Blocks() as demo:
719
  # ================================================================
720
  # TAB 5: Help & Documentation
721
  # ================================================================
722
- with gr.TabItem("❓ Help", id=5):
723
  gr.Markdown("""
724
  ## πŸ“š Documentation & Tips
725
 
@@ -777,22 +774,12 @@ with gr.Blocks() as demo:
777
  - Scanned PDFs (images) are not supported - use OCR tools first
778
  - Very complex table layouts may require manual adjustment
779
  - Password-protected PDFs are not supported
780
-
781
- ---
782
-
783
- ### πŸ“§ Feedback
784
-
785
- If you encounter issues or have suggestions, please provide feedback!
786
  """)
787
 
788
  # Footer
789
- gr.HTML("""
790
- <div style="text-align: center; margin-top: 20px; padding: 20px; background: #f8f9fa; border-radius: 8px;">
791
- <p style="color: #666; margin: 0;">
792
- Built with ❀️ using Gradio & pdfplumber |
793
- <a href="https://github.com/jsvine/pdfplumber" target="_blank">pdfplumber docs</a>
794
- </p>
795
- </div>
796
  """)
797
 
798
 
 
235
 
236
  try:
237
  # Extract metadata
238
+ metadata = get_pdf_metadata(pdf_file)
239
  metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
240
 
241
  # Extract text
242
+ text = extract_text_from_pdf(pdf_file)
243
 
244
  # Extract tables
245
+ tables = extract_tables_from_pdf(pdf_file)
246
 
247
  if tables:
248
  first_table = tables[0]
 
271
 
272
  try:
273
  tables = extract_tables_with_settings(
274
+ pdf_file,
275
  vertical_strategy=v_strategy,
276
  horizontal_strategy=h_strategy,
277
  snap_tolerance=snap_tol,
 
282
  return pd.DataFrame(), "No tables found with current settings.", ""
283
 
284
  # Get the requested page's table
285
+ idx = min(int(page_num) - 1, len(tables) - 1)
286
+ idx = max(0, idx)
287
  table = tables[idx]
288
 
289
  info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
 
307
 
308
  try:
309
  # Extract tables
310
+ tables = extract_tables_from_pdf(pdf_file)
311
 
312
  if not tables:
313
  # Try with text strategy
314
  tables = extract_tables_with_settings(
315
+ pdf_file,
316
  vertical_strategy="text",
317
  horizontal_strategy="text"
318
  )
 
390
  return temp_file.name
391
 
392
 
393
+ def process_batch(files):
394
+ """Process multiple PDF files."""
395
+ if not files:
396
+ return "No files uploaded", pd.DataFrame()
397
+
398
+ results = []
399
+ all_tables = []
400
+
401
+ for file in files:
402
+ try:
403
+ tables = extract_tables_from_pdf(file)
404
+ results.append(f"βœ… {os.path.basename(file)}: Found {len(tables)} table(s)")
405
+
406
+ for table in tables:
407
+ table['Source_File'] = os.path.basename(file)
408
+ all_tables.append(table)
409
+ except Exception as e:
410
+ results.append(f"❌ {os.path.basename(file)}: Error - {str(e)}")
411
+
412
+ if all_tables:
413
+ try:
414
+ combined = pd.concat(all_tables, ignore_index=True)
415
+ except:
416
+ combined = all_tables[0] if all_tables else pd.DataFrame()
417
+ else:
418
+ combined = pd.DataFrame()
419
+
420
+ return "\n".join(results), combined
421
+
422
+
423
  # ============================================================================
424
  # GRADIO UI
425
  # ============================================================================
 
428
  with gr.Blocks() as demo:
429
 
430
  # Header
431
+ gr.Markdown("""
432
+ # πŸ“„ PDF Table Extractor & AR Aging Analyzer
433
+
434
+ Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel
 
435
  """)
436
 
437
  with gr.Tabs() as tabs:
 
439
  # ================================================================
440
  # TAB 1: Basic Extraction
441
  # ================================================================
442
+ with gr.Tab("πŸ“‹ Basic Extraction"):
443
  gr.Markdown("""
444
  ### Quick PDF Analysis
445
  Upload a PDF to extract text, metadata, and tables automatically.
 
449
  with gr.Column(scale=1):
450
  basic_pdf_input = gr.File(
451
  label="Upload PDF",
452
+ file_types=[".pdf"]
 
453
  )
454
+ basic_extract_btn = gr.Button("πŸ” Extract Content", variant="primary")
455
 
456
  with gr.Column(scale=2):
457
  basic_metadata = gr.Markdown(label="PDF Metadata")
 
469
  basic_table = gr.Dataframe(
470
  label="Extracted Table",
471
  wrap=True,
472
+ max_height=400
473
  )
474
 
475
  with gr.Row():
 
500
  # ================================================================
501
  # TAB 2: Advanced Extraction
502
  # ================================================================
503
+ with gr.Tab("βš™οΈ Advanced Extraction"):
504
  gr.Markdown("""
505
  ### Advanced Table Extraction Settings
506
  Fine-tune the extraction parameters for complex PDFs.
 
510
  with gr.Column(scale=1):
511
  adv_pdf_input = gr.File(
512
  label="Upload PDF",
513
+ file_types=[".pdf"]
 
514
  )
515
 
516
  gr.Markdown("**Extraction Settings**")
 
562
  adv_table = gr.Dataframe(
563
  label="Extracted Table",
564
  wrap=True,
565
+ max_height=500
566
  )
567
 
568
  with gr.Row():
 
593
  # ================================================================
594
  # TAB 3: AR Aging Analysis
595
  # ================================================================
596
+ with gr.Tab("πŸ’° AR Aging Analysis"):
597
  gr.Markdown("""
598
  ### Accounts Receivable Aging Analysis
599
  Upload an AR aging PDF report to extract, analyze, and visualize the data.
 
607
  with gr.Column(scale=1):
608
  ar_pdf_input = gr.File(
609
  label="Upload AR Aging PDF",
610
+ file_types=[".pdf"]
 
611
  )
612
 
613
  ar_name_col = gr.Textbox(
 
622
  info="Column names for aging buckets"
623
  )
624
 
625
+ ar_analyze_btn = gr.Button("πŸ“Š Analyze AR Aging", variant="primary")
626
 
627
  with gr.Column(scale=2):
628
  ar_summary = gr.Markdown(label="Summary Statistics")
 
632
  ar_table = gr.Dataframe(
633
  label="AR Aging Summary by Customer",
634
  wrap=True,
635
+ max_height=400
636
  )
637
 
638
  gr.Markdown("### πŸ“ˆ Visualizations")
 
672
  # ================================================================
673
  # TAB 4: Batch Processing
674
  # ================================================================
675
+ with gr.Tab("πŸ“ Batch Processing"):
676
  gr.Markdown("""
677
  ### Process Multiple PDFs
678
  Upload multiple PDF files to extract tables from all of them at once.
 
681
  batch_pdf_input = gr.File(
682
  label="Upload Multiple PDFs",
683
  file_types=[".pdf"],
684
+ file_count="multiple"
 
685
  )
686
 
687
  batch_process_btn = gr.Button("πŸ”„ Process All PDFs", variant="primary")
 
694
  batch_combined_table = gr.Dataframe(
695
  label="Combined Data (All Tables)",
696
  wrap=True,
697
+ max_height=400
698
  )
699
 
700
  with gr.Row():
701
  batch_csv_btn = gr.Button("πŸ“₯ Export Combined to CSV")
702
  batch_csv_output = gr.File(label="CSV Download")
703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
  batch_process_btn.click(
705
  fn=process_batch,
706
  inputs=[batch_pdf_input],
 
716
  # ================================================================
717
  # TAB 5: Help & Documentation
718
  # ================================================================
719
+ with gr.Tab("❓ Help"):
720
  gr.Markdown("""
721
  ## πŸ“š Documentation & Tips
722
 
 
774
  - Scanned PDFs (images) are not supported - use OCR tools first
775
  - Very complex table layouts may require manual adjustment
776
  - Password-protected PDFs are not supported
 
 
 
 
 
 
777
  """)
778
 
779
  # Footer
780
+ gr.Markdown("""
781
+ ---
782
+ Built with ❀️ using Gradio & pdfplumber | [pdfplumber docs](https://github.com/jsvine/pdfplumber)
 
 
 
 
783
  """)
784
 
785