Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -235,14 +235,14 @@ def process_pdf_basic(pdf_file) -> Tuple[str, str, pd.DataFrame, str]:
|
|
| 235 |
|
| 236 |
try:
|
| 237 |
# Extract metadata
|
| 238 |
-
metadata = get_pdf_metadata(pdf_file
|
| 239 |
metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
|
| 240 |
|
| 241 |
# Extract text
|
| 242 |
-
text = extract_text_from_pdf(pdf_file
|
| 243 |
|
| 244 |
# Extract tables
|
| 245 |
-
tables = extract_tables_from_pdf(pdf_file
|
| 246 |
|
| 247 |
if tables:
|
| 248 |
first_table = tables[0]
|
|
@@ -271,7 +271,7 @@ def process_pdf_advanced(
|
|
| 271 |
|
| 272 |
try:
|
| 273 |
tables = extract_tables_with_settings(
|
| 274 |
-
pdf_file
|
| 275 |
vertical_strategy=v_strategy,
|
| 276 |
horizontal_strategy=h_strategy,
|
| 277 |
snap_tolerance=snap_tol,
|
|
@@ -282,7 +282,8 @@ def process_pdf_advanced(
|
|
| 282 |
return pd.DataFrame(), "No tables found with current settings.", ""
|
| 283 |
|
| 284 |
# Get the requested page's table
|
| 285 |
-
idx = min(page_num - 1, len(tables) - 1)
|
|
|
|
| 286 |
table = tables[idx]
|
| 287 |
|
| 288 |
info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
|
|
@@ -306,12 +307,12 @@ def process_ar_aging_report(
|
|
| 306 |
|
| 307 |
try:
|
| 308 |
# Extract tables
|
| 309 |
-
tables = extract_tables_from_pdf(pdf_file
|
| 310 |
|
| 311 |
if not tables:
|
| 312 |
# Try with text strategy
|
| 313 |
tables = extract_tables_with_settings(
|
| 314 |
-
pdf_file
|
| 315 |
vertical_strategy="text",
|
| 316 |
horizontal_strategy="text"
|
| 317 |
)
|
|
@@ -389,6 +390,36 @@ def export_to_excel(df: pd.DataFrame) -> str:
|
|
| 389 |
return temp_file.name
|
| 390 |
|
| 391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
# ============================================================================
|
| 393 |
# GRADIO UI
|
| 394 |
# ============================================================================
|
|
@@ -397,11 +428,10 @@ def export_to_excel(df: pd.DataFrame) -> str:
|
|
| 397 |
with gr.Blocks() as demo:
|
| 398 |
|
| 399 |
# Header
|
| 400 |
-
gr.
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
</div>
|
| 405 |
""")
|
| 406 |
|
| 407 |
with gr.Tabs() as tabs:
|
|
@@ -409,7 +439,7 @@ with gr.Blocks() as demo:
|
|
| 409 |
# ================================================================
|
| 410 |
# TAB 1: Basic Extraction
|
| 411 |
# ================================================================
|
| 412 |
-
with gr.
|
| 413 |
gr.Markdown("""
|
| 414 |
### Quick PDF Analysis
|
| 415 |
Upload a PDF to extract text, metadata, and tables automatically.
|
|
@@ -419,10 +449,9 @@ with gr.Blocks() as demo:
|
|
| 419 |
with gr.Column(scale=1):
|
| 420 |
basic_pdf_input = gr.File(
|
| 421 |
label="Upload PDF",
|
| 422 |
-
file_types=[".pdf"]
|
| 423 |
-
type="filepath"
|
| 424 |
)
|
| 425 |
-
basic_extract_btn = gr.Button("π Extract Content", variant="primary"
|
| 426 |
|
| 427 |
with gr.Column(scale=2):
|
| 428 |
basic_metadata = gr.Markdown(label="PDF Metadata")
|
|
@@ -440,7 +469,7 @@ with gr.Blocks() as demo:
|
|
| 440 |
basic_table = gr.Dataframe(
|
| 441 |
label="Extracted Table",
|
| 442 |
wrap=True,
|
| 443 |
-
|
| 444 |
)
|
| 445 |
|
| 446 |
with gr.Row():
|
|
@@ -471,7 +500,7 @@ with gr.Blocks() as demo:
|
|
| 471 |
# ================================================================
|
| 472 |
# TAB 2: Advanced Extraction
|
| 473 |
# ================================================================
|
| 474 |
-
with gr.
|
| 475 |
gr.Markdown("""
|
| 476 |
### Advanced Table Extraction Settings
|
| 477 |
Fine-tune the extraction parameters for complex PDFs.
|
|
@@ -481,8 +510,7 @@ with gr.Blocks() as demo:
|
|
| 481 |
with gr.Column(scale=1):
|
| 482 |
adv_pdf_input = gr.File(
|
| 483 |
label="Upload PDF",
|
| 484 |
-
file_types=[".pdf"]
|
| 485 |
-
type="filepath"
|
| 486 |
)
|
| 487 |
|
| 488 |
gr.Markdown("**Extraction Settings**")
|
|
@@ -534,7 +562,7 @@ with gr.Blocks() as demo:
|
|
| 534 |
adv_table = gr.Dataframe(
|
| 535 |
label="Extracted Table",
|
| 536 |
wrap=True,
|
| 537 |
-
|
| 538 |
)
|
| 539 |
|
| 540 |
with gr.Row():
|
|
@@ -565,7 +593,7 @@ with gr.Blocks() as demo:
|
|
| 565 |
# ================================================================
|
| 566 |
# TAB 3: AR Aging Analysis
|
| 567 |
# ================================================================
|
| 568 |
-
with gr.
|
| 569 |
gr.Markdown("""
|
| 570 |
### Accounts Receivable Aging Analysis
|
| 571 |
Upload an AR aging PDF report to extract, analyze, and visualize the data.
|
|
@@ -579,8 +607,7 @@ with gr.Blocks() as demo:
|
|
| 579 |
with gr.Column(scale=1):
|
| 580 |
ar_pdf_input = gr.File(
|
| 581 |
label="Upload AR Aging PDF",
|
| 582 |
-
file_types=[".pdf"]
|
| 583 |
-
type="filepath"
|
| 584 |
)
|
| 585 |
|
| 586 |
ar_name_col = gr.Textbox(
|
|
@@ -595,7 +622,7 @@ with gr.Blocks() as demo:
|
|
| 595 |
info="Column names for aging buckets"
|
| 596 |
)
|
| 597 |
|
| 598 |
-
ar_analyze_btn = gr.Button("π Analyze AR Aging", variant="primary"
|
| 599 |
|
| 600 |
with gr.Column(scale=2):
|
| 601 |
ar_summary = gr.Markdown(label="Summary Statistics")
|
|
@@ -605,7 +632,7 @@ with gr.Blocks() as demo:
|
|
| 605 |
ar_table = gr.Dataframe(
|
| 606 |
label="AR Aging Summary by Customer",
|
| 607 |
wrap=True,
|
| 608 |
-
|
| 609 |
)
|
| 610 |
|
| 611 |
gr.Markdown("### π Visualizations")
|
|
@@ -645,7 +672,7 @@ with gr.Blocks() as demo:
|
|
| 645 |
# ================================================================
|
| 646 |
# TAB 4: Batch Processing
|
| 647 |
# ================================================================
|
| 648 |
-
with gr.
|
| 649 |
gr.Markdown("""
|
| 650 |
### Process Multiple PDFs
|
| 651 |
Upload multiple PDF files to extract tables from all of them at once.
|
|
@@ -654,8 +681,7 @@ with gr.Blocks() as demo:
|
|
| 654 |
batch_pdf_input = gr.File(
|
| 655 |
label="Upload Multiple PDFs",
|
| 656 |
file_types=[".pdf"],
|
| 657 |
-
file_count="multiple"
|
| 658 |
-
type="filepath"
|
| 659 |
)
|
| 660 |
|
| 661 |
batch_process_btn = gr.Button("π Process All PDFs", variant="primary")
|
|
@@ -668,42 +694,13 @@ with gr.Blocks() as demo:
|
|
| 668 |
batch_combined_table = gr.Dataframe(
|
| 669 |
label="Combined Data (All Tables)",
|
| 670 |
wrap=True,
|
| 671 |
-
|
| 672 |
)
|
| 673 |
|
| 674 |
with gr.Row():
|
| 675 |
batch_csv_btn = gr.Button("π₯ Export Combined to CSV")
|
| 676 |
batch_csv_output = gr.File(label="CSV Download")
|
| 677 |
|
| 678 |
-
def process_batch(files):
|
| 679 |
-
if not files:
|
| 680 |
-
return "No files uploaded", pd.DataFrame()
|
| 681 |
-
|
| 682 |
-
results = []
|
| 683 |
-
all_tables = []
|
| 684 |
-
|
| 685 |
-
for file in files:
|
| 686 |
-
try:
|
| 687 |
-
tables = extract_tables_from_pdf(file.name)
|
| 688 |
-
results.append(f"β
{os.path.basename(file.name)}: Found {len(tables)} table(s)")
|
| 689 |
-
|
| 690 |
-
for table in tables:
|
| 691 |
-
table['Source_File'] = os.path.basename(file.name)
|
| 692 |
-
all_tables.append(table)
|
| 693 |
-
except Exception as e:
|
| 694 |
-
results.append(f"β {os.path.basename(file.name)}: Error - {str(e)}")
|
| 695 |
-
|
| 696 |
-
if all_tables:
|
| 697 |
-
# Try to combine tables with same structure
|
| 698 |
-
try:
|
| 699 |
-
combined = pd.concat(all_tables, ignore_index=True)
|
| 700 |
-
except:
|
| 701 |
-
combined = all_tables[0] if all_tables else pd.DataFrame()
|
| 702 |
-
else:
|
| 703 |
-
combined = pd.DataFrame()
|
| 704 |
-
|
| 705 |
-
return "\n".join(results), combined
|
| 706 |
-
|
| 707 |
batch_process_btn.click(
|
| 708 |
fn=process_batch,
|
| 709 |
inputs=[batch_pdf_input],
|
|
@@ -719,7 +716,7 @@ with gr.Blocks() as demo:
|
|
| 719 |
# ================================================================
|
| 720 |
# TAB 5: Help & Documentation
|
| 721 |
# ================================================================
|
| 722 |
-
with gr.
|
| 723 |
gr.Markdown("""
|
| 724 |
## π Documentation & Tips
|
| 725 |
|
|
@@ -777,22 +774,12 @@ with gr.Blocks() as demo:
|
|
| 777 |
- Scanned PDFs (images) are not supported - use OCR tools first
|
| 778 |
- Very complex table layouts may require manual adjustment
|
| 779 |
- Password-protected PDFs are not supported
|
| 780 |
-
|
| 781 |
-
---
|
| 782 |
-
|
| 783 |
-
### π§ Feedback
|
| 784 |
-
|
| 785 |
-
If you encounter issues or have suggestions, please provide feedback!
|
| 786 |
""")
|
| 787 |
|
| 788 |
# Footer
|
| 789 |
-
gr.
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
Built with β€οΈ using Gradio & pdfplumber |
|
| 793 |
-
<a href="https://github.com/jsvine/pdfplumber" target="_blank">pdfplumber docs</a>
|
| 794 |
-
</p>
|
| 795 |
-
</div>
|
| 796 |
""")
|
| 797 |
|
| 798 |
|
|
|
|
| 235 |
|
| 236 |
try:
|
| 237 |
# Extract metadata
|
| 238 |
+
metadata = get_pdf_metadata(pdf_file)
|
| 239 |
metadata_str = "\n".join([f"**{k}:** {v}" for k, v in metadata.items()])
|
| 240 |
|
| 241 |
# Extract text
|
| 242 |
+
text = extract_text_from_pdf(pdf_file)
|
| 243 |
|
| 244 |
# Extract tables
|
| 245 |
+
tables = extract_tables_from_pdf(pdf_file)
|
| 246 |
|
| 247 |
if tables:
|
| 248 |
first_table = tables[0]
|
|
|
|
| 271 |
|
| 272 |
try:
|
| 273 |
tables = extract_tables_with_settings(
|
| 274 |
+
pdf_file,
|
| 275 |
vertical_strategy=v_strategy,
|
| 276 |
horizontal_strategy=h_strategy,
|
| 277 |
snap_tolerance=snap_tol,
|
|
|
|
| 282 |
return pd.DataFrame(), "No tables found with current settings.", ""
|
| 283 |
|
| 284 |
# Get the requested page's table
|
| 285 |
+
idx = min(int(page_num) - 1, len(tables) - 1)
|
| 286 |
+
idx = max(0, idx)
|
| 287 |
table = tables[idx]
|
| 288 |
|
| 289 |
info = f"Extracted {len(tables)} table(s). Showing table {idx + 1}."
|
|
|
|
| 307 |
|
| 308 |
try:
|
| 309 |
# Extract tables
|
| 310 |
+
tables = extract_tables_from_pdf(pdf_file)
|
| 311 |
|
| 312 |
if not tables:
|
| 313 |
# Try with text strategy
|
| 314 |
tables = extract_tables_with_settings(
|
| 315 |
+
pdf_file,
|
| 316 |
vertical_strategy="text",
|
| 317 |
horizontal_strategy="text"
|
| 318 |
)
|
|
|
|
| 390 |
return temp_file.name
|
| 391 |
|
| 392 |
|
| 393 |
+
def process_batch(files):
|
| 394 |
+
"""Process multiple PDF files."""
|
| 395 |
+
if not files:
|
| 396 |
+
return "No files uploaded", pd.DataFrame()
|
| 397 |
+
|
| 398 |
+
results = []
|
| 399 |
+
all_tables = []
|
| 400 |
+
|
| 401 |
+
for file in files:
|
| 402 |
+
try:
|
| 403 |
+
tables = extract_tables_from_pdf(file)
|
| 404 |
+
results.append(f"β
{os.path.basename(file)}: Found {len(tables)} table(s)")
|
| 405 |
+
|
| 406 |
+
for table in tables:
|
| 407 |
+
table['Source_File'] = os.path.basename(file)
|
| 408 |
+
all_tables.append(table)
|
| 409 |
+
except Exception as e:
|
| 410 |
+
results.append(f"β {os.path.basename(file)}: Error - {str(e)}")
|
| 411 |
+
|
| 412 |
+
if all_tables:
|
| 413 |
+
try:
|
| 414 |
+
combined = pd.concat(all_tables, ignore_index=True)
|
| 415 |
+
except:
|
| 416 |
+
combined = all_tables[0] if all_tables else pd.DataFrame()
|
| 417 |
+
else:
|
| 418 |
+
combined = pd.DataFrame()
|
| 419 |
+
|
| 420 |
+
return "\n".join(results), combined
|
| 421 |
+
|
| 422 |
+
|
| 423 |
# ============================================================================
|
| 424 |
# GRADIO UI
|
| 425 |
# ============================================================================
|
|
|
|
| 428 |
with gr.Blocks() as demo:
|
| 429 |
|
| 430 |
# Header
|
| 431 |
+
gr.Markdown("""
|
| 432 |
+
# π PDF Table Extractor & AR Aging Analyzer
|
| 433 |
+
|
| 434 |
+
Extract tables from PDFs, analyze AR aging reports, and export to CSV/Excel
|
|
|
|
| 435 |
""")
|
| 436 |
|
| 437 |
with gr.Tabs() as tabs:
|
|
|
|
| 439 |
# ================================================================
|
| 440 |
# TAB 1: Basic Extraction
|
| 441 |
# ================================================================
|
| 442 |
+
with gr.Tab("π Basic Extraction"):
|
| 443 |
gr.Markdown("""
|
| 444 |
### Quick PDF Analysis
|
| 445 |
Upload a PDF to extract text, metadata, and tables automatically.
|
|
|
|
| 449 |
with gr.Column(scale=1):
|
| 450 |
basic_pdf_input = gr.File(
|
| 451 |
label="Upload PDF",
|
| 452 |
+
file_types=[".pdf"]
|
|
|
|
| 453 |
)
|
| 454 |
+
basic_extract_btn = gr.Button("π Extract Content", variant="primary")
|
| 455 |
|
| 456 |
with gr.Column(scale=2):
|
| 457 |
basic_metadata = gr.Markdown(label="PDF Metadata")
|
|
|
|
| 469 |
basic_table = gr.Dataframe(
|
| 470 |
label="Extracted Table",
|
| 471 |
wrap=True,
|
| 472 |
+
max_height=400
|
| 473 |
)
|
| 474 |
|
| 475 |
with gr.Row():
|
|
|
|
| 500 |
# ================================================================
|
| 501 |
# TAB 2: Advanced Extraction
|
| 502 |
# ================================================================
|
| 503 |
+
with gr.Tab("βοΈ Advanced Extraction"):
|
| 504 |
gr.Markdown("""
|
| 505 |
### Advanced Table Extraction Settings
|
| 506 |
Fine-tune the extraction parameters for complex PDFs.
|
|
|
|
| 510 |
with gr.Column(scale=1):
|
| 511 |
adv_pdf_input = gr.File(
|
| 512 |
label="Upload PDF",
|
| 513 |
+
file_types=[".pdf"]
|
|
|
|
| 514 |
)
|
| 515 |
|
| 516 |
gr.Markdown("**Extraction Settings**")
|
|
|
|
| 562 |
adv_table = gr.Dataframe(
|
| 563 |
label="Extracted Table",
|
| 564 |
wrap=True,
|
| 565 |
+
max_height=500
|
| 566 |
)
|
| 567 |
|
| 568 |
with gr.Row():
|
|
|
|
| 593 |
# ================================================================
|
| 594 |
# TAB 3: AR Aging Analysis
|
| 595 |
# ================================================================
|
| 596 |
+
with gr.Tab("π° AR Aging Analysis"):
|
| 597 |
gr.Markdown("""
|
| 598 |
### Accounts Receivable Aging Analysis
|
| 599 |
Upload an AR aging PDF report to extract, analyze, and visualize the data.
|
|
|
|
| 607 |
with gr.Column(scale=1):
|
| 608 |
ar_pdf_input = gr.File(
|
| 609 |
label="Upload AR Aging PDF",
|
| 610 |
+
file_types=[".pdf"]
|
|
|
|
| 611 |
)
|
| 612 |
|
| 613 |
ar_name_col = gr.Textbox(
|
|
|
|
| 622 |
info="Column names for aging buckets"
|
| 623 |
)
|
| 624 |
|
| 625 |
+
ar_analyze_btn = gr.Button("π Analyze AR Aging", variant="primary")
|
| 626 |
|
| 627 |
with gr.Column(scale=2):
|
| 628 |
ar_summary = gr.Markdown(label="Summary Statistics")
|
|
|
|
| 632 |
ar_table = gr.Dataframe(
|
| 633 |
label="AR Aging Summary by Customer",
|
| 634 |
wrap=True,
|
| 635 |
+
max_height=400
|
| 636 |
)
|
| 637 |
|
| 638 |
gr.Markdown("### π Visualizations")
|
|
|
|
| 672 |
# ================================================================
|
| 673 |
# TAB 4: Batch Processing
|
| 674 |
# ================================================================
|
| 675 |
+
with gr.Tab("π Batch Processing"):
|
| 676 |
gr.Markdown("""
|
| 677 |
### Process Multiple PDFs
|
| 678 |
Upload multiple PDF files to extract tables from all of them at once.
|
|
|
|
| 681 |
batch_pdf_input = gr.File(
|
| 682 |
label="Upload Multiple PDFs",
|
| 683 |
file_types=[".pdf"],
|
| 684 |
+
file_count="multiple"
|
|
|
|
| 685 |
)
|
| 686 |
|
| 687 |
batch_process_btn = gr.Button("π Process All PDFs", variant="primary")
|
|
|
|
| 694 |
batch_combined_table = gr.Dataframe(
|
| 695 |
label="Combined Data (All Tables)",
|
| 696 |
wrap=True,
|
| 697 |
+
max_height=400
|
| 698 |
)
|
| 699 |
|
| 700 |
with gr.Row():
|
| 701 |
batch_csv_btn = gr.Button("π₯ Export Combined to CSV")
|
| 702 |
batch_csv_output = gr.File(label="CSV Download")
|
| 703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
batch_process_btn.click(
|
| 705 |
fn=process_batch,
|
| 706 |
inputs=[batch_pdf_input],
|
|
|
|
| 716 |
# ================================================================
|
| 717 |
# TAB 5: Help & Documentation
|
| 718 |
# ================================================================
|
| 719 |
+
with gr.Tab("β Help"):
|
| 720 |
gr.Markdown("""
|
| 721 |
## π Documentation & Tips
|
| 722 |
|
|
|
|
| 774 |
- Scanned PDFs (images) are not supported - use OCR tools first
|
| 775 |
- Very complex table layouts may require manual adjustment
|
| 776 |
- Password-protected PDFs are not supported
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
""")
|
| 778 |
|
| 779 |
# Footer
|
| 780 |
+
gr.Markdown("""
|
| 781 |
+
---
|
| 782 |
+
Built with β€οΈ using Gradio & pdfplumber | [pdfplumber docs](https://github.com/jsvine/pdfplumber)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
""")
|
| 784 |
|
| 785 |
|