Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import os | |
| import pandas as pd | |
| import pdfplumber | |
| import gradio as gr | |
| import time | |
| from pathlib import Path | |
| import shutil | |
| # Function to extract content from a single PDF | |
| def extract_pdf_content(file_path): | |
| # Open the PDF | |
| pdf_file = fitz.open(file_path) | |
| page_nums = len(pdf_file) | |
| # Ensure images directory exists | |
| images_dir = "temp_images" | |
| if not os.path.exists(images_dir): | |
| os.makedirs(images_dir) | |
| # Store extracted content | |
| all_text = [] | |
| all_tables = [] | |
| images_list = [] | |
| # Extract text, tables, and images | |
| for page_num in range(page_nums): | |
| page_content = pdf_file[page_num] | |
| # Extract text | |
| text = page_content.get_text("text") | |
| all_text.append(f"--- Page {page_num + 1} ---\n{text}") | |
| # Extract tables using pdfplumber | |
| with pdfplumber.open(file_path) as pdf: | |
| tables = pdf.pages[page_num].extract_tables() | |
| for table in tables: | |
| df = pd.DataFrame(table) | |
| all_tables.append(df) | |
| # Extract images | |
| images_list.extend(page_content.get_images(full=True)) | |
| # Save extracted images | |
| image_paths = [] | |
| if images_list: | |
| for i, image in enumerate(images_list, start=1): | |
| xref = image[0] | |
| base_image = pdf_file.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_ext = base_image["ext"] | |
| image_name = f"{images_dir}/image_{time.time()}_{i}.{image_ext}" # Unique name for each image | |
| image_paths.append(image_name) | |
| with open(image_name, "wb") as image_file: | |
| image_file.write(image_bytes) | |
| # Close the PDF file | |
| pdf_file.close() | |
| return "\n".join(all_text), all_tables, image_paths | |
| # Function to handle multiple PDFs | |
| def process_multiple_pdfs(files, progress=gr.Progress()): | |
| aggregated_text = [] | |
| aggregated_tables = [] | |
| aggregated_images = [] | |
| total_files = len(files) | |
| for idx, file in enumerate(files): | |
| file_path = file.name # Get the temporary file path | |
| progress(idx / total_files, desc=f"Processing PDF {idx + 1}/{total_files}") | |
| text, tables, images = extract_pdf_content(file_path) | |
| aggregated_text.append(f"=== File: {Path(file_path).name} ===\n{text}") | |
| aggregated_tables.extend(tables) | |
| aggregated_images.extend(images) | |
| # Convert tables to HTML with advanced styling | |
| table_html = "" | |
| for idx, table in enumerate(aggregated_tables): | |
| table_html += f"<h3>Table {idx + 1}</h3>" | |
| table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered") | |
| # Return outputs | |
| return "\n".join(aggregated_text), table_html, aggregated_images | |
| # Custom CSS for advanced styling | |
| custom_css = """ | |
| .gradio-container { | |
| max-width: 1200px; | |
| margin: auto; | |
| } | |
| .table { | |
| width: 100%; | |
| margin-bottom: 1rem; | |
| color: #212529; | |
| } | |
| .table-striped tbody tr:nth-of-type(odd) { | |
| background-color: rgba(0, 0, 0, 0.05); | |
| } | |
| .table-bordered { | |
| border: 1px solid #dee2e6; | |
| } | |
| .table-bordered th, | |
| .table-bordered td { | |
| border: 1px solid #dee2e6; | |
| } | |
| .gallery { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 10px; | |
| } | |
| .gallery img { | |
| max-width: 100%; | |
| height: auto; | |
| border-radius: 5px; | |
| box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
| } | |
| .scrollable { | |
| max-height: 400px; /* Fixed height for vertical scrolling */ | |
| max-width: 100%; /* Ensure the width is constrained */ | |
| overflow-y: auto; /* Enable vertical scrolling */ | |
| overflow-x: auto; /* Enable horizontal scrolling */ | |
| white-space: pre-wrap; /* Preserve whitespace and wrap text */ | |
| word-wrap: break-word; /* Break long words if necessary */ | |
| border: 1px solid #ddd; | |
| padding: 10px; | |
| border-radius: 5px; | |
| } | |
| .row { | |
| display: flex; | |
| gap: 20px; | |
| margin-bottom: 20px; | |
| } | |
| .column { | |
| flex: 1; | |
| } | |
| .center { | |
| text-align: center; | |
| margin: auto; | |
| width: 80%; | |
| } | |
| """ | |
| # Create Gradio Interface | |
| with gr.Blocks(css=custom_css) as demo: | |
| gr.Markdown("# Advanced PDF Content Extractor") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Extracted Text") | |
| text_output = gr.Textbox( | |
| label="Text", | |
| lines=15, | |
| interactive=False, | |
| elem_classes="scrollable" # Apply scrollable class | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### Extracted Images") | |
| image_gallery = gr.Gallery( | |
| label="Images", | |
| columns=4, | |
| height="auto", | |
| elem_classes="scrollable" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Extracted Tables") | |
| table_output = gr.HTML( | |
| label="Tables", | |
| elem_classes="scrollable center" | |
| ) | |
| # Main function call | |
| pdf_input.change( | |
| fn=process_multiple_pdfs, | |
| inputs=pdf_input, | |
| outputs=[text_output, table_output, image_gallery] | |
| ) | |
| # Launch the Gradio app | |
| demo.launch() |