Spaces:

Gopikanth123
/

pdf_content_extraction

Sleeping

App Files Files Community

Gopikanth123 commited on Mar 10, 2025

Commit

7eaea00

verified ·

1 Parent(s): da708c5

Create app.py

Browse files

Files changed (1) hide show

app.py +165 -0

app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import fitz  # PyMuPDF
+import os
+import pandas as pd
+import pdfplumber
+import gradio as gr
+import time
+from pathlib import Path
+import shutil
+# Function to extract content from PDF
+def extract_pdf_content(file_path):
+    # Open the PDF
+    pdf_file = fitz.open(file_path)
+    page_nums = len(pdf_file)
+    # Ensure images directory exists
+    images_dir = "temp_images"
+    if os.path.exists(images_dir):
+        shutil.rmtree(images_dir)  # Clean up previous images
+    os.makedirs(images_dir)
+    # Store extracted content
+    all_text = []
+    all_tables = []
+    images_list = []
+    # Extract text, tables, and images
+    for page_num in range(page_nums):
+        page_content = pdf_file[page_num]
+        # Extract text
+        text = page_content.get_text("text")
+        all_text.append(f"--- Page {page_num + 1} ---\n{text}")
+        # Extract tables using pdfplumber
+        with pdfplumber.open(file_path) as pdf:
+            tables = pdf.pages[page_num].extract_tables()
+            for table in tables:
+                df = pd.DataFrame(table)
+                all_tables.append(df)
+        # Extract images
+        images_list.extend(page_content.get_images(full=True))
+    # Save extracted images
+    image_paths = []
+    if images_list:
+        for i, image in enumerate(images_list, start=1):
+            xref = image[0]
+            base_image = pdf_file.extract_image(xref)
+            image_bytes = base_image["image"]
+            image_ext = base_image["ext"]
+            image_name = f"{images_dir}/image_{i}.{image_ext}"
+            image_paths.append(image_name)
+            with open(image_name, "wb") as image_file:
+                image_file.write(image_bytes)
+    # Close the PDF file
+    pdf_file.close()
+    return "\n".join(all_text), all_tables, image_paths
+# Gradio Interface
+def display_pdf_content(file_path, progress=gr.Progress()):
+    # Extract content with progress updates
+    progress(0, desc="Starting extraction...")
+    time.sleep(1)
+    progress(0.25, desc="Extracting text...")
+    text, tables, images = extract_pdf_content(file_path)
+    progress(0.5, desc="Extracting tables...")
+    time.sleep(1)
+    progress(0.75, desc="Extracting images...")
+    time.sleep(1)
+    progress(1.0, desc="Extraction complete!")
+    # Convert tables to HTML with advanced styling
+    table_html = ""
+    for idx, table in enumerate(tables):
+        table_html += f"<h3>Table {idx + 1}</h3>"
+        table_html += table.to_html(index=False, border=1, classes="table table-striped table-bordered")
+    # Return outputs
+    return text, table_html, images
+# Custom CSS for advanced styling
+custom_css = """
+    .gradio-container {
+        max-width: 1200px;
+        margin: auto;
+    }
+    .table {
+        width: 100%;
+        margin-bottom: 1rem;
+        color: #212529;
+    }
+    .table-striped tbody tr:nth-of-type(odd) {
+        background-color: rgba(0, 0, 0, 0.05);
+    }
+    .table-bordered {
+        border: 1px solid #dee2e6;
+    }
+    .table-bordered th,
+    .table-bordered td {
+        border: 1px solid #dee2e6;
+    }
+    .gallery {
+        display: flex;
+        flex-wrap: wrap;
+        gap: 10px;
+    }
+    .gallery img {
+        max-width: 100%;
+        height: auto;
+        border-radius: 5px;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+    }
+    .scrollable {
+        max-height: 400px;
+        overflow-y: auto;
+        border: 1px solid #ddd;
+        padding: 10px;
+        border-radius: 5px;
+    }
+    .row {
+        display: flex;
+        gap: 20px;
+        margin-bottom: 20px;
+    }
+    .column {
+        flex: 1;
+    }
+    .center {
+        text-align: center;
+        margin: auto;
+        width: 80%;
+    }
+"""
+# Create Gradio Interface
+with gr.Blocks(css=custom_css) as demo:
+    gr.Markdown("# Advanced PDF Content Extractor")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Extracted Text")
+            text_output = gr.Textbox(label="Text", lines=15, interactive=False, elem_classes="scrollable")
+        with gr.Column():
+            gr.Markdown("### Extracted Images")
+            image_gallery = gr.Gallery(label="Images", columns=4, height="auto", elem_classes="scrollable")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Extracted Tables")
+            table_output = gr.HTML(label="Tables", elem_classes="scrollable center")
+    # Main function call
+    pdf_input.change(
+        fn=display_pdf_content,
+        inputs=pdf_input,
+        outputs=[text_output, table_output, image_gallery]
+    )
+# Launch the Gradio app
+demo.launch()