Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

9f42776

verified ·

1 Parent(s): ff0f9fc

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -65

app.py CHANGED Viewed

@@ -1,83 +1,58 @@
 import gradio as gr
 import pandas as pd
-import time
-import os
-from langchain_community.document_loaders import UnstructuredPDFLoader
-from PyPDF2 import PdfReader
-def extract_text_by_page(pdf_file_path):
-    """Extract text from each page of the PDF and return as a list of dictionaries."""
-    # Initialize PDF reader
-    reader = PdfReader(pdf_file_path)
-    num_pages = len(reader.pages)
-    doc_name = os.path.basename(pdf_file_path)
     extracted_data = []
-    for page_num in range(1, num_pages + 1):
-        print(f"Processing Page {page_num}...")
-        # Initialize the loader for the specific page
-        loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
-        documents = loader.load()
-        if not documents:
-            print(f"No content found on Page {page_num}.")
-            continue
-        for doc in documents:
-            paragraphs = doc.page_content.split("\n\n")  # Split text into paragraphs
-            for para in paragraphs:
-                if para.strip():  # Skip empty paragraphs
-                    extracted_data.append({
-                        "Document": doc_name,
-                        "Page": page_num,
-                        "Paragraph": para.strip()
-                    })
-        time.sleep(1)  # Optional: Introduce a small delay between pages
-    return extracted_data
-def save_to_csv(data, output_filename="extracted_content.csv"):
-    """Save extracted data to a CSV file."""
-    df = pd.DataFrame(data)
     df.to_csv(output_filename, index=False)
     return output_filename
-def extract_and_save(pdf_file):
-    """Main function to extract text and save to CSV."""
-    if pdf_file is None:
-        return "No file uploaded."
-    # Extract text by page
-    extracted_data = extract_text_by_page(pdf_file.name)
-    if not extracted_data:
-        return "No text extracted from the PDF."
-    # Save to CSV
-    csv_path = save_to_csv(extracted_data)
-    return csv_path
-# Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("# PDF Text Extractor with Page Tracking and CSV Export")
     with gr.Row():
-        pdf_input = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
         extract_button = gr.Button("Extract and Download CSV")
     with gr.Row():
-        download_csv = gr.File(label="Download Extracted CSV")
-    extract_button.click(
-        fn=extract_and_save,
-        inputs=pdf_input,
-        outputs=download_csv
-    )
 # Launch the Gradio app
 demo.queue().launch()

 import gradio as gr
 import pandas as pd
+from langchain_community.document_loaders import UnstructuredFileLoader
+def extract_text_with_langchain_pdf(pdf_file):
+    """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
+    loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
+    documents = loader.load()
+    # Initialize an empty list to collect all extracted paragraphs
     extracted_data = []
+    # Extract content for each page, split into paragraphs, and collect metadata
+    doc_name = pdf_file.split("/")[-1]  # Get the document name
+    for doc in documents:
+        page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
+        paragraphs = doc.page_content.split("\n\n")  # Split content by paragraphs
+        for paragraph in paragraphs:
+            if paragraph.strip():  # Skip empty paragraphs
+                extracted_data.append({
+                    "Document": doc_name,
+                    "Page": page_num,
+                    "Paragraph": paragraph.strip()
+                })
+    # Convert the extracted data to a DataFrame
+    df = pd.DataFrame(extracted_data)
+    return df
+def save_df_to_csv(df, output_filename="extracted_content.csv"):
+    """Save the DataFrame to a CSV file."""
     df.to_csv(output_filename, index=False)
     return output_filename
 with gr.Blocks() as demo:
     with gr.Row():
+        gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
+    with gr.Row():
+        pdf_file = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
         extract_button = gr.Button("Extract and Download CSV")
     with gr.Row():
+        download_button = gr.File(label="Download Extracted CSV")
+    def on_extract(pdf_file):
+        """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
+        df = extract_text_with_langchain_pdf(pdf_file)
+        csv_path = save_df_to_csv(df)
+        return csv_path
+    extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
 # Launch the Gradio app
 demo.queue().launch()