Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

de88355

verified ·

1 Parent(s): d2db20e

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -57

app.py CHANGED Viewed

@@ -1,78 +1,45 @@
 import gradio as gr
-import pandas as pd
-import time
-from langchain_community.document_loaders import UnstructuredFileLoader  # Updated import
 def extract_text_with_langchain_pdf(pdf_file):
     """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
-    loader = UnstructuredFileLoader(pdf_file)  # Pass the filepath directly
     documents = loader.load()
-    # Collect text per page and return as a list of tuples (page_num, paragraph)
-    extracted_data = []
     for doc in documents:
-        page_num = doc.metadata.get("page_number", "Unknown")  # Extract page number if available
-        paragraphs = doc.page_content.split("\n\n")  # Split text by paragraphs
-        for para in paragraphs:
-            if para.strip():  # Skip empty paragraphs
-                extracted_data.append((page_num, para.strip()))
-    return extracted_data
-def process_pdf_with_batches(pdf_file, batch_size, wait_time):
-    """Extract text, split into batches, and store in a DataFrame."""
-    extracted_data = extract_text_with_langchain_pdf(pdf_file)
-    doc_name = pdf_file.split("/")[-1]
-    # Create a DataFrame from the extracted data
-    df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
-    df["Document"] = doc_name  # Add document name as a column
-    # Split the DataFrame into batches for display
-    batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]
-    output = []
-    for idx, batch in enumerate(batches):
-        output.append(f"Batch {idx + 1}:\n{batch.to_string(index=False)}")
-        time.sleep(wait_time)  # Wait between batches
-    return df, "\n\n".join(output)
-def save_csv(df):
-    """Save the extracted DataFrame to a CSV file."""
-    output_path = "extracted_content.csv"
-    df.to_csv(output_path, index=False)
-    return output_path
 with gr.Blocks() as demo:
     with gr.Row():
-        gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
     with gr.Row():
-        pdf_file = gr.File(label="Upload PDF", type="filepath")  # Updated type to 'filepath'
     with gr.Row():
-        batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
-        wait_time = gr.Slider(label="Wait Time (seconds)", value=2, minimum=0, maximum=10, step=0.5)
     with gr.Row():
-        extract_button = gr.Button("Extract and Save CSV")
-    with gr.Row():
-        output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
-        download_button = gr.File(label="Download Extracted CSV")
-    def on_extract(pdf_file, batch_size, wait_time):
-        """Callback function to extract text, display batches, and save CSV."""
-        df, batch_output = process_pdf_with_batches(pdf_file, int(batch_size), wait_time)
-        csv_path = save_csv(df)
-        return batch_output, csv_path
-    extract_button.click(
-        on_extract,
-        inputs=[pdf_file, batch_size, wait_time],
-        outputs=[output_text, download_button]
-    )
-# Launch the Gradio app
-demo.queue().launch()

 import gradio as gr
+from langchain_community.document_loaders import UnstructuredFileLoader
 def extract_text_with_langchain_pdf(pdf_file):
     """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
+    loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
     documents = loader.load()
+    # Concatenate the content from all pages with page numbers
+    pdf_content = ""
     for doc in documents:
+        page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
+        pdf_content += f"\n\n--- Page {page_num} ---\n{doc.page_content.strip()}\n"
+    return pdf_content
+def save_text_to_file(text, output_filename="extracted_content.txt"):
+    """Save extracted text to a .txt file."""
+    with open(output_filename, "w", encoding="utf-8") as f:
+        f.write(text)
+    return output_filename
 with gr.Blocks() as demo:
     with gr.Row():
+        gr.Markdown("# PDF Text Extractor with Page Numbers")
     with gr.Row():
+        pdf_file = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
+        extract_button = gr.Button("Extract and Download Text")
     with gr.Row():
+        download_button = gr.File(label="Download Extracted Text")
+    def on_extract(pdf_file):
+        """Callback function to extract text with page numbers and return a downloadable .txt file."""
+        extracted_text = extract_text_with_langchain_pdf(pdf_file)
+        txt_path = save_text_to_file(extracted_text)
+        return txt_path
+    extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
+# Launch the Gradio
+demo.queue().launch()