Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

c0ce244

verified ·

1 Parent(s): de88355

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -19

app.py CHANGED Viewed

@@ -1,45 +1,58 @@
 import gradio as gr
 from langchain_community.document_loaders import UnstructuredFileLoader
 def extract_text_with_langchain_pdf(pdf_file):
-    """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
     loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
     documents = loader.load()
-    # Concatenate the content from all pages with page numbers
-    pdf_content = ""
     for doc in documents:
         page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
-        pdf_content += f"\n\n--- Page {page_num} ---\n{doc.page_content.strip()}\n"
-    return pdf_content
-def save_text_to_file(text, output_filename="extracted_content.txt"):
-    """Save extracted text to a .txt file."""
-    with open(output_filename, "w", encoding="utf-8") as f:
-        f.write(text)
     return output_filename
 with gr.Blocks() as demo:
     with gr.Row():
-        gr.Markdown("# PDF Text Extractor with Page Numbers")
     with gr.Row():
         pdf_file = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
-        extract_button = gr.Button("Extract and Download Text")
     with gr.Row():
-        download_button = gr.File(label="Download Extracted Text")
     def on_extract(pdf_file):
-        """Callback function to extract text with page numbers and return a downloadable .txt file."""
-        extracted_text = extract_text_with_langchain_pdf(pdf_file)
-        txt_path = save_text_to_file(extracted_text)
-        return txt_path
     extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
-# Launch the Gradio
 demo.queue().launch()

 import gradio as gr
+import pandas as pd
 from langchain_community.document_loaders import UnstructuredFileLoader
 def extract_text_with_langchain_pdf(pdf_file):
+    """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
     loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
     documents = loader.load()
+    # Initialize an empty list to collect all extracted paragraphs
+    extracted_data = []
+    # Extract content for each page, split into paragraphs, and collect metadata
+    doc_name = pdf_file.split("/")[-1]  # Get the document name
     for doc in documents:
         page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
+        paragraphs = doc.page_content.split("\n\n")  # Split content by paragraphs
+        for paragraph in paragraphs:
+            if paragraph.strip():  # Skip empty paragraphs
+                extracted_data.append({
+                    "Document": doc_name,
+                    "Page": page_num,
+                    "Paragraph": paragraph.strip()
+                })
+    # Convert the extracted data to a DataFrame
+    df = pd.DataFrame(extracted_data)
+    return df
+def save_df_to_csv(df, output_filename="extracted_content.csv"):
+    """Save the DataFrame to a CSV file."""
+    df.to_csv(output_filename, index=False)
     return output_filename
 with gr.Blocks() as demo:
     with gr.Row():
+        gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
     with gr.Row():
         pdf_file = gr.File(label="Upload PDF", type="filepath")
     with gr.Row():
+        extract_button = gr.Button("Extract and Download CSV")
     with gr.Row():
+        download_button = gr.File(label="Download Extracted CSV")
     def on_extract(pdf_file):
+        """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
+        df = extract_text_with_langchain_pdf(pdf_file)
+        csv_path = save_df_to_csv(df)
+        return csv_path
     extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
+# Launch the Gradio
 demo.queue().launch()