Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

3e9e19b

verified ·

1 Parent(s): c6d5ac7

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -5

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import gradio as gr
 import pandas as pd
 import time
-from langchain.document_loaders import UnstructuredFileLoader
 import spaces
 def extract_text_with_langchain_pdf(pdf_file):
     """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
-    loader = UnstructuredFileLoader(pdf_file.name)
     documents = loader.load()
     # Collect text per page and return as a list of tuples (page_num, paragraph)
@@ -23,7 +23,7 @@ def extract_text_with_langchain_pdf(pdf_file):
 def process_pdf_with_batches(pdf_file, batch_size, wait_time):
     """Extract text, split into batches, and store in a DataFrame."""
     extracted_data = extract_text_with_langchain_pdf(pdf_file)
-    doc_name = pdf_file.name.split("/")[-1]
     # Create a DataFrame from the extracted data
     df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
@@ -50,7 +50,7 @@ with gr.Blocks() as demo:
         gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
     with gr.Row():
-        pdf_file = gr.File(label="Upload PDF", type="file")
     with gr.Row():
         batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
@@ -62,7 +62,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
         download_button = gr.File(label="Download Extracted CSV")
     @spaces.GPU
     def on_extract(pdf_file, batch_size, wait_time):
         """Callback function to extract text, display batches, and save CSV."""

 import gradio as gr
 import pandas as pd
 import time
+from langchain_community.document_loaders import UnstructuredFileLoader  # Updated import
 import spaces
 def extract_text_with_langchain_pdf(pdf_file):
     """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
+    loader = UnstructuredFileLoader(pdf_file)  # Pass the filepath directly
     documents = loader.load()
     # Collect text per page and return as a list of tuples (page_num, paragraph)
 def process_pdf_with_batches(pdf_file, batch_size, wait_time):
     """Extract text, split into batches, and store in a DataFrame."""
     extracted_data = extract_text_with_langchain_pdf(pdf_file)
+    doc_name = pdf_file.split("/")[-1]
     # Create a DataFrame from the extracted data
     df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
         gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
     with gr.Row():
+        pdf_file = gr.File(label="Upload PDF", type="filepath")  # Updated type to 'filepath'
     with gr.Row():
         batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
     with gr.Row():
         output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
         download_button = gr.File(label="Download Extracted CSV")
     @spaces.GPU
     def on_extract(pdf_file, batch_size, wait_time):
         """Callback function to extract text, display batches, and save CSV."""