Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

7ae30ee

verified ·

1 Parent(s): 7f934fa

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -10

app.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import gradio as gr
 import pandas as pd
 import io
 from langchain_community.document_loaders import UnstructuredFileLoader
 def extract_text_with_langchain_pdf(pdf_file_path):
     """
     Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
@@ -18,7 +23,7 @@ def extract_text_with_langchain_pdf(pdf_file_path):
         documents = loader.load()
         extracted_data = []
-        doc_name = pdf_file_path.split("/")[-1]  # Extract document name
         # Concatenate all page contents into a single string
         pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
@@ -80,12 +85,12 @@ def text_to_txt_bytes(text):
 def on_extract(pdf_file_path):
     """
     Callback function to extract text from PDF and return CSV and TXT data.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
     Returns:
-        tuple: CSV download object, TXT download object, Status message.
     """
     if not pdf_file_path:
         return None, None, "No file uploaded."
@@ -96,16 +101,28 @@ def on_extract(pdf_file_path):
         # Convert DataFrame to CSV bytes
         csv_bytes = df_to_csv_bytes(df)
-        csv_filename = f"{pdf_file_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_extracted.csv"
         # Convert full text to TXT bytes
         txt_bytes = text_to_txt_bytes(full_text)
-        txt_filename = f"{pdf_file_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_full_text.txt"
-        # Return CSV and TXT files along with a success message
         return (
-            (csv_bytes, csv_filename),
-            (txt_bytes, txt_filename),
             "Extraction successful!"
         )
     except Exception as e:
@@ -118,7 +135,7 @@ with gr.Blocks() as demo:
         pdf_input = gr.File(
             label="Upload PDF",
             file_types=[".pdf"],
-            type="filepath",  # Using "filepath" as per Gradio's valid options
             interactive=True
         )

 import gradio as gr
 import pandas as pd
 import io
+import tempfile
+import os
 from langchain_community.document_loaders import UnstructuredFileLoader
+# Create a temporary directory for storing download files
+temp_dir = tempfile.TemporaryDirectory()
 def extract_text_with_langchain_pdf(pdf_file_path):
     """
     Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
         documents = loader.load()
         extracted_data = []
+        doc_name = os.path.basename(pdf_file_path)  # Extract document name
         # Concatenate all page contents into a single string
         pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
 def on_extract(pdf_file_path):
     """
     Callback function to extract text from PDF and return CSV and TXT data.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
     Returns:
+        tuple: Paths to CSV and TXT files, Status message.
     """
     if not pdf_file_path:
         return None, None, "No file uploaded."
         # Convert DataFrame to CSV bytes
         csv_bytes = df_to_csv_bytes(df)
+        csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_extracted.csv"
         # Convert full text to TXT bytes
         txt_bytes = text_to_txt_bytes(full_text)
+        txt_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_full_text.txt"
+        # Define full paths within the temporary directory
+        csv_tmp_path = os.path.join(temp_dir.name, csv_filename)
+        txt_tmp_path = os.path.join(temp_dir.name, txt_filename)
+        # Write CSV bytes to temporary file
+        with open(csv_tmp_path, 'wb') as csv_tmp:
+            csv_tmp.write(csv_bytes)
+        # Write TXT bytes to temporary file
+        with open(txt_tmp_path, 'wb') as txt_tmp:
+            txt_tmp.write(txt_bytes)
+        # Return the paths to the temporary files and a success message
         return (
+            csv_tmp_path,
+            txt_tmp_path,
             "Extraction successful!"
         )
     except Exception as e:
         pdf_input = gr.File(
             label="Upload PDF",
             file_types=[".pdf"],
+            type="filepath",  # Ensure type is set to "filepath"
             interactive=True
         )