Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

ba8b960

verified ·

1 Parent(s): 49bcd81

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -63

app.py CHANGED Viewed

@@ -4,6 +4,11 @@ import io
 import tempfile
 import os
 from langchain_community.document_loaders import PyPDFLoader
 # Create a temporary directory for storing download files
 temp_dir = tempfile.TemporaryDirectory()
@@ -11,14 +16,16 @@ temp_dir = tempfile.TemporaryDirectory()
 def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
     """
     Extract text from a PDF page by page using LangChain's PyPDFLoader.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         start_page (int, optional): The starting page number for extraction (1-based index).
         end_page (int, optional): The ending page number for extraction (1-based index).
     Returns:
-        tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
     """
     try:
         # Initialize the loader
@@ -49,29 +56,37 @@ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=Non
             start_page = 1
             end_page = total_pages
-        # Concatenate selected page contents into a single string
-        pdf_pages_content = '\n'.join(doc.page_content for doc in selected_docs)
-        extracted_data = []
         for idx, doc in enumerate(selected_docs, start=start_page):
-            # Assign the actual page number
             page_num = idx
-            # Split content into paragraphs
-            paragraphs = doc.page_content.split("\n\n")  # Split into paragraphs
-            for paragraph in paragraphs:
-                clean_para = paragraph.strip()
-                if clean_para:
-                    extracted_data.append({
                         "Document": doc_name,
                         "Page": page_num,
-                        "Paragraph": clean_para
                     })
-        df = pd.DataFrame(extracted_data)
-        return df, pdf_pages_content
     except Exception as e:
         raise RuntimeError(f"Error during PDF extraction: {e}")
@@ -95,34 +110,21 @@ def df_to_csv_bytes(df):
     except Exception as e:
         raise RuntimeError(f"Error during CSV conversion: {e}")
-def text_to_txt_bytes(text):
-    """
-    Convert text to TXT in bytes.
-    Args:
-        text (str): The text to convert.
-    Returns:
-        bytes: TXT data in bytes.
-    """
-    try:
-        txt_data = text.encode('utf-8')
-        return txt_data
-    except Exception as e:
-        raise RuntimeError(f"Error during TXT conversion: {e}")
 def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
     """
-    Callback function to extract text from PDF and return CSV and TXT data.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         extraction_mode (str): "All Pages" or "Range of Pages".
         start_page (float): Starting page number for extraction.
         end_page (float): Ending page number for extraction.
     Returns:
-        tuple: Paths to CSV and TXT files, Status message.
     """
     if not pdf_file_path:
         return None, None, "No file uploaded."
@@ -136,37 +138,36 @@ def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
             selected_start = start_page
             selected_end = end_page
-        # Extract text and create DataFrame
-        df, full_text = extract_text_with_py_pdf_loader(
             pdf_file_path,
             start_page=selected_start,
             end_page=selected_end
         )
-        # Convert DataFrame to CSV bytes
-        csv_bytes = df_to_csv_bytes(df)
-        csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_extracted.csv"
-        # Convert full text to TXT bytes
-        txt_bytes = text_to_txt_bytes(full_text)
-        txt_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_full_text.txt"
         # Define full paths within the temporary directory
-        csv_tmp_path = os.path.join(temp_dir.name, csv_filename)
-        txt_tmp_path = os.path.join(temp_dir.name, txt_filename)
-        # Write CSV bytes to temporary file
-        with open(csv_tmp_path, 'wb') as csv_tmp:
-            csv_tmp.write(csv_bytes)
-        # Write TXT bytes to temporary file
-        with open(txt_tmp_path, 'wb') as txt_tmp:
-            txt_tmp.write(txt_bytes)
-        # Return the paths to the temporary files and a success message
         return (
-            csv_tmp_path,
-            txt_tmp_path,
             "Extraction successful!"
         )
     except Exception as e:
@@ -221,12 +222,12 @@ with gr.Blocks() as demo:
         extract_button = gr.Button("Extract and Download")
     with gr.Row():
-        csv_download = gr.File(
-            label="Download Extracted CSV",
             interactive=False
         )
-        txt_download = gr.File(
-            label="Download Full Text",
             interactive=False
         )
@@ -240,7 +241,7 @@ with gr.Blocks() as demo:
     extract_button.click(
         fn=on_extract,
         inputs=[pdf_input, extraction_mode, start_page, end_page],
-        outputs=[csv_download, txt_download, status_output]
     )
     gr.Markdown("""

 import tempfile
 import os
 from langchain_community.document_loaders import PyPDFLoader
+import nltk
+from nltk.tokenize import sent_tokenize
+# Download NLTK's punkt tokenizer if not already downloaded
+nltk.download('punkt')
 # Create a temporary directory for storing download files
 temp_dir = tempfile.TemporaryDirectory()
 def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
     """
     Extract text from a PDF page by page using LangChain's PyPDFLoader.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         start_page (int, optional): The starting page number for extraction (1-based index).
         end_page (int, optional): The ending page number for extraction (1-based index).
     Returns:
+        tuple:
+            - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
+            - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
     """
     try:
         # Initialize the loader
             start_page = 1
             end_page = total_pages
+        # Initialize lists to store data
+        page_data = []
+        sentence_data = []
         for idx, doc in enumerate(selected_docs, start=start_page):
             page_num = idx
+            text = doc.page_content.strip()
+            # Append page-wise data
+            page_data.append({
+                "Document": doc_name,
+                "Page": page_num,
+                "Text": text
+            })
+            # Sentence tokenization
+            sentences = sent_tokenize(text)
+            for sentence in sentences:
+                sentence = sentence.strip()
+                if sentence:
+                    sentence_data.append({
                         "Document": doc_name,
                         "Page": page_num,
+                        "Sentence": sentence
                     })
+        # Create DataFrames
+        page_df = pd.DataFrame(page_data)
+        sentence_df = pd.DataFrame(sentence_data)
+        return page_df, sentence_df
     except Exception as e:
         raise RuntimeError(f"Error during PDF extraction: {e}")
     except Exception as e:
         raise RuntimeError(f"Error during CSV conversion: {e}")
 def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
     """
+    Callback function to extract text from PDF and return CSV data.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         extraction_mode (str): "All Pages" or "Range of Pages".
         start_page (float): Starting page number for extraction.
         end_page (float): Ending page number for extraction.
     Returns:
+        tuple:
+            - page_csv_path (str): Path to the page-wise CSV file.
+            - sentence_csv_path (str): Path to the sentence-wise CSV file.
+            - status_message (str): Status of the extraction process.
     """
     if not pdf_file_path:
         return None, None, "No file uploaded."
             selected_start = start_page
             selected_end = end_page
+        # Extract text and create DataFrames
+        page_df, sentence_df = extract_text_with_py_pdf_loader(
             pdf_file_path,
             start_page=selected_start,
             end_page=selected_end
         )
+        # Convert DataFrames to CSV bytes
+        page_csv_bytes = df_to_csv_bytes(page_df)
+        sentence_csv_bytes = df_to_csv_bytes(sentence_df)
+        # Define CSV filenames
+        page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv"
+        sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv"
         # Define full paths within the temporary directory
+        page_csv_path = os.path.join(temp_dir.name, page_csv_filename)
+        sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename)
+        # Write CSV bytes to temporary files
+        with open(page_csv_path, 'wb') as page_csv_file:
+            page_csv_file.write(page_csv_bytes)
+        with open(sentence_csv_path, 'wb') as sentence_csv_file:
+            sentence_csv_file.write(sentence_csv_bytes)
+        # Return the paths to the temporary CSV files and a success message
         return (
+            page_csv_path,
+            sentence_csv_path,
             "Extraction successful!"
         )
     except Exception as e:
         extract_button = gr.Button("Extract and Download")
     with gr.Row():
+        page_csv_download = gr.File(
+            label="Download Page-wise CSV",
             interactive=False
         )
+        sentence_csv_download = gr.File(
+            label="Download Sentence-wise CSV",
             interactive=False
         )
     extract_button.click(
         fn=on_extract,
         inputs=[pdf_input, extraction_mode, start_page, end_page],
+        outputs=[page_csv_download, sentence_csv_download, status_output]
     )
     gr.Markdown("""