Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

e56c8a4

verified ·

1 Parent(s): 9847598

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -9

app.py CHANGED Viewed

@@ -3,14 +3,14 @@ import pandas as pd
 import io
 import tempfile
 import os
-from langchain_community.document_loaders import UnstructuredPDFLoader
 # Create a temporary directory for storing download files
 temp_dir = tempfile.TemporaryDirectory()
-def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=None):
     """
-    Extract text from a PDF page by page using LangChain's UnstructuredPDFLoader.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
@@ -21,9 +21,9 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
         tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
     """
     try:
-        # Initialize the loader with split_pages=True to ensure each page is a separate document
-        loader = UnstructuredPDFLoader(pdf_file_path, split_pages=True)
-        documents = loader.load()
         total_pages = len(documents)
         doc_name = os.path.basename(pdf_file_path)  # Extract document name
@@ -54,9 +54,9 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
         extracted_data = []
-        for idx, doc in enumerate(selected_docs, start=1):
             # Assign the actual page number
-            page_num = start_page + idx - 1
             # Split content into paragraphs
             paragraphs = doc.page_content.split("\n\n")  # Split into paragraphs
@@ -137,7 +137,7 @@ def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
             selected_end = end_page
         # Extract text and create DataFrame
-        df, full_text = extract_text_with_langchain_pdf(
             pdf_file_path,
             start_page=selected_start,
             end_page=selected_end

 import io
 import tempfile
 import os
+from langchain.document_loaders import PyPDFLoader  # Updated import
 # Create a temporary directory for storing download files
 temp_dir = tempfile.TemporaryDirectory()
+def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
     """
+    Extract text from a PDF page by page using LangChain's PyPDFLoader.
     Args:
         pdf_file_path (str): The file path to the uploaded PDF.
         tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
     """
     try:
+        # Initialize the loader
+        loader = PyPDFLoader(pdf_file_path)
+        documents = loader.load_and_split()  # Each document corresponds to a single page
         total_pages = len(documents)
         doc_name = os.path.basename(pdf_file_path)  # Extract document name
         extracted_data = []
+        for idx, doc in enumerate(selected_docs, start=start_page):
             # Assign the actual page number
+            page_num = idx
             # Split content into paragraphs
             paragraphs = doc.page_content.split("\n\n")  # Split into paragraphs
             selected_end = end_page
         # Extract text and create DataFrame
+        df, full_text = extract_text_with_py_pdf_loader(
             pdf_file_path,
             start_page=selected_start,
             end_page=selected_end