Spaces:

abakerdp
/

RAGtimeSearch

Sleeping

abakerdp commited on Nov 11, 2024

Commit

597fa2d

verified ·

1 Parent(s): 84f5641

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -34,7 +34,27 @@ index = pc.Index('pdf-index')
 def process_pdf(file):
     # Read PDF content
     pdf_path = file.name
     pdf_file = io.BytesIO(pdf_content)
     reader = PyPDF2.PdfReader(pdf_file)

 def process_pdf(file):
     # Read PDF content
+# Function to extract text from the PDF file using PyPDF2
+def process_pdf(file):
+    # Get the file path from the 'file' attribute (Gradio passes file as a temporary file)
     pdf_path = file.name
+    # Open the PDF file in read-binary mode
+    with open(pdf_path, 'rb') as f:
+        # Create a PdfReader object
+        pdf_reader = PyPDF2.PdfReader(f)
+        # Initialize an empty string to hold the extracted text
+        pdf_content = ""
+        # Loop through all pages in the PDF and extract text
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            pdf_content += page.extract_text()  # Extract text from each page
+    return pdf_content
     pdf_file = io.BytesIO(pdf_content)
     reader = PyPDF2.PdfReader(pdf_file)