Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,7 +34,27 @@ index = pc.Index('pdf-index')
|
|
| 34 |
|
| 35 |
def process_pdf(file):
|
| 36 |
# Read PDF content
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
pdf_path = file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
pdf_file = io.BytesIO(pdf_content)
|
| 39 |
reader = PyPDF2.PdfReader(pdf_file)
|
| 40 |
|
|
|
|
| 34 |
|
| 35 |
def process_pdf(file):
|
| 36 |
# Read PDF content
|
| 37 |
+
|
| 38 |
+
# Function to extract text from the PDF file using PyPDF2
|
| 39 |
+
def process_pdf(file):
|
| 40 |
+
# Get the file path from the 'file' attribute (Gradio passes file as a temporary file)
|
| 41 |
pdf_path = file.name
|
| 42 |
+
|
| 43 |
+
# Open the PDF file in read-binary mode
|
| 44 |
+
with open(pdf_path, 'rb') as f:
|
| 45 |
+
# Create a PdfReader object
|
| 46 |
+
pdf_reader = PyPDF2.PdfReader(f)
|
| 47 |
+
|
| 48 |
+
# Initialize an empty string to hold the extracted text
|
| 49 |
+
pdf_content = ""
|
| 50 |
+
|
| 51 |
+
# Loop through all pages in the PDF and extract text
|
| 52 |
+
for page_num in range(len(pdf_reader.pages)):
|
| 53 |
+
page = pdf_reader.pages[page_num]
|
| 54 |
+
pdf_content += page.extract_text() # Extract text from each page
|
| 55 |
+
|
| 56 |
+
return pdf_content
|
| 57 |
+
|
| 58 |
pdf_file = io.BytesIO(pdf_content)
|
| 59 |
reader = PyPDF2.PdfReader(pdf_file)
|
| 60 |
|