abakerdp commited on
Commit
597fa2d
·
verified ·
1 Parent(s): 84f5641

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -0
app.py CHANGED
@@ -34,7 +34,27 @@ index = pc.Index('pdf-index')
34
 
35
  def process_pdf(file):
36
  # Read PDF content
 
 
 
 
37
  pdf_path = file.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  pdf_file = io.BytesIO(pdf_content)
39
  reader = PyPDF2.PdfReader(pdf_file)
40
 
 
34
 
35
  def process_pdf(file):
36
  # Read PDF content
37
+
38
+ # Function to extract text from the PDF file using PyPDF2
39
+ def process_pdf(file):
40
+ # Get the file path from the 'file' attribute (Gradio passes file as a temporary file)
41
  pdf_path = file.name
42
+
43
+ # Open the PDF file in read-binary mode
44
+ with open(pdf_path, 'rb') as f:
45
+ # Create a PdfReader object
46
+ pdf_reader = PyPDF2.PdfReader(f)
47
+
48
+ # Initialize an empty string to hold the extracted text
49
+ pdf_content = ""
50
+
51
+ # Loop through all pages in the PDF and extract text
52
+ for page_num in range(len(pdf_reader.pages)):
53
+ page = pdf_reader.pages[page_num]
54
+ pdf_content += page.extract_text() # Extract text from each page
55
+
56
+ return pdf_content
57
+
58
  pdf_file = io.BytesIO(pdf_content)
59
  reader = PyPDF2.PdfReader(pdf_file)
60