kxx-kkk commited on
Commit
79de481
·
verified ·
1 Parent(s): e530c33

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -8,22 +8,24 @@ from PIL import Image
8
 
9
 
10
  def extract_text(file_path):
 
11
  with open(file_path, "rb") as pdf_file:
12
  pdf_reader = PyPDF2.PdfReader(pdf_file)
13
  num_pages = len(pdf_reader.pages)
14
 
15
  for page_number in range(num_pages):
16
- st.write(f"Page {page_number + 1}")
17
  page = pdf_reader.pages[page_number]
18
  text = page.extract_text()
19
 
20
  images = convert_from_path(file_path) # Convert PDF pages to images
21
  for i, image in enumerate(images):
22
- st.write(f"Page {i + 1}")
23
  image_text = pytesseract.image_to_string(image)
24
-
25
- text += image_text
26
- st.write(text) # Display the extracted text from the image
 
27
 
28
  def main():
29
  st.title("PDF Text Extractor")
 
8
 
9
 
10
  def extract_text(file_path):
11
+ text = ''
12
  with open(file_path, "rb") as pdf_file:
13
  pdf_reader = PyPDF2.PdfReader(pdf_file)
14
  num_pages = len(pdf_reader.pages)
15
 
16
  for page_number in range(num_pages):
17
+ # st.write(f"Page {page_number + 1}")
18
  page = pdf_reader.pages[page_number]
19
  text = page.extract_text()
20
 
21
  images = convert_from_path(file_path) # Convert PDF pages to images
22
  for i, image in enumerate(images):
23
+ # st.write(f"Page {i + 1}")
24
  image_text = pytesseract.image_to_string(image)
25
+
26
+ text = text + image_text
27
+
28
+ st.write(text) # Display the extracted text from the image
29
 
30
  def main():
31
  st.title("PDF Text Extractor")