Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -8,22 +8,24 @@ from PIL import Image
|
|
| 8 |
|
| 9 |
|
| 10 |
def extract_text(file_path):
|
|
|
|
| 11 |
with open(file_path, "rb") as pdf_file:
|
| 12 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 13 |
num_pages = len(pdf_reader.pages)
|
| 14 |
|
| 15 |
for page_number in range(num_pages):
|
| 16 |
-
st.write(f"Page {page_number + 1}")
|
| 17 |
page = pdf_reader.pages[page_number]
|
| 18 |
text = page.extract_text()
|
| 19 |
|
| 20 |
images = convert_from_path(file_path) # Convert PDF pages to images
|
| 21 |
for i, image in enumerate(images):
|
| 22 |
-
st.write(f"Page {i + 1}")
|
| 23 |
image_text = pytesseract.image_to_string(image)
|
| 24 |
-
|
| 25 |
-
text
|
| 26 |
-
|
|
|
|
| 27 |
|
| 28 |
def main():
|
| 29 |
st.title("PDF Text Extractor")
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
def extract_text(file_path):
|
| 11 |
+
text = ''
|
| 12 |
with open(file_path, "rb") as pdf_file:
|
| 13 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 14 |
num_pages = len(pdf_reader.pages)
|
| 15 |
|
| 16 |
for page_number in range(num_pages):
|
| 17 |
+
# st.write(f"Page {page_number + 1}")
|
| 18 |
page = pdf_reader.pages[page_number]
|
| 19 |
text = page.extract_text()
|
| 20 |
|
| 21 |
images = convert_from_path(file_path) # Convert PDF pages to images
|
| 22 |
for i, image in enumerate(images):
|
| 23 |
+
# st.write(f"Page {i + 1}")
|
| 24 |
image_text = pytesseract.image_to_string(image)
|
| 25 |
+
|
| 26 |
+
text = text + image_text
|
| 27 |
+
|
| 28 |
+
st.write(text) # Display the extracted text from the image
|
| 29 |
|
| 30 |
def main():
|
| 31 |
st.title("PDF Text Extractor")
|