Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import tempfile | |
| import pytesseract | |
| import PyPDF2 | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| def extract_text(file_path): | |
| text = "" | |
| image_text = "" | |
| with open(file_path, "rb") as pdf_file: | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| num_pages = len(pdf_reader.pages) | |
| for page_number in range(num_pages): | |
| # st.write(f"Page {page_number + 1}") | |
| page = pdf_reader.pages[page_number] | |
| text += page.extract_text() | |
| images = convert_from_path(file_path) # Convert PDF pages to images | |
| for i, image in enumerate(images): | |
| # st.write(f"Page {i + 1}") | |
| image_text += pytesseract.image_to_string(image) | |
| # st.write("text") | |
| # st.write(text) | |
| # st.write("image_text") | |
| # st.write(image_text) | |
| text = text + image_text | |
| st.write("plus") | |
| st.write(text) # Display the extracted text from the image | |
| def main(): | |
| st.title("PDF Text Extractor") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| if uploaded_file is not None: | |
| with tempfile.NamedTemporaryFile(delete=False) as temp_file: | |
| temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path | |
| st.success("File successfully uploaded. Click below to extract text.") | |
| st.button("Extract Text", on_click=extract_text, args=(temp_file.name,)) | |
| if __name__ == "__main__": | |
| main() |