Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pytesseract | |
| from PIL import Image | |
| import docx | |
| import pdf2image | |
| # Set Tesseract path if not set already | |
| pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
| def extract_text_from_image_pdf(pdf_file): | |
| """Extracts text from a PDF by converting it to images and performing OCR.""" | |
| # Read the PDF file | |
| with open(pdf_file, 'rb') as f: | |
| pdf_bytes = f.read() | |
| # Extract images from the PDF | |
| images = pdf2image.convert_from_bytes(pdf_bytes) | |
| # Perform OCR on each image and combine the text | |
| extracted_text = '' | |
| for image in images: | |
| text = pytesseract.image_to_string(image) | |
| extracted_text += text + '\n' # Add newline for better readability | |
| return extracted_text | |
| def main(): | |
| """Streamlit app for converting PDF images to text.""" | |
| # Title and description | |
| st.title("PDF to Text Converter") | |
| st.subheader("Convert your PDF images to editable text documents.") | |
| # Upload PDF file | |
| uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf") | |
| if uploaded_file is not None: | |
| # Extract text from the PDF | |
| extracted_text = extract_text_from_image_pdf(uploaded_file.name) | |
| # Display extracted text | |
| st.success("Text extracted from PDF:") | |
| st.write(extracted_text) | |
| # Download option (optional) | |
| if st.button("Download text as .txt file"): | |
| with open("extracted_text.txt", "w") as f: | |
| f.write(extracted_text) | |
| st.success("Text downloaded!") | |
| if __name__ == "__main__": | |
| main() |