import streamlit as st import pytesseract from PIL import Image import docx import pdf2image # Set Tesseract path if not set already pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' def extract_text_from_image_pdf(pdf_file): """Extracts text from a PDF by converting it to images and performing OCR.""" # Read the PDF file with open(pdf_file, 'rb') as f: pdf_bytes = f.read() # Extract images from the PDF images = pdf2image.convert_from_bytes(pdf_bytes) # Perform OCR on each image and combine the text extracted_text = '' for image in images: text = pytesseract.image_to_string(image) extracted_text += text + '\n' # Add newline for better readability return extracted_text def main(): """Streamlit app for converting PDF images to text.""" # Title and description st.title("PDF to Text Converter") st.subheader("Convert your PDF images to editable text documents.") # Upload PDF file uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf") if uploaded_file is not None: # Extract text from the PDF extracted_text = extract_text_from_image_pdf(uploaded_file.name) # Display extracted text st.success("Text extracted from PDF:") st.write(extracted_text) # Download option (optional) if st.button("Download text as .txt file"): with open("extracted_text.txt", "w") as f: f.write(extracted_text) st.success("Text downloaded!") if __name__ == "__main__": main()