import streamlit as st import pytesseract from pdf2image import convert_from_path import os import tempfile import base64 # Set page config st.set_page_config(page_title="PDF Text Extractor", layout="wide") # Function to convert PDF pages to images def pdf_to_images(pdf_path, dpi=300): return convert_from_path(pdf_path, dpi=dpi) # Function to extract text from images def extract_text_from_images(images, progress_bar): extracted_text = "" for i, image in enumerate(images): text = pytesseract.image_to_string(image) extracted_text += f"Page {i+1}:\n{text}\n\n" progress_bar.progress((i + 1) / len(images)) return extracted_text # Function to create a download link def get_download_link(text, filename, link_text): b64 = base64.b64encode(text.encode()).decode() return f'{link_text}' # Main Streamlit app def main(): st.title("PDF Text Extractor") st.sidebar.header("Settings") dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50) uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file is not None: st.success("File successfully uploaded!") # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name try: with st.spinner("Converting PDF to images..."): images = pdf_to_images(tmp_file_path, dpi) st.info(f"Total pages: {len(images)}") progress_bar = st.progress(0) status_text = st.empty() with st.spinner("Extracting text from images..."): status_text.text("Extracting text... (This may take a while)") extracted_text = extract_text_from_images(images, progress_bar) status_text.text("Text extraction complete!") st.subheader("Extracted Text") st.text_area("", extracted_text, height=300) st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True) except Exception as e: st.error(f"An error occurred: {str(e)}") finally: # Clean up the temporary file os.unlink(tmp_file_path) if __name__ == "__main__": main()