Spaces:
Build error
Build error
| import streamlit as st | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| import os | |
| import tempfile | |
| import base64 | |
| # Set page config | |
| st.set_page_config(page_title="PDF Text Extractor", layout="wide") | |
| # Function to convert PDF pages to images | |
| def pdf_to_images(pdf_path, dpi=300): | |
| return convert_from_path(pdf_path, dpi=dpi) | |
| # Function to extract text from images | |
| def extract_text_from_images(images, progress_bar): | |
| extracted_text = "" | |
| for i, image in enumerate(images): | |
| text = pytesseract.image_to_string(image) | |
| extracted_text += f"Page {i+1}:\n{text}\n\n" | |
| progress_bar.progress((i + 1) / len(images)) | |
| return extracted_text | |
| # Function to create a download link | |
| def get_download_link(text, filename, link_text): | |
| b64 = base64.b64encode(text.encode()).decode() | |
| return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{link_text}</a>' | |
| # Main Streamlit app | |
| def main(): | |
| st.title("PDF Text Extractor") | |
| st.sidebar.header("Settings") | |
| dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50) | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| if uploaded_file is not None: | |
| st.success("File successfully uploaded!") | |
| # Save uploaded file temporarily | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
| tmp_file.write(uploaded_file.getvalue()) | |
| tmp_file_path = tmp_file.name | |
| try: | |
| with st.spinner("Converting PDF to images..."): | |
| images = pdf_to_images(tmp_file_path, dpi) | |
| st.info(f"Total pages: {len(images)}") | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| with st.spinner("Extracting text from images..."): | |
| status_text.text("Extracting text... (This may take a while)") | |
| extracted_text = extract_text_from_images(images, progress_bar) | |
| status_text.text("Text extraction complete!") | |
| st.subheader("Extracted Text") | |
| st.text_area("", extracted_text, height=300) | |
| st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True) | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| finally: | |
| # Clean up the temporary file | |
| os.unlink(tmp_file_path) | |
| if __name__ == "__main__": | |
| main() |