Spaces:

ositamiles
/

Pdf-Text-Extractor

Build error

File size: 2,484 Bytes

9afab0a

import streamlit as st
import pytesseract
from pdf2image import convert_from_path
import os
import tempfile
import base64

# Set page config
st.set_page_config(page_title="PDF Text Extractor", layout="wide")

# Function to convert PDF pages to images
def pdf_to_images(pdf_path, dpi=300):
    return convert_from_path(pdf_path, dpi=dpi)

# Function to extract text from images
def extract_text_from_images(images, progress_bar):
    extracted_text = ""
    for i, image in enumerate(images):
        text = pytesseract.image_to_string(image)
        extracted_text += f"Page {i+1}:\n{text}\n\n"
        progress_bar.progress((i + 1) / len(images))
    return extracted_text

# Function to create a download link
def get_download_link(text, filename, link_text):
    b64 = base64.b64encode(text.encode()).decode()
    return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{link_text}</a>'

# Main Streamlit app
def main():
    st.title("PDF Text Extractor")

    st.sidebar.header("Settings")
    dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50)

    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

    if uploaded_file is not None:
        st.success("File successfully uploaded!")

        # Save uploaded file temporarily
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            tmp_file_path = tmp_file.name

        try:
            with st.spinner("Converting PDF to images..."):
                images = pdf_to_images(tmp_file_path, dpi)

            st.info(f"Total pages: {len(images)}")

            progress_bar = st.progress(0)
            status_text = st.empty()

            with st.spinner("Extracting text from images..."):
                status_text.text("Extracting text... (This may take a while)")
                extracted_text = extract_text_from_images(images, progress_bar)

            status_text.text("Text extraction complete!")

            st.subheader("Extracted Text")
            st.text_area("", extracted_text, height=300)

            st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")

        finally:
            # Clean up the temporary file
            os.unlink(tmp_file_path)

if __name__ == "__main__":
    main()