ositamiles's picture
Create app.py
9afab0a verified
import streamlit as st
import pytesseract
from pdf2image import convert_from_path
import os
import tempfile
import base64
# Set page config
st.set_page_config(page_title="PDF Text Extractor", layout="wide")
# Function to convert PDF pages to images
def pdf_to_images(pdf_path, dpi=300):
return convert_from_path(pdf_path, dpi=dpi)
# Function to extract text from images
def extract_text_from_images(images, progress_bar):
extracted_text = ""
for i, image in enumerate(images):
text = pytesseract.image_to_string(image)
extracted_text += f"Page {i+1}:\n{text}\n\n"
progress_bar.progress((i + 1) / len(images))
return extracted_text
# Function to create a download link
def get_download_link(text, filename, link_text):
b64 = base64.b64encode(text.encode()).decode()
return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{link_text}</a>'
# Main Streamlit app
def main():
st.title("PDF Text Extractor")
st.sidebar.header("Settings")
dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50)
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file is not None:
st.success("File successfully uploaded!")
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
try:
with st.spinner("Converting PDF to images..."):
images = pdf_to_images(tmp_file_path, dpi)
st.info(f"Total pages: {len(images)}")
progress_bar = st.progress(0)
status_text = st.empty()
with st.spinner("Extracting text from images..."):
status_text.text("Extracting text... (This may take a while)")
extracted_text = extract_text_from_images(images, progress_bar)
status_text.text("Text extraction complete!")
st.subheader("Extracted Text")
st.text_area("", extracted_text, height=300)
st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
finally:
# Clean up the temporary file
os.unlink(tmp_file_path)
if __name__ == "__main__":
main()