Spaces:

ositamiles
/

Pdf-Text-Extractor

Build error

App Files Files Community

Pdf-Text-Extractor / app.py

ositamiles

Create app.py

9afab0a verified over 1 year ago

raw

history blame contribute delete

2.48 kB

	import streamlit as st
	import pytesseract
	from pdf2image import convert_from_path
	import os
	import tempfile
	import base64

	# Set page config
	st.set_page_config(page_title="PDF Text Extractor", layout="wide")

	# Function to convert PDF pages to images
	def pdf_to_images(pdf_path, dpi=300):
	return convert_from_path(pdf_path, dpi=dpi)

	# Function to extract text from images
	def extract_text_from_images(images, progress_bar):
	extracted_text = ""
	for i, image in enumerate(images):
	text = pytesseract.image_to_string(image)
	extracted_text += f"Page {i+1}:\n{text}\n\n"
	progress_bar.progress((i + 1) / len(images))
	return extracted_text

	# Function to create a download link
	def get_download_link(text, filename, link_text):
	b64 = base64.b64encode(text.encode()).decode()
	return f'<a href="data:file/txt;base64,{b64}" download="{filename}">{link_text}</a>'

	# Main Streamlit app
	def main():
	st.title("PDF Text Extractor")

	st.sidebar.header("Settings")
	dpi = st.sidebar.slider("DPI (Higher for better quality, slower processing)", 100, 600, 300, 50)

	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	st.success("File successfully uploaded!")

	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_file_path = tmp_file.name

	try:
	with st.spinner("Converting PDF to images..."):
	images = pdf_to_images(tmp_file_path, dpi)

	st.info(f"Total pages: {len(images)}")

	progress_bar = st.progress(0)
	status_text = st.empty()

	with st.spinner("Extracting text from images..."):
	status_text.text("Extracting text... (This may take a while)")
	extracted_text = extract_text_from_images(images, progress_bar)

	status_text.text("Text extraction complete!")

	st.subheader("Extracted Text")
	st.text_area("", extracted_text, height=300)

	st.markdown(get_download_link(extracted_text, "extracted_text.txt", "Download Extracted Text"), unsafe_allow_html=True)

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")

	finally:
	# Clean up the temporary file
	os.unlink(tmp_file_path)

	if __name__ == "__main__":
	main()