Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

PDF-Convertor / app.py

Update app.py

2df8377 verified about 1 year ago

1.53 kB

	import streamlit as st
	import pytesseract
	from PIL import Image
	import docx
	import pdf2image

	# Set Tesseract path if not set already
	pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

	def extract_text_from_image_pdf(pdf_file):
	"""Extracts text from a PDF by converting it to images and performing OCR."""

	# Read the PDF file
	with open(pdf_file, 'rb') as f:
	pdf_bytes = f.read()

	# Extract images from the PDF
	images = pdf2image.convert_from_bytes(pdf_bytes)

	# Perform OCR on each image and combine the text
	extracted_text = ''
	for image in images:
	text = pytesseract.image_to_string(image)
	extracted_text += text + '\n' # Add newline for better readability

	return extracted_text

	def main():
	"""Streamlit app for converting PDF images to text."""

	# Title and description
	st.title("PDF to Text Converter")
	st.subheader("Convert your PDF images to editable text documents.")

	# Upload PDF file
	uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")

	if uploaded_file is not None:
	# Extract text from the PDF
	extracted_text = extract_text_from_image_pdf(uploaded_file.name)

	# Display extracted text
	st.success("Text extracted from PDF:")
	st.write(extracted_text)

	# Download option (optional)
	if st.button("Download text as .txt file"):
	with open("extracted_text.txt", "w") as f:
	f.write(extracted_text)
	st.success("Text downloaded!")

	if __name__ == "__main__":
	main()