PDF-Editor

Build error

App Files Files Community

PDF-Editor / app.py

Tassawar

Update app.py

880593d verified 11 months ago

raw

history blame contribute delete

3.53 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from docx import Document
	from io import BytesIO
	from pdf2image import convert_from_bytes
	import pytesseract
	import time

	# Configure Tesseract path (if needed)
	# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'

	def pdf_to_word(pdf_file, password=None):
	"""Convert a PDF file to a Word file with optional decryption and OCR."""
	try:
	# Ensure the file is a valid PDF
	if pdf_file.type != "application/pdf":
	raise ValueError("Invalid file type. Please upload a PDF file.")

	# Initialize PDF reader
	reader = PdfReader(pdf_file)

	# Decrypt the PDF if it's encrypted
	if reader.is_encrypted:
	if password:
	try:
	reader.decrypt(password)
	except Exception as e:
	raise ValueError("Failed to decrypt the PDF. Check the password.") from e
	else:
	raise ValueError("The PDF is encrypted. Please provide a password.")

	# Create a Word document
	document = Document()

	# Extract text from each page
	pdf_bytes = pdf_file.read()
	total_pages = len(reader.pages)
	progress_bar = st.progress(0)
	status_text = st.empty()

	for i, page in enumerate(reader.pages):
	status_text.text(f"Processing page {i + 1} of {total_pages}...")
	progress_bar.progress((i + 1) / total_pages)

	# Try extracting text directly
	text = page.extract_text()
	if text:
	document.add_paragraph(text)
	else:
	# Use OCR for non-extractable pages
	images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1)
	for image in images:
	ocr_text = pytesseract.image_to_string(image)
	if ocr_text.strip():
	document.add_paragraph(ocr_text)
	else:
	document.add_paragraph("[This page contains non-extractable content or images]")

	# Save the Word document to a BytesIO object
	word_file = BytesIO()
	document.save(word_file)
	word_file.seek(0)

	return word_file

	except Exception as e:
	raise ValueError(f"An error occurred: {e}")

	# Streamlit app configuration
	st.set_page_config(page_title="PDF to Word Converter", page_icon="🖋", layout="centered")

	# App header
	st.title("📄 PDF to Word Converter")
	st.write("Upload a PDF file to convert it into an editable Word document.")

	# Upload PDF file widget
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	# Optionally ask for a password if the PDF is encrypted
	password = st.text_input("Enter PDF password (if encrypted)", type="password")

	if st.button("Convert to Word"):
	try:
	# Convert the PDF to Word
	with st.spinner("Converting PDF to Word..."):
	word_file = pdf_to_word(uploaded_file, password)

	# Provide a download link for the Word file
	st.success("Conversion successful!")
	st.download_button(
	label="Download Word file",
	data=word_file,
	file_name="converted.docx",
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	)
	except Exception as e:
	st.error(f"Error: {e}")