Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

PDF-Convertor / app.py

AzizWazir

Update app.py

b1cf141 verified 12 months ago

raw

history blame contribute delete

2.69 kB

	import os
	import streamlit as st
	from pdf2image import convert_from_path

	# Path to your PDF file
	pdf_path = "path_to_your_pdf.pdf"

	# Path to Poppler binary (optional if already in PATH)
	poppler_path = r"C:\path\to\poppler\bin" # Update this path as needed

	try:
	# Convert PDF to images
	images = convert_from_path(pdf_path, poppler_path=poppler_path)
	print(f"Converted {len(images)} pages to images successfully!")
	except Exception as e:
	print(f"An error occurred: {e}")


	# Function to extract text from an image-based PDF
	def extract_text_from_image_pdf(pdf_path):
	images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
	extracted_text = []
	for page_num, image in enumerate(images, start=1):
	text = pytesseract.image_to_string(image)
	extracted_text.append(f"Page {page_num}:\n{text}")
	return "\n".join(extracted_text)

	# Function to save extracted text to a Word file
	def save_text_to_word(text, output_path):
	doc = Document()
	doc.add_paragraph(text)
	doc.save(output_path)

	# Function to save extracted text to an Excel file
	def save_text_to_excel(text, output_path):
	data = {"Text": text.split("\n")}
	df = pd.DataFrame(data)
	df.to_excel(output_path, index=False)

	def main():
	st.title("PDF Image to Text Converter")
	st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.")

	uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
	if uploaded_file is not None:
	with st.spinner("Processing..."):
	tmp_file_path = "uploaded_file.pdf"
	with open(tmp_file_path, "wb") as f:
	f.write(uploaded_file.read())

	try:
	extracted_text = extract_text_from_image_pdf(tmp_file_path)
	st.success("Text extracted successfully!")
	st.text_area("Extracted Text", extracted_text, height=300)

	# Options to download text in different formats
	if st.button("Download as Word"):
	save_text_to_word(extracted_text, "output.docx")
	st.download_button("Download Word File", open("output.docx", "rb"), "output.docx")
	if st.button("Download as Excel"):
	save_text_to_excel(extracted_text, "output.xlsx")
	st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx")

	except Exception as e:
	st.error(f"An error occurred: {e}")

	finally:
	if os.path.exists(tmp_file_path):
	os.remove(tmp_file_path)

	if __name__ == "__main__":
	main()