PDF-Convertor / app.py
AzizWazir's picture
Update app.py
b1cf141 verified
import os
import streamlit as st
from pdf2image import convert_from_path
# Path to your PDF file
pdf_path = "path_to_your_pdf.pdf"
# Path to Poppler binary (optional if already in PATH)
poppler_path = r"C:\path\to\poppler\bin" # Update this path as needed
try:
# Convert PDF to images
images = convert_from_path(pdf_path, poppler_path=poppler_path)
print(f"Converted {len(images)} pages to images successfully!")
except Exception as e:
print(f"An error occurred: {e}")
# Function to extract text from an image-based PDF
def extract_text_from_image_pdf(pdf_path):
images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
extracted_text = []
for page_num, image in enumerate(images, start=1):
text = pytesseract.image_to_string(image)
extracted_text.append(f"Page {page_num}:\n{text}")
return "\n".join(extracted_text)
# Function to save extracted text to a Word file
def save_text_to_word(text, output_path):
doc = Document()
doc.add_paragraph(text)
doc.save(output_path)
# Function to save extracted text to an Excel file
def save_text_to_excel(text, output_path):
data = {"Text": text.split("\n")}
df = pd.DataFrame(data)
df.to_excel(output_path, index=False)
def main():
st.title("PDF Image to Text Converter")
st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file is not None:
with st.spinner("Processing..."):
tmp_file_path = "uploaded_file.pdf"
with open(tmp_file_path, "wb") as f:
f.write(uploaded_file.read())
try:
extracted_text = extract_text_from_image_pdf(tmp_file_path)
st.success("Text extracted successfully!")
st.text_area("Extracted Text", extracted_text, height=300)
# Options to download text in different formats
if st.button("Download as Word"):
save_text_to_word(extracted_text, "output.docx")
st.download_button("Download Word File", open("output.docx", "rb"), "output.docx")
if st.button("Download as Excel"):
save_text_to_excel(extracted_text, "output.xlsx")
st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx")
except Exception as e:
st.error(f"An error occurred: {e}")
finally:
if os.path.exists(tmp_file_path):
os.remove(tmp_file_path)
if __name__ == "__main__":
main()