AzizWazir's picture
Update app.py
1822e3f verified
import os
import streamlit as st
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from docx import Document
import tempfile
# Function to convert PDF to image with poppler_path configuration
def pdf_to_image(pdf_path):
try:
# Ensure that Poppler tools are correctly set up
poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" # Update this with your actual poppler path
images = convert_from_path(pdf_path, 500, poppler_path=poppler_path)
return images
except Exception as e:
st.error(f"Error during PDF to image conversion: {str(e)}")
return None
# Function to extract text from an image using pytesseract
def image_to_text(image):
try:
text = pytesseract.image_to_string(image)
return text
except Exception as e:
st.error(f"Error during image to text conversion: {str(e)}")
return None
# Function to save text to a Word document
def save_to_word(text, file_name):
doc = Document()
doc.add_paragraph(text)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name)
doc.save(temp_file.name)
return temp_file.name
# Streamlit UI
st.title("PDF to Word Converter")
st.write("Upload a PDF to convert it to a Word document")
# File upload feature
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
# Save the uploaded PDF to a temporary file
temp_pdf_path = tempfile.mktemp(suffix=".pdf")
with open(temp_pdf_path, "wb") as temp_pdf:
temp_pdf.write(uploaded_file.getbuffer())
# Convert PDF to images
images = pdf_to_image(temp_pdf_path)
if images:
# Extract text from images
extracted_text = ""
for img in images:
text = image_to_text(img)
if text:
extracted_text += text + "\n"
# Save the extracted text to Word
if extracted_text:
word_file = save_to_word(extracted_text, uploaded_file.name)
st.success(f"Conversion of {uploaded_file.name} complete! Download the Word file below.")
st.download_button(f"Download {uploaded_file.name} as Word", word_file, file_name=f"{uploaded_file.name}.docx")
else:
st.write("Please upload PDF files to convert.")