AzizWazir's picture
Update app.py
d57bc5f verified
raw
history blame
1.47 kB
import streamlit as st
import os
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
# Ensure Poppler is in the PATH (you might need to adjust this for your server)
os.environ["PATH"] += ":/usr/local/bin" # Update this with the correct Poppler path if needed
def pdf_to_text(pdf_path):
try:
# Convert PDF to images
images = convert_from_path(pdf_path)
text = ""
for image in images:
text += pytesseract.image_to_string(image)
return text
except Exception as e:
st.error(f"Error during PDF to image conversion: {e}")
return None
def main():
st.title("PDF to Text Converter")
# Upload PDF file
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file is not None:
# Save uploaded file temporarily
with open("uploaded_file.pdf", "wb") as f:
f.write(uploaded_file.getbuffer())
st.text("Converting PDF to text...")
text = pdf_to_text("uploaded_file.pdf")
if text:
st.text_area("Extracted Text", text, height=300)
# Create downloadable Word file
word_file = "output.docx"
with open(word_file, "w") as f:
f.write(text)
st.download_button("Download Word File", word_file)
if __name__ == "__main__":
main()