Spaces:
Build error
Build error
File size: 2,467 Bytes
d57bc5f a856839 a4fb3b6 d57bc5f a4fb3b6 a856839 72c095c c050171 1822e3f 72c095c c050171 1822e3f 72c095c d57bc5f c050171 72c095c d57bc5f 72c095c a856839 72c095c a856839 72c095c 00bba6b a856839 00bba6b a856839 72c095c 00bba6b 72c095c a856839 72c095c 00bba6b 72c095c 00bba6b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import streamlit as st
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from docx import Document
import tempfile
# Function to convert PDF to image with poppler_path configuration
def pdf_to_image(pdf_path):
try:
# Ensure that Poppler tools are correctly set up
poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" # Update this with your actual poppler path
images = convert_from_path(pdf_path, 500, poppler_path=poppler_path)
return images
except Exception as e:
st.error(f"Error during PDF to image conversion: {str(e)}")
return None
# Function to extract text from an image using pytesseract
def image_to_text(image):
try:
text = pytesseract.image_to_string(image)
return text
except Exception as e:
st.error(f"Error during image to text conversion: {str(e)}")
return None
# Function to save text to a Word document
def save_to_word(text, file_name):
doc = Document()
doc.add_paragraph(text)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name)
doc.save(temp_file.name)
return temp_file.name
# Streamlit UI
st.title("PDF to Word Converter")
st.write("Upload a PDF to convert it to a Word document")
# File upload feature
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
if uploaded_files:
for uploaded_file in uploaded_files:
# Save the uploaded PDF to a temporary file
temp_pdf_path = tempfile.mktemp(suffix=".pdf")
with open(temp_pdf_path, "wb") as temp_pdf:
temp_pdf.write(uploaded_file.getbuffer())
# Convert PDF to images
images = pdf_to_image(temp_pdf_path)
if images:
# Extract text from images
extracted_text = ""
for img in images:
text = image_to_text(img)
if text:
extracted_text += text + "\n"
# Save the extracted text to Word
if extracted_text:
word_file = save_to_word(extracted_text, uploaded_file.name)
st.success(f"Conversion of {uploaded_file.name} complete! Download the Word file below.")
st.download_button(f"Download {uploaded_file.name} as Word", word_file, file_name=f"{uploaded_file.name}.docx")
else:
st.write("Please upload PDF files to convert.")
|