File size: 2,467 Bytes
d57bc5f
a856839
a4fb3b6
d57bc5f
a4fb3b6
a856839
72c095c
c050171
1822e3f
72c095c
c050171
1822e3f
 
 
72c095c
 
 
 
 
 
 
 
 
d57bc5f
c050171
72c095c
d57bc5f
 
72c095c
 
a856839
 
72c095c
 
 
a856839
72c095c
 
00bba6b
 
 
 
a856839
00bba6b
 
 
 
 
 
a856839
72c095c
00bba6b
 
72c095c
 
 
 
 
 
 
a856839
72c095c
 
00bba6b
 
 
72c095c
00bba6b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import streamlit as st
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from docx import Document
import tempfile

# Function to convert PDF to image with poppler_path configuration
def pdf_to_image(pdf_path):
    try:
        # Ensure that Poppler tools are correctly set up
        poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin"  # Update this with your actual poppler path
        images = convert_from_path(pdf_path, 500, poppler_path=poppler_path)
        return images
    except Exception as e:
        st.error(f"Error during PDF to image conversion: {str(e)}")
        return None

# Function to extract text from an image using pytesseract
def image_to_text(image):
    try:
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        st.error(f"Error during image to text conversion: {str(e)}")
        return None

# Function to save text to a Word document
def save_to_word(text, file_name):
    doc = Document()
    doc.add_paragraph(text)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name)
    doc.save(temp_file.name)
    return temp_file.name

# Streamlit UI
st.title("PDF to Word Converter")
st.write("Upload a PDF to convert it to a Word document")

# File upload feature
uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)

if uploaded_files:
    for uploaded_file in uploaded_files:
        # Save the uploaded PDF to a temporary file
        temp_pdf_path = tempfile.mktemp(suffix=".pdf")
        with open(temp_pdf_path, "wb") as temp_pdf:
            temp_pdf.write(uploaded_file.getbuffer())

        # Convert PDF to images
        images = pdf_to_image(temp_pdf_path)

        if images:
            # Extract text from images
            extracted_text = ""
            for img in images:
                text = image_to_text(img)
                if text:
                    extracted_text += text + "\n"

            # Save the extracted text to Word
            if extracted_text:
                word_file = save_to_word(extracted_text, uploaded_file.name)
                st.success(f"Conversion of {uploaded_file.name} complete! Download the Word file below.")
                st.download_button(f"Download {uploaded_file.name} as Word", word_file, file_name=f"{uploaded_file.name}.docx")
else:
    st.write("Please upload PDF files to convert.")