File size: 2,685 Bytes
1168986
bdc3ab9
1168986
b1cf141
 
 
 
 
 
 
 
 
 
 
 
 
 
bdc3ab9
1168986
 
 
 
 
 
 
 
bdc3ab9
1168986
 
 
 
 
bdc3ab9
1168986
 
 
 
 
bdc3ab9
 
1168986
 
bdc3ab9
1168986
 
 
 
 
 
 
 
 
 
 
bdc3ab9
1168986
 
 
 
 
 
 
bdc3ab9
1168986
 
bdc3ab9
1168986
 
 
bdc3ab9
 
1168986
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import streamlit as st
from pdf2image import convert_from_path

# Path to your PDF file
pdf_path = "path_to_your_pdf.pdf"

# Path to Poppler binary (optional if already in PATH)
poppler_path = r"C:\path\to\poppler\bin"  # Update this path as needed

try:
    # Convert PDF to images
    images = convert_from_path(pdf_path, poppler_path=poppler_path)
    print(f"Converted {len(images)} pages to images successfully!")
except Exception as e:
    print(f"An error occurred: {e}")


# Function to extract text from an image-based PDF
def extract_text_from_image_pdf(pdf_path):
    images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
    extracted_text = []
    for page_num, image in enumerate(images, start=1):
        text = pytesseract.image_to_string(image)
        extracted_text.append(f"Page {page_num}:\n{text}")
    return "\n".join(extracted_text)

# Function to save extracted text to a Word file
def save_text_to_word(text, output_path):
    doc = Document()
    doc.add_paragraph(text)
    doc.save(output_path)

# Function to save extracted text to an Excel file
def save_text_to_excel(text, output_path):
    data = {"Text": text.split("\n")}
    df = pd.DataFrame(data)
    df.to_excel(output_path, index=False)

def main():
    st.title("PDF Image to Text Converter")
    st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.")

    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
    if uploaded_file is not None:
        with st.spinner("Processing..."):
            tmp_file_path = "uploaded_file.pdf"
            with open(tmp_file_path, "wb") as f:
                f.write(uploaded_file.read())
            
            try:
                extracted_text = extract_text_from_image_pdf(tmp_file_path)
                st.success("Text extracted successfully!")
                st.text_area("Extracted Text", extracted_text, height=300)

                # Options to download text in different formats
                if st.button("Download as Word"):
                    save_text_to_word(extracted_text, "output.docx")
                    st.download_button("Download Word File", open("output.docx", "rb"), "output.docx")
                if st.button("Download as Excel"):
                    save_text_to_excel(extracted_text, "output.xlsx")
                    st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx")

            except Exception as e:
                st.error(f"An error occurred: {e}")

            finally:
                if os.path.exists(tmp_file_path):
                    os.remove(tmp_file_path)

if __name__ == "__main__":
    main()