File size: 1,528 Bytes
f4a8154
c8e848b
 
2df8377
 
529e2f8
2df8377
 
5432d3d
2df8377
 
c8e848b
2df8377
 
 
 
 
 
 
 
 
 
 
 
 
 
c8e848b
 
2df8377
c8e848b
2df8377
 
 
c8e848b
2df8377
 
c8e848b
2df8377
 
 
c8e848b
2df8377
 
 
c8e848b
2df8377
 
 
 
 
da16d5a
f4a8154
2df8377
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import streamlit as st
import pytesseract
from PIL import Image
import docx
import pdf2image

# Set Tesseract path if not set already
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image_pdf(pdf_file):
  """Extracts text from a PDF by converting it to images and performing OCR."""

  # Read the PDF file
  with open(pdf_file, 'rb') as f:
    pdf_bytes = f.read()

  # Extract images from the PDF
  images = pdf2image.convert_from_bytes(pdf_bytes)

  # Perform OCR on each image and combine the text
  extracted_text = ''
  for image in images:
    text = pytesseract.image_to_string(image)
    extracted_text += text + '\n'  # Add newline for better readability

  return extracted_text

def main():
  """Streamlit app for converting PDF images to text."""

  # Title and description
  st.title("PDF to Text Converter")
  st.subheader("Convert your PDF images to editable text documents.")

  # Upload PDF file
  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")

  if uploaded_file is not None:
    # Extract text from the PDF
    extracted_text = extract_text_from_image_pdf(uploaded_file.name)

    # Display extracted text
    st.success("Text extracted from PDF:")
    st.write(extracted_text)

    # Download option (optional)
    if st.button("Download text as .txt file"):
      with open("extracted_text.txt", "w") as f:
        f.write(extracted_text)
      st.success("Text downloaded!")

if __name__ == "__main__":
  main()