Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

File size: 1,528 Bytes

f4a8154
c8e848b
 
2df8377
 
529e2f8
2df8377
 
5432d3d
2df8377
 
c8e848b
2df8377
 
 
 
 
 
 
 
 
 
 
 
 
 
c8e848b
 
2df8377
c8e848b
2df8377
 
 
c8e848b
2df8377
 
c8e848b
2df8377
 
 
c8e848b
2df8377
 
 
c8e848b
2df8377
 
 
 
 
da16d5a
f4a8154
2df8377

import streamlit as st
import pytesseract
from PIL import Image
import docx
import pdf2image

# Set Tesseract path if not set already
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image_pdf(pdf_file):
  """Extracts text from a PDF by converting it to images and performing OCR."""

  # Read the PDF file
  with open(pdf_file, 'rb') as f:
    pdf_bytes = f.read()

  # Extract images from the PDF
  images = pdf2image.convert_from_bytes(pdf_bytes)

  # Perform OCR on each image and combine the text
  extracted_text = ''
  for image in images:
    text = pytesseract.image_to_string(image)
    extracted_text += text + '\n'  # Add newline for better readability

  return extracted_text

def main():
  """Streamlit app for converting PDF images to text."""

  # Title and description
  st.title("PDF to Text Converter")
  st.subheader("Convert your PDF images to editable text documents.")

  # Upload PDF file
  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")

  if uploaded_file is not None:
    # Extract text from the PDF
    extracted_text = extract_text_from_image_pdf(uploaded_file.name)

    # Display extracted text
    st.success("Text extracted from PDF:")
    st.write(extracted_text)

    # Download option (optional)
    if st.button("Download text as .txt file"):
      with open("extracted_text.txt", "w") as f:
        f.write(extracted_text)
      st.success("Text downloaded!")

if __name__ == "__main__":
  main()