import streamlit as st
import pytesseract
from PIL import Image
import docx
import pdf2image

# Set Tesseract path if not set already
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image_pdf(pdf_file):
  """Extracts text from a PDF by converting it to images and performing OCR."""

  # Read the PDF file
  with open(pdf_file, 'rb') as f:
    pdf_bytes = f.read()

  # Extract images from the PDF
  images = pdf2image.convert_from_bytes(pdf_bytes)

  # Perform OCR on each image and combine the text
  extracted_text = ''
  for image in images:
    text = pytesseract.image_to_string(image)
    extracted_text += text + '\n'  # Add newline for better readability

  return extracted_text

def main():
  """Streamlit app for converting PDF images to text."""

  # Title and description
  st.title("PDF to Text Converter")
  st.subheader("Convert your PDF images to editable text documents.")

  # Upload PDF file
  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")

  if uploaded_file is not None:
    # Extract text from the PDF
    extracted_text = extract_text_from_image_pdf(uploaded_file.name)

    # Display extracted text
    st.success("Text extracted from PDF:")
    st.write(extracted_text)

    # Download option (optional)
    if st.button("Download text as .txt file"):
      with open("extracted_text.txt", "w") as f:
        f.write(extracted_text)
      st.success("Text downloaded!")

if __name__ == "__main__":
  main()