Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

03dbb03

verified ·

1 Parent(s): 46a8f59

Create app.py

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import streamlit as st
+import pytesseract
+from PIL import Image
+import docx
+import pdf2image
+# Set Tesseract path if not set already
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+def extract_text_from_image_pdf(pdf_file):
+  """Extracts text from a PDF by converting it to images and performing OCR."""
+  # Read the PDF file
+  with open(pdf_file, 'rb') as f:
+    pdf_bytes = f.read()
+  # Extract images from the PDF
+  images = pdf2image.convert_from_bytes(pdf_bytes)
+  # Perform OCR on each image and combine the text
+  extracted_text = ''
+  for image in images:
+    text = pytesseract.image_to_string(image)
+    extracted_text += text + '\n'  # Add newline for better readability
+  return extracted_text
+def main():
+  """Streamlit app for converting PDF images to text."""
+  # Title and description
+  st.title("PDF to Text Converter")
+  st.subheader("Convert your PDF images to editable text documents.")
+  # Upload PDF file
+  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
+  if uploaded_file is not None:
+    # Extract text from the PDF
+    extracted_text = extract_text_from_image_pdf(uploaded_file.name)
+    # Display extracted text
+    st.success("Text extracted from PDF:")
+    st.write(extracted_text)
+    # Download option (optional)
+    if st.button("Download text as .txt file"):
+      with open("extracted_text.txt", "w") as f:
+        f.write(extracted_text)
+      st.success("Text downloaded!")
+if __name__ == "__main__":
+  main()