Spaces:

AzizWazir
/

PDF-Convertor

Sleeping

App Files Files Community

AzizWazir commited on Dec 29, 2024

Commit

1168986

verified ·

1 Parent(s): bdc3ab9

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -38

app.py CHANGED Viewed

@@ -1,53 +1,66 @@
 import streamlit as st
 import pytesseract
 from PIL import Image
-import docx
-import pdf2image
-# Set Tesseract path if not set already
-pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-def extract_text_from_image_pdf(pdf_file):
-  """Extracts text from a PDF by converting it to images and performing OCR."""
-  # Read the PDF file
-  with open(pdf_file, 'rb') as f:
-    pdf_bytes = f.read()
-  # Extract images from the PDF
-  images = pdf2image.convert_from_bytes(pdf_bytes)
-  # Perform OCR on each image and combine the text
-  extracted_text = ''
-  for image in images:
-    text = pytesseract.image_to_string(image)
-    extracted_text += text + '\n'  # Add newline for better readability
-  return extracted_text
 def main():
-  """Streamlit app for converting PDF images to text."""
-  # Title and description
-  st.title("PDF to Text Converter")
-  st.subheader("Convert your PDF images to editable text documents.")
-  # Upload PDF file
-  uploaded_file = st.file_uploader("Choose a PDF file to convert:", type="pdf")
-  if uploaded_file is not None:
-    # Extract text from the PDF
-    extracted_text = extract_text_from_image_pdf(uploaded_file.name)
-    # Display extracted text
-    st.success("Text extracted from PDF:")
-    st.write(extracted_text)
-    # Download option (optional)
-    if st.button("Download text as .txt file"):
-      with open("extracted_text.txt", "w") as f:
-        f.write(extracted_text)
-      st.success("Text downloaded!")
 if __name__ == "__main__":
-  main()

+import os
 import streamlit as st
+from pdf2image import convert_from_path
 import pytesseract
 from PIL import Image
+import pandas as pd
+from docx import Document
+# Set paths for poppler and tesseract (for local testing or adjust as per your environment)
+POPPLER_PATH = "/usr/bin"
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+# Function to extract text from an image-based PDF
+def extract_text_from_image_pdf(pdf_path):
+    images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH)
+    extracted_text = []
+    for page_num, image in enumerate(images, start=1):
+        text = pytesseract.image_to_string(image)
+        extracted_text.append(f"Page {page_num}:\n{text}")
+    return "\n".join(extracted_text)
+# Function to save extracted text to a Word file
+def save_text_to_word(text, output_path):
+    doc = Document()
+    doc.add_paragraph(text)
+    doc.save(output_path)
+# Function to save extracted text to an Excel file
+def save_text_to_excel(text, output_path):
+    data = {"Text": text.split("\n")}
+    df = pd.DataFrame(data)
+    df.to_excel(output_path, index=False)
 def main():
+    st.title("PDF Image to Text Converter")
+    st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.")
+    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
+    if uploaded_file is not None:
+        with st.spinner("Processing..."):
+            tmp_file_path = "uploaded_file.pdf"
+            with open(tmp_file_path, "wb") as f:
+                f.write(uploaded_file.read())
+            try:
+                extracted_text = extract_text_from_image_pdf(tmp_file_path)
+                st.success("Text extracted successfully!")
+                st.text_area("Extracted Text", extracted_text, height=300)
+                # Options to download text in different formats
+                if st.button("Download as Word"):
+                    save_text_to_word(extracted_text, "output.docx")
+                    st.download_button("Download Word File", open("output.docx", "rb"), "output.docx")
+                if st.button("Download as Excel"):
+                    save_text_to_excel(extracted_text, "output.xlsx")
+                    st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx")
+            except Exception as e:
+                st.error(f"An error occurred: {e}")
+            finally:
+                if os.path.exists(tmp_file_path):
+                    os.remove(tmp_file_path)
 if __name__ == "__main__":
+    main()