Spaces:

AzizWazir
/

PDF_image-Convertor

Build error

App Files Files Community

AzizWazir commited on Jan 5, 2025

Commit

a856839

verified ·

1 Parent(s): 401483c

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -13

app.py CHANGED Viewed

@@ -1,18 +1,21 @@
-import streamlit as st
 import os
 from pdf2image import convert_from_path
 from PIL import Image
 import pytesseract
-# Ensure Poppler is in the PATH (you might need to adjust this for your server)
-os.environ["PATH"] += ":/usr/local/bin"  # Update this with the correct Poppler path if needed
 def pdf_to_text(pdf_path):
     try:
         # Convert PDF to images
-        images = convert_from_path(pdf_path)
         text = ""
         for image in images:
             text += pytesseract.image_to_string(image)
@@ -21,30 +24,36 @@ def pdf_to_text(pdf_path):
         st.error(f"Error during PDF to image conversion: {e}")
         return None
 def main():
     st.title("PDF to Text Converter")
     # Upload PDF file
     uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
     if uploaded_file is not None:
         # Save uploaded file temporarily
         with open("uploaded_file.pdf", "wb") as f:
             f.write(uploaded_file.getbuffer())
         st.text("Converting PDF to text...")
         text = pdf_to_text("uploaded_file.pdf")
         if text:
             st.text_area("Extracted Text", text, height=300)
             # Create downloadable Word file
             word_file = "output.docx"
-            with open(word_file, "w") as f:
-                f.write(text)
             st.download_button("Download Word File", word_file)
 if __name__ == "__main__":
     main()

 import os
+import streamlit as st
 from pdf2image import convert_from_path
 from PIL import Image
 import pytesseract
+from docx import Document
+# Ensure Poppler's path is correct
+# Set the full path to Poppler's 'bin' directory (update this path according to your system)
+poppler_path = r'C:\poppler\bin'  # Update this with your actual Poppler path
 def pdf_to_text(pdf_path):
     try:
         # Convert PDF to images
+        images = convert_from_path(pdf_path, poppler_path=poppler_path)
         text = ""
+        # Extract text from each image using pytesseract
         for image in images:
             text += pytesseract.image_to_string(image)
         st.error(f"Error during PDF to image conversion: {e}")
         return None
+def save_text_to_word(text, filename="output.docx"):
+    # Create a Word document and write the text to it
+    doc = Document()
+    doc.add_paragraph(text)
+    doc.save(filename)
 def main():
     st.title("PDF to Text Converter")
     # Upload PDF file
     uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
     if uploaded_file is not None:
         # Save uploaded file temporarily
         with open("uploaded_file.pdf", "wb") as f:
             f.write(uploaded_file.getbuffer())
         st.text("Converting PDF to text...")
+        # Convert PDF to text
         text = pdf_to_text("uploaded_file.pdf")
         if text:
             st.text_area("Extracted Text", text, height=300)
             # Create downloadable Word file
             word_file = "output.docx"
+            save_text_to_word(text, word_file)
             st.download_button("Download Word File", word_file)
 if __name__ == "__main__":
     main()