Spaces:

AzizWazir
/

PDF_image-Convertor

Build error

App Files Files Community

AzizWazir commited on Jan 5, 2025

Commit

72c095c

verified ·

1 Parent(s): 757beef

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -39

app.py CHANGED Viewed

@@ -4,56 +4,63 @@ from pdf2image import convert_from_path
 from PIL import Image
 import pytesseract
 from docx import Document
-# Ensure Poppler's path is correct
-# Set the full path to Poppler's 'bin' directory (update this path according to your system)
-poppler_path = r'C:\poppler\bin'  # Update this with your actual Poppler path
-def pdf_to_text(pdf_path):
     try:
-        # Convert PDF to images
-        images = convert_from_path(pdf_path, poppler_path=poppler_path)
-        text = ""
-        # Extract text from each image using pytesseract
-        for image in images:
-            text += pytesseract.image_to_string(image)
         return text
     except Exception as e:
-        st.error(f"Error during PDF to image conversion: {e}")
         return None
-def save_text_to_word(text, filename="output.docx"):
-    # Create a Word document and write the text to it
     doc = Document()
     doc.add_paragraph(text)
-    doc.save(filename)
-def main():
-    st.title("PDF to Text Converter")
-    # Upload PDF file
-    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
-    if uploaded_file is not None:
-        # Save uploaded file temporarily
-        with open("uploaded_file.pdf", "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        st.text("Converting PDF to text...")
-        # Convert PDF to text
-        text = pdf_to_text("uploaded_file.pdf")
-        if text:
-            st.text_area("Extracted Text", text, height=300)
-            # Create downloadable Word file
-            word_file = "output.docx"
-            save_text_to_word(text, word_file)
-            st.download_button("Download Word File", word_file)
-if __name__ == "__main__":
-    main()

 from PIL import Image
 import pytesseract
 from docx import Document
+import tempfile
+# Folder path for PDFs
+pdf_folder_path = "D:/General"
+# Function to convert PDF to image
+def pdf_to_image(pdf_path):
     try:
+        images = convert_from_path(pdf_path, 500)
+        return images
+    except Exception as e:
+        st.error(f"Error during PDF to image conversion: {str(e)}")
+        return None
+# Function to extract text from an image using pytesseract
+def image_to_text(image):
+    try:
+        text = pytesseract.image_to_string(image)
         return text
     except Exception as e:
+        st.error(f"Error during image to text conversion: {str(e)}")
         return None
+# Function to save text to a Word document
+def save_to_word(text, file_name):
     doc = Document()
     doc.add_paragraph(text)
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name)
+    doc.save(temp_file.name)
+    return temp_file.name
+# Streamlit UI
+st.title("PDF to Word Converter")
+st.write("Converting PDFs from the D:/General folder")
+# Get all PDFs in the specified folder
+pdf_files = [f for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]
+if pdf_files:
+    for pdf_file in pdf_files:
+        pdf_path = os.path.join(pdf_folder_path, pdf_file)
+        # Convert PDF to images
+        images = pdf_to_image(pdf_path)
+        if images:
+            # Extract text from images
+            extracted_text = ""
+            for img in images:
+                text = image_to_text(img)
+                if text:
+                    extracted_text += text + "\n"
+            # Save the extracted text to Word
+            if extracted_text:
+                word_file = save_to_word(extracted_text, pdf_file)
+                st.success(f"Conversion of {pdf_file} complete! Download the Word file below.")
+                st.download_button(f"Download {pdf_file} as Word", word_file, file_name=f"{pdf_file}.docx")
+else:
+    st.write("No PDFs found in the specified folder.")