Spaces:
Build error
Build error
| import os | |
| import streamlit as st | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import pytesseract | |
| from docx import Document | |
| import tempfile | |
| # Function to convert PDF to image with poppler_path configuration | |
| def pdf_to_image(pdf_path): | |
| try: | |
| # Ensure that Poppler tools are correctly set up | |
| poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin" # Update this with your actual poppler path | |
| images = convert_from_path(pdf_path, 500, poppler_path=poppler_path) | |
| return images | |
| except Exception as e: | |
| st.error(f"Error during PDF to image conversion: {str(e)}") | |
| return None | |
| # Function to extract text from an image using pytesseract | |
| def image_to_text(image): | |
| try: | |
| text = pytesseract.image_to_string(image) | |
| return text | |
| except Exception as e: | |
| st.error(f"Error during image to text conversion: {str(e)}") | |
| return None | |
| # Function to save text to a Word document | |
| def save_to_word(text, file_name): | |
| doc = Document() | |
| doc.add_paragraph(text) | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name) | |
| doc.save(temp_file.name) | |
| return temp_file.name | |
| # Streamlit UI | |
| st.title("PDF to Word Converter") | |
| st.write("Upload a PDF to convert it to a Word document") | |
| # File upload feature | |
| uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True) | |
| if uploaded_files: | |
| for uploaded_file in uploaded_files: | |
| # Save the uploaded PDF to a temporary file | |
| temp_pdf_path = tempfile.mktemp(suffix=".pdf") | |
| with open(temp_pdf_path, "wb") as temp_pdf: | |
| temp_pdf.write(uploaded_file.getbuffer()) | |
| # Convert PDF to images | |
| images = pdf_to_image(temp_pdf_path) | |
| if images: | |
| # Extract text from images | |
| extracted_text = "" | |
| for img in images: | |
| text = image_to_text(img) | |
| if text: | |
| extracted_text += text + "\n" | |
| # Save the extracted text to Word | |
| if extracted_text: | |
| word_file = save_to_word(extracted_text, uploaded_file.name) | |
| st.success(f"Conversion of {uploaded_file.name} complete! Download the Word file below.") | |
| st.download_button(f"Download {uploaded_file.name} as Word", word_file, file_name=f"{uploaded_file.name}.docx") | |
| else: | |
| st.write("Please upload PDF files to convert.") | |