Spaces:
Build error
Build error
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from docx import Document | |
| from io import BytesIO | |
| from pdf2image import convert_from_bytes | |
| import pytesseract | |
| import time | |
| # Configure Tesseract path (if needed) | |
| # pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract' | |
| def pdf_to_word(pdf_file, password=None): | |
| """Convert a PDF file to a Word file with optional decryption and OCR.""" | |
| try: | |
| # Ensure the file is a valid PDF | |
| if pdf_file.type != "application/pdf": | |
| raise ValueError("Invalid file type. Please upload a PDF file.") | |
| # Initialize PDF reader | |
| reader = PdfReader(pdf_file) | |
| # Decrypt the PDF if it's encrypted | |
| if reader.is_encrypted: | |
| if password: | |
| try: | |
| reader.decrypt(password) | |
| except Exception as e: | |
| raise ValueError("Failed to decrypt the PDF. Check the password.") from e | |
| else: | |
| raise ValueError("The PDF is encrypted. Please provide a password.") | |
| # Create a Word document | |
| document = Document() | |
| # Extract text from each page | |
| pdf_bytes = pdf_file.read() | |
| total_pages = len(reader.pages) | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for i, page in enumerate(reader.pages): | |
| status_text.text(f"Processing page {i + 1} of {total_pages}...") | |
| progress_bar.progress((i + 1) / total_pages) | |
| # Try extracting text directly | |
| text = page.extract_text() | |
| if text: | |
| document.add_paragraph(text) | |
| else: | |
| # Use OCR for non-extractable pages | |
| images = convert_from_bytes(pdf_bytes, first_page=i + 1, last_page=i + 1) | |
| for image in images: | |
| ocr_text = pytesseract.image_to_string(image) | |
| if ocr_text.strip(): | |
| document.add_paragraph(ocr_text) | |
| else: | |
| document.add_paragraph("[This page contains non-extractable content or images]") | |
| # Save the Word document to a BytesIO object | |
| word_file = BytesIO() | |
| document.save(word_file) | |
| word_file.seek(0) | |
| return word_file | |
| except Exception as e: | |
| raise ValueError(f"An error occurred: {e}") | |
| # Streamlit app configuration | |
| st.set_page_config(page_title="PDF to Word Converter", page_icon="π", layout="centered") | |
| # App header | |
| st.title("π PDF to Word Converter") | |
| st.write("Upload a PDF file to convert it into an editable Word document.") | |
| # Upload PDF file widget | |
| uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
| if uploaded_file is not None: | |
| # Optionally ask for a password if the PDF is encrypted | |
| password = st.text_input("Enter PDF password (if encrypted)", type="password") | |
| if st.button("Convert to Word"): | |
| try: | |
| # Convert the PDF to Word | |
| with st.spinner("Converting PDF to Word..."): | |
| word_file = pdf_to_word(uploaded_file, password) | |
| # Provide a download link for the Word file | |
| st.success("Conversion successful!") | |
| st.download_button( | |
| label="Download Word file", | |
| data=word_file, | |
| file_name="converted.docx", | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| ) | |
| except Exception as e: | |
| st.error(f"Error: {e}") |