import os import streamlit as st from pdf2image import convert_from_path from PIL import Image import pytesseract from docx import Document import tempfile # Function to convert PDF to image def pdf_to_image(pdf_path): try: images = convert_from_path(pdf_path, 500) return images except Exception as e: st.error(f"Error during PDF to image conversion: {str(e)}") return None # Function to extract text from an image using pytesseract def image_to_text(image): try: text = pytesseract.image_to_string(image) return text except Exception as e: st.error(f"Error during image to text conversion: {str(e)}") return None # Function to save text to a Word document def save_to_word(text, file_name): doc = Document() doc.add_paragraph(text) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', prefix=file_name) doc.save(temp_file.name) return temp_file.name # Streamlit UI st.title("PDF to Word Converter") st.write("Upload a PDF to convert it to a Word document") # File upload feature uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True) if uploaded_files: for uploaded_file in uploaded_files: # Save the uploaded PDF to a temporary file temp_pdf_path = tempfile.mktemp(suffix=".pdf") with open(temp_pdf_path, "wb") as temp_pdf: temp_pdf.write(uploaded_file.getbuffer()) # Convert PDF to images images = pdf_to_image(temp_pdf_path) if images: # Extract text from images extracted_text = "" for img in images: text = image_to_text(img) if text: extracted_text += text + "\n" # Save the extracted text to Word if extracted_text: word_file = save_to_word(extracted_text, uploaded_file.name) st.success(f"Conversion of {uploaded_file.name} complete! Download the Word file below.") st.download_button(f"Download {uploaded_file.name} as Word", word_file, file_name=f"{uploaded_file.name}.docx") else: st.write("Please upload PDF files to convert.")