import os import streamlit as st from pdf2image import convert_from_path # Path to your PDF file pdf_path = "path_to_your_pdf.pdf" # Path to Poppler binary (optional if already in PATH) poppler_path = r"C:\path\to\poppler\bin" # Update this path as needed try: # Convert PDF to images images = convert_from_path(pdf_path, poppler_path=poppler_path) print(f"Converted {len(images)} pages to images successfully!") except Exception as e: print(f"An error occurred: {e}") # Function to extract text from an image-based PDF def extract_text_from_image_pdf(pdf_path): images = convert_from_path(pdf_path, poppler_path=POPPLER_PATH) extracted_text = [] for page_num, image in enumerate(images, start=1): text = pytesseract.image_to_string(image) extracted_text.append(f"Page {page_num}:\n{text}") return "\n".join(extracted_text) # Function to save extracted text to a Word file def save_text_to_word(text, output_path): doc = Document() doc.add_paragraph(text) doc.save(output_path) # Function to save extracted text to an Excel file def save_text_to_excel(text, output_path): data = {"Text": text.split("\n")} df = pd.DataFrame(data) df.to_excel(output_path, index=False) def main(): st.title("PDF Image to Text Converter") st.write("Upload an image-based PDF to extract text and save as text, Word, or Excel format.") uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) if uploaded_file is not None: with st.spinner("Processing..."): tmp_file_path = "uploaded_file.pdf" with open(tmp_file_path, "wb") as f: f.write(uploaded_file.read()) try: extracted_text = extract_text_from_image_pdf(tmp_file_path) st.success("Text extracted successfully!") st.text_area("Extracted Text", extracted_text, height=300) # Options to download text in different formats if st.button("Download as Word"): save_text_to_word(extracted_text, "output.docx") st.download_button("Download Word File", open("output.docx", "rb"), "output.docx") if st.button("Download as Excel"): save_text_to_excel(extracted_text, "output.xlsx") st.download_button("Download Excel File", open("output.xlsx", "rb"), "output.xlsx") except Exception as e: st.error(f"An error occurred: {e}") finally: if os.path.exists(tmp_file_path): os.remove(tmp_file_path) if __name__ == "__main__": main()