import streamlit as st import pandas as pd from pdf2image import convert_from_bytes import pytesseract from pytesseract import Output from io import BytesIO import base64 # Set up the Streamlit app st.title("PDF to Excel Converter with OCR") combined_data = pd.DataFrame() # Upload the PDF file uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) # Convert the PDF to images and use OCR to extract the text if uploaded_file is not None: with st.spinner("Converting PDF to images..."): images = convert_from_bytes(uploaded_file.read()) st.success("PDF converted to images successfully!") with st.spinner("Extracting text from images using OCR..."): data = [] for i, img in enumerate(images): text = pytesseract.image_to_data(img, output_type=Output.DICT) data.append(pd.DataFrame(text)) combined_data = pd.concat(data, ignore_index=True) st.success("Text extracted successfully!") # Display the extracted text and create a download button for the Excel file st.write(combined_data) def to_excel(df): output = BytesIO() writer = pd.ExcelWriter(output, engine='openpyxl') df.to_excel(writer, index=False, sheet_name='Sheet1') writer.close() processed_data = output.getvalue() return processed_data def get_table_download_link(df): val = to_excel(df) b64 = base64.b64encode(val) return f'Download Excel file' st.markdown(get_table_download_link(combined_data), unsafe_allow_html=True)