Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| from pdf2image import convert_from_bytes | |
| import pytesseract | |
| from pytesseract import Output | |
| from io import BytesIO | |
| import base64 | |
| # Set up the Streamlit app | |
| st.title("PDF to Excel Converter with OCR") | |
| combined_data = pd.DataFrame() | |
| # Upload the PDF file | |
| uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
| # Convert the PDF to images and use OCR to extract the text | |
| if uploaded_file is not None: | |
| with st.spinner("Converting PDF to images..."): | |
| images = convert_from_bytes(uploaded_file.read()) | |
| st.success("PDF converted to images successfully!") | |
| with st.spinner("Extracting text from images using OCR..."): | |
| data = [] | |
| for i, img in enumerate(images): | |
| text = pytesseract.image_to_data(img, output_type=Output.DICT) | |
| data.append(pd.DataFrame(text)) | |
| combined_data = pd.concat(data, ignore_index=True) | |
| st.success("Text extracted successfully!") | |
| # Display the extracted text and create a download button for the Excel file | |
| st.write(combined_data) | |
| def to_excel(df): | |
| output = BytesIO() | |
| writer = pd.ExcelWriter(output, engine='openpyxl') | |
| df.to_excel(writer, index=False, sheet_name='Sheet1') | |
| writer.close() | |
| processed_data = output.getvalue() | |
| return processed_data | |
| def get_table_download_link(df): | |
| val = to_excel(df) | |
| b64 = base64.b64encode(val) | |
| return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="output.xlsx">Download Excel file</a>' | |
| st.markdown(get_table_download_link(combined_data), unsafe_allow_html=True) | |