Spaces:

joehare
/

ocr-to-data

Build error

ocr-to-data / app.py

Joe Hare

First version tryna load a model

2ea29e7 over 2 years ago

1.61 kB

	import streamlit as st
	import pandas as pd
	from pdf2image import convert_from_bytes
	import pytesseract
	from pytesseract import Output
	from io import BytesIO
	import base64

	# Set up the Streamlit app
	st.title("PDF to Excel Converter with OCR")

	combined_data = pd.DataFrame()

	# Upload the PDF file
	uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])

	# Convert the PDF to images and use OCR to extract the text
	if uploaded_file is not None:
	with st.spinner("Converting PDF to images..."):
	images = convert_from_bytes(uploaded_file.read())

	st.success("PDF converted to images successfully!")

	with st.spinner("Extracting text from images using OCR..."):
	data = []
	for i, img in enumerate(images):
	text = pytesseract.image_to_data(img, output_type=Output.DICT)
	data.append(pd.DataFrame(text))

	combined_data = pd.concat(data, ignore_index=True)
	st.success("Text extracted successfully!")

	# Display the extracted text and create a download button for the Excel file
	st.write(combined_data)

	def to_excel(df):
	output = BytesIO()
	writer = pd.ExcelWriter(output, engine='openpyxl')
	df.to_excel(writer, index=False, sheet_name='Sheet1')
	writer.close()
	processed_data = output.getvalue()
	return processed_data

	def get_table_download_link(df):
	val = to_excel(df)
	b64 = base64.b64encode(val)
	return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="output.xlsx">Download Excel file</a>'

	st.markdown(get_table_download_link(combined_data), unsafe_allow_html=True)