Spaces:

joehare
/

ocr-to-data

Build error

App Files Files Community

Joe Hare commited on Jun 16, 2023

Commit

2ea29e7

1 Parent(s): 9614a6f

First version tryna load a model

Browse files

Files changed (1) hide show

app.py +49 -0

app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import streamlit as st
+import pandas as pd
+from pdf2image import convert_from_bytes
+import pytesseract
+from pytesseract import Output
+from io import BytesIO
+import base64
+# Set up the Streamlit app
+st.title("PDF to Excel Converter with OCR")
+combined_data = pd.DataFrame()
+# Upload the PDF file
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+# Convert the PDF to images and use OCR to extract the text
+if uploaded_file is not None:
+    with st.spinner("Converting PDF to images..."):
+        images = convert_from_bytes(uploaded_file.read())
+    st.success("PDF converted to images successfully!")
+    with st.spinner("Extracting text from images using OCR..."):
+        data = []
+        for i, img in enumerate(images):
+            text = pytesseract.image_to_data(img, output_type=Output.DICT)
+            data.append(pd.DataFrame(text))
+        combined_data = pd.concat(data, ignore_index=True)
+        st.success("Text extracted successfully!")
+# Display the extracted text and create a download button for the Excel file
+st.write(combined_data)
+def to_excel(df):
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='openpyxl')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    writer.close()
+    processed_data = output.getvalue()
+    return processed_data
+def get_table_download_link(df):
+    val = to_excel(df)
+    b64 = base64.b64encode(val)
+    return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="output.xlsx">Download Excel file</a>'
+st.markdown(get_table_download_link(combined_data), unsafe_allow_html=True)