Joe Hare commited on
Commit
2ea29e7
·
1 Parent(s): 9614a6f

First version tryna load a model

Browse files
Files changed (1) hide show
  1. app.py +49 -0
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from pdf2image import convert_from_bytes
4
+ import pytesseract
5
+ from pytesseract import Output
6
+ from io import BytesIO
7
+ import base64
8
+
9
+ # Set up the Streamlit app
10
+ st.title("PDF to Excel Converter with OCR")
11
+
12
+ combined_data = pd.DataFrame()
13
+
14
+ # Upload the PDF file
15
+ uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
16
+
17
+ # Convert the PDF to images and use OCR to extract the text
18
+ if uploaded_file is not None:
19
+ with st.spinner("Converting PDF to images..."):
20
+ images = convert_from_bytes(uploaded_file.read())
21
+
22
+ st.success("PDF converted to images successfully!")
23
+
24
+ with st.spinner("Extracting text from images using OCR..."):
25
+ data = []
26
+ for i, img in enumerate(images):
27
+ text = pytesseract.image_to_data(img, output_type=Output.DICT)
28
+ data.append(pd.DataFrame(text))
29
+
30
+ combined_data = pd.concat(data, ignore_index=True)
31
+ st.success("Text extracted successfully!")
32
+
33
+ # Display the extracted text and create a download button for the Excel file
34
+ st.write(combined_data)
35
+
36
+ def to_excel(df):
37
+ output = BytesIO()
38
+ writer = pd.ExcelWriter(output, engine='openpyxl')
39
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
40
+ writer.close()
41
+ processed_data = output.getvalue()
42
+ return processed_data
43
+
44
+ def get_table_download_link(df):
45
+ val = to_excel(df)
46
+ b64 = base64.b64encode(val)
47
+ return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="output.xlsx">Download Excel file</a>'
48
+
49
+ st.markdown(get_table_download_link(combined_data), unsafe_allow_html=True)