darthPanda commited on
Commit
121d037
·
1 Parent(s): 837517c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +71 -0
  2. requirements.txt +5 -0
  3. temp/temp.pdf +0 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf2jpg import pdf2jpg
3
+ import shutil
4
+ import os
5
+ from ultralytics import YOLO
6
+ import shutil
7
+ import os
8
+ from tabula import read_pdf
9
+ import pandas as pd
10
+ import gdown
11
+
12
+
13
+ if os.path.exists('prediction') and os.path.isdir('prediction'):
14
+ shutil.rmtree('prediction')
15
+
16
+ # Check if the directory exists
17
+ if not os.path.exists('model'):
18
+ # If it does not exist, create it
19
+ os.makedirs('model')
20
+ url = "https://drive.google.com/uc?id=1zv3VDW-LXuesKLrTm6xSdKGrycutFdHb"
21
+ output = "model//best.pt"
22
+ gdown.download(url, output, quiet=False)
23
+
24
+ temp_file_path = 'temp//temp.pdf'
25
+
26
+ model = YOLO('model//best.pt')
27
+
28
+ def main():
29
+ # Set the title of the app
30
+ st.title("Table detection")
31
+
32
+ # Create a file uploader to upload PDF files
33
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
34
+
35
+ if uploaded_file is not None:
36
+ # Create a temporary directory
37
+
38
+ with open(temp_file_path, 'wb') as f:
39
+ f.write(uploaded_file.getbuffer())
40
+
41
+
42
+ inputpath = "temp//temp.pdf"
43
+ outputpath = ""
44
+ with st.spinner('Converting pdf to images...'):
45
+ result = pdf2jpg.convert_pdf2jpg(inputpath,outputpath, pages="ALL")
46
+
47
+ st.markdown('### Images of detected tables')
48
+ with st.spinner('Detecting table in images...'):
49
+ for index, entry in enumerate(os.listdir('temp.pdf_dir')):
50
+ # Construct the full file path
51
+ full_path = os.path.join('temp.pdf_dir', entry)
52
+ print(full_path)
53
+ results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
54
+ st.image(os.path.join(f'prediction//image_{index}',entry))
55
+
56
+ st.markdown('### Extracted data from tables')
57
+
58
+ with st.spinner('Performing OCR on tables to extract images...'):
59
+ tables = read_pdf(inputpath, pages='all', multiple_tables=True)
60
+ for i, table in enumerate(tables):
61
+ print(f"Table {i+1}")
62
+ print(table)
63
+ st.dataframe(table)
64
+
65
+ st.success('Processing Completed!')
66
+
67
+ # st.image(os.listdir('temp.pdf_dir'))
68
+
69
+ # Run the app
70
+ if __name__ == "__main__":
71
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit==1.25.0
2
+ ultralytics
3
+ pdf2jpg
4
+ tabula-py[jpype]
5
+ gdown
temp/temp.pdf ADDED
Binary file (26.4 kB). View file