Spaces:

darthPanda
/

table_detector

Runtime error

App Files Files Community

darthPanda commited on Dec 4, 2023

Commit

121d037

1 Parent(s): 837517c

Upload 3 files

Browse files

Files changed (3) hide show

app.py +71 -0
requirements.txt +5 -0
temp/temp.pdf +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+from pdf2jpg import pdf2jpg
+import shutil
+import os
+from ultralytics import YOLO
+import shutil
+import os
+from tabula import read_pdf
+import pandas as pd
+import gdown
+if os.path.exists('prediction') and os.path.isdir('prediction'):
+    shutil.rmtree('prediction')
+# Check if the directory exists
+if not os.path.exists('model'):
+    # If it does not exist, create it
+    os.makedirs('model')
+    url = "https://drive.google.com/uc?id=1zv3VDW-LXuesKLrTm6xSdKGrycutFdHb"
+    output = "model//best.pt"
+    gdown.download(url, output, quiet=False)
+temp_file_path = 'temp//temp.pdf'
+model = YOLO('model//best.pt')
+def main():
+    # Set the title of the app
+    st.title("Table detection")
+    # Create a file uploader to upload PDF files
+    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+    if uploaded_file is not None:
+        # Create a temporary directory
+        with open(temp_file_path, 'wb') as f:
+            f.write(uploaded_file.getbuffer())
+        inputpath = "temp//temp.pdf"
+        outputpath = ""
+        with st.spinner('Converting pdf to images...'):
+            result = pdf2jpg.convert_pdf2jpg(inputpath,outputpath, pages="ALL")
+        st.markdown('### Images of detected tables')
+        with st.spinner('Detecting table in images...'):
+            for index, entry in enumerate(os.listdir('temp.pdf_dir')):
+                # Construct the full file path
+                full_path = os.path.join('temp.pdf_dir', entry)
+                print(full_path)
+                results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
+                st.image(os.path.join(f'prediction//image_{index}',entry))
+        st.markdown('### Extracted data from tables')
+        with st.spinner('Performing OCR on tables to extract images...'):
+            tables = read_pdf(inputpath, pages='all', multiple_tables=True)
+            for i, table in enumerate(tables):
+                print(f"Table {i+1}")
+                print(table)
+                st.dataframe(table)
+        st.success('Processing Completed!')
+        # st.image(os.listdir('temp.pdf_dir'))
+# Run the app
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+streamlit==1.25.0
+ultralytics
+pdf2jpg
+tabula-py[jpype]
+gdown

temp/temp.pdf ADDED Viewed

Binary file (26.4 kB). View file