Spaces:

darthPanda
/

table_detector

Runtime error

App Files Files Community

darthPanda commited on Dec 4, 2023

Commit

53e11fa

1 Parent(s): 340ec84

Upload 2 files

Browse files

Files changed (2) hide show

app.py +25 -16
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import streamlit as st
-from pdf2jpg import pdf2jpg
 import shutil
 import os
 from ultralytics import YOLO
 import shutil
-import os
-from tabula import read_pdf
 import pandas as pd
 import gdown
 import camelot
@@ -14,10 +12,13 @@ import camelot
 if os.path.exists('prediction') and os.path.isdir('prediction'):
     shutil.rmtree('prediction')
 # Check if the directory exists
-if not os.path.exists('temp.pdf_dir'):
     # If it does not exist, create it
-    os.makedirs('temp.pdf_dir')
     print('not_found')
 else:
     print('found')
@@ -49,16 +50,30 @@ def main():
         inputpath = "temp//temp.pdf"
-        outputpath = ""
         with st.spinner('Converting pdf to images...'):
-            result = pdf2jpg.convert_pdf2jpg(inputpath,outputpath, pages="ALL")
-        st.markdown('### Images of detected tables')
         with st.spinner('Detecting table in images...'):
-            for index, entry in enumerate(os.listdir('./temp.pdf_dir')):
                 print(entry)
                 # Construct the full file path
-                full_path = os.path.join('temp.pdf_dir', entry)
                 print(full_path)
                 results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
                 st.image(os.path.join(f'prediction//image_{index}',entry))
@@ -72,12 +87,6 @@ def main():
             # Print the tables or convert them to a different format like CSV
             for i, table in enumerate(tables):
                 st.dataframe(table.df)
-                # table.to_csv(f'table_{i}.csv') # Save to CSV
-            # tables = read_pdf(inputpath, pages='all', multiple_tables=True)
-            # for i, table in enumerate(tables):
-            #     print(f"Table {i+1}")
-            #     print(table)
-            #     st.dataframe(table)
         st.success('Processing Completed!')

 import streamlit as st
 import shutil
 import os
 from ultralytics import YOLO
 import shutil
+import fitz
 import pandas as pd
 import gdown
 import camelot
 if os.path.exists('prediction') and os.path.isdir('prediction'):
     shutil.rmtree('prediction')
+if os.path.exists('temp_pdf') and os.path.isdir('temp_pdf'):
+    shutil.rmtree('temp_pdf')
 # Check if the directory exists
+if not os.path.exists('temp_pdf'):
     # If it does not exist, create it
+    os.makedirs('temp_pdf')
     print('not_found')
 else:
     print('found')
         inputpath = "temp//temp.pdf"
+        st.markdown('### Images of detected tables')
         with st.spinner('Converting pdf to images...'):
+            doc = fitz.open(inputpath)
+            zoom = 4
+            mat = fitz.Matrix(zoom, zoom)
+            count = 0
+            for p in doc:
+                count += 1
+            for i in range(count):
+                val = f"image_{i}.png"
+                val = os.path.join('temp_pdf',val)
+                page = doc.load_page(i)
+                pix = page.get_pixmap(matrix=mat)
+                pix.save(val)
+            doc.close()
         with st.spinner('Detecting table in images...'):
+            # for index, entry in enumerate(os.listdir('./temp.pdf_dir')):
+            for index, entry in enumerate(os.listdir('./temp_pdf')):
                 print(entry)
                 # Construct the full file path
+                full_path = os.path.join('temp_pdf', entry)
                 print(full_path)
                 results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
                 st.image(os.path.join(f'prediction//image_{index}',entry))
             # Print the tables or convert them to a different format like CSV
             for i, table in enumerate(tables):
                 st.dataframe(table.df)
         st.success('Processing Completed!')

requirements.txt CHANGED Viewed

@@ -1,7 +1,6 @@
 streamlit==1.25.0
 ultralytics
-pdf2jpg
-tabula-py[jpype]
 gdown
 camelot-py[cv]
 PyPDF2<3.0

 streamlit==1.25.0
 ultralytics
+PyMuPDF
 gdown
 camelot-py[cv]
 PyPDF2<3.0