darthPanda commited on
Commit
53e11fa
·
1 Parent(s): 340ec84

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +25 -16
  2. requirements.txt +1 -2
app.py CHANGED
@@ -1,11 +1,9 @@
1
  import streamlit as st
2
- from pdf2jpg import pdf2jpg
3
  import shutil
4
  import os
5
  from ultralytics import YOLO
6
  import shutil
7
- import os
8
- from tabula import read_pdf
9
  import pandas as pd
10
  import gdown
11
  import camelot
@@ -14,10 +12,13 @@ import camelot
14
  if os.path.exists('prediction') and os.path.isdir('prediction'):
15
  shutil.rmtree('prediction')
16
 
 
 
 
17
  # Check if the directory exists
18
- if not os.path.exists('temp.pdf_dir'):
19
  # If it does not exist, create it
20
- os.makedirs('temp.pdf_dir')
21
  print('not_found')
22
  else:
23
  print('found')
@@ -49,16 +50,30 @@ def main():
49
 
50
 
51
  inputpath = "temp//temp.pdf"
52
- outputpath = ""
53
  with st.spinner('Converting pdf to images...'):
54
- result = pdf2jpg.convert_pdf2jpg(inputpath,outputpath, pages="ALL")
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- st.markdown('### Images of detected tables')
57
  with st.spinner('Detecting table in images...'):
58
- for index, entry in enumerate(os.listdir('./temp.pdf_dir')):
 
 
59
  print(entry)
60
  # Construct the full file path
61
- full_path = os.path.join('temp.pdf_dir', entry)
62
  print(full_path)
63
  results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
64
  st.image(os.path.join(f'prediction//image_{index}',entry))
@@ -72,12 +87,6 @@ def main():
72
  # Print the tables or convert them to a different format like CSV
73
  for i, table in enumerate(tables):
74
  st.dataframe(table.df)
75
- # table.to_csv(f'table_{i}.csv') # Save to CSV
76
- # tables = read_pdf(inputpath, pages='all', multiple_tables=True)
77
- # for i, table in enumerate(tables):
78
- # print(f"Table {i+1}")
79
- # print(table)
80
- # st.dataframe(table)
81
 
82
  st.success('Processing Completed!')
83
 
 
1
  import streamlit as st
 
2
  import shutil
3
  import os
4
  from ultralytics import YOLO
5
  import shutil
6
+ import fitz
 
7
  import pandas as pd
8
  import gdown
9
  import camelot
 
12
  if os.path.exists('prediction') and os.path.isdir('prediction'):
13
  shutil.rmtree('prediction')
14
 
15
+ if os.path.exists('temp_pdf') and os.path.isdir('temp_pdf'):
16
+ shutil.rmtree('temp_pdf')
17
+
18
  # Check if the directory exists
19
+ if not os.path.exists('temp_pdf'):
20
  # If it does not exist, create it
21
+ os.makedirs('temp_pdf')
22
  print('not_found')
23
  else:
24
  print('found')
 
50
 
51
 
52
  inputpath = "temp//temp.pdf"
53
+ st.markdown('### Images of detected tables')
54
  with st.spinner('Converting pdf to images...'):
55
+ doc = fitz.open(inputpath)
56
+ zoom = 4
57
+ mat = fitz.Matrix(zoom, zoom)
58
+ count = 0
59
+ for p in doc:
60
+ count += 1
61
+ for i in range(count):
62
+ val = f"image_{i}.png"
63
+ val = os.path.join('temp_pdf',val)
64
+ page = doc.load_page(i)
65
+ pix = page.get_pixmap(matrix=mat)
66
+ pix.save(val)
67
+ doc.close()
68
 
69
+
70
  with st.spinner('Detecting table in images...'):
71
+
72
+ # for index, entry in enumerate(os.listdir('./temp.pdf_dir')):
73
+ for index, entry in enumerate(os.listdir('./temp_pdf')):
74
  print(entry)
75
  # Construct the full file path
76
+ full_path = os.path.join('temp_pdf', entry)
77
  print(full_path)
78
  results = model.predict(full_path, save=True, project="prediction", name=f'image_{index}')
79
  st.image(os.path.join(f'prediction//image_{index}',entry))
 
87
  # Print the tables or convert them to a different format like CSV
88
  for i, table in enumerate(tables):
89
  st.dataframe(table.df)
 
 
 
 
 
 
90
 
91
  st.success('Processing Completed!')
92
 
requirements.txt CHANGED
@@ -1,7 +1,6 @@
1
  streamlit==1.25.0
2
  ultralytics
3
- pdf2jpg
4
- tabula-py[jpype]
5
  gdown
6
  camelot-py[cv]
7
  PyPDF2<3.0
 
1
  streamlit==1.25.0
2
  ultralytics
3
+ PyMuPDF
 
4
  gdown
5
  camelot-py[cv]
6
  PyPDF2<3.0