darthPanda commited on
Commit
340ec84
·
1 Parent(s): 6d63784

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +13 -5
  2. requirements.txt +3 -1
app.py CHANGED
@@ -8,6 +8,7 @@ import os
8
  from tabula import read_pdf
9
  import pandas as pd
10
  import gdown
 
11
 
12
 
13
  if os.path.exists('prediction') and os.path.isdir('prediction'):
@@ -48,7 +49,7 @@ def main():
48
 
49
 
50
  inputpath = "temp//temp.pdf"
51
- outputpath = "temp_pdf"
52
  with st.spinner('Converting pdf to images...'):
53
  result = pdf2jpg.convert_pdf2jpg(inputpath,outputpath, pages="ALL")
54
 
@@ -65,11 +66,18 @@ def main():
65
  st.markdown('### Extracted data from tables')
66
 
67
  with st.spinner('Performing OCR on tables to extract images...'):
68
- tables = read_pdf(inputpath, pages='all', multiple_tables=True)
 
 
 
69
  for i, table in enumerate(tables):
70
- print(f"Table {i+1}")
71
- print(table)
72
- st.dataframe(table)
 
 
 
 
73
 
74
  st.success('Processing Completed!')
75
 
 
8
  from tabula import read_pdf
9
  import pandas as pd
10
  import gdown
11
+ import camelot
12
 
13
 
14
  if os.path.exists('prediction') and os.path.isdir('prediction'):
 
49
 
50
 
51
  inputpath = "temp//temp.pdf"
52
+ outputpath = ""
53
  with st.spinner('Converting pdf to images...'):
54
  result = pdf2jpg.convert_pdf2jpg(inputpath,outputpath, pages="ALL")
55
 
 
66
  st.markdown('### Extracted data from tables')
67
 
68
  with st.spinner('Performing OCR on tables to extract images...'):
69
+ # Extract tables from the PDF
70
+ tables = camelot.read_pdf(inputpath, pages='all', flavor='stream')
71
+
72
+ # Print the tables or convert them to a different format like CSV
73
  for i, table in enumerate(tables):
74
+ st.dataframe(table.df)
75
+ # table.to_csv(f'table_{i}.csv') # Save to CSV
76
+ # tables = read_pdf(inputpath, pages='all', multiple_tables=True)
77
+ # for i, table in enumerate(tables):
78
+ # print(f"Table {i+1}")
79
+ # print(table)
80
+ # st.dataframe(table)
81
 
82
  st.success('Processing Completed!')
83
 
requirements.txt CHANGED
@@ -2,4 +2,6 @@ streamlit==1.25.0
2
  ultralytics
3
  pdf2jpg
4
  tabula-py[jpype]
5
- gdown
 
 
 
2
  ultralytics
3
  pdf2jpg
4
  tabula-py[jpype]
5
+ gdown
6
+ camelot-py[cv]
7
+ PyPDF2<3.0