tjxj commited on
Commit
224e0a3
·
1 Parent(s): 6e9c9e4
.history/app_20220621074625.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ process_background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number,process_background=process_background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621074627.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ process_background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number,process_background=process_background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621075004.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ process_background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621075036.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ process_background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=process_background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621075144.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ process_background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=process_background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621075237.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621075718.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",('True', 'False'))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621075907.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(True, False))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621095309.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(True, False))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages=all, process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621095310.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(True, False))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages=all, process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621095327.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(True, False))
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages=all, process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621095508.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages=all, process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621095552.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621100215.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
41
+
42
+
43
+ if st.button('单页抽取'):
44
+ st.write('Why hello there')
45
+ if st.button('全文抽取'):
46
+ st.write('全文抽取')
.history/app_20220621100535.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
41
+
42
+
43
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
44
+ with row9_1:
45
+
46
+ with row9_2:
47
+
48
+
49
+ if st.button('单页抽取'):
50
+ st.write('Why hello there')
51
+ if st.button('全文抽取'):
52
+ st.write('全文抽取')
.history/app_20220621100545.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ for i in range(0,len(tables_all)):
36
+ table = tables_all[i].df
37
+ sheetname = str(i)
38
+ table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
41
+
42
+
43
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
44
+ with row9_1:
45
+ pass
46
+ with row9_2:
47
+ pass
48
+
49
+ if st.button('单页抽取'):
50
+ st.write('Why hello there')
51
+ if st.button('全文抽取'):
52
+ st.write('全文抽取')
.history/app_20220621100559.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ # for i in range(0,len(tables_all)):
36
+ # table = tables_all[i].df
37
+ # sheetname = str(i)
38
+ # table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
41
+
42
+
43
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
44
+ with row9_1:
45
+ pass
46
+ with row9_2:
47
+ pass
48
+
49
+ if st.button('单页抽取'):
50
+ st.write('Why hello there')
51
+ if st.button('全文抽取'):
52
+ st.write('全文抽取')
.history/app_20220621100625.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ #tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ # for i in range(0,len(tables_all)):
36
+ # table = tables_all[i].df
37
+ # sheetname = str(i)
38
+ # table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
41
+
42
+
43
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
44
+ with row9_1:
45
+ pass
46
+ with row9_2:
47
+ pass
48
+
49
+ if st.button('单页抽取'):
50
+ st.write('Why hello there')
51
+ if st.button('全文抽取'):
52
+ st.write('全文抽取')
.history/app_20220621100808.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True),)
14
+ if input_pdf is not None:
15
+ # byte object into a PDF file
16
+ with open("input.pdf", "wb") as f:
17
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
+ f.write(base64.b64decode(base64_pdf))
19
+ f.close()
20
+
21
+ # read the pdf and parse it using stream
22
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
23
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
+ tables[0].to_excel(result,index=False)
25
+ # for i in range(0,len(tables)):
26
+ # table = tables[i].df
27
+ # sheetname = str(i)
28
+ # table.to_excel(result, sheetname,index=False)
29
+
30
+ with open('result.xlsx','rb') as f:
31
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
32
+
33
+ #tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
34
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
35
+ # for i in range(0,len(tables_all)):
36
+ # table = tables_all[i].df
37
+ # sheetname = str(i)
38
+ # table.to_excel(result_all, sheetname,index=False)
39
+ with open('result_all.xlsx','rb') as f:
40
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
41
+
42
+
43
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
44
+ with row9_1:
45
+ if st.button('单页抽取'):
46
+ st.write('单页抽取')
47
+ with row9_2:
48
+ if st.button('全文抽取'):
49
+ st.write('全文抽取')
50
+
51
+
.history/app_20220621101549.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
13
+ background = st.selectbox("表格线条是否隐藏",(False,True))
14
+ background = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
15
+
16
+ if input_pdf is not None:
17
+ # byte object into a PDF file
18
+ with open("input.pdf", "wb") as f:
19
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
20
+ f.write(base64.b64decode(base64_pdf))
21
+ f.close()
22
+
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+
35
+ #tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ # for i in range(0,len(tables_all)):
38
+ # table = tables_all[i].df
39
+ # sheetname = str(i)
40
+ # table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
43
+
44
+
45
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
46
+ with row9_1:
47
+ if st.button('单页抽取'):
48
+ st.write('单页抽取')
49
+ with row9_2:
50
+ if st.button('全文抽取'):
51
+ st.write('全文抽取')
52
+
53
+
.history/app_20220621102045.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+
35
+ #tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ # for i in range(0,len(tables_all)):
38
+ # table = tables_all[i].df
39
+ # sheetname = str(i)
40
+ # table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
43
+
44
+
45
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
46
+ with row9_1:
47
+ if st.button('单页抽取'):
48
+ st.write('单页抽取')
49
+ with row9_2:
50
+ if st.button('全文抽取'):
51
+ st.write('全文抽取')
52
+
53
+
.history/app_20220621102127.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('一件抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
43
+
44
+
45
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
46
+ with row9_1:
47
+ if st.button('单页抽取'):
48
+ st.write('单页抽取')
49
+ with row9_2:
50
+ if st.button('全文抽取'):
51
+ st.write('全文抽取')
52
+
53
+
.history/app_20220621102131.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
43
+
44
+
45
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
46
+ with row9_1:
47
+ if st.button('单页抽取'):
48
+ st.write('单页抽取')
49
+ with row9_2:
50
+ if st.button('全文抽取'):
51
+ st.write('全文抽取')
52
+
53
+
.history/app_20220621102203.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
43
+
44
+
45
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
46
+ with row9_1:
47
+ if st.button('单页抽取'):
48
+ st.write('单页抽取')
49
+ with row9_2:
50
+ if st.button('全文抽取'):
51
+ st.write('全文抽取')
52
+
53
+
.history/app_20220621102204.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
43
+
44
+
45
+ row9_spacer1, row9_1, row9_spacer2, row9_2, row9_spacer3 = st.columns((.2, 2.3, .4, 4.4, .2))
46
+ with row9_1:
47
+ if st.button('单页抽取'):
48
+ st.write('单页抽取')
49
+ with row9_2:
50
+ if st.button('全文抽取'):
51
+ st.write('全文抽取')
52
+
53
+
.history/app_20220621102210.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[0].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621102938.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ tables[1].to_excel(result,index=False)
27
+ # for i in range(0,len(tables)):
28
+ # table = tables[i].df
29
+ # sheetname = str(i)
30
+ # table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621104301.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ if input_pdf is not None:
16
+ # byte object into a PDF file
17
+ with open("input.pdf", "wb") as f:
18
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
19
+ f.write(base64.b64decode(base64_pdf))
20
+ f.close()
21
+ if extractor_mode == "单页抽取":
22
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
23
+ # read the pdf and parse it using stream
24
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
25
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
26
+ #tables[1].to_excel(result,index=False)
27
+ for i in range(0,len(tables)):
28
+ table = tables[i].df
29
+ sheetname = str(i)
30
+ table.to_excel(result, sheetname,index=False)
31
+
32
+ with open('result.xlsx','rb') as f:
33
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
34
+ if extractor_mode == "全文抽取":
35
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
36
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
37
+ for i in range(0,len(tables_all)):
38
+ table = tables_all[i].df
39
+ sheetname = str(i)
40
+ table.to_excel(result_all, sheetname,index=False)
41
+ with open('result_all.xlsx','rb') as f:
42
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621104703.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ with open('result_all.xlsx','rb') as f:
23
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
24
+
25
+
26
+ if input_pdf is not None:
27
+ # byte object into a PDF file
28
+ with open("input.pdf", "wb") as f:
29
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
30
+ f.write(base64.b64decode(base64_pdf))
31
+ f.close()
32
+ if extractor_mode == "单页抽取":
33
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
34
+ # read the pdf and parse it using stream
35
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
36
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
37
+ #tables[1].to_excel(result,index=False)
38
+ for i in range(0,len(tables)):
39
+ table = tables[i].df
40
+ sheetname = str(i)
41
+ table.to_excel(result, sheetname,index=False)
42
+
43
+ with open('result.xlsx','rb') as f:
44
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
45
+ if extractor_mode == "全文抽取":
46
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
47
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
48
+ for i in range(0,len(tables_all)):
49
+ table = tables_all[i].df
50
+ sheetname = str(i)
51
+ table.to_excel(result_all, sheetname,index=False)
52
+ with open('result_all.xlsx','rb') as f:
53
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621104714.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ with open(result_name,'rb') as f:
23
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
24
+
25
+
26
+ if input_pdf is not None:
27
+ # byte object into a PDF file
28
+ with open("input.pdf", "wb") as f:
29
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
30
+ f.write(base64.b64decode(base64_pdf))
31
+ f.close()
32
+ if extractor_mode == "单页抽取":
33
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
34
+ # read the pdf and parse it using stream
35
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
36
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
37
+ #tables[1].to_excel(result,index=False)
38
+ for i in range(0,len(tables)):
39
+ table = tables[i].df
40
+ sheetname = str(i)
41
+ table.to_excel(result, sheetname,index=False)
42
+
43
+ with open('result.xlsx','rb') as f:
44
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
45
+ if extractor_mode == "全文抽取":
46
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
47
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
48
+ for i in range(0,len(tables_all)):
49
+ table = tables_all[i].df
50
+ sheetname = str(i)
51
+ table.to_excel(result_all, sheetname,index=False)
52
+ with open('result_all.xlsx','rb') as f:
53
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621104723.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ with open(result_name,'rb') as f:
23
+ st.download_button('抽取完成,点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
24
+
25
+
26
+ if input_pdf is not None:
27
+ # byte object into a PDF file
28
+ with open("input.pdf", "wb") as f:
29
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
30
+ f.write(base64.b64decode(base64_pdf))
31
+ f.close()
32
+ if extractor_mode == "单页抽取":
33
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
34
+ # read the pdf and parse it using stream
35
+ tables = cam.read_pdf("input.pdf", pages=page_number, process_background=background)
36
+ result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
37
+ #tables[1].to_excel(result,index=False)
38
+ for i in range(0,len(tables)):
39
+ table = tables[i].df
40
+ sheetname = str(i)
41
+ table.to_excel(result, sheetname,index=False)
42
+
43
+ with open('result.xlsx','rb') as f:
44
+ st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
45
+ if extractor_mode == "全文抽取":
46
+ tables_all= cam.read_pdf("input.pdf", pages="all", process_background=background)
47
+ result_all = pd.ExcelWriter('result_all.xlsx', engine='xlsxwriter')
48
+ for i in range(0,len(tables_all)):
49
+ table = tables_all[i].df
50
+ sheetname = str(i)
51
+ table.to_excel(result_all, sheetname,index=False)
52
+ with open('result_all.xlsx','rb') as f:
53
+ st.download_button('抽取完成,点击下载!', f,file_name='result_all.xlsx',mime="application/vnd.ms-excel")
.history/app_20220621104931.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ with open(result_name,'rb') as f:
23
+ st.download_button('抽取完成,点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
24
+
25
+
26
+ if input_pdf is not None:
27
+ # byte object into a PDF file
28
+ with open("input.pdf", "wb") as f:
29
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
30
+ f.write(base64.b64decode(base64_pdf))
31
+ f.close()
32
+ if extractor_mode == "单页抽取":
33
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
34
+ extractor(page_number,"result.xlsx")
35
+ if extractor_mode == "全文抽取":
36
+ extractor("all","result_all.xlsx")
.history/app_20220621105543.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ result_all.save()
23
+ with open(result_name,'rb') as f:
24
+ st.download_button('抽取完成, 点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
25
+
26
+
27
+ if input_pdf is not None:
28
+ # byte object into a PDF file
29
+ with open("input.pdf", "wb") as f:
30
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
31
+ f.write(base64.b64decode(base64_pdf))
32
+ f.close()
33
+ if extractor_mode == "单页抽取":
34
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
35
+ extractor(page_number,"result.xlsx")
36
+ if extractor_mode == "全文抽取":
37
+ extractor("all","result_all.xlsx")
app.py CHANGED
@@ -9,7 +9,20 @@ st.title("PDF Table Extractor")
9
 
10
  input_pdf = st.file_uploader(label = "", type = 'pdf')
11
 
12
- page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  if input_pdf is not None:
15
  # byte object into a PDF file
@@ -17,15 +30,8 @@ if input_pdf is not None:
17
  base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
18
  f.write(base64.b64decode(base64_pdf))
19
  f.close()
20
-
21
- # read the pdf and parse it using stream
22
- tables = cam.read_pdf("input.pdf", pages=page_number)
23
- result = pd.ExcelWriter('result.xlsx', engine='xlsxwriter')
24
- tables[0].to_excel(result,index=False)
25
- # for i in range(0,len(tables)):
26
- # table = tables[i].df
27
- # sheetname = str(i)
28
- # table.to_excel(result, sheetname,index=False)
29
-
30
- with open('result.xlsx','rb') as f:
31
- st.download_button('提取完成,点击下载!', f,file_name='result.xlsx',mime="application/vnd.ms-excel")
 
9
 
10
  input_pdf = st.file_uploader(label = "", type = 'pdf')
11
 
12
+ background = st.selectbox("表格线条是否隐藏",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ result_all.save()
23
+ with open(result_name,'rb') as f:
24
+ st.download_button('抽取完成, 点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
25
+
26
 
27
  if input_pdf is not None:
28
  # byte object into a PDF file
 
30
  base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
31
  f.write(base64.b64decode(base64_pdf))
32
  f.close()
33
+ if extractor_mode == "单页抽取":
34
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
35
+ extractor(page_number,"result.xlsx")
36
+ if extractor_mode == "全文抽取":
37
+ extractor("all","result_all.xlsx")
 
 
 
 
 
 
 
input.pdf CHANGED
Binary files a/input.pdf and b/input.pdf differ
 
result.xlsx CHANGED
Binary files a/result.xlsx and b/result.xlsx differ
 
result_all.xlsx ADDED
File without changes