Spaces:
Runtime error
Runtime error
tjxj
commited on
Commit
·
9e7a9da
1
Parent(s):
8482f1b
1.0
Browse files- .history/app_20220621141455.py +27 -0
- .history/app_20220621141456.py +27 -0
- .history/app_20220621141530.py +27 -0
- .history/test_20220621134654.py +0 -0
- .history/test_20220621134738.py +34 -0
- .history/test_20220621135222.py +37 -0
- .history/test_20220621135234.py +37 -0
- .history/test_20220621135258.py +31 -0
- .history/test_20220621135452.py +28 -0
- .history/test_20220621135519.py +28 -0
- .history/test_20220621135601.py +28 -0
- .history/test_20220621135627.py +29 -0
- .history/test_20220621135757.py +28 -0
- .history/test_20220621135808.py +27 -0
- app.py +10 -20
- input.pdf +0 -0
- result.xlsx +0 -0
- result_all.xlsx +0 -0
.history/app_20220621141455.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3, 1-3, 2-end, all", value = 1)
|
| 12 |
+
|
| 13 |
+
if input_pdf is not None:
|
| 14 |
+
# byte object into a PDF file
|
| 15 |
+
with open("input.pdf", "wb") as f:
|
| 16 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
+
f.write(base64.b64decode(base64_pdf))
|
| 18 |
+
f.close()
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
.history/app_20220621141456.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3, 1-3, 2-end, all", value = 1)
|
| 12 |
+
|
| 13 |
+
if input_pdf is not None:
|
| 14 |
+
# byte object into a PDF file
|
| 15 |
+
with open("input.pdf", "wb") as f:
|
| 16 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
+
f.write(base64.b64decode(base64_pdf))
|
| 18 |
+
f.close()
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
.history/app_20220621141530.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3, 1-3, 2-end, all", value = 1)
|
| 12 |
+
|
| 13 |
+
if input_pdf is not None:
|
| 14 |
+
# byte object into a PDF file
|
| 15 |
+
with open("input.pdf", "wb") as f:
|
| 16 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
+
f.write(base64.b64decode(base64_pdf))
|
| 18 |
+
f.close()
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
.history/test_20220621134654.py
ADDED
|
File without changes
|
.history/test_20220621134738.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
|
| 10 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
+
|
| 12 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 13 |
+
#extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
|
| 14 |
+
|
| 15 |
+
def extractor(page,result_name):
|
| 16 |
+
tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
|
| 17 |
+
result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
|
| 18 |
+
for i in range(0,len(tables_all)):
|
| 19 |
+
table = tables_all[i].df
|
| 20 |
+
sheetname = str(i)
|
| 21 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 22 |
+
result_all.save()
|
| 23 |
+
with open(result_name,'rb') as f:
|
| 24 |
+
st.download_button('抽取完成, 点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if input_pdf is not None:
|
| 28 |
+
# byte object into a PDF file
|
| 29 |
+
with open("input.pdf", "wb") as f:
|
| 30 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 31 |
+
f.write(base64.b64decode(base64_pdf))
|
| 32 |
+
f.close()
|
| 33 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 34 |
+
extractor(page_number,"result.xlsx")
|
.history/test_20220621135222.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
|
| 10 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
+
|
| 12 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 13 |
+
|
| 14 |
+
def extractor(page,result_name):
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if input_pdf is not None:
|
| 19 |
+
# byte object into a PDF file
|
| 20 |
+
with open("input.pdf", "wb") as f:
|
| 21 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 22 |
+
f.write(base64.b64decode(base64_pdf))
|
| 23 |
+
f.close()
|
| 24 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 25 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 26 |
+
result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
|
| 27 |
+
for i in range(0,len(tables_all)):
|
| 28 |
+
table = tables_all[i].df
|
| 29 |
+
sheetname = str(i)
|
| 30 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 31 |
+
result_all.save()
|
| 32 |
+
with open(result_name,'rb') as f:
|
| 33 |
+
st.download_button('抽取完成, 点击下载!', f,file_name=result.xlsx,mime="application/vnd.ms-excel")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
extractor(page_number,"result.xlsx")
|
.history/test_20220621135234.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
|
| 10 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
+
|
| 12 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 13 |
+
|
| 14 |
+
def extractor(page,result_name):
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if input_pdf is not None:
|
| 19 |
+
# byte object into a PDF file
|
| 20 |
+
with open("input.pdf", "wb") as f:
|
| 21 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 22 |
+
f.write(base64.b64decode(base64_pdf))
|
| 23 |
+
f.close()
|
| 24 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 25 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 26 |
+
result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
|
| 27 |
+
for i in range(0,len(tables_all)):
|
| 28 |
+
table = tables_all[i].df
|
| 29 |
+
sheetname = str(i)
|
| 30 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 31 |
+
result_all.save()
|
| 32 |
+
with open(result_name,'rb') as f:
|
| 33 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
extractor(page_number,"result.xlsx")
|
.history/test_20220621135258.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
|
| 12 |
+
if input_pdf is not None:
|
| 13 |
+
# byte object into a PDF file
|
| 14 |
+
with open("input.pdf", "wb") as f:
|
| 15 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 16 |
+
f.write(base64.b64decode(base64_pdf))
|
| 17 |
+
f.close()
|
| 18 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_name,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
extractor(page_number,"result.xlsx")
|
.history/test_20220621135452.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
|
| 12 |
+
if input_pdf is not None:
|
| 13 |
+
# byte object into a PDF file
|
| 14 |
+
with open("input.pdf", "wb") as f:
|
| 15 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 16 |
+
f.write(base64.b64decode(base64_pdf))
|
| 17 |
+
f.close()
|
| 18 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 28 |
+
|
.history/test_20220621135519.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
|
| 12 |
+
if input_pdf is not None:
|
| 13 |
+
# byte object into a PDF file
|
| 14 |
+
with open("input.pdf", "wb") as f:
|
| 15 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 16 |
+
f.write(base64.b64decode(base64_pdf))
|
| 17 |
+
f.close()
|
| 18 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 28 |
+
|
.history/test_20220621135601.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
|
| 12 |
+
if input_pdf is not None:
|
| 13 |
+
# byte object into a PDF file
|
| 14 |
+
with open("input.pdf", "wb") as f:
|
| 15 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 16 |
+
f.write(base64.b64decode(base64_pdf))
|
| 17 |
+
f.close()
|
| 18 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3,1-3,2-end,all", value = 1)
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 28 |
+
|
.history/test_20220621135627.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3,1-3,2-end,all", value = 1)
|
| 12 |
+
|
| 13 |
+
if input_pdf is not None:
|
| 14 |
+
# byte object into a PDF file
|
| 15 |
+
with open("input.pdf", "wb") as f:
|
| 16 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
+
f.write(base64.b64decode(base64_pdf))
|
| 18 |
+
f.close()
|
| 19 |
+
|
| 20 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 21 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 22 |
+
for i in range(0,len(tables_all)):
|
| 23 |
+
table = tables_all[i].df
|
| 24 |
+
sheetname = str(i)
|
| 25 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 26 |
+
result_all.save()
|
| 27 |
+
with open(result_all,'rb') as f:
|
| 28 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 29 |
+
|
.history/test_20220621135757.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3, 1-3, 2-end, all", value = 1)
|
| 12 |
+
|
| 13 |
+
if input_pdf is not None:
|
| 14 |
+
# byte object into a PDF file
|
| 15 |
+
with open("input.pdf", "wb") as f:
|
| 16 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
+
f.write(base64.b64decode(base64_pdf))
|
| 18 |
+
f.close()
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
| 28 |
+
|
.history/test_20220621135808.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 10 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3, 1-3, 2-end, all", value = 1)
|
| 12 |
+
|
| 13 |
+
if input_pdf is not None:
|
| 14 |
+
# byte object into a PDF file
|
| 15 |
+
with open("input.pdf", "wb") as f:
|
| 16 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
+
f.write(base64.b64decode(base64_pdf))
|
| 18 |
+
f.close()
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
app.py
CHANGED
|
@@ -6,23 +6,9 @@ import pandas as pd
|
|
| 6 |
import camelot as cam # extracting tables from PDFs
|
| 7 |
|
| 8 |
st.title("PDF Table Extractor")
|
| 9 |
-
|
| 10 |
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
-
|
| 12 |
background = st.selectbox("表格线条是否透明",(False,True))
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def extractor(page,result_name):
|
| 16 |
-
tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
|
| 17 |
-
result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
|
| 18 |
-
for i in range(0,len(tables_all)):
|
| 19 |
-
table = tables_all[i].df
|
| 20 |
-
sheetname = str(i)
|
| 21 |
-
table.to_excel(result_all, sheetname,index=False)
|
| 22 |
-
result_all.save()
|
| 23 |
-
with open(result_name,'rb') as f:
|
| 24 |
-
st.download_button('抽取完成, 点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
|
| 25 |
-
|
| 26 |
|
| 27 |
if input_pdf is not None:
|
| 28 |
# byte object into a PDF file
|
|
@@ -30,8 +16,12 @@ if input_pdf is not None:
|
|
| 30 |
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 31 |
f.write(base64.b64decode(base64_pdf))
|
| 32 |
f.close()
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import camelot as cam # extracting tables from PDFs
|
| 7 |
|
| 8 |
st.title("PDF Table Extractor")
|
|
|
|
| 9 |
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
|
|
|
| 10 |
background = st.selectbox("表格线条是否透明",(False,True))
|
| 11 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3, 1-3, 2-end, all", value = 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
if input_pdf is not None:
|
| 14 |
# byte object into a PDF file
|
|
|
|
| 16 |
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 17 |
f.write(base64.b64decode(base64_pdf))
|
| 18 |
f.close()
|
| 19 |
+
tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
|
| 20 |
+
result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
|
| 21 |
+
for i in range(0,len(tables_all)):
|
| 22 |
+
table = tables_all[i].df
|
| 23 |
+
sheetname = str(i)
|
| 24 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 25 |
+
result_all.save()
|
| 26 |
+
with open(result_all,'rb') as f:
|
| 27 |
+
st.download_button('抽取完成, 点击下载!', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
|
input.pdf
DELETED
|
Binary file (85.2 kB)
|
|
|
result.xlsx
DELETED
|
Binary file (5.85 kB)
|
|
|
result_all.xlsx
DELETED
|
Binary file (5.85 kB)
|
|
|