Spaces:
Runtime error
Runtime error
tjxj
commited on
Commit
·
8482f1b
1
Parent(s):
224e0a3
1.0
Browse files- .history/app_20220621111527.py +37 -0
- app.py +1 -1
- result.xlsx +0 -0
- result_all.xlsx +0 -0
.history/app_20220621111527.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#-*- coding : utf-8-*-
|
| 2 |
+
import base64
|
| 3 |
+
from subprocess import STDOUT
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import camelot as cam # extracting tables from PDFs
|
| 7 |
+
|
| 8 |
+
st.title("PDF Table Extractor")
|
| 9 |
+
|
| 10 |
+
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
+
|
| 12 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 13 |
+
extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
|
| 14 |
+
|
| 15 |
+
def extractor(page,result_name):
|
| 16 |
+
tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
|
| 17 |
+
result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
|
| 18 |
+
for i in range(0,len(tables_all)):
|
| 19 |
+
table = tables_all[i].df
|
| 20 |
+
sheetname = str(i)
|
| 21 |
+
table.to_excel(result_all, sheetname,index=False)
|
| 22 |
+
result_all.save()
|
| 23 |
+
with open(result_name,'rb') as f:
|
| 24 |
+
st.download_button('抽取完成, 点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
if input_pdf is not None:
|
| 28 |
+
# byte object into a PDF file
|
| 29 |
+
with open("input.pdf", "wb") as f:
|
| 30 |
+
base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
|
| 31 |
+
f.write(base64.b64decode(base64_pdf))
|
| 32 |
+
f.close()
|
| 33 |
+
if extractor_mode == "单页抽取":
|
| 34 |
+
page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
|
| 35 |
+
extractor(page_number,"result.xlsx")
|
| 36 |
+
if extractor_mode == "全文抽取":
|
| 37 |
+
extractor("all","result_all.xlsx")
|
app.py
CHANGED
|
@@ -9,7 +9,7 @@ st.title("PDF Table Extractor")
|
|
| 9 |
|
| 10 |
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
|
| 12 |
-
background = st.selectbox("
|
| 13 |
extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
|
| 14 |
|
| 15 |
def extractor(page,result_name):
|
|
|
|
| 9 |
|
| 10 |
input_pdf = st.file_uploader(label = "", type = 'pdf')
|
| 11 |
|
| 12 |
+
background = st.selectbox("表格线条是否透明",(False,True))
|
| 13 |
extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
|
| 14 |
|
| 15 |
def extractor(page,result_name):
|
result.xlsx
CHANGED
|
Binary files a/result.xlsx and b/result.xlsx differ
|
|
|
result_all.xlsx
CHANGED
|
Binary files a/result_all.xlsx and b/result_all.xlsx differ
|
|
|