tjxj commited on
Commit
8482f1b
·
1 Parent(s): 224e0a3
Files changed (4) hide show
  1. .history/app_20220621111527.py +37 -0
  2. app.py +1 -1
  3. result.xlsx +0 -0
  4. result_all.xlsx +0 -0
.history/app_20220621111527.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding : utf-8-*-
2
+ import base64
3
+ from subprocess import STDOUT
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import camelot as cam # extracting tables from PDFs
7
+
8
+ st.title("PDF Table Extractor")
9
+
10
+ input_pdf = st.file_uploader(label = "", type = 'pdf')
11
+
12
+ background = st.selectbox("表格线条是否透明",(False,True))
13
+ extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
+
15
+ def extractor(page,result_name):
16
+ tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
17
+ result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
18
+ for i in range(0,len(tables_all)):
19
+ table = tables_all[i].df
20
+ sheetname = str(i)
21
+ table.to_excel(result_all, sheetname,index=False)
22
+ result_all.save()
23
+ with open(result_name,'rb') as f:
24
+ st.download_button('抽取完成, 点击下载!', f,file_name=result_name,mime="application/vnd.ms-excel")
25
+
26
+
27
+ if input_pdf is not None:
28
+ # byte object into a PDF file
29
+ with open("input.pdf", "wb") as f:
30
+ base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
31
+ f.write(base64.b64decode(base64_pdf))
32
+ f.close()
33
+ if extractor_mode == "单页抽取":
34
+ page_number = st.text_input("请填写表格所在PDF页码,eg: 3", value = 1)
35
+ extractor(page_number,"result.xlsx")
36
+ if extractor_mode == "全文抽取":
37
+ extractor("all","result_all.xlsx")
app.py CHANGED
@@ -9,7 +9,7 @@ st.title("PDF Table Extractor")
9
 
10
  input_pdf = st.file_uploader(label = "", type = 'pdf')
11
 
12
- background = st.selectbox("表格线条是否隐藏",(False,True))
13
  extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
 
15
  def extractor(page,result_name):
 
9
 
10
  input_pdf = st.file_uploader(label = "", type = 'pdf')
11
 
12
+ background = st.selectbox("表格线条是否透明",(False,True))
13
  extractor_mode = st.selectbox("单页抽取 OR 全文抽取",("单页抽取","全文抽取"))
14
 
15
  def extractor(page,result_name):
result.xlsx CHANGED
Binary files a/result.xlsx and b/result.xlsx differ
 
result_all.xlsx CHANGED
Binary files a/result_all.xlsx and b/result_all.xlsx differ