Spaces:

beihai
/

PDF-Table-Extractor

Runtime error

tjxj commited on Jun 21, 2022

Commit

8482f1b

1 Parent(s): 224e0a3

1.0

Files changed (4) hide show

.history/app_20220621111527.py ADDED Viewed

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+extractor_mode = st.selectbox("单页抽取  OR   全文抽取",("单页抽取","全文抽取"))
+def extractor(page,result_name):
+        tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
+        result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
+        for i in range(0,len(tables_all)):
+            table = tables_all[i].df
+            sheetname = str(i)
+            table.to_excel(result_all, sheetname,index=False)
+        result_all.save()
+        with open(result_name,'rb') as f:
+           st.download_button('抽取完成, 点击下载！', f,file_name=result_name,mime="application/vnd.ms-excel")
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    if extractor_mode == "单页抽取":
+        page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+        extractor(page_number,"result.xlsx")
+    if extractor_mode == "全文抽取":
+        extractor("all","result_all.xlsx")

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ st.title("PDF Table Extractor")
 input_pdf = st.file_uploader(label = "", type = 'pdf')
-background = st.selectbox("表格线条是否隐藏",(False,True))
 extractor_mode = st.selectbox("单页抽取  OR   全文抽取",("单页抽取","全文抽取"))
 def extractor(page,result_name):

 input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
 extractor_mode = st.selectbox("单页抽取  OR   全文抽取",("单页抽取","全文抽取"))
 def extractor(page,result_name):

result.xlsx CHANGED Viewed

Binary files a/result.xlsx and b/result.xlsx differ

result_all.xlsx CHANGED Viewed

Binary files a/result_all.xlsx and b/result_all.xlsx differ