Spaces:

beihai
/

PDF-Table-Extractor

Runtime error

App Files Files Community

tjxj commited on Jun 21, 2022

Commit

9e7a9da

1 Parent(s): 8482f1b

1.0

Browse files

Files changed (18) hide show

.history/app_20220621141455.py +27 -0
.history/app_20220621141456.py +27 -0
.history/app_20220621141530.py +27 -0
.history/test_20220621134654.py +0 -0
.history/test_20220621134738.py +34 -0
.history/test_20220621135222.py +37 -0
.history/test_20220621135234.py +37 -0
.history/test_20220621135258.py +31 -0
.history/test_20220621135452.py +28 -0
.history/test_20220621135519.py +28 -0
.history/test_20220621135601.py +28 -0
.history/test_20220621135627.py +29 -0
.history/test_20220621135757.py +28 -0
.history/test_20220621135808.py +27 -0
app.py +10 -20
input.pdf +0 -0
result.xlsx +0 -0
result_all.xlsx +0 -0

.history/app_20220621141455.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3, 1-3, 2-end, all", value = 1)
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/app_20220621141456.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3, 1-3, 2-end, all", value = 1)
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/app_20220621141530.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3, 1-3, 2-end, all", value = 1)
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/test_20220621134654.py ADDED Viewed

File without changes

.history/test_20220621134738.py ADDED Viewed

	@@ -0,0 +1,34 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+#extractor_mode = st.selectbox("单页抽取  OR   全文抽取",("单页抽取","全文抽取"))
+def extractor(page,result_name):
+        tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
+        result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
+        for i in range(0,len(tables_all)):
+            table = tables_all[i].df
+            sheetname = str(i)
+            table.to_excel(result_all, sheetname,index=False)
+        result_all.save()
+        with open(result_name,'rb') as f:
+           st.download_button('抽取完成, 点击下载！', f,file_name=result_name,mime="application/vnd.ms-excel")
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+    extractor(page_number,"result.xlsx")

.history/test_20220621135222.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+def extractor(page,result_name):
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_name,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name=result.xlsx,mime="application/vnd.ms-excel")
+    extractor(page_number,"result.xlsx")

.history/test_20220621135234.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+def extractor(page,result_name):
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_name,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
+    extractor(page_number,"result.xlsx")

.history/test_20220621135258.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_name,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")
+    extractor(page_number,"result.xlsx")

.history/test_20220621135452.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter(result.xlsx, engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/test_20220621135519.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/test_20220621135601.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    page_number = st.text_input("请填写表格所在PDF页码，eg: 3,1-3,2-end,all", value = 1)
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/test_20220621135627.py ADDED Viewed

	@@ -0,0 +1,29 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3,1-3,2-end,all", value = 1)
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/test_20220621135757.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3, 1-3, 2-end, all", value = 1)
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

.history/test_20220621135808.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#-*- coding : utf-8-*-
+import base64
+from subprocess import STDOUT
+import streamlit as st
+import pandas as pd
+import camelot as cam # extracting tables from PDFs
+st.title("PDF Table Extractor")
+input_pdf = st.file_uploader(label = "", type = 'pdf')
+background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3, 1-3, 2-end, all", value = 1)
+if input_pdf is not None:
+    # byte object into a PDF file
+    with open("input.pdf", "wb") as f:
+        base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
+        f.write(base64.b64decode(base64_pdf))
+    f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

app.py CHANGED Viewed

@@ -6,23 +6,9 @@ import pandas as pd
 import camelot as cam # extracting tables from PDFs
 st.title("PDF Table Extractor")
 input_pdf = st.file_uploader(label = "", type = 'pdf')
 background = st.selectbox("表格线条是否透明",(False,True))
-extractor_mode = st.selectbox("单页抽取  OR   全文抽取",("单页抽取","全文抽取"))
-def extractor(page,result_name):
-        tables_all= cam.read_pdf("input.pdf", pages=page, process_background=background)
-        result_all = pd.ExcelWriter(result_name, engine='xlsxwriter')
-        for i in range(0,len(tables_all)):
-            table = tables_all[i].df
-            sheetname = str(i)
-            table.to_excel(result_all, sheetname,index=False)
-        result_all.save()
-        with open(result_name,'rb') as f:
-           st.download_button('抽取完成, 点击下载！', f,file_name=result_name,mime="application/vnd.ms-excel")
 if input_pdf is not None:
     # byte object into a PDF file
@@ -30,8 +16,12 @@ if input_pdf is not None:
         base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
         f.write(base64.b64decode(base64_pdf))
     f.close()
-    if extractor_mode == "单页抽取":
-        page_number = st.text_input("请填写表格所在PDF页码，eg: 3", value = 1)
-        extractor(page_number,"result.xlsx")
-    if extractor_mode == "全文抽取":
-        extractor("all","result_all.xlsx")

 import camelot as cam # extracting tables from PDFs
 st.title("PDF Table Extractor")
 input_pdf = st.file_uploader(label = "", type = 'pdf')
 background = st.selectbox("表格线条是否透明",(False,True))
+page_number = st.text_input("请填写表格所在PDF页码，eg: 3, 1-3, 2-end, all", value = 1)
 if input_pdf is not None:
     # byte object into a PDF file
         base64_pdf = base64.b64encode(input_pdf.read()).decode('utf-8')
         f.write(base64.b64decode(base64_pdf))
     f.close()
+    tables_all= cam.read_pdf("input.pdf", pages=page_number, process_background=background)
+    result_all = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
+    for i in range(0,len(tables_all)):
+        table = tables_all[i].df
+        sheetname = str(i)
+        table.to_excel(result_all, sheetname,index=False)
+    result_all.save()
+    with open(result_all,'rb') as f:
+       st.download_button('抽取完成, 点击下载！', f,file_name="result.xlsx",mime="application/vnd.ms-excel")

input.pdf DELETED Viewed

Binary file (85.2 kB)

result.xlsx DELETED Viewed

Binary file (5.85 kB)

result_all.xlsx DELETED Viewed

Binary file (5.85 kB)