Spaces:
Build error
Build error
Create new file
Browse files
app.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import math
|
| 3 |
+
import re
|
| 4 |
+
import os
|
| 5 |
+
from PyPDF2 import PdfFileReader, PdfFileWriter
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import pdfplumber
|
| 8 |
+
from docx2pdf import convert
|
| 9 |
+
import fitz
|
| 10 |
+
import base64
|
| 11 |
+
|
| 12 |
+
st.header('PDF文件处理工具测试')
|
| 13 |
+
|
| 14 |
+
def fx(x):
|
| 15 |
+
return sum(x,[])
|
| 16 |
+
fns=st.radio('请选择PDF处理类型:',['拆分','合并','读取','在线预览','转换'])
|
| 17 |
+
if fns=='拆分':
|
| 18 |
+
uploaded_file = st.text_input("请输入要处理的pdf文件地址:")
|
| 19 |
+
if uploaded_file !='':
|
| 20 |
+
|
| 21 |
+
pdf_reader = PdfFileReader(uploaded_file)
|
| 22 |
+
n=pdf_reader.getNumPages()
|
| 23 |
+
che=st.radio('选择拆分类型',['按固定页数拆分','截取某几页','删除指定页面'])
|
| 24 |
+
if che=='按固定页数拆分':
|
| 25 |
+
fn=st.number_input('请输入每组拆分的文档页数:',1,n,1)
|
| 26 |
+
stre=st.text_input("请输入拆分后文件存放根目录:")
|
| 27 |
+
zs=math.ceil(n/fn)
|
| 28 |
+
if st.button('开始拆分>>'):
|
| 29 |
+
for page in range(1,zs+1):
|
| 30 |
+
for pn in range(fn*page-fn,fn*page):
|
| 31 |
+
if pn<n:
|
| 32 |
+
pdf_writer = PdfFileWriter()
|
| 33 |
+
pdf_writer.addPage(pdf_reader.getPage(pn))
|
| 34 |
+
with open(stre+'/test-{}.pdf'.format(page), 'wb') as out:
|
| 35 |
+
pdf_writer.write(out)
|
| 36 |
+
elif che=='截取某几页':
|
| 37 |
+
st_en=st.text_input("请输入截取的起止页码,格式为“1-5”或“1,3,5”:")
|
| 38 |
+
stre2=st.text_input("请输入截取后pdf文件存放根目录:")
|
| 39 |
+
if st_en!='':
|
| 40 |
+
tt=[int(x) for x in re.split(r'[-,\s]\s*',st_en)]
|
| 41 |
+
if st.button('开始截取>>'):
|
| 42 |
+
outw=PdfFileWriter()
|
| 43 |
+
for r in (tt if ',' in st_en else range(tt[0]-1,tt[1])):
|
| 44 |
+
outw.addPage(pdf_reader.getPage(r))
|
| 45 |
+
with open(stre2+'/666.pdf', 'wb') as out:
|
| 46 |
+
outw.write(out)
|
| 47 |
+
else:
|
| 48 |
+
st_en2=st.text_input("请输入需要删除的页码,格式为“1-5”或“1,3,5”:")
|
| 49 |
+
stre3=st.text_input("请输入删除指定页面后的pdf文件存放根目录:")
|
| 50 |
+
if st_en2!='':
|
| 51 |
+
tt=[int(x) for x in re.split(r'[-,\s]\s*',st_en2)]
|
| 52 |
+
if st.button('开始删除>>'):
|
| 53 |
+
outw2=PdfFileWriter()
|
| 54 |
+
for r in range(n):
|
| 55 |
+
if r not in (tt if ',' in st_en2 else range(tt[0]-1,tt[1])):
|
| 56 |
+
outw2.addPage(pdf_reader.getPage(r))
|
| 57 |
+
with open(stre3+'/666.pdf', 'wb') as out:
|
| 58 |
+
outw2.write(out)
|
| 59 |
+
elif fns=='合并':
|
| 60 |
+
path = st.text_input("请输入要处理的pdf文件根目录:")
|
| 61 |
+
scn = st.text_input("请填写输出文件地址及文件名")
|
| 62 |
+
if path !='' and scn!='':
|
| 63 |
+
file_list = os.listdir(path)
|
| 64 |
+
if st.button('开始合并>>'):
|
| 65 |
+
file_out = PdfFileWriter()
|
| 66 |
+
for file in file_list:
|
| 67 |
+
docdir = os.path.join(path, file)
|
| 68 |
+
file_read = PdfFileReader(docdir)
|
| 69 |
+
for pageNum in range(file_read.getNumPages()):
|
| 70 |
+
file_out.addPage(file_read.getPage(pageNum))
|
| 71 |
+
with open(scn,'wb') as output:
|
| 72 |
+
file_out.write(output)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
elif fns=='读取':
|
| 76 |
+
path3 = st.text_input("请输入要读取的pdf文件地址:")
|
| 77 |
+
if path3 !='':
|
| 78 |
+
ms=st.radio('请选择读取模式:',['指定页码','全部'])
|
| 79 |
+
if ms=='指定页码':
|
| 80 |
+
ymq= st.number_input("请选择要读取的pdf页码:",1,66,1)
|
| 81 |
+
dqlx=st.radio('请选择读取类型',['文本内容','表格内容'])
|
| 82 |
+
with pdfplumber.open(path3) as p:
|
| 83 |
+
page = p.pages[ymq-1]
|
| 84 |
+
if dqlx=='文本内容':
|
| 85 |
+
textdata = page.extract_text()
|
| 86 |
+
st.write(textdata)
|
| 87 |
+
else:
|
| 88 |
+
n_table=st.number_input('请选择读取页面中第几个表格:',1,3,1)
|
| 89 |
+
tables=page.extract_tables()
|
| 90 |
+
datan=tables[n_table-1]
|
| 91 |
+
st.dataframe(pd.DataFrame(datan[1:],columns=datan[0]))
|
| 92 |
+
else:
|
| 93 |
+
dqlx2=st.radio('请选择读取类型',['文本内容','表格内容'])
|
| 94 |
+
with pdfplumber.open(path3) as p:
|
| 95 |
+
if dqlx2=='文本内容':
|
| 96 |
+
sz='\n'.join([page.extract_text() for page in p.pages])
|
| 97 |
+
st.write(sz)
|
| 98 |
+
else:
|
| 99 |
+
st.dataframe(pd.concat([pd.DataFrame(data=y[1:],columns=y[0]) for y in fx([page.extract_tables() for page in p.pages])]))
|
| 100 |
+
|
| 101 |
+
elif fns=='在线预览':
|
| 102 |
+
file = st.file_uploader("请上传PDF")
|
| 103 |
+
if file is not None:
|
| 104 |
+
base64_pdf = base64.b64encode(file.read()).decode('utf-8')
|
| 105 |
+
pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="100%" height="1000" type="application/pdf">'
|
| 106 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
| 107 |
+
|
| 108 |
+
else:
|
| 109 |
+
ms1=st.radio('请选择转换模式:',['word->pdf','ppt->pdf','pdf->jpg/png','jpg/png->pdf'])
|
| 110 |
+
if ms1=='word->pdf':
|
| 111 |
+
path4 = st.text_input("请输入要批量转换的word文件根目录:")
|
| 112 |
+
if path4 !='':
|
| 113 |
+
FileList = map(lambda x: path4 + '\\' + x, os.listdir(path4))
|
| 114 |
+
for file in FileList:
|
| 115 |
+
convert(file, f"{file.split('.')[0]}.pdf")
|
| 116 |
+
st.success('转换成功!')
|
| 117 |
+
elif ms1=='pdf->jpg/png':
|
| 118 |
+
path5 = st.text_input("请输入要转换的pdf文件地址:")
|
| 119 |
+
dir_1=st.text_input("请输入要输出的图片保存根目录:")
|
| 120 |
+
if path5 !='' and dir_1 !='':
|
| 121 |
+
doc = fitz.open(path5)
|
| 122 |
+
for page in doc:
|
| 123 |
+
pix = page.get_pixmap()
|
| 124 |
+
pix.save(dir_1+"/page-%i.png" % page.number)
|
| 125 |
+
|
| 126 |
+
elif ms1=='jpg/png->pdf':
|
| 127 |
+
dir_2=st.text_input("请输入要转换为pdf的图片根目录:")
|
| 128 |
+
path6 = st.text_input("请输入合成的pdf文件存放地址:")
|
| 129 |
+
if path6 !='' and dir_2 !='':
|
| 130 |
+
doc = fitz.open()
|
| 131 |
+
imglist = os.listdir(dir_2)
|
| 132 |
+
for i, f in enumerate(imglist):
|
| 133 |
+
img = fitz.open(os.path.join(dir_2, f))
|
| 134 |
+
rect = img[0].rect
|
| 135 |
+
pdfbytes = img.convert_to_pdf()
|
| 136 |
+
img.close()
|
| 137 |
+
imgPDF = fitz.open("pdf", pdfbytes)
|
| 138 |
+
page = doc.new_page(width = rect.width,height = rect.height)
|
| 139 |
+
page.show_pdf_page(rect, imgPDF, 0)
|
| 140 |
+
doc.save(path6)
|
| 141 |
+
elif ms1=='ppt->pdf':
|
| 142 |
+
dir_3=st.text_input("请输入要转换为pdf的PPT文件地址:")
|
| 143 |
+
path7 = st.text_input("请输入生成的pdf文件存放地址:")
|
| 144 |
+
if path7 !='' and dir_3 !='':
|
| 145 |
+
ppt = fitz.open(dir_3)
|
| 146 |
+
pdfbytes = ppt.convert_to_pdf()
|
| 147 |
+
pdf = fitz.open("pdf", pdfbytes)
|
| 148 |
+
pdf.save(path7)
|
| 149 |
+
else:
|
| 150 |
+
""
|