Spaces:

kuroiikimono
/

Splitpdf_streamlit2

Paused

App Files Files Community

Splitpdf_streamlit2 / app.py

kuroiikimono

Update app.py

49fc200 verified 11 months ago

raw

history blame contribute delete

3.9 kB

	import streamlit as st
	import fitz # PyMuPDF
	import io

	#
	description = st.empty()
	description.markdown("""
	If the PDF is more than 100 pages, it cannot be uploaded to Google Translate as is, so split the PDF into an appropriate number of pages.
	The guideline is to keep it within 100 pages and 10MB, and if it exceeds this limit, Google Translate will not translate it.

	It's easy to use. Just upload📤 your PDF.

	To simply split into 10 pages each:
	Group size (pages per group) == 10

	PDF merger App is [HERE](https://huggingface.co/spaces/kuroiikimono/PDF_merger_Streamlit2)

	""", unsafe_allow_html=True)

	st.video("https://youtu.be/ohwHEdeZE1Q?si=dQ7CQf3LmRkDJ-OM")

	#
	def split_pdf_by_pages(doc):
	total_pages = doc.page_count
	split_files = {}

	for page_num in range(total_pages):
	try:
	#
	new_doc = fitz.open()
	new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
	new_doc.subset_fonts()

	output_pdf = io.BytesIO()
	new_doc.save(output_pdf) #
	output_pdf.seek(0)
	split_files[page_num] = output_pdf
	# st.write(f"Page {page_num + 1} processed.") #
	except Exception as e:
	st.error(f"Error processing page {page_num + 1}: {e}") #
	raise

	return split_files, total_pages

	#
	def merge_pdfs_in_groups(split_files, group_size=50):
	merged_files = {}

	for i in range(0, len(split_files), group_size):
	try:
	#
	new_doc = fitz.open()
	for page_num in range(i, min(i + group_size, len(split_files))):
	pdf_file = split_files[page_num]
	pdf_file.seek(0) #
	temp_doc = fitz.open("pdf", pdf_file.read())
	new_doc.insert_pdf(temp_doc) #

	new_doc.subset_fonts()
	output_pdf = io.BytesIO()

	new_doc.save(
	output_pdf,
	deflate=True,
	garbage=4,
	deflate_fonts=True,
	use_objstms=1
	) #
	output_pdf.seek(0)
	merged_files[i // group_size] = output_pdf
	# st.write(f"Merged group {i // group_size + 1} processed.") # Streamlit での表示
	except Exception as e:
	st.error(f"Error merging group {i // group_size + 1}: {e}") # Streamlit でのエラー表示
	raise

	return merged_files

	#
	def create_download_link(file_data, display_name):
	st.download_button(
	label=f"Download {display_name}",
	data=file_data,
	file_name=display_name,
	mime="application/pdf"
	)

	#
	def main():
	st.title("PDF Splitter with PyMuPDF 𓁨")

	uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

	#
	group_size = st.slider(
	"Group size (pages per group)",
	min_value=1,
	max_value=99, #
	value=10, #
	step=1 #
	)

	if uploaded_file is not None:
	try:
	#
	file_bytes = uploaded_file.read()
	doc = fitz.open("pdf", file_bytes) #

	#
	split_files, total_pages = split_pdf_by_pages(doc)

	#
	merged_files = merge_pdfs_in_groups(split_files, group_size)

	#
	st.subheader("Merged Groups") #
	for i, file_obj in merged_files.items():
	first = i * group_size + 1
	last = (i + 1) * group_size
	if i + 1 == len(merged_files.items()):
	last = total_pages
	create_download_link(
	file_obj.getvalue(),
	f"group_{i+1}_{first}-{last}.pdf"
	)

	except Exception as e:
	st.error(f"Error processing PDF: {e}")

	if __name__ == "__main__":
	main()