import streamlit as st import fitz # PyMuPDF import io # description = st.empty() description.markdown(""" If the PDF is more than 100 pages, it cannot be uploaded to Google Translate as is, so split the PDF into an appropriate number of pages. The guideline is to keep it within 100 pages and 10MB, and if it exceeds this limit, Google Translate will not translate it. It's easy to use. Just upload📤 your PDF. To simply split into 10 pages each: Group size (pages per group) == 10 PDF merger App is [HERE](https://huggingface.co/spaces/kuroiikimono/PDF_merger_Streamlit2) """, unsafe_allow_html=True) st.video("https://youtu.be/ohwHEdeZE1Q?si=dQ7CQf3LmRkDJ-OM") # def split_pdf_by_pages(doc): total_pages = doc.page_count split_files = {} for page_num in range(total_pages): try: # new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) new_doc.subset_fonts() output_pdf = io.BytesIO() new_doc.save(output_pdf) # output_pdf.seek(0) split_files[page_num] = output_pdf # st.write(f"Page {page_num + 1} processed.") # except Exception as e: st.error(f"Error processing page {page_num + 1}: {e}") # raise return split_files, total_pages # def merge_pdfs_in_groups(split_files, group_size=50): merged_files = {} for i in range(0, len(split_files), group_size): try: # new_doc = fitz.open() for page_num in range(i, min(i + group_size, len(split_files))): pdf_file = split_files[page_num] pdf_file.seek(0) # temp_doc = fitz.open("pdf", pdf_file.read()) new_doc.insert_pdf(temp_doc) # new_doc.subset_fonts() output_pdf = io.BytesIO() new_doc.save( output_pdf, deflate=True, garbage=4, deflate_fonts=True, use_objstms=1 ) # output_pdf.seek(0) merged_files[i // group_size] = output_pdf # st.write(f"Merged group {i // group_size + 1} processed.") # Streamlit での表示 except Exception as e: st.error(f"Error merging group {i // group_size + 1}: {e}") # Streamlit でのエラー表示 raise return merged_files # def create_download_link(file_data, display_name): st.download_button( label=f"Download {display_name}", data=file_data, file_name=display_name, mime="application/pdf" ) # def main(): st.title("PDF Splitter with PyMuPDF 𓁨") uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") # group_size = st.slider( "Group size (pages per group)", min_value=1, max_value=99, # value=10, # step=1 # ) if uploaded_file is not None: try: # file_bytes = uploaded_file.read() doc = fitz.open("pdf", file_bytes) # # split_files, total_pages = split_pdf_by_pages(doc) # merged_files = merge_pdfs_in_groups(split_files, group_size) # st.subheader("Merged Groups") # for i, file_obj in merged_files.items(): first = i * group_size + 1 last = (i + 1) * group_size if i + 1 == len(merged_files.items()): last = total_pages create_download_link( file_obj.getvalue(), f"group_{i+1}_{first}-{last}.pdf" ) except Exception as e: st.error(f"Error processing PDF: {e}") if __name__ == "__main__": main()