|
|
import streamlit as st |
|
|
import fitz |
|
|
import io |
|
|
|
|
|
|
|
|
description = st.empty() |
|
|
description.markdown(""" |
|
|
If the PDF is more than 100 pages, it cannot be uploaded to Google Translate as is, so split the PDF into an appropriate number of pages. |
|
|
The guideline is to keep it within 100 pages and 10MB, and if it exceeds this limit, Google Translate will not translate it. |
|
|
|
|
|
It's easy to use. Just upload📤 your PDF. |
|
|
|
|
|
To simply split into 10 pages each: |
|
|
Group size (pages per group) == 10 |
|
|
|
|
|
PDF merger App is [HERE](https://huggingface.co/spaces/kuroiikimono/PDF_merger_Streamlit2) |
|
|
|
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.video("https://youtu.be/ohwHEdeZE1Q?si=dQ7CQf3LmRkDJ-OM") |
|
|
|
|
|
|
|
|
def split_pdf_by_pages(doc): |
|
|
total_pages = doc.page_count |
|
|
split_files = {} |
|
|
|
|
|
for page_num in range(total_pages): |
|
|
try: |
|
|
|
|
|
new_doc = fitz.open() |
|
|
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) |
|
|
new_doc.subset_fonts() |
|
|
|
|
|
output_pdf = io.BytesIO() |
|
|
new_doc.save(output_pdf) |
|
|
output_pdf.seek(0) |
|
|
split_files[page_num] = output_pdf |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error processing page {page_num + 1}: {e}") |
|
|
raise |
|
|
|
|
|
return split_files, total_pages |
|
|
|
|
|
|
|
|
def merge_pdfs_in_groups(split_files, group_size=50): |
|
|
merged_files = {} |
|
|
|
|
|
for i in range(0, len(split_files), group_size): |
|
|
try: |
|
|
|
|
|
new_doc = fitz.open() |
|
|
for page_num in range(i, min(i + group_size, len(split_files))): |
|
|
pdf_file = split_files[page_num] |
|
|
pdf_file.seek(0) |
|
|
temp_doc = fitz.open("pdf", pdf_file.read()) |
|
|
new_doc.insert_pdf(temp_doc) |
|
|
|
|
|
new_doc.subset_fonts() |
|
|
output_pdf = io.BytesIO() |
|
|
|
|
|
new_doc.save( |
|
|
output_pdf, |
|
|
deflate=True, |
|
|
garbage=4, |
|
|
deflate_fonts=True, |
|
|
use_objstms=1 |
|
|
) |
|
|
output_pdf.seek(0) |
|
|
merged_files[i // group_size] = output_pdf |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error merging group {i // group_size + 1}: {e}") |
|
|
raise |
|
|
|
|
|
return merged_files |
|
|
|
|
|
|
|
|
def create_download_link(file_data, display_name): |
|
|
st.download_button( |
|
|
label=f"Download {display_name}", |
|
|
data=file_data, |
|
|
file_name=display_name, |
|
|
mime="application/pdf" |
|
|
) |
|
|
|
|
|
|
|
|
def main(): |
|
|
st.title("PDF Splitter with PyMuPDF 𓁨") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") |
|
|
|
|
|
|
|
|
group_size = st.slider( |
|
|
"Group size (pages per group)", |
|
|
min_value=1, |
|
|
max_value=99, |
|
|
value=10, |
|
|
step=1 |
|
|
) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
try: |
|
|
|
|
|
file_bytes = uploaded_file.read() |
|
|
doc = fitz.open("pdf", file_bytes) |
|
|
|
|
|
|
|
|
split_files, total_pages = split_pdf_by_pages(doc) |
|
|
|
|
|
|
|
|
merged_files = merge_pdfs_in_groups(split_files, group_size) |
|
|
|
|
|
|
|
|
st.subheader("Merged Groups") |
|
|
for i, file_obj in merged_files.items(): |
|
|
first = i * group_size + 1 |
|
|
last = (i + 1) * group_size |
|
|
if i + 1 == len(merged_files.items()): |
|
|
last = total_pages |
|
|
create_download_link( |
|
|
file_obj.getvalue(), |
|
|
f"group_{i+1}_{first}-{last}.pdf" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"Error processing PDF: {e}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |