pdf.tocgen.split / utils /split_pdf.py
adelevett's picture
Upload 76 files
046e3b8 verified
import fitz
import sys
import os
import re
def main():
if len(sys.argv) < 2:
print("Usage: python split_pdf.py <input_with_bookmarks.pdf> [output_dir]")
sys.exit(1)
# Force UTF-8 for stdout/stderr
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')
pdf_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Reading bookmarks from '{pdf_path}'...")
doc = fitz.open(pdf_path)
toc = doc.get_toc() # [[lvl, title, page_num, ...], ...]
if not toc:
print("Error: No bookmarks found in this PDF.")
sys.exit(1)
# Filter for Level 1 bookmarks (Top-level chapters)
chapters = [entry for entry in toc if entry[0] == 1]
print(f"Found {len(chapters)} top-level chapters.")
total_pages = doc.page_count
for i, (lvl, title, start_page, *_) in enumerate(chapters):
# Calculate End Page
# Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1?
# Standard logic: Chapter 1 ends where Chapter 2 begins.
# We need the index of this entry in the full TOC to find the next meaningful boundary
# But simpler: The next Level 1 defines the end of this Level 1 block.
start_idx = start_page - 1
if i < len(chapters) - 1:
next_start_page = chapters[i+1][2]
end_idx = next_start_page - 1 - 1
else:
end_idx = total_pages - 1
# Sanity check
if end_idx < start_idx:
end_idx = start_idx
filename = f"{title}.pdf"
# Sanitize
filename = re.sub(r'[<>:"/\\|?*]', '_', filename).strip()
out_path = os.path.join(output_dir, filename)
print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})")
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
new_doc.save(out_path)
new_doc.close()
print(f"Done. Files saved to {output_dir}/")
if __name__ == "__main__":
main()