import fitz import sys import os import re def main(): if len(sys.argv) < 2: print("Usage: python split_pdf.py [output_dir]") sys.exit(1) # Force UTF-8 for stdout/stderr sys.stdout.reconfigure(encoding='utf-8') sys.stderr.reconfigure(encoding='utf-8') pdf_path = sys.argv[1] output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output" if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Reading bookmarks from '{pdf_path}'...") doc = fitz.open(pdf_path) toc = doc.get_toc() # [[lvl, title, page_num, ...], ...] if not toc: print("Error: No bookmarks found in this PDF.") sys.exit(1) # Filter for Level 1 bookmarks (Top-level chapters) chapters = [entry for entry in toc if entry[0] == 1] print(f"Found {len(chapters)} top-level chapters.") total_pages = doc.page_count for i, (lvl, title, start_page, *_) in enumerate(chapters): # Calculate End Page # Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1? # Standard logic: Chapter 1 ends where Chapter 2 begins. # We need the index of this entry in the full TOC to find the next meaningful boundary # But simpler: The next Level 1 defines the end of this Level 1 block. start_idx = start_page - 1 if i < len(chapters) - 1: next_start_page = chapters[i+1][2] end_idx = next_start_page - 1 - 1 else: end_idx = total_pages - 1 # Sanity check if end_idx < start_idx: end_idx = start_idx filename = f"{title}.pdf" # Sanitize filename = re.sub(r'[<>:"/\\|?*]', '_', filename).strip() out_path = os.path.join(output_dir, filename) print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})") new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) new_doc.save(out_path) new_doc.close() print(f"Done. Files saved to {output_dir}/") if __name__ == "__main__": main()