import fitz # PyMuPDF import sys import os import re def main(): if len(sys.argv) < 3: print("Usage: python split_by_toc.py [output_dir]") sys.exit(1) # Force UTF-8 for stdout/stderr sys.stdout.reconfigure(encoding='utf-8') sys.stderr.reconfigure(encoding='utf-8') pdf_path = sys.argv[1] toc_path = sys.argv[2] output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output" if not os.path.exists(output_dir): os.makedirs(output_dir) print(f"Splitting '{pdf_path}' based on '{toc_path}'...") # 1. Parse ToC # We need a list of (Title, StartPage) entries = [] # Regex to match modify_toc style output: "Title" Page ... # Also matches standard pdftocgen style pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') with open(toc_path, 'r', encoding='utf-8') as f: for line in f: if not line.strip(): continue match = pattern.match(line) if match: title = match.group(2) page = int(match.group(3)) entries.append((title, page)) if not entries: print("Error: No ToC entries found.") sys.exit(1) # 2. Open PDF doc = fitz.open(pdf_path) total_pages = doc.page_count print(f"Total Pages: {total_pages}") print(f"Found {len(entries)} chapters.") print("-" * 40) # 3. Iterate and Split for i, (title, start_page) in enumerate(entries): # PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually # But wait, pdftocgen output is 1-based visual page numbers. # So StartIndex = start_page - 1 start_idx = start_page - 1 # Determine End Page if i < len(entries) - 1: next_start_page = entries[i+1][1] end_idx = next_start_page - 1 - 1 # One page before next chapter else: end_idx = total_pages - 1 # Safety check for weird overlaps or empty ranges if start_idx > end_idx: # Maybe bookmarks are out of order or on same page # Just grab the single page end_idx = start_idx filename = f"{title}.pdf" # Sanitize filename (remove forbidden chars like slash, colon) filename = re.sub(r'[<>:"/\\|?*]', '_', filename) out_path = os.path.join(output_dir, filename) print(f"[{i+1}/{len(entries)}] {title}") print(f" Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})") # Create new PDF for this chapter new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) new_doc.save(out_path) new_doc.close() print("-" * 40) print(f"Done! Files saved to '{output_dir}/'") if __name__ == "__main__": main()