File size: 2,313 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import fitz
import sys
import os
import re

def main():
    if len(sys.argv) < 2:
        print("Usage: python split_pdf.py <input_with_bookmarks.pdf> [output_dir]")
        sys.exit(1)

    # Force UTF-8 for stdout/stderr
    sys.stdout.reconfigure(encoding='utf-8')
    sys.stderr.reconfigure(encoding='utf-8')

    pdf_path = sys.argv[1]
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"Reading bookmarks from '{pdf_path}'...")
    
    doc = fitz.open(pdf_path)
    toc = doc.get_toc() # [[lvl, title, page_num, ...], ...]
    
    if not toc:
        print("Error: No bookmarks found in this PDF.")
        sys.exit(1)

    # Filter for Level 1 bookmarks (Top-level chapters)
    chapters = [entry for entry in toc if entry[0] == 1]
    
    print(f"Found {len(chapters)} top-level chapters.")
    
    total_pages = doc.page_count
    
    for i, (lvl, title, start_page, *_) in enumerate(chapters):
        # Calculate End Page
        # Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1? 
        # Standard logic: Chapter 1 ends where Chapter 2 begins.
        
        # We need the index of this entry in the full TOC to find the next meaningful boundary
        # But simpler: The next Level 1 defines the end of this Level 1 block.
        
        start_idx = start_page - 1
        
        if i < len(chapters) - 1:
            next_start_page = chapters[i+1][2]
            end_idx = next_start_page - 1 - 1
        else:
            end_idx = total_pages - 1
            
        # Sanity check
        if end_idx < start_idx:
            end_idx = start_idx

        filename = f"{title}.pdf"
        # Sanitize
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename).strip()
        out_path = os.path.join(output_dir, filename)

        print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})")
        
        new_doc = fitz.open()
        new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
        new_doc.save(out_path)
        new_doc.close()

    print(f"Done. Files saved to {output_dir}/")

if __name__ == "__main__":
    main()