Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

pdf.tocgen.split

File size: 3,002 Bytes

046e3b8

import fitz  # PyMuPDF
import sys
import os
import re

def main():
    if len(sys.argv) < 3:
        print("Usage: python split_by_toc.py <input.pdf> <input.toc> [output_dir]")
        sys.exit(1)

    # Force UTF-8 for stdout/stderr
    sys.stdout.reconfigure(encoding='utf-8')
    sys.stderr.reconfigure(encoding='utf-8')

    pdf_path = sys.argv[1]
    toc_path = sys.argv[2]
    output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"Splitting '{pdf_path}' based on '{toc_path}'...")
    
    # 1. Parse ToC
    # We need a list of (Title, StartPage)
    entries = []
    # Regex to match modify_toc style output: "Title" Page ...
    # Also matches standard pdftocgen style
    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
    
    with open(toc_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip(): continue
            match = pattern.match(line)
            if match:
                title = match.group(2)
                page = int(match.group(3))
                entries.append((title, page))
    
    if not entries:
        print("Error: No ToC entries found.")
        sys.exit(1)

    # 2. Open PDF
    doc = fitz.open(pdf_path)
    total_pages = doc.page_count
    
    print(f"Total Pages: {total_pages}")
    print(f"Found {len(entries)} chapters.")
    print("-" * 40)

    # 3. Iterate and Split
    for i, (title, start_page) in enumerate(entries):
        # PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually
        # But wait, pdftocgen output is 1-based visual page numbers.
        # So StartIndex = start_page - 1
        
        start_idx = start_page - 1
        
        # Determine End Page
        if i < len(entries) - 1:
            next_start_page = entries[i+1][1]
            end_idx = next_start_page - 1 - 1 # One page before next chapter
        else:
            end_idx = total_pages - 1
            
        # Safety check for weird overlaps or empty ranges
        if start_idx > end_idx:
            # Maybe bookmarks are out of order or on same page
            # Just grab the single page
            end_idx = start_idx
        
        filename = f"{title}.pdf"
        # Sanitize filename (remove forbidden chars like slash, colon)
        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
        out_path = os.path.join(output_dir, filename)
        
        print(f"[{i+1}/{len(entries)}] {title}")
        print(f"  Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})")
        
        # Create new PDF for this chapter
        new_doc = fitz.open()
        new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
        new_doc.save(out_path)
        new_doc.close()

    print("-" * 40)
    print(f"Done! Files saved to '{output_dir}/'")

if __name__ == "__main__":
    main()