Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

pdf.tocgen.split

File size: 8,985 Bytes

046e3b8

"""

TOC Processor

-------------

Handles operations related to the Table of Contents (TOC) for the PDF pipeline.

Includes functionality for:

- Cleaning and sanitizing text (encoding issues, soft hyphens)

- Merging usage-heuristic headers (e.g. multi-line headers on same page)

- generating split PDF chapters

"""

import re
import io
import zipfile
import fitz  # PyMuPDF
from typing import List, Tuple, Generator, Optional
import tempfile
import os

# Type alias for TOC entry: [level, title, page, ...]
FitZTOCEntry = list

def clean_text(text: str) -> str:
    """

    Sanitize text to remove common PDF artifacts.

    Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes.

    """
    if not text:
        return ""
        
    # Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
    text = text.replace('\xa0', ' ').replace('\xad', '')
    
    # Replace en-dash and em-dash with standard hyphen
    text = text.replace('\u2013', '-').replace('\u2014', '-')
    
    # Remove control characters (except allowed ones)
    text = "".join(ch for ch in text if ch.isprintable())
    
    return ' '.join(text.split())

def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]:
    """

    Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list.

    Expected format lines: '    "Chapter Title" 123'

    """
    toc = []
    # Regex captures: 1=Indent, 2=Title, 3=PageNum
    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
    
    for line in raw_output.splitlines():
        match = pattern.match(line)
        if match:
            indent, title, page_str, _ = match.groups()
            
            # Calculate level based on indentation (4 spaces = 1 indent step)
            # 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc.
            # pdftocgen defaults to standard indentation
            level = (len(indent) // 4) + 1
            page = int(page_str)
            
            toc.append([level, title, page])
            
    return toc

def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]:
    """

    Detects consecutive Level 1 headers derived from the same page and merges them.

    This fixes the "double split" issue where multi-line headers are detected as separate entries.

    

    Example:

        Input:  [[1, "Title Part 1", 10], [1, "Title Part 2", 10]]

        Output: [[1, "Title Part 1 Title Part 2", 10]]

    """
    if not toc:
        return []

    merged_toc = []
    
    for entry in toc:
        level, title, page = entry[0], entry[1], entry[2]
        
        # We only care about merging Level 1 headers
        if level != 1:
            merged_toc.append(entry)
            continue
            
        # Check if we can merge with the previous entry
        if merged_toc:
            prev_entry = merged_toc[-1]
            prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2]
            
            # CRITERIA: Both Level 1, Same Page
            if prev_level == 1 and prev_page == page:
                # Merge! Update the previous entry's title
                new_title = f"{prev_title} {title}"
                merged_toc[-1][1] = new_title
                continue
        
        # If no merge, append as new
        merged_toc.append(entry)
        
    return merged_toc

def process_toc(raw_toc_content: str) -> str:
    """

    Full pipeline to clean and format raw TOC content.

    Returns the string content formatted for `pdftocio` input (with indices).

    """
    # 1. Parse
    parsed_toc = parse_raw_toc_output(raw_toc_content)
    
    # 2. Clean Titles
    for entry in parsed_toc:
        entry[1] = clean_text(entry[1])
        
    # 3. Merge Same-Page Headers (The Double Split Fix)
    merged_toc = merge_same_page_headers(parsed_toc)
    
    # 4. Format for Output (re-serialize)
    # pdftocio expects: "Title" PageNum
    # DECOUPLED: We keep the PDF bookmarks clean (no number prefix).
    # File naming handling is moved to generate_chapter_splits.
    
    output_lines = []
    
    for entry in merged_toc:
        level, title, page = entry[0], entry[1], entry[2]
        
        # Indent: 4 spaces per level minus 1
        indent = " " * (4 * (level - 1))
        output_lines.append(f'{indent}"{title}" {page}')
        
    return "\n".join(output_lines)

def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None):
    """

    Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path.

    Uses tempfile logic to handle large files safely.

    

    Args:

        input_pdf_path: Path to source PDF

        output_zip_path: Path to write the ZIP

        back_matter_start_page: 1-based page number where Back Matter starts. 

                                Chapters will be clamped to end before this page.

                                Content from this page to end will be saved as 999_Back_Matter.pdf.

    """
    doc = fitz.open(input_pdf_path)
    toc = doc.get_toc()
    
    if not toc:
        doc.close()
        raise ValueError("No Table of Contents found in the PDF.")
        
    # Create the zip file
    with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        total_pages = doc.page_count
        
        # --- Front Matter Extraction ---
        # Find the first Level 1 chapter
        first_l1_page = None
        for entry in toc:
             if entry[0] == 1:
                 first_l1_page = entry[2]
                 break
        
        # If the first chapter starts after Page 1, extract Front Matter
        if first_l1_page and first_l1_page > 1:
            # Front matter is from page 0 to (first_l1_page - 1) - 1 (index)
            fm_end_idx = first_l1_page - 2 
            
            if fm_end_idx >= 0:
                fm_doc = fitz.open()
                fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx)
                zf.writestr("000_Front_matter.pdf", fm_doc.tobytes())
                fm_doc.close()

        # --- Chapter Extraction ---
        chapter_idx = 1
        
        for i, entry in enumerate(toc):
            level, title, start_page = entry[0], entry[1], entry[2]
            
            # We skip non-L1 for splitting functionality
            if level != 1:
                continue
            
            # If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter)
            if back_matter_start_page and start_page >= back_matter_start_page:
                continue

            start_idx = start_page - 1
            
            # Determine end page lookahead
            end_page = total_pages
            for next_entry in toc[i+1:]:
                if next_entry[0] == 1:
                    # The start of the next chapter is the end of this one
                    end_page = next_entry[2] - 1 
                    break
            
            # --- CLAMPING: Check against Back Matter ---
            if back_matter_start_page:
                # If the *natural* end of this chapter goes into back matter, cut it short.
                # The cut point is back_matter_start_page - 1.
                # Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99.
                if end_page >= back_matter_start_page:
                    end_page = back_matter_start_page - 1
            
            end_idx = end_page - 1
            
            # Safety clamp
            if end_idx < start_idx:
                end_idx = start_idx
                
            # Create sub-document
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
            
            # Sanitize filename
            safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()
            if not safe_title:
                safe_title = f"chapter_{chapter_idx}"
                
            # Formatting: 001_Title_pgX.pdf
            pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf"
            chapter_idx += 1
            
            # Write to zip
            zf.writestr(pdf_name, new_doc.tobytes())
            new_doc.close()
            
        # --- Back Matter Generation ---
        if back_matter_start_page and back_matter_start_page <= total_pages:
            bm_start_idx = back_matter_start_page - 1
            bm_end_idx = total_pages - 1
            
            bm_doc = fitz.open()
            bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx)
            zf.writestr("999_Back_matter.pdf", bm_doc.tobytes())
            bm_doc.close()
            
    doc.close()