File size: 8,985 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""

TOC Processor

-------------

Handles operations related to the Table of Contents (TOC) for the PDF pipeline.

Includes functionality for:

- Cleaning and sanitizing text (encoding issues, soft hyphens)

- Merging usage-heuristic headers (e.g. multi-line headers on same page)

- generating split PDF chapters

"""

import re
import io
import zipfile
import fitz  # PyMuPDF
from typing import List, Tuple, Generator, Optional
import tempfile
import os

# Type alias for TOC entry: [level, title, page, ...]
FitZTOCEntry = list

def clean_text(text: str) -> str:
    """

    Sanitize text to remove common PDF artifacts.

    Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes.

    """
    if not text:
        return ""
        
    # Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
    text = text.replace('\xa0', ' ').replace('\xad', '')
    
    # Replace en-dash and em-dash with standard hyphen
    text = text.replace('\u2013', '-').replace('\u2014', '-')
    
    # Remove control characters (except allowed ones)
    text = "".join(ch for ch in text if ch.isprintable())
    
    return ' '.join(text.split())

def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]:
    """

    Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list.

    Expected format lines: '    "Chapter Title" 123'

    """
    toc = []
    # Regex captures: 1=Indent, 2=Title, 3=PageNum
    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
    
    for line in raw_output.splitlines():
        match = pattern.match(line)
        if match:
            indent, title, page_str, _ = match.groups()
            
            # Calculate level based on indentation (4 spaces = 1 indent step)
            # 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc.
            # pdftocgen defaults to standard indentation
            level = (len(indent) // 4) + 1
            page = int(page_str)
            
            toc.append([level, title, page])
            
    return toc

def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]:
    """

    Detects consecutive Level 1 headers derived from the same page and merges them.

    This fixes the "double split" issue where multi-line headers are detected as separate entries.

    

    Example:

        Input:  [[1, "Title Part 1", 10], [1, "Title Part 2", 10]]

        Output: [[1, "Title Part 1 Title Part 2", 10]]

    """
    if not toc:
        return []

    merged_toc = []
    
    for entry in toc:
        level, title, page = entry[0], entry[1], entry[2]
        
        # We only care about merging Level 1 headers
        if level != 1:
            merged_toc.append(entry)
            continue
            
        # Check if we can merge with the previous entry
        if merged_toc:
            prev_entry = merged_toc[-1]
            prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2]
            
            # CRITERIA: Both Level 1, Same Page
            if prev_level == 1 and prev_page == page:
                # Merge! Update the previous entry's title
                new_title = f"{prev_title} {title}"
                merged_toc[-1][1] = new_title
                continue
        
        # If no merge, append as new
        merged_toc.append(entry)
        
    return merged_toc

def process_toc(raw_toc_content: str) -> str:
    """

    Full pipeline to clean and format raw TOC content.

    Returns the string content formatted for `pdftocio` input (with indices).

    """
    # 1. Parse
    parsed_toc = parse_raw_toc_output(raw_toc_content)
    
    # 2. Clean Titles
    for entry in parsed_toc:
        entry[1] = clean_text(entry[1])
        
    # 3. Merge Same-Page Headers (The Double Split Fix)
    merged_toc = merge_same_page_headers(parsed_toc)
    
    # 4. Format for Output (re-serialize)
    # pdftocio expects: "Title" PageNum
    # DECOUPLED: We keep the PDF bookmarks clean (no number prefix).
    # File naming handling is moved to generate_chapter_splits.
    
    output_lines = []
    
    for entry in merged_toc:
        level, title, page = entry[0], entry[1], entry[2]
        
        # Indent: 4 spaces per level minus 1
        indent = " " * (4 * (level - 1))
        output_lines.append(f'{indent}"{title}" {page}')
        
    return "\n".join(output_lines)

def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None):
    """

    Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path.

    Uses tempfile logic to handle large files safely.

    

    Args:

        input_pdf_path: Path to source PDF

        output_zip_path: Path to write the ZIP

        back_matter_start_page: 1-based page number where Back Matter starts. 

                                Chapters will be clamped to end before this page.

                                Content from this page to end will be saved as 999_Back_Matter.pdf.

    """
    doc = fitz.open(input_pdf_path)
    toc = doc.get_toc()
    
    if not toc:
        doc.close()
        raise ValueError("No Table of Contents found in the PDF.")
        
    # Create the zip file
    with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        total_pages = doc.page_count
        
        # --- Front Matter Extraction ---
        # Find the first Level 1 chapter
        first_l1_page = None
        for entry in toc:
             if entry[0] == 1:
                 first_l1_page = entry[2]
                 break
        
        # If the first chapter starts after Page 1, extract Front Matter
        if first_l1_page and first_l1_page > 1:
            # Front matter is from page 0 to (first_l1_page - 1) - 1 (index)
            fm_end_idx = first_l1_page - 2 
            
            if fm_end_idx >= 0:
                fm_doc = fitz.open()
                fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx)
                zf.writestr("000_Front_matter.pdf", fm_doc.tobytes())
                fm_doc.close()

        # --- Chapter Extraction ---
        chapter_idx = 1
        
        for i, entry in enumerate(toc):
            level, title, start_page = entry[0], entry[1], entry[2]
            
            # We skip non-L1 for splitting functionality
            if level != 1:
                continue
            
            # If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter)
            if back_matter_start_page and start_page >= back_matter_start_page:
                continue

            start_idx = start_page - 1
            
            # Determine end page lookahead
            end_page = total_pages
            for next_entry in toc[i+1:]:
                if next_entry[0] == 1:
                    # The start of the next chapter is the end of this one
                    end_page = next_entry[2] - 1 
                    break
            
            # --- CLAMPING: Check against Back Matter ---
            if back_matter_start_page:
                # If the *natural* end of this chapter goes into back matter, cut it short.
                # The cut point is back_matter_start_page - 1.
                # Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99.
                if end_page >= back_matter_start_page:
                    end_page = back_matter_start_page - 1
            
            end_idx = end_page - 1
            
            # Safety clamp
            if end_idx < start_idx:
                end_idx = start_idx
                
            # Create sub-document
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
            
            # Sanitize filename
            safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()
            if not safe_title:
                safe_title = f"chapter_{chapter_idx}"
                
            # Formatting: 001_Title_pgX.pdf
            pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf"
            chapter_idx += 1
            
            # Write to zip
            zf.writestr(pdf_name, new_doc.tobytes())
            new_doc.close()
            
        # --- Back Matter Generation ---
        if back_matter_start_page and back_matter_start_page <= total_pages:
            bm_start_idx = back_matter_start_page - 1
            bm_end_idx = total_pages - 1
            
            bm_doc = fitz.open()
            bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx)
            zf.writestr("999_Back_matter.pdf", bm_doc.tobytes())
            bm_doc.close()
            
    doc.close()