""" TOC Processor ------------- Handles operations related to the Table of Contents (TOC) for the PDF pipeline. Includes functionality for: - Cleaning and sanitizing text (encoding issues, soft hyphens) - Merging usage-heuristic headers (e.g. multi-line headers on same page) - generating split PDF chapters """ import re import io import zipfile import fitz # PyMuPDF from typing import List, Tuple, Generator, Optional import tempfile import os # Type alias for TOC entry: [level, title, page, ...] FitZTOCEntry = list def clean_text(text: str) -> str: """ Sanitize text to remove common PDF artifacts. Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes. """ if not text: return "" # Replace non-breaking spaces (\xa0) and soft hyphens (\xad) text = text.replace('\xa0', ' ').replace('\xad', '') # Replace en-dash and em-dash with standard hyphen text = text.replace('\u2013', '-').replace('\u2014', '-') # Remove control characters (except allowed ones) text = "".join(ch for ch in text if ch.isprintable()) return ' '.join(text.split()) def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]: """ Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list. Expected format lines: ' "Chapter Title" 123' """ toc = [] # Regex captures: 1=Indent, 2=Title, 3=PageNum pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') for line in raw_output.splitlines(): match = pattern.match(line) if match: indent, title, page_str, _ = match.groups() # Calculate level based on indentation (4 spaces = 1 indent step) # 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc. # pdftocgen defaults to standard indentation level = (len(indent) // 4) + 1 page = int(page_str) toc.append([level, title, page]) return toc def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]: """ Detects consecutive Level 1 headers derived from the same page and merges them. This fixes the "double split" issue where multi-line headers are detected as separate entries. Example: Input: [[1, "Title Part 1", 10], [1, "Title Part 2", 10]] Output: [[1, "Title Part 1 Title Part 2", 10]] """ if not toc: return [] merged_toc = [] for entry in toc: level, title, page = entry[0], entry[1], entry[2] # We only care about merging Level 1 headers if level != 1: merged_toc.append(entry) continue # Check if we can merge with the previous entry if merged_toc: prev_entry = merged_toc[-1] prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2] # CRITERIA: Both Level 1, Same Page if prev_level == 1 and prev_page == page: # Merge! Update the previous entry's title new_title = f"{prev_title} {title}" merged_toc[-1][1] = new_title continue # If no merge, append as new merged_toc.append(entry) return merged_toc def process_toc(raw_toc_content: str) -> str: """ Full pipeline to clean and format raw TOC content. Returns the string content formatted for `pdftocio` input (with indices). """ # 1. Parse parsed_toc = parse_raw_toc_output(raw_toc_content) # 2. Clean Titles for entry in parsed_toc: entry[1] = clean_text(entry[1]) # 3. Merge Same-Page Headers (The Double Split Fix) merged_toc = merge_same_page_headers(parsed_toc) # 4. Format for Output (re-serialize) # pdftocio expects: "Title" PageNum # DECOUPLED: We keep the PDF bookmarks clean (no number prefix). # File naming handling is moved to generate_chapter_splits. output_lines = [] for entry in merged_toc: level, title, page = entry[0], entry[1], entry[2] # Indent: 4 spaces per level minus 1 indent = " " * (4 * (level - 1)) output_lines.append(f'{indent}"{title}" {page}') return "\n".join(output_lines) def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None): """ Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path. Uses tempfile logic to handle large files safely. Args: input_pdf_path: Path to source PDF output_zip_path: Path to write the ZIP back_matter_start_page: 1-based page number where Back Matter starts. Chapters will be clamped to end before this page. Content from this page to end will be saved as 999_Back_Matter.pdf. """ doc = fitz.open(input_pdf_path) toc = doc.get_toc() if not toc: doc.close() raise ValueError("No Table of Contents found in the PDF.") # Create the zip file with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: total_pages = doc.page_count # --- Front Matter Extraction --- # Find the first Level 1 chapter first_l1_page = None for entry in toc: if entry[0] == 1: first_l1_page = entry[2] break # If the first chapter starts after Page 1, extract Front Matter if first_l1_page and first_l1_page > 1: # Front matter is from page 0 to (first_l1_page - 1) - 1 (index) fm_end_idx = first_l1_page - 2 if fm_end_idx >= 0: fm_doc = fitz.open() fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx) zf.writestr("000_Front_matter.pdf", fm_doc.tobytes()) fm_doc.close() # --- Chapter Extraction --- chapter_idx = 1 for i, entry in enumerate(toc): level, title, start_page = entry[0], entry[1], entry[2] # We skip non-L1 for splitting functionality if level != 1: continue # If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter) if back_matter_start_page and start_page >= back_matter_start_page: continue start_idx = start_page - 1 # Determine end page lookahead end_page = total_pages for next_entry in toc[i+1:]: if next_entry[0] == 1: # The start of the next chapter is the end of this one end_page = next_entry[2] - 1 break # --- CLAMPING: Check against Back Matter --- if back_matter_start_page: # If the *natural* end of this chapter goes into back matter, cut it short. # The cut point is back_matter_start_page - 1. # Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99. if end_page >= back_matter_start_page: end_page = back_matter_start_page - 1 end_idx = end_page - 1 # Safety clamp if end_idx < start_idx: end_idx = start_idx # Create sub-document new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) # Sanitize filename safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip() if not safe_title: safe_title = f"chapter_{chapter_idx}" # Formatting: 001_Title_pgX.pdf pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf" chapter_idx += 1 # Write to zip zf.writestr(pdf_name, new_doc.tobytes()) new_doc.close() # --- Back Matter Generation --- if back_matter_start_page and back_matter_start_page <= total_pages: bm_start_idx = back_matter_start_page - 1 bm_end_idx = total_pages - 1 bm_doc = fitz.open() bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx) zf.writestr("999_Back_matter.pdf", bm_doc.tobytes()) bm_doc.close() doc.close()