pdf.tocgen.split / utils /toc_processor.py
adelevett's picture
Upload 76 files
046e3b8 verified
"""
TOC Processor
-------------
Handles operations related to the Table of Contents (TOC) for the PDF pipeline.
Includes functionality for:
- Cleaning and sanitizing text (encoding issues, soft hyphens)
- Merging usage-heuristic headers (e.g. multi-line headers on same page)
- generating split PDF chapters
"""
import re
import io
import zipfile
import fitz # PyMuPDF
from typing import List, Tuple, Generator, Optional
import tempfile
import os
# Type alias for TOC entry: [level, title, page, ...]
FitZTOCEntry = list
def clean_text(text: str) -> str:
"""
Sanitize text to remove common PDF artifacts.
Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes.
"""
if not text:
return ""
# Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
text = text.replace('\xa0', ' ').replace('\xad', '')
# Replace en-dash and em-dash with standard hyphen
text = text.replace('\u2013', '-').replace('\u2014', '-')
# Remove control characters (except allowed ones)
text = "".join(ch for ch in text if ch.isprintable())
return ' '.join(text.split())
def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]:
"""
Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list.
Expected format lines: ' "Chapter Title" 123'
"""
toc = []
# Regex captures: 1=Indent, 2=Title, 3=PageNum
pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
for line in raw_output.splitlines():
match = pattern.match(line)
if match:
indent, title, page_str, _ = match.groups()
# Calculate level based on indentation (4 spaces = 1 indent step)
# 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc.
# pdftocgen defaults to standard indentation
level = (len(indent) // 4) + 1
page = int(page_str)
toc.append([level, title, page])
return toc
def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]:
"""
Detects consecutive Level 1 headers derived from the same page and merges them.
This fixes the "double split" issue where multi-line headers are detected as separate entries.
Example:
Input: [[1, "Title Part 1", 10], [1, "Title Part 2", 10]]
Output: [[1, "Title Part 1 Title Part 2", 10]]
"""
if not toc:
return []
merged_toc = []
for entry in toc:
level, title, page = entry[0], entry[1], entry[2]
# We only care about merging Level 1 headers
if level != 1:
merged_toc.append(entry)
continue
# Check if we can merge with the previous entry
if merged_toc:
prev_entry = merged_toc[-1]
prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2]
# CRITERIA: Both Level 1, Same Page
if prev_level == 1 and prev_page == page:
# Merge! Update the previous entry's title
new_title = f"{prev_title} {title}"
merged_toc[-1][1] = new_title
continue
# If no merge, append as new
merged_toc.append(entry)
return merged_toc
def process_toc(raw_toc_content: str) -> str:
"""
Full pipeline to clean and format raw TOC content.
Returns the string content formatted for `pdftocio` input (with indices).
"""
# 1. Parse
parsed_toc = parse_raw_toc_output(raw_toc_content)
# 2. Clean Titles
for entry in parsed_toc:
entry[1] = clean_text(entry[1])
# 3. Merge Same-Page Headers (The Double Split Fix)
merged_toc = merge_same_page_headers(parsed_toc)
# 4. Format for Output (re-serialize)
# pdftocio expects: "Title" PageNum
# DECOUPLED: We keep the PDF bookmarks clean (no number prefix).
# File naming handling is moved to generate_chapter_splits.
output_lines = []
for entry in merged_toc:
level, title, page = entry[0], entry[1], entry[2]
# Indent: 4 spaces per level minus 1
indent = " " * (4 * (level - 1))
output_lines.append(f'{indent}"{title}" {page}')
return "\n".join(output_lines)
def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None):
"""
Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path.
Uses tempfile logic to handle large files safely.
Args:
input_pdf_path: Path to source PDF
output_zip_path: Path to write the ZIP
back_matter_start_page: 1-based page number where Back Matter starts.
Chapters will be clamped to end before this page.
Content from this page to end will be saved as 999_Back_Matter.pdf.
"""
doc = fitz.open(input_pdf_path)
toc = doc.get_toc()
if not toc:
doc.close()
raise ValueError("No Table of Contents found in the PDF.")
# Create the zip file
with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
total_pages = doc.page_count
# --- Front Matter Extraction ---
# Find the first Level 1 chapter
first_l1_page = None
for entry in toc:
if entry[0] == 1:
first_l1_page = entry[2]
break
# If the first chapter starts after Page 1, extract Front Matter
if first_l1_page and first_l1_page > 1:
# Front matter is from page 0 to (first_l1_page - 1) - 1 (index)
fm_end_idx = first_l1_page - 2
if fm_end_idx >= 0:
fm_doc = fitz.open()
fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx)
zf.writestr("000_Front_matter.pdf", fm_doc.tobytes())
fm_doc.close()
# --- Chapter Extraction ---
chapter_idx = 1
for i, entry in enumerate(toc):
level, title, start_page = entry[0], entry[1], entry[2]
# We skip non-L1 for splitting functionality
if level != 1:
continue
# If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter)
if back_matter_start_page and start_page >= back_matter_start_page:
continue
start_idx = start_page - 1
# Determine end page lookahead
end_page = total_pages
for next_entry in toc[i+1:]:
if next_entry[0] == 1:
# The start of the next chapter is the end of this one
end_page = next_entry[2] - 1
break
# --- CLAMPING: Check against Back Matter ---
if back_matter_start_page:
# If the *natural* end of this chapter goes into back matter, cut it short.
# The cut point is back_matter_start_page - 1.
# Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99.
if end_page >= back_matter_start_page:
end_page = back_matter_start_page - 1
end_idx = end_page - 1
# Safety clamp
if end_idx < start_idx:
end_idx = start_idx
# Create sub-document
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
# Sanitize filename
safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()
if not safe_title:
safe_title = f"chapter_{chapter_idx}"
# Formatting: 001_Title_pgX.pdf
pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf"
chapter_idx += 1
# Write to zip
zf.writestr(pdf_name, new_doc.tobytes())
new_doc.close()
# --- Back Matter Generation ---
if back_matter_start_page and back_matter_start_page <= total_pages:
bm_start_idx = back_matter_start_page - 1
bm_end_idx = total_pages - 1
bm_doc = fitz.open()
bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx)
zf.writestr("999_Back_matter.pdf", bm_doc.tobytes())
bm_doc.close()
doc.close()