Spaces:
Sleeping
Sleeping
File size: 8,985 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | """
TOC Processor
-------------
Handles operations related to the Table of Contents (TOC) for the PDF pipeline.
Includes functionality for:
- Cleaning and sanitizing text (encoding issues, soft hyphens)
- Merging usage-heuristic headers (e.g. multi-line headers on same page)
- generating split PDF chapters
"""
import re
import io
import zipfile
import fitz # PyMuPDF
from typing import List, Tuple, Generator, Optional
import tempfile
import os
# Type alias for TOC entry: [level, title, page, ...]
FitZTOCEntry = list
def clean_text(text: str) -> str:
"""
Sanitize text to remove common PDF artifacts.
Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes.
"""
if not text:
return ""
# Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
text = text.replace('\xa0', ' ').replace('\xad', '')
# Replace en-dash and em-dash with standard hyphen
text = text.replace('\u2013', '-').replace('\u2014', '-')
# Remove control characters (except allowed ones)
text = "".join(ch for ch in text if ch.isprintable())
return ' '.join(text.split())
def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]:
"""
Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list.
Expected format lines: ' "Chapter Title" 123'
"""
toc = []
# Regex captures: 1=Indent, 2=Title, 3=PageNum
pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
for line in raw_output.splitlines():
match = pattern.match(line)
if match:
indent, title, page_str, _ = match.groups()
# Calculate level based on indentation (4 spaces = 1 indent step)
# 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc.
# pdftocgen defaults to standard indentation
level = (len(indent) // 4) + 1
page = int(page_str)
toc.append([level, title, page])
return toc
def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]:
"""
Detects consecutive Level 1 headers derived from the same page and merges them.
This fixes the "double split" issue where multi-line headers are detected as separate entries.
Example:
Input: [[1, "Title Part 1", 10], [1, "Title Part 2", 10]]
Output: [[1, "Title Part 1 Title Part 2", 10]]
"""
if not toc:
return []
merged_toc = []
for entry in toc:
level, title, page = entry[0], entry[1], entry[2]
# We only care about merging Level 1 headers
if level != 1:
merged_toc.append(entry)
continue
# Check if we can merge with the previous entry
if merged_toc:
prev_entry = merged_toc[-1]
prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2]
# CRITERIA: Both Level 1, Same Page
if prev_level == 1 and prev_page == page:
# Merge! Update the previous entry's title
new_title = f"{prev_title} {title}"
merged_toc[-1][1] = new_title
continue
# If no merge, append as new
merged_toc.append(entry)
return merged_toc
def process_toc(raw_toc_content: str) -> str:
"""
Full pipeline to clean and format raw TOC content.
Returns the string content formatted for `pdftocio` input (with indices).
"""
# 1. Parse
parsed_toc = parse_raw_toc_output(raw_toc_content)
# 2. Clean Titles
for entry in parsed_toc:
entry[1] = clean_text(entry[1])
# 3. Merge Same-Page Headers (The Double Split Fix)
merged_toc = merge_same_page_headers(parsed_toc)
# 4. Format for Output (re-serialize)
# pdftocio expects: "Title" PageNum
# DECOUPLED: We keep the PDF bookmarks clean (no number prefix).
# File naming handling is moved to generate_chapter_splits.
output_lines = []
for entry in merged_toc:
level, title, page = entry[0], entry[1], entry[2]
# Indent: 4 spaces per level minus 1
indent = " " * (4 * (level - 1))
output_lines.append(f'{indent}"{title}" {page}')
return "\n".join(output_lines)
def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None):
"""
Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path.
Uses tempfile logic to handle large files safely.
Args:
input_pdf_path: Path to source PDF
output_zip_path: Path to write the ZIP
back_matter_start_page: 1-based page number where Back Matter starts.
Chapters will be clamped to end before this page.
Content from this page to end will be saved as 999_Back_Matter.pdf.
"""
doc = fitz.open(input_pdf_path)
toc = doc.get_toc()
if not toc:
doc.close()
raise ValueError("No Table of Contents found in the PDF.")
# Create the zip file
with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
total_pages = doc.page_count
# --- Front Matter Extraction ---
# Find the first Level 1 chapter
first_l1_page = None
for entry in toc:
if entry[0] == 1:
first_l1_page = entry[2]
break
# If the first chapter starts after Page 1, extract Front Matter
if first_l1_page and first_l1_page > 1:
# Front matter is from page 0 to (first_l1_page - 1) - 1 (index)
fm_end_idx = first_l1_page - 2
if fm_end_idx >= 0:
fm_doc = fitz.open()
fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx)
zf.writestr("000_Front_matter.pdf", fm_doc.tobytes())
fm_doc.close()
# --- Chapter Extraction ---
chapter_idx = 1
for i, entry in enumerate(toc):
level, title, start_page = entry[0], entry[1], entry[2]
# We skip non-L1 for splitting functionality
if level != 1:
continue
# If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter)
if back_matter_start_page and start_page >= back_matter_start_page:
continue
start_idx = start_page - 1
# Determine end page lookahead
end_page = total_pages
for next_entry in toc[i+1:]:
if next_entry[0] == 1:
# The start of the next chapter is the end of this one
end_page = next_entry[2] - 1
break
# --- CLAMPING: Check against Back Matter ---
if back_matter_start_page:
# If the *natural* end of this chapter goes into back matter, cut it short.
# The cut point is back_matter_start_page - 1.
# Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99.
if end_page >= back_matter_start_page:
end_page = back_matter_start_page - 1
end_idx = end_page - 1
# Safety clamp
if end_idx < start_idx:
end_idx = start_idx
# Create sub-document
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
# Sanitize filename
safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()
if not safe_title:
safe_title = f"chapter_{chapter_idx}"
# Formatting: 001_Title_pgX.pdf
pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf"
chapter_idx += 1
# Write to zip
zf.writestr(pdf_name, new_doc.tobytes())
new_doc.close()
# --- Back Matter Generation ---
if back_matter_start_page and back_matter_start_page <= total_pages:
bm_start_idx = back_matter_start_page - 1
bm_end_idx = total_pages - 1
bm_doc = fitz.open()
bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx)
zf.writestr("999_Back_matter.pdf", bm_doc.tobytes())
bm_doc.close()
doc.close()
|