Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

pdf.tocgen.split / utils /toc_processor.py

adelevett

Upload 76 files

046e3b8 verified about 1 month ago

raw

history blame contribute delete

8.99 kB

	"""
	TOC Processor
	-------------
	Handles operations related to the Table of Contents (TOC) for the PDF pipeline.
	Includes functionality for:
	- Cleaning and sanitizing text (encoding issues, soft hyphens)
	- Merging usage-heuristic headers (e.g. multi-line headers on same page)
	- generating split PDF chapters
	"""

	import re
	import io
	import zipfile
	import fitz # PyMuPDF
	from typing import List, Tuple, Generator, Optional
	import tempfile
	import os

	# Type alias for TOC entry: [level, title, page, ...]
	FitZTOCEntry = list

	def clean_text(text: str) -> str:
	"""
	Sanitize text to remove common PDF artifacts.
	Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes.
	"""
	if not text:
	return ""

	# Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
	text = text.replace('\xa0', ' ').replace('\xad', '')

	# Replace en-dash and em-dash with standard hyphen
	text = text.replace('\u2013', '-').replace('\u2014', '-')

	# Remove control characters (except allowed ones)
	text = "".join(ch for ch in text if ch.isprintable())

	return ' '.join(text.split())

	def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]:
	"""
	Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list.
	Expected format lines: ' "Chapter Title" 123'
	"""
	toc = []
	# Regex captures: 1=Indent, 2=Title, 3=PageNum
	pattern = re.compile(r'^(\s)"(.)"\s+(\d+)(.*)$')

	for line in raw_output.splitlines():
	match = pattern.match(line)
	if match:
	indent, title, page_str, _ = match.groups()

	# Calculate level based on indentation (4 spaces = 1 indent step)
	# 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc.
	# pdftocgen defaults to standard indentation
	level = (len(indent) // 4) + 1
	page = int(page_str)

	toc.append([level, title, page])

	return toc

	def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]:
	"""
	Detects consecutive Level 1 headers derived from the same page and merges them.
	This fixes the "double split" issue where multi-line headers are detected as separate entries.

	Example:
	Input: [[1, "Title Part 1", 10], [1, "Title Part 2", 10]]
	Output: [[1, "Title Part 1 Title Part 2", 10]]
	"""
	if not toc:
	return []

	merged_toc = []

	for entry in toc:
	level, title, page = entry[0], entry[1], entry[2]

	# We only care about merging Level 1 headers
	if level != 1:
	merged_toc.append(entry)
	continue

	# Check if we can merge with the previous entry
	if merged_toc:
	prev_entry = merged_toc[-1]
	prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2]

	# CRITERIA: Both Level 1, Same Page
	if prev_level == 1 and prev_page == page:
	# Merge! Update the previous entry's title
	new_title = f"{prev_title} {title}"
	merged_toc[-1][1] = new_title
	continue

	# If no merge, append as new
	merged_toc.append(entry)

	return merged_toc

	def process_toc(raw_toc_content: str) -> str:
	"""
	Full pipeline to clean and format raw TOC content.
	Returns the string content formatted for `pdftocio` input (with indices).
	"""
	# 1. Parse
	parsed_toc = parse_raw_toc_output(raw_toc_content)

	# 2. Clean Titles
	for entry in parsed_toc:
	entry[1] = clean_text(entry[1])

	# 3. Merge Same-Page Headers (The Double Split Fix)
	merged_toc = merge_same_page_headers(parsed_toc)

	# 4. Format for Output (re-serialize)
	# pdftocio expects: "Title" PageNum
	# DECOUPLED: We keep the PDF bookmarks clean (no number prefix).
	# File naming handling is moved to generate_chapter_splits.

	output_lines = []

	for entry in merged_toc:
	level, title, page = entry[0], entry[1], entry[2]

	# Indent: 4 spaces per level minus 1
	indent = " " * (4 * (level - 1))
	output_lines.append(f'{indent}"{title}" {page}')

	return "\n".join(output_lines)

	def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None):
	"""
	Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path.
	Uses tempfile logic to handle large files safely.

	Args:
	input_pdf_path: Path to source PDF
	output_zip_path: Path to write the ZIP
	back_matter_start_page: 1-based page number where Back Matter starts.
	Chapters will be clamped to end before this page.
	Content from this page to end will be saved as 999_Back_Matter.pdf.
	"""
	doc = fitz.open(input_pdf_path)
	toc = doc.get_toc()

	if not toc:
	doc.close()
	raise ValueError("No Table of Contents found in the PDF.")

	# Create the zip file
	with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
	total_pages = doc.page_count

	# --- Front Matter Extraction ---
	# Find the first Level 1 chapter
	first_l1_page = None
	for entry in toc:
	if entry[0] == 1:
	first_l1_page = entry[2]
	break

	# If the first chapter starts after Page 1, extract Front Matter
	if first_l1_page and first_l1_page > 1:
	# Front matter is from page 0 to (first_l1_page - 1) - 1 (index)
	fm_end_idx = first_l1_page - 2

	if fm_end_idx >= 0:
	fm_doc = fitz.open()
	fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx)
	zf.writestr("000_Front_matter.pdf", fm_doc.tobytes())
	fm_doc.close()

	# --- Chapter Extraction ---
	chapter_idx = 1

	for i, entry in enumerate(toc):
	level, title, start_page = entry[0], entry[1], entry[2]

	# We skip non-L1 for splitting functionality
	if level != 1:
	continue

	# If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter)
	if back_matter_start_page and start_page >= back_matter_start_page:
	continue

	start_idx = start_page - 1

	# Determine end page lookahead
	end_page = total_pages
	for next_entry in toc[i+1:]:
	if next_entry[0] == 1:
	# The start of the next chapter is the end of this one
	end_page = next_entry[2] - 1
	break

	# --- CLAMPING: Check against Back Matter ---
	if back_matter_start_page:
	# If the natural end of this chapter goes into back matter, cut it short.
	# The cut point is back_matter_start_page - 1.
	# Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99.
	if end_page >= back_matter_start_page:
	end_page = back_matter_start_page - 1

	end_idx = end_page - 1

	# Safety clamp
	if end_idx < start_idx:
	end_idx = start_idx

	# Create sub-document
	new_doc = fitz.open()
	new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)

	# Sanitize filename
	safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()
	if not safe_title:
	safe_title = f"chapter_{chapter_idx}"

	# Formatting: 001_Title_pgX.pdf
	pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf"
	chapter_idx += 1

	# Write to zip
	zf.writestr(pdf_name, new_doc.tobytes())
	new_doc.close()

	# --- Back Matter Generation ---
	if back_matter_start_page and back_matter_start_page <= total_pages:
	bm_start_idx = back_matter_start_page - 1
	bm_end_idx = total_pages - 1

	bm_doc = fitz.open()
	bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx)
	zf.writestr("999_Back_matter.pdf", bm_doc.tobytes())
	bm_doc.close()

	doc.close()