Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

pdf.tocgen.split / utils /split_by_toc.py

adelevett

Upload 76 files

046e3b8 verified about 1 month ago

raw

history blame contribute delete

3 kB

	import fitz # PyMuPDF
	import sys
	import os
	import re

	def main():
	if len(sys.argv) < 3:
	print("Usage: python split_by_toc.py <input.pdf> <input.toc> [output_dir]")
	sys.exit(1)

	# Force UTF-8 for stdout/stderr
	sys.stdout.reconfigure(encoding='utf-8')
	sys.stderr.reconfigure(encoding='utf-8')

	pdf_path = sys.argv[1]
	toc_path = sys.argv[2]
	output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output"

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	print(f"Splitting '{pdf_path}' based on '{toc_path}'...")

	# 1. Parse ToC
	# We need a list of (Title, StartPage)
	entries = []
	# Regex to match modify_toc style output: "Title" Page ...
	# Also matches standard pdftocgen style
	pattern = re.compile(r'^(\s)"(.)"\s+(\d+)(.*)$')

	with open(toc_path, 'r', encoding='utf-8') as f:
	for line in f:
	if not line.strip(): continue
	match = pattern.match(line)
	if match:
	title = match.group(2)
	page = int(match.group(3))
	entries.append((title, page))

	if not entries:
	print("Error: No ToC entries found.")
	sys.exit(1)

	# 2. Open PDF
	doc = fitz.open(pdf_path)
	total_pages = doc.page_count

	print(f"Total Pages: {total_pages}")
	print(f"Found {len(entries)} chapters.")
	print("-" * 40)

	# 3. Iterate and Split
	for i, (title, start_page) in enumerate(entries):
	# PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually
	# But wait, pdftocgen output is 1-based visual page numbers.
	# So StartIndex = start_page - 1

	start_idx = start_page - 1

	# Determine End Page
	if i < len(entries) - 1:
	next_start_page = entries[i+1][1]
	end_idx = next_start_page - 1 - 1 # One page before next chapter
	else:
	end_idx = total_pages - 1

	# Safety check for weird overlaps or empty ranges
	if start_idx > end_idx:
	# Maybe bookmarks are out of order or on same page
	# Just grab the single page
	end_idx = start_idx

	filename = f"{title}.pdf"
	# Sanitize filename (remove forbidden chars like slash, colon)
	filename = re.sub(r'[<>:"/\\\|?*]', '_', filename)
	out_path = os.path.join(output_dir, filename)

	print(f"[{i+1}/{len(entries)}] {title}")
	print(f" Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})")

	# Create new PDF for this chapter
	new_doc = fitz.open()
	new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
	new_doc.save(out_path)
	new_doc.close()

	print("-" * 40)
	print(f"Done! Files saved to '{output_dir}/'")

	if __name__ == "__main__":
	main()