Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

pdf.tocgen.split / utils /split_pdf.py

adelevett

Upload 76 files

046e3b8 verified about 1 month ago

raw

history blame contribute delete

2.31 kB

	import fitz
	import sys
	import os
	import re

	def main():
	if len(sys.argv) < 2:
	print("Usage: python split_pdf.py <input_with_bookmarks.pdf> [output_dir]")
	sys.exit(1)

	# Force UTF-8 for stdout/stderr
	sys.stdout.reconfigure(encoding='utf-8')
	sys.stderr.reconfigure(encoding='utf-8')

	pdf_path = sys.argv[1]
	output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output"

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	print(f"Reading bookmarks from '{pdf_path}'...")

	doc = fitz.open(pdf_path)
	toc = doc.get_toc() # [[lvl, title, page_num, ...], ...]

	if not toc:
	print("Error: No bookmarks found in this PDF.")
	sys.exit(1)

	# Filter for Level 1 bookmarks (Top-level chapters)
	chapters = [entry for entry in toc if entry[0] == 1]

	print(f"Found {len(chapters)} top-level chapters.")

	total_pages = doc.page_count

	for i, (lvl, title, start_page, *_) in enumerate(chapters):
	# Calculate End Page
	# Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1?
	# Standard logic: Chapter 1 ends where Chapter 2 begins.

	# We need the index of this entry in the full TOC to find the next meaningful boundary
	# But simpler: The next Level 1 defines the end of this Level 1 block.

	start_idx = start_page - 1

	if i < len(chapters) - 1:
	next_start_page = chapters[i+1][2]
	end_idx = next_start_page - 1 - 1
	else:
	end_idx = total_pages - 1

	# Sanity check
	if end_idx < start_idx:
	end_idx = start_idx

	filename = f"{title}.pdf"
	# Sanitize
	filename = re.sub(r'[<>:"/\\\|?*]', '_', filename).strip()
	out_path = os.path.join(output_dir, filename)

	print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})")

	new_doc = fitz.open()
	new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
	new_doc.save(out_path)
	new_doc.close()

	print(f"Done. Files saved to {output_dir}/")

	if __name__ == "__main__":
	main()