Spaces:
Sleeping
Sleeping
File size: 3,002 Bytes
046e3b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | import fitz # PyMuPDF
import sys
import os
import re
def main():
if len(sys.argv) < 3:
print("Usage: python split_by_toc.py <input.pdf> <input.toc> [output_dir]")
sys.exit(1)
# Force UTF-8 for stdout/stderr
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')
pdf_path = sys.argv[1]
toc_path = sys.argv[2]
output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Splitting '{pdf_path}' based on '{toc_path}'...")
# 1. Parse ToC
# We need a list of (Title, StartPage)
entries = []
# Regex to match modify_toc style output: "Title" Page ...
# Also matches standard pdftocgen style
pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
with open(toc_path, 'r', encoding='utf-8') as f:
for line in f:
if not line.strip(): continue
match = pattern.match(line)
if match:
title = match.group(2)
page = int(match.group(3))
entries.append((title, page))
if not entries:
print("Error: No ToC entries found.")
sys.exit(1)
# 2. Open PDF
doc = fitz.open(pdf_path)
total_pages = doc.page_count
print(f"Total Pages: {total_pages}")
print(f"Found {len(entries)} chapters.")
print("-" * 40)
# 3. Iterate and Split
for i, (title, start_page) in enumerate(entries):
# PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually
# But wait, pdftocgen output is 1-based visual page numbers.
# So StartIndex = start_page - 1
start_idx = start_page - 1
# Determine End Page
if i < len(entries) - 1:
next_start_page = entries[i+1][1]
end_idx = next_start_page - 1 - 1 # One page before next chapter
else:
end_idx = total_pages - 1
# Safety check for weird overlaps or empty ranges
if start_idx > end_idx:
# Maybe bookmarks are out of order or on same page
# Just grab the single page
end_idx = start_idx
filename = f"{title}.pdf"
# Sanitize filename (remove forbidden chars like slash, colon)
filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
out_path = os.path.join(output_dir, filename)
print(f"[{i+1}/{len(entries)}] {title}")
print(f" Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})")
# Create new PDF for this chapter
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
new_doc.save(out_path)
new_doc.close()
print("-" * 40)
print(f"Done! Files saved to '{output_dir}/'")
if __name__ == "__main__":
main()
|