Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| import sys | |
| import os | |
| import re | |
| def main(): | |
| if len(sys.argv) < 3: | |
| print("Usage: python split_by_toc.py <input.pdf> <input.toc> [output_dir]") | |
| sys.exit(1) | |
| # Force UTF-8 for stdout/stderr | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| sys.stderr.reconfigure(encoding='utf-8') | |
| pdf_path = sys.argv[1] | |
| toc_path = sys.argv[2] | |
| output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output" | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| print(f"Splitting '{pdf_path}' based on '{toc_path}'...") | |
| # 1. Parse ToC | |
| # We need a list of (Title, StartPage) | |
| entries = [] | |
| # Regex to match modify_toc style output: "Title" Page ... | |
| # Also matches standard pdftocgen style | |
| pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') | |
| with open(toc_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| if not line.strip(): continue | |
| match = pattern.match(line) | |
| if match: | |
| title = match.group(2) | |
| page = int(match.group(3)) | |
| entries.append((title, page)) | |
| if not entries: | |
| print("Error: No ToC entries found.") | |
| sys.exit(1) | |
| # 2. Open PDF | |
| doc = fitz.open(pdf_path) | |
| total_pages = doc.page_count | |
| print(f"Total Pages: {total_pages}") | |
| print(f"Found {len(entries)} chapters.") | |
| print("-" * 40) | |
| # 3. Iterate and Split | |
| for i, (title, start_page) in enumerate(entries): | |
| # PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually | |
| # But wait, pdftocgen output is 1-based visual page numbers. | |
| # So StartIndex = start_page - 1 | |
| start_idx = start_page - 1 | |
| # Determine End Page | |
| if i < len(entries) - 1: | |
| next_start_page = entries[i+1][1] | |
| end_idx = next_start_page - 1 - 1 # One page before next chapter | |
| else: | |
| end_idx = total_pages - 1 | |
| # Safety check for weird overlaps or empty ranges | |
| if start_idx > end_idx: | |
| # Maybe bookmarks are out of order or on same page | |
| # Just grab the single page | |
| end_idx = start_idx | |
| filename = f"{title}.pdf" | |
| # Sanitize filename (remove forbidden chars like slash, colon) | |
| filename = re.sub(r'[<>:"/\\|?*]', '_', filename) | |
| out_path = os.path.join(output_dir, filename) | |
| print(f"[{i+1}/{len(entries)}] {title}") | |
| print(f" Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})") | |
| # Create new PDF for this chapter | |
| new_doc = fitz.open() | |
| new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) | |
| new_doc.save(out_path) | |
| new_doc.close() | |
| print("-" * 40) | |
| print(f"Done! Files saved to '{output_dir}/'") | |
| if __name__ == "__main__": | |
| main() | |