Spaces:
Sleeping
Sleeping
| import fitz | |
| import sys | |
| import os | |
| import re | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python split_pdf.py <input_with_bookmarks.pdf> [output_dir]") | |
| sys.exit(1) | |
| # Force UTF-8 for stdout/stderr | |
| sys.stdout.reconfigure(encoding='utf-8') | |
| sys.stderr.reconfigure(encoding='utf-8') | |
| pdf_path = sys.argv[1] | |
| output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output" | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| print(f"Reading bookmarks from '{pdf_path}'...") | |
| doc = fitz.open(pdf_path) | |
| toc = doc.get_toc() # [[lvl, title, page_num, ...], ...] | |
| if not toc: | |
| print("Error: No bookmarks found in this PDF.") | |
| sys.exit(1) | |
| # Filter for Level 1 bookmarks (Top-level chapters) | |
| chapters = [entry for entry in toc if entry[0] == 1] | |
| print(f"Found {len(chapters)} top-level chapters.") | |
| total_pages = doc.page_count | |
| for i, (lvl, title, start_page, *_) in enumerate(chapters): | |
| # Calculate End Page | |
| # Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1? | |
| # Standard logic: Chapter 1 ends where Chapter 2 begins. | |
| # We need the index of this entry in the full TOC to find the next meaningful boundary | |
| # But simpler: The next Level 1 defines the end of this Level 1 block. | |
| start_idx = start_page - 1 | |
| if i < len(chapters) - 1: | |
| next_start_page = chapters[i+1][2] | |
| end_idx = next_start_page - 1 - 1 | |
| else: | |
| end_idx = total_pages - 1 | |
| # Sanity check | |
| if end_idx < start_idx: | |
| end_idx = start_idx | |
| filename = f"{title}.pdf" | |
| # Sanitize | |
| filename = re.sub(r'[<>:"/\\|?*]', '_', filename).strip() | |
| out_path = os.path.join(output_dir, filename) | |
| print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})") | |
| new_doc = fitz.open() | |
| new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) | |
| new_doc.save(out_path) | |
| new_doc.close() | |
| print(f"Done. Files saved to {output_dir}/") | |
| if __name__ == "__main__": | |
| main() | |