Spaces:
Sleeping
Sleeping
| """ | |
| TOC Processor | |
| ------------- | |
| Handles operations related to the Table of Contents (TOC) for the PDF pipeline. | |
| Includes functionality for: | |
| - Cleaning and sanitizing text (encoding issues, soft hyphens) | |
| - Merging usage-heuristic headers (e.g. multi-line headers on same page) | |
| - generating split PDF chapters | |
| """ | |
| import re | |
| import io | |
| import zipfile | |
| import fitz # PyMuPDF | |
| from typing import List, Tuple, Generator, Optional | |
| import tempfile | |
| import os | |
| # Type alias for TOC entry: [level, title, page, ...] | |
| FitZTOCEntry = list | |
| def clean_text(text: str) -> str: | |
| """ | |
| Sanitize text to remove common PDF artifacts. | |
| Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes. | |
| """ | |
| if not text: | |
| return "" | |
| # Replace non-breaking spaces (\xa0) and soft hyphens (\xad) | |
| text = text.replace('\xa0', ' ').replace('\xad', '') | |
| # Replace en-dash and em-dash with standard hyphen | |
| text = text.replace('\u2013', '-').replace('\u2014', '-') | |
| # Remove control characters (except allowed ones) | |
| text = "".join(ch for ch in text if ch.isprintable()) | |
| return ' '.join(text.split()) | |
| def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]: | |
| """ | |
| Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list. | |
| Expected format lines: ' "Chapter Title" 123' | |
| """ | |
| toc = [] | |
| # Regex captures: 1=Indent, 2=Title, 3=PageNum | |
| pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') | |
| for line in raw_output.splitlines(): | |
| match = pattern.match(line) | |
| if match: | |
| indent, title, page_str, _ = match.groups() | |
| # Calculate level based on indentation (4 spaces = 1 indent step) | |
| # 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc. | |
| # pdftocgen defaults to standard indentation | |
| level = (len(indent) // 4) + 1 | |
| page = int(page_str) | |
| toc.append([level, title, page]) | |
| return toc | |
| def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]: | |
| """ | |
| Detects consecutive Level 1 headers derived from the same page and merges them. | |
| This fixes the "double split" issue where multi-line headers are detected as separate entries. | |
| Example: | |
| Input: [[1, "Title Part 1", 10], [1, "Title Part 2", 10]] | |
| Output: [[1, "Title Part 1 Title Part 2", 10]] | |
| """ | |
| if not toc: | |
| return [] | |
| merged_toc = [] | |
| for entry in toc: | |
| level, title, page = entry[0], entry[1], entry[2] | |
| # We only care about merging Level 1 headers | |
| if level != 1: | |
| merged_toc.append(entry) | |
| continue | |
| # Check if we can merge with the previous entry | |
| if merged_toc: | |
| prev_entry = merged_toc[-1] | |
| prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2] | |
| # CRITERIA: Both Level 1, Same Page | |
| if prev_level == 1 and prev_page == page: | |
| # Merge! Update the previous entry's title | |
| new_title = f"{prev_title} {title}" | |
| merged_toc[-1][1] = new_title | |
| continue | |
| # If no merge, append as new | |
| merged_toc.append(entry) | |
| return merged_toc | |
| def process_toc(raw_toc_content: str) -> str: | |
| """ | |
| Full pipeline to clean and format raw TOC content. | |
| Returns the string content formatted for `pdftocio` input (with indices). | |
| """ | |
| # 1. Parse | |
| parsed_toc = parse_raw_toc_output(raw_toc_content) | |
| # 2. Clean Titles | |
| for entry in parsed_toc: | |
| entry[1] = clean_text(entry[1]) | |
| # 3. Merge Same-Page Headers (The Double Split Fix) | |
| merged_toc = merge_same_page_headers(parsed_toc) | |
| # 4. Format for Output (re-serialize) | |
| # pdftocio expects: "Title" PageNum | |
| # DECOUPLED: We keep the PDF bookmarks clean (no number prefix). | |
| # File naming handling is moved to generate_chapter_splits. | |
| output_lines = [] | |
| for entry in merged_toc: | |
| level, title, page = entry[0], entry[1], entry[2] | |
| # Indent: 4 spaces per level minus 1 | |
| indent = " " * (4 * (level - 1)) | |
| output_lines.append(f'{indent}"{title}" {page}') | |
| return "\n".join(output_lines) | |
| def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None): | |
| """ | |
| Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path. | |
| Uses tempfile logic to handle large files safely. | |
| Args: | |
| input_pdf_path: Path to source PDF | |
| output_zip_path: Path to write the ZIP | |
| back_matter_start_page: 1-based page number where Back Matter starts. | |
| Chapters will be clamped to end before this page. | |
| Content from this page to end will be saved as 999_Back_Matter.pdf. | |
| """ | |
| doc = fitz.open(input_pdf_path) | |
| toc = doc.get_toc() | |
| if not toc: | |
| doc.close() | |
| raise ValueError("No Table of Contents found in the PDF.") | |
| # Create the zip file | |
| with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: | |
| total_pages = doc.page_count | |
| # --- Front Matter Extraction --- | |
| # Find the first Level 1 chapter | |
| first_l1_page = None | |
| for entry in toc: | |
| if entry[0] == 1: | |
| first_l1_page = entry[2] | |
| break | |
| # If the first chapter starts after Page 1, extract Front Matter | |
| if first_l1_page and first_l1_page > 1: | |
| # Front matter is from page 0 to (first_l1_page - 1) - 1 (index) | |
| fm_end_idx = first_l1_page - 2 | |
| if fm_end_idx >= 0: | |
| fm_doc = fitz.open() | |
| fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx) | |
| zf.writestr("000_Front_matter.pdf", fm_doc.tobytes()) | |
| fm_doc.close() | |
| # --- Chapter Extraction --- | |
| chapter_idx = 1 | |
| for i, entry in enumerate(toc): | |
| level, title, start_page = entry[0], entry[1], entry[2] | |
| # We skip non-L1 for splitting functionality | |
| if level != 1: | |
| continue | |
| # If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter) | |
| if back_matter_start_page and start_page >= back_matter_start_page: | |
| continue | |
| start_idx = start_page - 1 | |
| # Determine end page lookahead | |
| end_page = total_pages | |
| for next_entry in toc[i+1:]: | |
| if next_entry[0] == 1: | |
| # The start of the next chapter is the end of this one | |
| end_page = next_entry[2] - 1 | |
| break | |
| # --- CLAMPING: Check against Back Matter --- | |
| if back_matter_start_page: | |
| # If the *natural* end of this chapter goes into back matter, cut it short. | |
| # The cut point is back_matter_start_page - 1. | |
| # Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99. | |
| if end_page >= back_matter_start_page: | |
| end_page = back_matter_start_page - 1 | |
| end_idx = end_page - 1 | |
| # Safety clamp | |
| if end_idx < start_idx: | |
| end_idx = start_idx | |
| # Create sub-document | |
| new_doc = fitz.open() | |
| new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) | |
| # Sanitize filename | |
| safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip() | |
| if not safe_title: | |
| safe_title = f"chapter_{chapter_idx}" | |
| # Formatting: 001_Title_pgX.pdf | |
| pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf" | |
| chapter_idx += 1 | |
| # Write to zip | |
| zf.writestr(pdf_name, new_doc.tobytes()) | |
| new_doc.close() | |
| # --- Back Matter Generation --- | |
| if back_matter_start_page and back_matter_start_page <= total_pages: | |
| bm_start_idx = back_matter_start_page - 1 | |
| bm_end_idx = total_pages - 1 | |
| bm_doc = fitz.open() | |
| bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx) | |
| zf.writestr("999_Back_matter.pdf", bm_doc.tobytes()) | |
| bm_doc.close() | |
| doc.close() | |