Spaces:
Sleeping
Sleeping
| import sys | |
| import re | |
| import io | |
| def clean_text(text): | |
| # Replace non-breaking spaces (\xa0) and soft hyphens (\xad) | |
| # Also collapses multiple spaces | |
| text = text.replace('\xa0', ' ').replace('\xad', '') | |
| # Replace en-dash and em-dash with standard hyphen | |
| text = text.replace('\u2013', '-').replace('\u2014', '-') | |
| # Remove control characters (except allowed ones, though likely not needed for titles) | |
| text = "".join(ch for ch in text if ch.isprintable()) | |
| return ' '.join(text.split()) | |
| def main(): | |
| # Force UTF-8 for stdin/stdout to handle special characters on Windows | |
| # otherwise it defaults to cp1252/cp437 which mangles unicode | |
| stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace') | |
| stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') | |
| # Regex to match ToC lines | |
| # Captures: | |
| # 1. Indentation (leading spaces) | |
| # 2. Title (inside quotes) | |
| # 3. Page Number | |
| # 4. Trailing content (like vpos) | |
| pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') | |
| idx = 0 | |
| for line in stdin: | |
| # Strip newline for processing | |
| line_content = line.rstrip('\n') | |
| if not line_content: | |
| stdout.write("\n") | |
| continue | |
| match = pattern.match(line_content) | |
| if match: | |
| indent = match.group(1) | |
| old_title = match.group(2) | |
| page_num = match.group(3) | |
| trailing = match.group(4) | |
| # Sanitize the title (fix weird spaces/hyphens) | |
| cleaned_title = clean_text(old_title) | |
| # Format: 000_Title_pgX | |
| new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}" | |
| # Reconstruct the line | |
| new_line = f'{indent}"{new_title}" {page_num}{trailing}' | |
| stdout.write(new_line + "\n") | |
| idx += 1 | |
| else: | |
| # If line doesn't match expected format, print as is | |
| stdout.write(line_content + "\n") | |
| if __name__ == "__main__": | |
| main() | |