import sys import re import io def clean_text(text): # Replace non-breaking spaces (\xa0) and soft hyphens (\xad) # Also collapses multiple spaces text = text.replace('\xa0', ' ').replace('\xad', '') # Replace en-dash and em-dash with standard hyphen text = text.replace('\u2013', '-').replace('\u2014', '-') # Remove control characters (except allowed ones, though likely not needed for titles) text = "".join(ch for ch in text if ch.isprintable()) return ' '.join(text.split()) def main(): # Force UTF-8 for stdin/stdout to handle special characters on Windows # otherwise it defaults to cp1252/cp437 which mangles unicode stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace') stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') # Regex to match ToC lines # Captures: # 1. Indentation (leading spaces) # 2. Title (inside quotes) # 3. Page Number # 4. Trailing content (like vpos) pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') idx = 0 for line in stdin: # Strip newline for processing line_content = line.rstrip('\n') if not line_content: stdout.write("\n") continue match = pattern.match(line_content) if match: indent = match.group(1) old_title = match.group(2) page_num = match.group(3) trailing = match.group(4) # Sanitize the title (fix weird spaces/hyphens) cleaned_title = clean_text(old_title) # Format: 000_Title_pgX new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}" # Reconstruct the line new_line = f'{indent}"{new_title}" {page_num}{trailing}' stdout.write(new_line + "\n") idx += 1 else: # If line doesn't match expected format, print as is stdout.write(line_content + "\n") if __name__ == "__main__": main()