File size: 2,155 Bytes
046e3b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import sys
import re
import io

def clean_text(text):
    # Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
    # Also collapses multiple spaces
    text = text.replace('\xa0', ' ').replace('\xad', '')
    # Replace en-dash and em-dash with standard hyphen
    text = text.replace('\u2013', '-').replace('\u2014', '-')
    # Remove control characters (except allowed ones, though likely not needed for titles)
    text = "".join(ch for ch in text if ch.isprintable())
    return ' '.join(text.split())

def main():
    # Force UTF-8 for stdin/stdout to handle special characters on Windows
    # otherwise it defaults to cp1252/cp437 which mangles unicode
    stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace')
    stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')

    # Regex to match ToC lines
    # Captures: 
    # 1. Indentation (leading spaces)
    # 2. Title (inside quotes)
    # 3. Page Number
    # 4. Trailing content (like vpos)
    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
    
    idx = 0
    
    for line in stdin:
        # Strip newline for processing
        line_content = line.rstrip('\n')
        if not line_content:
            stdout.write("\n")
            continue
            
        match = pattern.match(line_content)
        if match:
            indent = match.group(1)
            old_title = match.group(2)
            page_num = match.group(3)
            trailing = match.group(4)
            
            # Sanitize the title (fix weird spaces/hyphens)
            cleaned_title = clean_text(old_title)
            
            # Format: 000_Title_pgX
            new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}"
            
            # Reconstruct the line
            new_line = f'{indent}"{new_title}" {page_num}{trailing}'
            
            stdout.write(new_line + "\n")
            idx += 1
        else:
            # If line doesn't match expected format, print as is
            stdout.write(line_content + "\n")

if __name__ == "__main__":
    main()