pdf.tocgen.split / utils /modify_toc.py
adelevett's picture
Upload 76 files
046e3b8 verified
import sys
import re
import io
def clean_text(text):
# Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
# Also collapses multiple spaces
text = text.replace('\xa0', ' ').replace('\xad', '')
# Replace en-dash and em-dash with standard hyphen
text = text.replace('\u2013', '-').replace('\u2014', '-')
# Remove control characters (except allowed ones, though likely not needed for titles)
text = "".join(ch for ch in text if ch.isprintable())
return ' '.join(text.split())
def main():
# Force UTF-8 for stdin/stdout to handle special characters on Windows
# otherwise it defaults to cp1252/cp437 which mangles unicode
stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace')
stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
# Regex to match ToC lines
# Captures:
# 1. Indentation (leading spaces)
# 2. Title (inside quotes)
# 3. Page Number
# 4. Trailing content (like vpos)
pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
idx = 0
for line in stdin:
# Strip newline for processing
line_content = line.rstrip('\n')
if not line_content:
stdout.write("\n")
continue
match = pattern.match(line_content)
if match:
indent = match.group(1)
old_title = match.group(2)
page_num = match.group(3)
trailing = match.group(4)
# Sanitize the title (fix weird spaces/hyphens)
cleaned_title = clean_text(old_title)
# Format: 000_Title_pgX
new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}"
# Reconstruct the line
new_line = f'{indent}"{new_title}" {page_num}{trailing}'
stdout.write(new_line + "\n")
idx += 1
else:
# If line doesn't match expected format, print as is
stdout.write(line_content + "\n")
if __name__ == "__main__":
main()