Spaces:

adelevett
/

pdf.tocgen.split

Sleeping

App Files Files Community

pdf.tocgen.split / utils /modify_toc.py

adelevett

Upload 76 files

046e3b8 verified about 1 month ago

raw

history blame contribute delete

2.16 kB

	import sys
	import re
	import io

	def clean_text(text):
	# Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
	# Also collapses multiple spaces
	text = text.replace('\xa0', ' ').replace('\xad', '')
	# Replace en-dash and em-dash with standard hyphen
	text = text.replace('\u2013', '-').replace('\u2014', '-')
	# Remove control characters (except allowed ones, though likely not needed for titles)
	text = "".join(ch for ch in text if ch.isprintable())
	return ' '.join(text.split())

	def main():
	# Force UTF-8 for stdin/stdout to handle special characters on Windows
	# otherwise it defaults to cp1252/cp437 which mangles unicode
	stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace')
	stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')

	# Regex to match ToC lines
	# Captures:
	# 1. Indentation (leading spaces)
	# 2. Title (inside quotes)
	# 3. Page Number
	# 4. Trailing content (like vpos)
	pattern = re.compile(r'^(\s)"(.)"\s+(\d+)(.*)$')

	idx = 0

	for line in stdin:
	# Strip newline for processing
	line_content = line.rstrip('\n')
	if not line_content:
	stdout.write("\n")
	continue

	match = pattern.match(line_content)
	if match:
	indent = match.group(1)
	old_title = match.group(2)
	page_num = match.group(3)
	trailing = match.group(4)

	# Sanitize the title (fix weird spaces/hyphens)
	cleaned_title = clean_text(old_title)

	# Format: 000_Title_pgX
	new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}"

	# Reconstruct the line
	new_line = f'{indent}"{new_title}" {page_num}{trailing}'

	stdout.write(new_line + "\n")
	idx += 1
	else:
	# If line doesn't match expected format, print as is
	stdout.write(line_content + "\n")

	if __name__ == "__main__":
	main()