#!/usr/bin/env python3 """Normalize spaced-letter PDF text extraction to readable plain text.""" import re import sys def normalize_line(line: str) -> str: tokens = line.split(" ") out: list[str] = [] buf: list[str] = [] def flush() -> None: if buf: out.append("".join(buf)) buf.clear() for tok in tokens: if not tok: continue if len(tok) == 1 or ( len(tok) <= 4 and all(c.isdigit() or c in "./:-,º°" for c in tok) and not tok.isalpha() ): buf.append(tok) else: flush() out.append(tok) flush() return " ".join(out) def normalize_text(text: str) -> str: lines = [] for line in text.splitlines(): stripped = line.rstrip() if not stripped: lines.append("") continue if re.match(r"^-- \d+ of \d+ --$", stripped): lines.append(stripped) continue lines.append(normalize_line(stripped)) return "\n".join(lines).strip() + "\n" if __name__ == "__main__": raw = sys.stdin.read() sys.stdout.write(normalize_text(raw))