Spaces:
Running on Zero
Running on Zero
File size: 1,170 Bytes
3c147ac 19048ed 3c147ac 19048ed 3c147ac 19048ed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | #!/usr/bin/env python3
"""Normalize spaced-letter PDF text extraction to readable plain text."""
import re
import sys
def normalize_line(line: str) -> str:
tokens = line.split(" ")
out: list[str] = []
buf: list[str] = []
def flush() -> None:
if buf:
out.append("".join(buf))
buf.clear()
for tok in tokens:
if not tok:
continue
if len(tok) == 1 or (
len(tok) <= 4 and all(c.isdigit() or c in "./:-,º°" for c in tok) and not tok.isalpha()
):
buf.append(tok)
else:
flush()
out.append(tok)
flush()
return " ".join(out)
def normalize_text(text: str) -> str:
lines = []
for line in text.splitlines():
stripped = line.rstrip()
if not stripped:
lines.append("")
continue
if re.match(r"^-- \d+ of \d+ --$", stripped):
lines.append(stripped)
continue
lines.append(normalize_line(stripped))
return "\n".join(lines).strip() + "\n"
if __name__ == "__main__":
raw = sys.stdin.read()
sys.stdout.write(normalize_text(raw))
|