File size: 1,170 Bytes
3c147ac
 
19048ed
3c147ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19048ed
3c147ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19048ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python3
"""Normalize spaced-letter PDF text extraction to readable plain text."""

import re
import sys


def normalize_line(line: str) -> str:
    tokens = line.split(" ")
    out: list[str] = []
    buf: list[str] = []

    def flush() -> None:
        if buf:
            out.append("".join(buf))
            buf.clear()

    for tok in tokens:
        if not tok:
            continue
        if len(tok) == 1 or (
            len(tok) <= 4 and all(c.isdigit() or c in "./:-,º°" for c in tok) and not tok.isalpha()
        ):
            buf.append(tok)
        else:
            flush()
            out.append(tok)
    flush()
    return " ".join(out)


def normalize_text(text: str) -> str:
    lines = []
    for line in text.splitlines():
        stripped = line.rstrip()
        if not stripped:
            lines.append("")
            continue
        if re.match(r"^-- \d+ of \d+ --$", stripped):
            lines.append(stripped)
            continue
        lines.append(normalize_line(stripped))
    return "\n".join(lines).strip() + "\n"


if __name__ == "__main__":
    raw = sys.stdin.read()
    sys.stdout.write(normalize_text(raw))