Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """Normalize spaced-letter PDF text extraction to readable plain text.""" | |
| import re | |
| import sys | |
| def normalize_line(line: str) -> str: | |
| tokens = line.split(" ") | |
| out: list[str] = [] | |
| buf: list[str] = [] | |
| def flush() -> None: | |
| if buf: | |
| out.append("".join(buf)) | |
| buf.clear() | |
| for tok in tokens: | |
| if not tok: | |
| continue | |
| if len(tok) == 1 or ( | |
| len(tok) <= 4 and all(c.isdigit() or c in "./:-,º°" for c in tok) and not tok.isalpha() | |
| ): | |
| buf.append(tok) | |
| else: | |
| flush() | |
| out.append(tok) | |
| flush() | |
| return " ".join(out) | |
| def normalize_text(text: str) -> str: | |
| lines = [] | |
| for line in text.splitlines(): | |
| stripped = line.rstrip() | |
| if not stripped: | |
| lines.append("") | |
| continue | |
| if re.match(r"^-- \d+ of \d+ --$", stripped): | |
| lines.append(stripped) | |
| continue | |
| lines.append(normalize_line(stripped)) | |
| return "\n".join(lines).strip() + "\n" | |
| if __name__ == "__main__": | |
| raw = sys.stdin.read() | |
| sys.stdout.write(normalize_text(raw)) | |