vivamais / scripts /normalize_pdf_text.py
marinarosa's picture
Make Q&A usable during processing; stream OCR per image
19048ed
#!/usr/bin/env python3
"""Normalize spaced-letter PDF text extraction to readable plain text."""
import re
import sys
def normalize_line(line: str) -> str:
tokens = line.split(" ")
out: list[str] = []
buf: list[str] = []
def flush() -> None:
if buf:
out.append("".join(buf))
buf.clear()
for tok in tokens:
if not tok:
continue
if len(tok) == 1 or (
len(tok) <= 4 and all(c.isdigit() or c in "./:-,º°" for c in tok) and not tok.isalpha()
):
buf.append(tok)
else:
flush()
out.append(tok)
flush()
return " ".join(out)
def normalize_text(text: str) -> str:
lines = []
for line in text.splitlines():
stripped = line.rstrip()
if not stripped:
lines.append("")
continue
if re.match(r"^-- \d+ of \d+ --$", stripped):
lines.append(stripped)
continue
lines.append(normalize_line(stripped))
return "\n".join(lines).strip() + "\n"
if __name__ == "__main__":
raw = sys.stdin.read()
sys.stdout.write(normalize_text(raw))