Spaces:

build-small-hackathon
/

vivamais

Running on Zero

vivamais / scripts /normalize_pdf_text.py

Make Q&A usable during processing; stream OCR per image

19048ed 2 days ago

1.17 kB

	#!/usr/bin/env python3
	"""Normalize spaced-letter PDF text extraction to readable plain text."""

	import re
	import sys


	def normalize_line(line: str) -> str:
	tokens = line.split(" ")
	out: list[str] = []
	buf: list[str] = []

	def flush() -> None:
	if buf:
	out.append("".join(buf))
	buf.clear()

	for tok in tokens:
	if not tok:
	continue
	if len(tok) == 1 or (
	len(tok) <= 4 and all(c.isdigit() or c in "./:-,º°" for c in tok) and not tok.isalpha()
	):
	buf.append(tok)
	else:
	flush()
	out.append(tok)
	flush()
	return " ".join(out)


	def normalize_text(text: str) -> str:
	lines = []
	for line in text.splitlines():
	stripped = line.rstrip()
	if not stripped:
	lines.append("")
	continue
	if re.match(r"^-- \d+ of \d+ --$", stripped):
	lines.append(stripped)
	continue
	lines.append(normalize_line(stripped))
	return "\n".join(lines).strip() + "\n"


	if __name__ == "__main__":
	raw = sys.stdin.read()
	sys.stdout.write(normalize_text(raw))