Spaces:

plseng
/

khmer-space-injector

Sleeping

Create utils.py

d9b2509 verified about 2 months ago

1.04 kB

	# ======================================================
	# Khmer decoding (space insertion)
	# ======================================================
	_KHMER_COMBINING = {
	"្", # coeng (subscript marker)
	"់", "ៈ", "៎", "៏", "័", "៌", "៍", "៑", "៓", "៕", "។", "៘",
	"ា", "ិ", "ី", "ឹ", "ឺ", "ុ", "ូ", "ួ", "ើ", "ឿ", "ៀ", "េ", "ែ", "ៃ", "ោ", "ៅ",
	"ំ", "ះ",
	}


	def decode_spaces(text: str, pred_labels: list[int]) -> str:
	"""
	label=1 means insert a space AFTER this character.
	Avoid inserting spaces before Khmer combining marks to not split grapheme clusters.
	"""
	out: list[str] = []
	n = min(len(text), len(pred_labels))

	for i in range(n):
	ch = text[i]
	out.append(ch)

	if pred_labels[i] != 1:
	continue

	# If next codepoint is combining, don't insert a space
	if i + 1 < n and text[i + 1] in _KHMER_COMBINING:
	continue

	out.append(" ")

	return "".join(out).strip()