plseng's picture
Create utils.py
d9b2509 verified
# ======================================================
# Khmer decoding (space insertion)
# ======================================================
_KHMER_COMBINING = {
"αŸ’", # coeng (subscript marker)
"αŸ‹", "ៈ", "៎", "៏", "័", "៌", "៍", "αŸ‘", "αŸ“", "αŸ•", "αŸ”", "៘",
"អ", "ិ", "ី", "ឹ", "ឺ", "ុ", "ូ", "ួ", "ើ", "ឿ", "αŸ€", "េ", "αŸ‚", "αŸƒ", "αŸ„", "αŸ…",
"αŸ†", "αŸ‡",
}
def decode_spaces(text: str, pred_labels: list[int]) -> str:
"""
label=1 means insert a space AFTER this character.
Avoid inserting spaces before Khmer combining marks to not split grapheme clusters.
"""
out: list[str] = []
n = min(len(text), len(pred_labels))
for i in range(n):
ch = text[i]
out.append(ch)
if pred_labels[i] != 1:
continue
# If next codepoint is combining, don't insert a space
if i + 1 < n and text[i + 1] in _KHMER_COMBINING:
continue
out.append(" ")
return "".join(out).strip()