Spaces:
Sleeping
Sleeping
File size: 1,043 Bytes
d9b2509 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | # ======================================================
# Khmer decoding (space insertion)
# ======================================================
_KHMER_COMBINING = {
"α", # coeng (subscript marker)
"α", "α", "α", "α", "α", "α", "α", "α", "α", "α", "α", "α",
"αΆ", "α·", "αΈ", "αΉ", "αΊ", "α»", "αΌ", "α½", "αΎ", "αΏ", "α", "α", "α", "α", "α", "α
",
"α", "α",
}
def decode_spaces(text: str, pred_labels: list[int]) -> str:
"""
label=1 means insert a space AFTER this character.
Avoid inserting spaces before Khmer combining marks to not split grapheme clusters.
"""
out: list[str] = []
n = min(len(text), len(pred_labels))
for i in range(n):
ch = text[i]
out.append(ch)
if pred_labels[i] != 1:
continue
# If next codepoint is combining, don't insert a space
if i + 1 < n and text[i + 1] in _KHMER_COMBINING:
continue
out.append(" ")
return "".join(out).strip()
|