File size: 1,043 Bytes
d9b2509
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# ======================================================
# Khmer decoding (space insertion)
# ======================================================
_KHMER_COMBINING = {
    "αŸ’",  # coeng (subscript marker)
    "αŸ‹", "ៈ", "៎", "៏", "័", "៌", "៍", "αŸ‘", "αŸ“", "αŸ•", "αŸ”", "៘",
    "អ", "ិ", "ី", "ឹ", "ឺ", "ុ", "ូ", "ួ", "ើ", "ឿ", "αŸ€", "េ", "αŸ‚", "αŸƒ", "αŸ„", "αŸ…",
    "αŸ†", "αŸ‡",
}


def decode_spaces(text: str, pred_labels: list[int]) -> str:
    """
    label=1 means insert a space AFTER this character.
    Avoid inserting spaces before Khmer combining marks to not split grapheme clusters.
    """
    out: list[str] = []
    n = min(len(text), len(pred_labels))

    for i in range(n):
        ch = text[i]
        out.append(ch)

        if pred_labels[i] != 1:
            continue

        # If next codepoint is combining, don't insert a space
        if i + 1 < n and text[i + 1] in _KHMER_COMBINING:
            continue

        out.append(" ")

    return "".join(out).strip()