KoreAI-API

Running

File size: 7,247 Bytes

b01dc2a

"""
korean_rules.py
Pure-Python, deterministic Korean grammar rule engine.
No ML. Uses Unicode Hangul decomposition for batchim detection.
"""

HANGUL_BASE   = 0xAC00
INITIAL_COUNT = 21 * 28   # 588 per initial
MEDIAL_COUNT  = 28

FINALS = [
    None,'ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ',
    'ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ',
    'ㅁ','ㅂ','ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ',
    'ㅋ','ㅌ','ㅍ','ㅎ',
]
RIEUL = 'ㄹ'


def _last_hangul(word):
    for ch in reversed(word):
        if '\uAC00' <= ch <= '\uD7A3':
            return ch
    return ''

def _batchim(syl):
    if not syl or not ('\uAC00' <= syl <= '\uD7A3'):
        return None
    return FINALS[(ord(syl) - HANGUL_BASE) % 28]

def has_batchim(word):
    return _batchim(_last_hangul(word)) is not None

def has_batchim_no_rieul(word):
    b = _batchim(_last_hangul(word))
    return b is not None and b != RIEUL


class KoreanRuleEngine:

    # ── Particles ─────────────────────────────────────────────────────────────

    def get_topic_marker(self, noun):
        return '은' if has_batchim_no_rieul(noun) else '는'

    def get_subject_marker(self, noun):
        return '이' if has_batchim_no_rieul(noun) else '가'

    def get_object_marker(self, noun):
        return '을' if has_batchim(noun) else '를'

    def get_copula(self, noun):
        return '이에요' if has_batchim(noun) else '예요'

    def get_negative_marker(self, noun):
        """Full negative copula: '이 아니에요' or '가 아니에요'."""
        m = '이' if has_batchim_no_rieul(noun) else '가'
        return f'{m} 아니에요'

    def attach_topic_marker(self, noun):
        return noun + self.get_topic_marker(noun)

    def attach_subject_marker(self, noun):
        return noun + self.get_subject_marker(noun)

    def attach_object_marker(self, noun):
        return noun + self.get_object_marker(noun)

    def attach_copula(self, noun):
        return noun + self.get_copula(noun)

    def attach_negative_copula(self, noun):
        return noun + self.get_negative_marker(noun)

    # ── Indirect quotation ────────────────────────────────────────────────────

    def conjugate_indirect_quote(self, verb_stem, form, tense='present',
                                  is_adjective=False):
        """
        form: statement | command | neg_command |
              request_me | request_other | question | suggestion
        tense: past | present | future
        is_adjective: True for adjectives/있다/없다 — uses plain +다고 in present
        """
        s = verb_stem
        if form == 'statement':
            if tense == 'past':
                suffix = '았' if self._needs_a(s) else '었'
                return s + suffix + '다고'
            if tense == 'future':
                return (self._attach_batchim(s, 'ㄹ') + ' 거라고'
                        if not has_batchim(s) else s + '을 거라고')
            # present
            if is_adjective:
                return s + '다고'   # adjective/있다/없다: stem+다고
            return (self._attach_batchim(s, 'ㄴ') + '다고'
                    if not has_batchim(s) else s + '는다고')
        if form == 'command':
            return s + ('라고' if not has_batchim(s) else '으라고')
        if form == 'neg_command':
            return s + '지 말라고'
        if form == 'request_me':
            return s + self._vowel(s) + ' 달라고'
        if form == 'request_other':
            return s + self._vowel(s) + ' 주라고'
        if form == 'question':
            return self._drop_rieul(s) + '냐고'
        if form == 'suggestion':
            return s + '자고'
        return s + '다고'

    def conjugate_regret(self, verb_stem, negative=False):
        if negative:
            return verb_stem + '지 말 걸 그랬다'
        if not has_batchim(verb_stem):
            return self._attach_batchim(verb_stem, 'ㄹ') + ' 걸 그랬다'
        return verb_stem + '을 걸 그랬다'

    # ── Validation ────────────────────────────────────────────────────────────

    def validate_token_order(self, submitted, correct):
        return list(submitted) == list(correct)

    def validate_particle(self, word, chosen, ptype):
        fn = {
            'topic':    self.get_topic_marker,
            'subject':  self.get_subject_marker,
            'object':   self.get_object_marker,
            'copula':   self.get_copula,
            'negative': self.get_negative_marker,
        }.get(ptype)
        return fn is not None and chosen == fn(word)

    def get_hint(self, word, ptype):
        syl  = _last_hangul(word)
        b    = _batchim(syl) if syl else None
        desc = f"ends with consonant '{b}'" if b else "ends with a vowel sound"
        ans  = {
            'topic':    '은' if (b and b != RIEUL) else '는',
            'subject':  '이' if (b and b != RIEUL) else '가',
            'object':   '을' if b else '를',
            'copula':   '이에요' if b else '예요',
            'negative': ('이 아니에요' if (b and b != RIEUL) else '가 아니에요'),
        }.get(ptype, '?')
        return f"'{word}' {desc} → use {ans}"

    # ── Internal ──────────────────────────────────────────────────────────────

    def _needs_a(self, stem):
        syl = _last_hangul(stem)
        if not syl:
            return False
        medial = ((ord(syl) - HANGUL_BASE) // 28) % 21
        return medial in (0, 8)

    def _vowel(self, stem):
        return '아' if self._needs_a(stem) else '어'

    def _drop_rieul(self, stem):
        syl = _last_hangul(stem)
        if syl and _batchim(syl) == RIEUL:
            code    = ord(syl) - HANGUL_BASE
            initial = code // INITIAL_COUNT
            medial  = (code % INITIAL_COUNT) // 28
            return stem[:-1] + chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28)
        return stem

    def _attach_batchim(self, stem, jamo):
        """
        Attach a jamo final consonant to the last syllable of stem.
        e.g. '가' + 'ㄴ' → '간',  '가' + 'ㄹ' → '갈'
        Only works when last syllable has no existing batchim.
        """
        FINAL_IDX = {f: i for i, f in enumerate(FINALS) if f}
        idx = FINAL_IDX.get(jamo)
        if idx is None:
            return stem + jamo   # fallback: just concatenate
        syl = _last_hangul(stem)
        if not syl or _batchim(syl) is not None:
            return stem + jamo   # already has batchim
        code    = ord(syl) - HANGUL_BASE
        initial = code // INITIAL_COUNT
        medial  = (code % INITIAL_COUNT) // 28
        new_syl = chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28 + idx)
        return stem[:-1] + new_syl


rule_engine = KoreanRuleEngine()