KoreAI-API / korean_rules.py
rairo's picture
Create korean_rules.py
b01dc2a verified
"""
korean_rules.py
Pure-Python, deterministic Korean grammar rule engine.
No ML. Uses Unicode Hangul decomposition for batchim detection.
"""
HANGUL_BASE = 0xAC00
INITIAL_COUNT = 21 * 28 # 588 per initial
MEDIAL_COUNT = 28
FINALS = [
None,'ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ',
'ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ',
'ㅁ','ㅂ','ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ',
'ㅋ','ㅌ','ㅍ','ㅎ',
]
RIEUL = 'ㄹ'
def _last_hangul(word):
for ch in reversed(word):
if '\uAC00' <= ch <= '\uD7A3':
return ch
return ''
def _batchim(syl):
if not syl or not ('\uAC00' <= syl <= '\uD7A3'):
return None
return FINALS[(ord(syl) - HANGUL_BASE) % 28]
def has_batchim(word):
return _batchim(_last_hangul(word)) is not None
def has_batchim_no_rieul(word):
b = _batchim(_last_hangul(word))
return b is not None and b != RIEUL
class KoreanRuleEngine:
# ── Particles ─────────────────────────────────────────────────────────────
def get_topic_marker(self, noun):
return '은' if has_batchim_no_rieul(noun) else '는'
def get_subject_marker(self, noun):
return '이' if has_batchim_no_rieul(noun) else '가'
def get_object_marker(self, noun):
return '을' if has_batchim(noun) else '를'
def get_copula(self, noun):
return '이에요' if has_batchim(noun) else '예요'
def get_negative_marker(self, noun):
"""Full negative copula: '이 아니에요' or '가 아니에요'."""
m = '이' if has_batchim_no_rieul(noun) else '가'
return f'{m} 아니에요'
def attach_topic_marker(self, noun):
return noun + self.get_topic_marker(noun)
def attach_subject_marker(self, noun):
return noun + self.get_subject_marker(noun)
def attach_object_marker(self, noun):
return noun + self.get_object_marker(noun)
def attach_copula(self, noun):
return noun + self.get_copula(noun)
def attach_negative_copula(self, noun):
return noun + self.get_negative_marker(noun)
# ── Indirect quotation ────────────────────────────────────────────────────
def conjugate_indirect_quote(self, verb_stem, form, tense='present',
is_adjective=False):
"""
form: statement | command | neg_command |
request_me | request_other | question | suggestion
tense: past | present | future
is_adjective: True for adjectives/있다/없다 — uses plain +다고 in present
"""
s = verb_stem
if form == 'statement':
if tense == 'past':
suffix = '았' if self._needs_a(s) else '었'
return s + suffix + '다고'
if tense == 'future':
return (self._attach_batchim(s, 'ㄹ') + ' 거라고'
if not has_batchim(s) else s + '을 거라고')
# present
if is_adjective:
return s + '다고' # adjective/있다/없다: stem+다고
return (self._attach_batchim(s, 'ㄴ') + '다고'
if not has_batchim(s) else s + '는다고')
if form == 'command':
return s + ('라고' if not has_batchim(s) else '으라고')
if form == 'neg_command':
return s + '지 말라고'
if form == 'request_me':
return s + self._vowel(s) + ' 달라고'
if form == 'request_other':
return s + self._vowel(s) + ' 주라고'
if form == 'question':
return self._drop_rieul(s) + '냐고'
if form == 'suggestion':
return s + '자고'
return s + '다고'
def conjugate_regret(self, verb_stem, negative=False):
if negative:
return verb_stem + '지 말 걸 그랬다'
if not has_batchim(verb_stem):
return self._attach_batchim(verb_stem, 'ㄹ') + ' 걸 그랬다'
return verb_stem + '을 걸 그랬다'
# ── Validation ────────────────────────────────────────────────────────────
def validate_token_order(self, submitted, correct):
return list(submitted) == list(correct)
def validate_particle(self, word, chosen, ptype):
fn = {
'topic': self.get_topic_marker,
'subject': self.get_subject_marker,
'object': self.get_object_marker,
'copula': self.get_copula,
'negative': self.get_negative_marker,
}.get(ptype)
return fn is not None and chosen == fn(word)
def get_hint(self, word, ptype):
syl = _last_hangul(word)
b = _batchim(syl) if syl else None
desc = f"ends with consonant '{b}'" if b else "ends with a vowel sound"
ans = {
'topic': '은' if (b and b != RIEUL) else '는',
'subject': '이' if (b and b != RIEUL) else '가',
'object': '을' if b else '를',
'copula': '이에요' if b else '예요',
'negative': ('이 아니에요' if (b and b != RIEUL) else '가 아니에요'),
}.get(ptype, '?')
return f"'{word}' {desc} → use {ans}"
# ── Internal ──────────────────────────────────────────────────────────────
def _needs_a(self, stem):
syl = _last_hangul(stem)
if not syl:
return False
medial = ((ord(syl) - HANGUL_BASE) // 28) % 21
return medial in (0, 8)
def _vowel(self, stem):
return '아' if self._needs_a(stem) else '어'
def _drop_rieul(self, stem):
syl = _last_hangul(stem)
if syl and _batchim(syl) == RIEUL:
code = ord(syl) - HANGUL_BASE
initial = code // INITIAL_COUNT
medial = (code % INITIAL_COUNT) // 28
return stem[:-1] + chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28)
return stem
def _attach_batchim(self, stem, jamo):
"""
Attach a jamo final consonant to the last syllable of stem.
e.g. '가' + 'ㄴ' → '간', '가' + 'ㄹ' → '갈'
Only works when last syllable has no existing batchim.
"""
FINAL_IDX = {f: i for i, f in enumerate(FINALS) if f}
idx = FINAL_IDX.get(jamo)
if idx is None:
return stem + jamo # fallback: just concatenate
syl = _last_hangul(stem)
if not syl or _batchim(syl) is not None:
return stem + jamo # already has batchim
code = ord(syl) - HANGUL_BASE
initial = code // INITIAL_COUNT
medial = (code % INITIAL_COUNT) // 28
new_syl = chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28 + idx)
return stem[:-1] + new_syl
rule_engine = KoreanRuleEngine()