Spaces:
Running
Running
File size: 7,247 Bytes
b01dc2a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | """
korean_rules.py
Pure-Python, deterministic Korean grammar rule engine.
No ML. Uses Unicode Hangul decomposition for batchim detection.
"""
HANGUL_BASE = 0xAC00
INITIAL_COUNT = 21 * 28 # 588 per initial
MEDIAL_COUNT = 28
FINALS = [
None,'ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ',
'ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ',
'ㅁ','ㅂ','ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ',
'ㅋ','ㅌ','ㅍ','ㅎ',
]
RIEUL = 'ㄹ'
def _last_hangul(word):
for ch in reversed(word):
if '\uAC00' <= ch <= '\uD7A3':
return ch
return ''
def _batchim(syl):
if not syl or not ('\uAC00' <= syl <= '\uD7A3'):
return None
return FINALS[(ord(syl) - HANGUL_BASE) % 28]
def has_batchim(word):
return _batchim(_last_hangul(word)) is not None
def has_batchim_no_rieul(word):
b = _batchim(_last_hangul(word))
return b is not None and b != RIEUL
class KoreanRuleEngine:
# ── Particles ─────────────────────────────────────────────────────────────
def get_topic_marker(self, noun):
return '은' if has_batchim_no_rieul(noun) else '는'
def get_subject_marker(self, noun):
return '이' if has_batchim_no_rieul(noun) else '가'
def get_object_marker(self, noun):
return '을' if has_batchim(noun) else '를'
def get_copula(self, noun):
return '이에요' if has_batchim(noun) else '예요'
def get_negative_marker(self, noun):
"""Full negative copula: '이 아니에요' or '가 아니에요'."""
m = '이' if has_batchim_no_rieul(noun) else '가'
return f'{m} 아니에요'
def attach_topic_marker(self, noun):
return noun + self.get_topic_marker(noun)
def attach_subject_marker(self, noun):
return noun + self.get_subject_marker(noun)
def attach_object_marker(self, noun):
return noun + self.get_object_marker(noun)
def attach_copula(self, noun):
return noun + self.get_copula(noun)
def attach_negative_copula(self, noun):
return noun + self.get_negative_marker(noun)
# ── Indirect quotation ────────────────────────────────────────────────────
def conjugate_indirect_quote(self, verb_stem, form, tense='present',
is_adjective=False):
"""
form: statement | command | neg_command |
request_me | request_other | question | suggestion
tense: past | present | future
is_adjective: True for adjectives/있다/없다 — uses plain +다고 in present
"""
s = verb_stem
if form == 'statement':
if tense == 'past':
suffix = '았' if self._needs_a(s) else '었'
return s + suffix + '다고'
if tense == 'future':
return (self._attach_batchim(s, 'ㄹ') + ' 거라고'
if not has_batchim(s) else s + '을 거라고')
# present
if is_adjective:
return s + '다고' # adjective/있다/없다: stem+다고
return (self._attach_batchim(s, 'ㄴ') + '다고'
if not has_batchim(s) else s + '는다고')
if form == 'command':
return s + ('라고' if not has_batchim(s) else '으라고')
if form == 'neg_command':
return s + '지 말라고'
if form == 'request_me':
return s + self._vowel(s) + ' 달라고'
if form == 'request_other':
return s + self._vowel(s) + ' 주라고'
if form == 'question':
return self._drop_rieul(s) + '냐고'
if form == 'suggestion':
return s + '자고'
return s + '다고'
def conjugate_regret(self, verb_stem, negative=False):
if negative:
return verb_stem + '지 말 걸 그랬다'
if not has_batchim(verb_stem):
return self._attach_batchim(verb_stem, 'ㄹ') + ' 걸 그랬다'
return verb_stem + '을 걸 그랬다'
# ── Validation ────────────────────────────────────────────────────────────
def validate_token_order(self, submitted, correct):
return list(submitted) == list(correct)
def validate_particle(self, word, chosen, ptype):
fn = {
'topic': self.get_topic_marker,
'subject': self.get_subject_marker,
'object': self.get_object_marker,
'copula': self.get_copula,
'negative': self.get_negative_marker,
}.get(ptype)
return fn is not None and chosen == fn(word)
def get_hint(self, word, ptype):
syl = _last_hangul(word)
b = _batchim(syl) if syl else None
desc = f"ends with consonant '{b}'" if b else "ends with a vowel sound"
ans = {
'topic': '은' if (b and b != RIEUL) else '는',
'subject': '이' if (b and b != RIEUL) else '가',
'object': '을' if b else '를',
'copula': '이에요' if b else '예요',
'negative': ('이 아니에요' if (b and b != RIEUL) else '가 아니에요'),
}.get(ptype, '?')
return f"'{word}' {desc} → use {ans}"
# ── Internal ──────────────────────────────────────────────────────────────
def _needs_a(self, stem):
syl = _last_hangul(stem)
if not syl:
return False
medial = ((ord(syl) - HANGUL_BASE) // 28) % 21
return medial in (0, 8)
def _vowel(self, stem):
return '아' if self._needs_a(stem) else '어'
def _drop_rieul(self, stem):
syl = _last_hangul(stem)
if syl and _batchim(syl) == RIEUL:
code = ord(syl) - HANGUL_BASE
initial = code // INITIAL_COUNT
medial = (code % INITIAL_COUNT) // 28
return stem[:-1] + chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28)
return stem
def _attach_batchim(self, stem, jamo):
"""
Attach a jamo final consonant to the last syllable of stem.
e.g. '가' + 'ㄴ' → '간', '가' + 'ㄹ' → '갈'
Only works when last syllable has no existing batchim.
"""
FINAL_IDX = {f: i for i, f in enumerate(FINALS) if f}
idx = FINAL_IDX.get(jamo)
if idx is None:
return stem + jamo # fallback: just concatenate
syl = _last_hangul(stem)
if not syl or _batchim(syl) is not None:
return stem + jamo # already has batchim
code = ord(syl) - HANGUL_BASE
initial = code // INITIAL_COUNT
medial = (code % INITIAL_COUNT) // 28
new_syl = chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28 + idx)
return stem[:-1] + new_syl
rule_engine = KoreanRuleEngine() |