Spaces:
Running
Running
Create korean_rules.py
Browse files- korean_rules.py +188 -0
korean_rules.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
korean_rules.py
|
| 3 |
+
Pure-Python, deterministic Korean grammar rule engine.
|
| 4 |
+
No ML. Uses Unicode Hangul decomposition for batchim detection.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
HANGUL_BASE = 0xAC00
|
| 8 |
+
INITIAL_COUNT = 21 * 28 # 588 per initial
|
| 9 |
+
MEDIAL_COUNT = 28
|
| 10 |
+
|
| 11 |
+
FINALS = [
|
| 12 |
+
None,'ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ',
|
| 13 |
+
'ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ',
|
| 14 |
+
'ㅁ','ㅂ','ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ',
|
| 15 |
+
'ㅋ','ㅌ','ㅍ','ㅎ',
|
| 16 |
+
]
|
| 17 |
+
RIEUL = 'ㄹ'
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _last_hangul(word):
|
| 21 |
+
for ch in reversed(word):
|
| 22 |
+
if '\uAC00' <= ch <= '\uD7A3':
|
| 23 |
+
return ch
|
| 24 |
+
return ''
|
| 25 |
+
|
| 26 |
+
def _batchim(syl):
|
| 27 |
+
if not syl or not ('\uAC00' <= syl <= '\uD7A3'):
|
| 28 |
+
return None
|
| 29 |
+
return FINALS[(ord(syl) - HANGUL_BASE) % 28]
|
| 30 |
+
|
| 31 |
+
def has_batchim(word):
|
| 32 |
+
return _batchim(_last_hangul(word)) is not None
|
| 33 |
+
|
| 34 |
+
def has_batchim_no_rieul(word):
|
| 35 |
+
b = _batchim(_last_hangul(word))
|
| 36 |
+
return b is not None and b != RIEUL
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class KoreanRuleEngine:
|
| 40 |
+
|
| 41 |
+
# ── Particles ─────────────────────────────────────────────────────────────
|
| 42 |
+
|
| 43 |
+
def get_topic_marker(self, noun):
|
| 44 |
+
return '은' if has_batchim_no_rieul(noun) else '는'
|
| 45 |
+
|
| 46 |
+
def get_subject_marker(self, noun):
|
| 47 |
+
return '이' if has_batchim_no_rieul(noun) else '가'
|
| 48 |
+
|
| 49 |
+
def get_object_marker(self, noun):
|
| 50 |
+
return '을' if has_batchim(noun) else '를'
|
| 51 |
+
|
| 52 |
+
def get_copula(self, noun):
|
| 53 |
+
return '이에요' if has_batchim(noun) else '예요'
|
| 54 |
+
|
| 55 |
+
def get_negative_marker(self, noun):
|
| 56 |
+
"""Full negative copula: '이 아니에요' or '가 아니에요'."""
|
| 57 |
+
m = '이' if has_batchim_no_rieul(noun) else '가'
|
| 58 |
+
return f'{m} 아니에요'
|
| 59 |
+
|
| 60 |
+
def attach_topic_marker(self, noun):
|
| 61 |
+
return noun + self.get_topic_marker(noun)
|
| 62 |
+
|
| 63 |
+
def attach_subject_marker(self, noun):
|
| 64 |
+
return noun + self.get_subject_marker(noun)
|
| 65 |
+
|
| 66 |
+
def attach_object_marker(self, noun):
|
| 67 |
+
return noun + self.get_object_marker(noun)
|
| 68 |
+
|
| 69 |
+
def attach_copula(self, noun):
|
| 70 |
+
return noun + self.get_copula(noun)
|
| 71 |
+
|
| 72 |
+
def attach_negative_copula(self, noun):
|
| 73 |
+
return noun + self.get_negative_marker(noun)
|
| 74 |
+
|
| 75 |
+
# ── Indirect quotation ────────────────────────────────────────────────────
|
| 76 |
+
|
| 77 |
+
def conjugate_indirect_quote(self, verb_stem, form, tense='present',
|
| 78 |
+
is_adjective=False):
|
| 79 |
+
"""
|
| 80 |
+
form: statement | command | neg_command |
|
| 81 |
+
request_me | request_other | question | suggestion
|
| 82 |
+
tense: past | present | future
|
| 83 |
+
is_adjective: True for adjectives/있다/없다 — uses plain +다고 in present
|
| 84 |
+
"""
|
| 85 |
+
s = verb_stem
|
| 86 |
+
if form == 'statement':
|
| 87 |
+
if tense == 'past':
|
| 88 |
+
suffix = '았' if self._needs_a(s) else '었'
|
| 89 |
+
return s + suffix + '다고'
|
| 90 |
+
if tense == 'future':
|
| 91 |
+
return (self._attach_batchim(s, 'ㄹ') + ' 거라고'
|
| 92 |
+
if not has_batchim(s) else s + '을 거라고')
|
| 93 |
+
# present
|
| 94 |
+
if is_adjective:
|
| 95 |
+
return s + '다고' # adjective/있다/없다: stem+다고
|
| 96 |
+
return (self._attach_batchim(s, 'ㄴ') + '다고'
|
| 97 |
+
if not has_batchim(s) else s + '는다고')
|
| 98 |
+
if form == 'command':
|
| 99 |
+
return s + ('라고' if not has_batchim(s) else '으라고')
|
| 100 |
+
if form == 'neg_command':
|
| 101 |
+
return s + '지 말라고'
|
| 102 |
+
if form == 'request_me':
|
| 103 |
+
return s + self._vowel(s) + ' 달라고'
|
| 104 |
+
if form == 'request_other':
|
| 105 |
+
return s + self._vowel(s) + ' 주라고'
|
| 106 |
+
if form == 'question':
|
| 107 |
+
return self._drop_rieul(s) + '냐고'
|
| 108 |
+
if form == 'suggestion':
|
| 109 |
+
return s + '자고'
|
| 110 |
+
return s + '다고'
|
| 111 |
+
|
| 112 |
+
def conjugate_regret(self, verb_stem, negative=False):
|
| 113 |
+
if negative:
|
| 114 |
+
return verb_stem + '지 말 걸 그랬다'
|
| 115 |
+
if not has_batchim(verb_stem):
|
| 116 |
+
return self._attach_batchim(verb_stem, 'ㄹ') + ' 걸 그랬다'
|
| 117 |
+
return verb_stem + '을 걸 그랬다'
|
| 118 |
+
|
| 119 |
+
# ── Validation ────────────────────────────────────────────────────────────
|
| 120 |
+
|
| 121 |
+
def validate_token_order(self, submitted, correct):
|
| 122 |
+
return list(submitted) == list(correct)
|
| 123 |
+
|
| 124 |
+
def validate_particle(self, word, chosen, ptype):
|
| 125 |
+
fn = {
|
| 126 |
+
'topic': self.get_topic_marker,
|
| 127 |
+
'subject': self.get_subject_marker,
|
| 128 |
+
'object': self.get_object_marker,
|
| 129 |
+
'copula': self.get_copula,
|
| 130 |
+
'negative': self.get_negative_marker,
|
| 131 |
+
}.get(ptype)
|
| 132 |
+
return fn is not None and chosen == fn(word)
|
| 133 |
+
|
| 134 |
+
def get_hint(self, word, ptype):
|
| 135 |
+
syl = _last_hangul(word)
|
| 136 |
+
b = _batchim(syl) if syl else None
|
| 137 |
+
desc = f"ends with consonant '{b}'" if b else "ends with a vowel sound"
|
| 138 |
+
ans = {
|
| 139 |
+
'topic': '은' if (b and b != RIEUL) else '는',
|
| 140 |
+
'subject': '이' if (b and b != RIEUL) else '가',
|
| 141 |
+
'object': '을' if b else '를',
|
| 142 |
+
'copula': '이에요' if b else '예요',
|
| 143 |
+
'negative': ('이 아니에요' if (b and b != RIEUL) else '가 아니에요'),
|
| 144 |
+
}.get(ptype, '?')
|
| 145 |
+
return f"'{word}' {desc} → use {ans}"
|
| 146 |
+
|
| 147 |
+
# ── Internal ──────────────────────────────────────────────────────────────
|
| 148 |
+
|
| 149 |
+
def _needs_a(self, stem):
|
| 150 |
+
syl = _last_hangul(stem)
|
| 151 |
+
if not syl:
|
| 152 |
+
return False
|
| 153 |
+
medial = ((ord(syl) - HANGUL_BASE) // 28) % 21
|
| 154 |
+
return medial in (0, 8)
|
| 155 |
+
|
| 156 |
+
def _vowel(self, stem):
|
| 157 |
+
return '아' if self._needs_a(stem) else '어'
|
| 158 |
+
|
| 159 |
+
def _drop_rieul(self, stem):
|
| 160 |
+
syl = _last_hangul(stem)
|
| 161 |
+
if syl and _batchim(syl) == RIEUL:
|
| 162 |
+
code = ord(syl) - HANGUL_BASE
|
| 163 |
+
initial = code // INITIAL_COUNT
|
| 164 |
+
medial = (code % INITIAL_COUNT) // 28
|
| 165 |
+
return stem[:-1] + chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28)
|
| 166 |
+
return stem
|
| 167 |
+
|
| 168 |
+
def _attach_batchim(self, stem, jamo):
|
| 169 |
+
"""
|
| 170 |
+
Attach a jamo final consonant to the last syllable of stem.
|
| 171 |
+
e.g. '가' + 'ㄴ' → '간', '가' + 'ㄹ' → '갈'
|
| 172 |
+
Only works when last syllable has no existing batchim.
|
| 173 |
+
"""
|
| 174 |
+
FINAL_IDX = {f: i for i, f in enumerate(FINALS) if f}
|
| 175 |
+
idx = FINAL_IDX.get(jamo)
|
| 176 |
+
if idx is None:
|
| 177 |
+
return stem + jamo # fallback: just concatenate
|
| 178 |
+
syl = _last_hangul(stem)
|
| 179 |
+
if not syl or _batchim(syl) is not None:
|
| 180 |
+
return stem + jamo # already has batchim
|
| 181 |
+
code = ord(syl) - HANGUL_BASE
|
| 182 |
+
initial = code // INITIAL_COUNT
|
| 183 |
+
medial = (code % INITIAL_COUNT) // 28
|
| 184 |
+
new_syl = chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28 + idx)
|
| 185 |
+
return stem[:-1] + new_syl
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
rule_engine = KoreanRuleEngine()
|