rairo commited on
Commit
b01dc2a
·
verified ·
1 Parent(s): a77dd77

Create korean_rules.py

Browse files
Files changed (1) hide show
  1. korean_rules.py +188 -0
korean_rules.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ korean_rules.py
3
+ Pure-Python, deterministic Korean grammar rule engine.
4
+ No ML. Uses Unicode Hangul decomposition for batchim detection.
5
+ """
6
+
7
+ HANGUL_BASE = 0xAC00
8
+ INITIAL_COUNT = 21 * 28 # 588 per initial
9
+ MEDIAL_COUNT = 28
10
+
11
+ FINALS = [
12
+ None,'ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ',
13
+ 'ㄹ','ㄺ','ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ',
14
+ 'ㅁ','ㅂ','ㅄ','ㅅ','ㅆ','ㅇ','ㅈ','ㅊ',
15
+ 'ㅋ','ㅌ','ㅍ','ㅎ',
16
+ ]
17
+ RIEUL = 'ㄹ'
18
+
19
+
20
+ def _last_hangul(word):
21
+ for ch in reversed(word):
22
+ if '\uAC00' <= ch <= '\uD7A3':
23
+ return ch
24
+ return ''
25
+
26
+ def _batchim(syl):
27
+ if not syl or not ('\uAC00' <= syl <= '\uD7A3'):
28
+ return None
29
+ return FINALS[(ord(syl) - HANGUL_BASE) % 28]
30
+
31
+ def has_batchim(word):
32
+ return _batchim(_last_hangul(word)) is not None
33
+
34
+ def has_batchim_no_rieul(word):
35
+ b = _batchim(_last_hangul(word))
36
+ return b is not None and b != RIEUL
37
+
38
+
39
+ class KoreanRuleEngine:
40
+
41
+ # ── Particles ─────────────────────────────────────────────────────────────
42
+
43
+ def get_topic_marker(self, noun):
44
+ return '은' if has_batchim_no_rieul(noun) else '는'
45
+
46
+ def get_subject_marker(self, noun):
47
+ return '이' if has_batchim_no_rieul(noun) else '가'
48
+
49
+ def get_object_marker(self, noun):
50
+ return '을' if has_batchim(noun) else '를'
51
+
52
+ def get_copula(self, noun):
53
+ return '이에요' if has_batchim(noun) else '예요'
54
+
55
+ def get_negative_marker(self, noun):
56
+ """Full negative copula: '이 아니에요' or '가 아니에요'."""
57
+ m = '이' if has_batchim_no_rieul(noun) else '가'
58
+ return f'{m} 아니에요'
59
+
60
+ def attach_topic_marker(self, noun):
61
+ return noun + self.get_topic_marker(noun)
62
+
63
+ def attach_subject_marker(self, noun):
64
+ return noun + self.get_subject_marker(noun)
65
+
66
+ def attach_object_marker(self, noun):
67
+ return noun + self.get_object_marker(noun)
68
+
69
+ def attach_copula(self, noun):
70
+ return noun + self.get_copula(noun)
71
+
72
+ def attach_negative_copula(self, noun):
73
+ return noun + self.get_negative_marker(noun)
74
+
75
+ # ── Indirect quotation ────────────────────────────────────────────────────
76
+
77
+ def conjugate_indirect_quote(self, verb_stem, form, tense='present',
78
+ is_adjective=False):
79
+ """
80
+ form: statement | command | neg_command |
81
+ request_me | request_other | question | suggestion
82
+ tense: past | present | future
83
+ is_adjective: True for adjectives/있다/없다 — uses plain +다고 in present
84
+ """
85
+ s = verb_stem
86
+ if form == 'statement':
87
+ if tense == 'past':
88
+ suffix = '았' if self._needs_a(s) else '었'
89
+ return s + suffix + '다고'
90
+ if tense == 'future':
91
+ return (self._attach_batchim(s, 'ㄹ') + ' 거라고'
92
+ if not has_batchim(s) else s + '을 거라고')
93
+ # present
94
+ if is_adjective:
95
+ return s + '다고' # adjective/있다/없다: stem+다고
96
+ return (self._attach_batchim(s, 'ㄴ') + '다고'
97
+ if not has_batchim(s) else s + '는다고')
98
+ if form == 'command':
99
+ return s + ('라고' if not has_batchim(s) else '으라고')
100
+ if form == 'neg_command':
101
+ return s + '지 말라고'
102
+ if form == 'request_me':
103
+ return s + self._vowel(s) + ' 달라고'
104
+ if form == 'request_other':
105
+ return s + self._vowel(s) + ' 주라고'
106
+ if form == 'question':
107
+ return self._drop_rieul(s) + '냐고'
108
+ if form == 'suggestion':
109
+ return s + '자고'
110
+ return s + '다고'
111
+
112
+ def conjugate_regret(self, verb_stem, negative=False):
113
+ if negative:
114
+ return verb_stem + '지 말 걸 그랬다'
115
+ if not has_batchim(verb_stem):
116
+ return self._attach_batchim(verb_stem, 'ㄹ') + ' 걸 그랬다'
117
+ return verb_stem + '을 걸 그랬다'
118
+
119
+ # ── Validation ────────────────────────────────────────────────────────────
120
+
121
+ def validate_token_order(self, submitted, correct):
122
+ return list(submitted) == list(correct)
123
+
124
+ def validate_particle(self, word, chosen, ptype):
125
+ fn = {
126
+ 'topic': self.get_topic_marker,
127
+ 'subject': self.get_subject_marker,
128
+ 'object': self.get_object_marker,
129
+ 'copula': self.get_copula,
130
+ 'negative': self.get_negative_marker,
131
+ }.get(ptype)
132
+ return fn is not None and chosen == fn(word)
133
+
134
+ def get_hint(self, word, ptype):
135
+ syl = _last_hangul(word)
136
+ b = _batchim(syl) if syl else None
137
+ desc = f"ends with consonant '{b}'" if b else "ends with a vowel sound"
138
+ ans = {
139
+ 'topic': '은' if (b and b != RIEUL) else '는',
140
+ 'subject': '이' if (b and b != RIEUL) else '가',
141
+ 'object': '을' if b else '를',
142
+ 'copula': '이에요' if b else '예요',
143
+ 'negative': ('이 아니에요' if (b and b != RIEUL) else '가 아니에요'),
144
+ }.get(ptype, '?')
145
+ return f"'{word}' {desc} → use {ans}"
146
+
147
+ # ── Internal ──────────────────────────────────────────────────────────────
148
+
149
+ def _needs_a(self, stem):
150
+ syl = _last_hangul(stem)
151
+ if not syl:
152
+ return False
153
+ medial = ((ord(syl) - HANGUL_BASE) // 28) % 21
154
+ return medial in (0, 8)
155
+
156
+ def _vowel(self, stem):
157
+ return '아' if self._needs_a(stem) else '어'
158
+
159
+ def _drop_rieul(self, stem):
160
+ syl = _last_hangul(stem)
161
+ if syl and _batchim(syl) == RIEUL:
162
+ code = ord(syl) - HANGUL_BASE
163
+ initial = code // INITIAL_COUNT
164
+ medial = (code % INITIAL_COUNT) // 28
165
+ return stem[:-1] + chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28)
166
+ return stem
167
+
168
+ def _attach_batchim(self, stem, jamo):
169
+ """
170
+ Attach a jamo final consonant to the last syllable of stem.
171
+ e.g. '가' + 'ㄴ' → '간', '가' + 'ㄹ' → '갈'
172
+ Only works when last syllable has no existing batchim.
173
+ """
174
+ FINAL_IDX = {f: i for i, f in enumerate(FINALS) if f}
175
+ idx = FINAL_IDX.get(jamo)
176
+ if idx is None:
177
+ return stem + jamo # fallback: just concatenate
178
+ syl = _last_hangul(stem)
179
+ if not syl or _batchim(syl) is not None:
180
+ return stem + jamo # already has batchim
181
+ code = ord(syl) - HANGUL_BASE
182
+ initial = code // INITIAL_COUNT
183
+ medial = (code % INITIAL_COUNT) // 28
184
+ new_syl = chr(HANGUL_BASE + initial * INITIAL_COUNT + medial * 28 + idx)
185
+ return stem[:-1] + new_syl
186
+
187
+
188
+ rule_engine = KoreanRuleEngine()