daeunn commited on
Commit
dc7c022
ยท
verified ยท
1 Parent(s): 55ef0a7

Create enhanced_g2pk.py

Browse files
Files changed (1) hide show
  1. enhanced_g2pk.py +231 -0
enhanced_g2pk.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from g2pk2 import G2p as OriginalG2p
4
+
5
+
6
+ class EnhancedG2p(OriginalG2p):
7
+ """Enhanced version of G2p with improved pattern matching"""
8
+
9
+ def __init__(self):
10
+ super().__init__()
11
+ # ๊ฐœ๋ณ„ ๋‹จ์–ด ํŒจํ„ด (์ง์ ‘ ๋Œ€์ฒด๊ฐ€ ํ•„์š”ํ•œ ์˜ˆ์™ธ ์ผ€์ด์Šค)
12
+ self.word_patterns = {
13
+ "๋ฐ๊ธฐ": "๋ฐœ๋ผ",
14
+ }
15
+
16
+ def restore_spacing(self, text):
17
+ """ํŠน์ˆ˜ ๋ฌธ์ž๋กœ ์ฒ˜๋ฆฌ๋œ ๊ณต๋ฐฑ์„ ๋ณต์›"""
18
+ return text.replace('โฃ', ' ')
19
+
20
+ def process_patterns(self, text):
21
+ """ํŒจํ„ด ์‚ฌ์ „์„ ์‚ฌ์šฉํ•ด ๋ฐœ์Œ ๋ณ€ํ™˜ ์ฒ˜๋ฆฌ"""
22
+ for pattern, replacement in self.word_patterns.items():
23
+ if pattern in text:
24
+ text = text.replace(pattern, replacement)
25
+
26
+ return text
27
+
28
+ def process_verb_endings(self, text):
29
+ """์˜๋„/์•ฝ์†์„ ๋‚˜ํƒ€๋‚ด๋Š” ์ข…๊ฒฐ ์–ด๋ฏธ '-๊ฒŒ'๋งŒ '-๊ป˜'๋กœ ๋ณ€ํ™˜"""
30
+ # ๋‹จ๋… 'ํ•˜๊ฒŒ', 'ํ•˜๊ฒŒ์š”' ํŒจํ„ด ์ง์ ‘ ์ฒ˜๋ฆฌ
31
+ if text == 'ํ•˜๊ฒŒ' or text == 'ํ•˜๊ฒŒ์š”':
32
+ return text
33
+
34
+ # ๋ณต์‚ฌ๋ณธ ์ƒ์„ฑ
35
+ result = text
36
+
37
+ # ๋ณดํ˜ธํ•  ์„ธ๊ทธ๋จผํŠธ ์ €์žฅ์„ ์œ„ํ•œ ๋”•์…”๋„ˆ๋ฆฌ
38
+ protected_segments = {}
39
+ marker_counter = 0
40
+
41
+ # '์—๊ฒŒ', '๋‚ด๊ฒŒ' ์กฐ์‚ฌ ๋ณดํ˜ธ (์™„์ „ ๋Œ€์ฒด)
42
+ for protect_pattern in ['์—๊ฒŒ', '๋‚ด๊ฒŒ']:
43
+ while protect_pattern in result:
44
+ marker = f"__PROTECTED_{marker_counter}__"
45
+ protected_segments[marker] = protect_pattern
46
+ result = result.replace(protect_pattern, marker, 1) # ํ•œ ๋ฒˆ์— ํ•˜๋‚˜์”ฉ ๋Œ€์ฒด
47
+ marker_counter += 1
48
+
49
+ # ์ผ๋ฐ˜์ ์ธ ํ˜•์šฉ์‚ฌ + '๊ฒŒ' ํŒจํ„ด ๋ณดํ˜ธ
50
+ adj_stems = ['๊ฐ€๋ƒ˜ํ”„', '๊ฐ€๋Š˜', '๊ฐ€ํŒŒ๋ฅด', '๊ฑฐ์„ธ', '๊ฑฐ์น ', '๊ฑด์กฐํ•˜', '๊ฒ€', '๊ฒŒ์œผ๋ฅด', '๊ณ ๋ฅด', '๊ณ ๋‹ฌํ”„',
51
+ '๊ณ ๋ง™', '๊ณฑ', '๊ณ ํ”„', '๊ณง', '๊ตณ', '๊ตต', '๊ท€์—ฝ', '๊ธฐ์˜', '๊ธธ', '๊นŠ', '๊นจ๋—ํ•˜',
52
+ '๋‚˜์˜', '๋‚ฎ', '๋„ˆ๊ทธ๋Ÿฝ', '๋„ˆ๋ฅด', '๋…ธ๋ž—', '๋†’', '๋ˆ…', '๋А๋ฆฌ', '๋Šฆ', '๋”๋Ÿฝ',
53
+ '๋”์›', '๋‘ฅ๊ธ€', '๋“œ๋ฌผ', '๋”ฑํ•˜', '๋›ฐ์–ด๋‚˜', '๋œจ๊ฒ', '๋งŽ', '๋ฉ€', '๋ฉ‹์ง€', '๋ฉ”๋งˆ๋ฅด',
54
+ '๋ฉ”์Šค๊ป', '๋ชป๋‚˜', '๋ชป๋˜', '๋ชป์ƒ๊ธฐ', '๋ฌด๊ฒ', '๋ฌด๋””', '๋ฌด๋ฅด', '๋ฌด์„ญ', '๋ฏธ๋ˆํ•˜', ''
55
+ '๋ฏธ์›Œํ•˜', '๋ฏธ์น˜', '๋ฐ˜๊ฐ‘', '๋ณด๋“œ๋ž', '๋ณด๋žŒ์ฐจ', '๋ณด์ž˜๊ฒƒ์—†', '๋ถ€๋“œ๋Ÿฝ', '๋ถ€๋ฅด',
56
+ '๋ถ‰', '๋น„์‹ธ', '๋น ๋ฅด', '๋ผˆ์ €๋ฆฌ', '์ƒˆ๋กญ', '์„œํˆด', '์„ฃ๋ถ€๋ฅด', '์„ฑ๊ฐ€์‹œ', '์„ธ',
57
+ '์ˆ˜๋‹ค์Šค๋Ÿฝ', '์ˆ˜์ค', '์‰ฝ', '์Šฌํ”„', '์‹ซ', '์‹ธ', '์Œ€์Œ€๋งž', '์œ์‚ด๊ฐ™', '์“ฐ๋””์“ฐ',
58
+ '์“ฐ๋ฆฌ', '์“ฐ', '์•„๋ฆ„๋‹ต', '์•„์‰ฝ', '์•„ํ”„', '์•ˆ์“ฐ๋Ÿฝ', '์•ˆํƒ€๊น', '์•ฝ์‚ญ๋น ๋ฅด',
59
+ '์•ฝ', '์–‡', '์–•', '์–ด๋‘ก', '์–ด๋ ต', '์–ด๋ฆฌ', '์–ธ์งข', '์—†', '์—ด๋ ', '์˜ˆ์˜',
60
+ '์˜ฌ๋ฐ”๋ฅด', '์™ธ๋กญ', '์šฐ์Šต', '์˜์‹ฌ์ฉ', '์ด๋ฅด', '์ต', '์žˆ', '์ž‘', '์ž˜๋‚˜', '์ž˜๋น ์ง€',
61
+ '์žฌ๋ฏธ์žˆ', '์ ', '์ Š', '์ ์ž–', '์กฐ๊ทธ๋งฃ', '์ข', '์ข‹', '์ฃผ์ œ๋„˜', '์ค„๊ธฐ์ฐจ', '์ฆ๊ฒ',
62
+ '์ง€๋‚˜์น˜', '์ง€ํ˜œ๋กญ', '์งˆ๊ธฐ', '์ง“๊ถ‚', '์ง™', '์ผ€์ผ€๋ฌต', 'ํฌ', 'ํƒ์Šค๋Ÿฝ', 'ํ„ฑ์—†',
63
+ 'ํ‘ธ๋ฅด', 'ํ๋ฆฌ', 'ํฌ๋ง์ฐจ', 'ํฌ', 'ํž˜๊ฒน', 'ํž˜์ฐจ','๋งŒ๋“ค']
64
+
65
+ # ํ˜•์šฉ์‚ฌ ์–ด๊ฐ„ + '๊ฒŒ' ํŒจํ„ด ๋ณดํ˜ธ (์™„์ „ ๋Œ€์ฒด)
66
+ for stem in adj_stems:
67
+ pattern = stem + '๊ฒŒ'
68
+ while pattern in result:
69
+ marker = f"__PROTECTED_{marker_counter}__"
70
+ protected_segments[marker] = pattern
71
+ result = result.replace(pattern, marker, 1) # ํ•œ ๋ฒˆ์— ํ•˜๋‚˜์”ฉ ๋Œ€์ฒด
72
+ marker_counter += 1
73
+
74
+
75
+ # 'ํ•˜๊ฒŒ'๋กœ ๋๋‚˜๋Š” ๋ชจ๋“  ํŒจํ„ด ๋ณดํ˜ธ (์™„์ „ ๋Œ€์ฒด)
76
+ hage_pattern = re.compile(r'([๊ฐ€-ํžฃ]+)ํ•˜๊ฒŒ')
77
+ for match in hage_pattern.finditer(result):
78
+ full_match = match.group(0)
79
+ marker = f"__PROTECTED_{marker_counter}__"
80
+ protected_segments[marker] = full_match
81
+
82
+ # ์ •ํ™•ํžˆ ํ•ด๋‹น ์œ„์น˜์˜ ๋ฌธ์ž์—ด๋งŒ ๋Œ€์ฒด
83
+ start = match.start()
84
+ end = match.end()
85
+ result = result[:start] + marker + result[end:]
86
+
87
+ marker_counter += 1
88
+ # ์œ„์น˜๊ฐ€ ๋ฐ”๋€Œ์—ˆ์œผ๋ฏ€๋กœ ํŒจํ„ด ๋‹ค์‹œ ์ฐพ์•„์•ผ ํ•จ
89
+ hage_pattern = re.compile(r'([๊ฐ€-ํžฃ]+)ํ•˜๊ฒŒ')
90
+
91
+ # ์šฉ์–ธ ์–ด๊ฐ„ ๋ฐ›์นจ 'ใ„น' + '๊ฒŒ' ํŒจํ„ด ๋ณ€ํ™˜ (์˜๋„/์•ฝ์† ํ‘œํ˜„)
92
+ result = re.sub(r'([๊ฐˆ-ํž])\s*๊ฒŒ(์š”)?', r'\1๊ป˜\2', result)
93
+
94
+ # ๋ณดํ˜ธ๋œ ์„ธ๊ทธ๋จผํŠธ ๋ณต์›
95
+ for marker, original in protected_segments.items():
96
+ result = result.replace(marker, original)
97
+
98
+ return result
99
+
100
+ def fix_rhotacization(self, original_text, g2p_text):
101
+ """
102
+ ์ข…์„ฑ ใ„น + ๋„์–ด์“ฐ๊ธฐ + ์ดˆ์„ฑ ใ„ด ํŒจํ„ด์—๏ฟฝ๏ฟฝ ์œ ์Œํ™”๊ฐ€ ๋ฐœ์ƒํ•œ ๊ฒฝ์šฐ๋ฅผ ์›๋ž˜๋Œ€๋กœ ๋ณต์›
103
+ """
104
+ # G2p ํ…์ŠคํŠธ์—์„œ ์œ ์Œํ™”๋œ ํŒจํ„ด ์ง์ ‘ ์ฐพ๊ธฐ (์ข…์„ฑ ใ„น + ๊ณต๋ฐฑ + ์ดˆ์„ฑ ใ„น)
105
+ rhot_pattern = re.compile(r'([๊ฐ€-ํžฃ]*[๊ฐˆ-ํž])\s+([๋ผ-๋ง‡][๊ฐ€-ํžฃ]*)')
106
+ result = g2p_text
107
+
108
+ # ๋ชจ๋“  ์œ ์Œํ™”๋œ ํŒจํ„ด ์ฐพ๊ธฐ
109
+ rhot_matches = list(rhot_pattern.finditer(result))
110
+
111
+ for i, rhot_match in enumerate(rhot_matches):
112
+ # ์œ ์Œํ™”๋œ ํŒจํ„ด
113
+ rhot_full = rhot_match.group(0)
114
+ l_word = rhot_match.group(1)
115
+ r_word = rhot_match.group(2)
116
+
117
+ # ์ดˆ์„ฑ ใ„น ์Œ์ ˆ์„ ์ดˆ์„ฑ ใ„ด ์Œ์ ˆ๋กœ ๋ณ€ํ™˜
118
+ r_syllable = r_word[0] # ์ฒซ ์Œ์ ˆ (ใ„น ์ดˆ์„ฑ)
119
+
120
+ char_code = ord(r_syllable) - ord('๊ฐ€')
121
+ initial = char_code // 588
122
+ medial = (char_code % 588) // 28
123
+ final = char_code % 28
124
+
125
+ # ์ดˆ์„ฑ์ด 'ใ„น'์ธ์ง€ ํ™•์ธ (์ดˆ์„ฑ ์ฝ”๋“œ 5๊ฐ€ 'ใ„น'์— ํ•ด๋‹น)
126
+ if initial == 5:
127
+ # 'ใ„ด'์œผ๋กœ ๋ฐ”๊พผ ๋ฌธ์ž ๊ณ„์‚ฐ (์ดˆ์„ฑ ์ฝ”๋“œ 2๊ฐ€ 'ใ„ด'์— ํ•ด๋‹น)
128
+ new_char_code = (2 * 588) + (medial * 28) + final
129
+ new_char = chr(new_char_code + ord('๊ฐ€'))
130
+
131
+ # ๋ณ€ํ™˜๋œ ์ฒซ ๊ธ€์ž๋ฅผ ๊ฐ€์ง„ ๋‹จ์–ด ์ƒ์„ฑ
132
+ n_word = new_char + r_word[1:]
133
+
134
+ # ์œ ์Œํ™” ํŒจํ„ด ๊ต์ •
135
+ start = rhot_match.start(2) # ๋‘ ๋ฒˆ์งธ ๊ทธ๋ฃน์˜ ์‹œ์ž‘ ์œ„์น˜
136
+ end = rhot_match.end(2) # ๋‘ ๋ฒˆ์งธ ๊ทธ๋ฃน์˜ ๋ ์œ„์น˜
137
+ result = result[:start] + n_word + result[end:]
138
+
139
+ return result
140
+
141
+ def __call__(self, string, descriptive=False, verbose=False, group_vowels=False, to_syl=True):
142
+ """๊ธฐ์กด G2p๋ฅผ ํ˜ธ์ถœํ•˜๋˜, ํŠน์ • ํŒจํ„ด์„ ์ฒ˜๋ฆฌ"""
143
+
144
+ # 1. ๋ฐœ์Œ ํŒจํ„ด ์ฒ˜๋ฆฌ (g2pk ์ฒ˜๋ฆฌ ์ „)
145
+ result = self.process_patterns(string)
146
+
147
+ # 2. ์›๋ณธ ๋ฌธ์ž์—ด ์ €์žฅ (๋‚˜์ค‘์— ์œ ์Œํ™” ์ฒ˜๋ฆฌ์— ์‚ฌ์šฉ)
148
+ original_string = result
149
+
150
+ # 3. g2pk ์›๋ณธ ์ฒ˜๋ฆฌ
151
+ result = super().__call__(result, descriptive, verbose, group_vowels, to_syl)
152
+
153
+ # 4. ํŠน์ˆ˜ ๊ณต๋ฐฑ ๋ณต์›
154
+ result = self.restore_spacing(result)
155
+
156
+ # 5. ์œ ์Œํ™” ํŒจํ„ด ๊ต์ • (์›๋ณธ๊ณผ g2p ์ฒ˜๋ฆฌ๋œ ๊ฒฐ๊ณผ ๋น„๊ต)
157
+ result = self.fix_rhotacization(original_string, result)
158
+
159
+ # 6. ์šฉ์–ธ + ๊ฒŒ ํŒจํ„ด ์ฒ˜๋ฆฌ
160
+ result = self.process_verb_endings(result)
161
+
162
+ return result
163
+
164
+
165
+ def convert_text(text, **kwargs):
166
+ """Helper function to convert text using EnhancedG2p"""
167
+ g2p = EnhancedG2p()
168
+ return g2p(text, **kwargs)
169
+
170
+ '''
171
+ if __name__ == "__main__":
172
+ # Create an instance of EnhancedG2p
173
+ g2p = EnhancedG2p()
174
+
175
+ # Test examples to show the enhancement
176
+ test_examples = [
177
+ # ใ„บ + ใ„ฑ ๊ทœ์น™ ํ…Œ์ŠคํŠธ
178
+ "๋ฐ๊ธฐ", # -> ๋ฐœ๋ผ
179
+ "์งง๊ฒŒ",
180
+ "์ค„๊ฒŒ ์žˆ์–ด", # -> ์ค„๊ป˜
181
+ "ํ• ๊ฒŒ", # -> ํ• ๊ป˜
182
+ "๋ณผ๊ฒŒ์š”", # -> ๋ณผ๊ป˜์š”
183
+ "๋งŒ๋“ค๊ฒŒ", # -> ๋งŒ๋“ค๊ป˜
184
+ "๋†’๊ฒŒ",
185
+ "๊ธธ๊ฒŒ",
186
+ "์„ ๋ช…ํ•˜๊ฒŒ",
187
+ "ํ•  ๊ฒŒ ์—†์–ด",
188
+ "๊ณฑ๊ฒŒ",
189
+ "๊ฐ€๋ƒ˜ํ”„๊ฒŒ",
190
+ "๊ณ ๋‹ฌํ”„๊ฒŒ",
191
+ "์„œํˆด๊ฒŒ",
192
+ "๊ตณ๊ฒŒ",
193
+ "๊ณง๊ฒŒ",
194
+ "์ฃผ์—ด์ด์—๊ฒŒ ์•Œ๋ ค์ค„๊ฒŒ",
195
+
196
+ # ๋‹ค์–‘ํ•œ ๋ฌธ์žฅ ํ…Œ์ŠคํŠธ
197
+ "๋‚ด์ผ ํ•™๊ต์— ๊ฐˆ๊ฒŒ", # -> ๋‚ด์ผ ํ•™๊ต์— ๊ฐˆ๊ป˜
198
+ "์ด๊ฑฐ ํ•œ๋ฒˆ ๋จน์„๊ฒŒ", # -> ์ด๊ฑฐ ํ•œ๋ฒˆ ๋จน์„๊ป˜
199
+ "๋‚ด๊ฐ€ ์•Œ๋ ค์ค„๊ฒŒ", # -> ๋‚ด๊ฐ€ ์•Œ๋ ค์ค„๊ป˜
200
+ "์ด ์ฑ…์„ ์ฝ์„๊ฒŒ์š”", # -> ์ด ์ฑ…์„ ์ฝ์„๊ป˜์š”
201
+ "๊ทธ๊ฑด ๋‚ด๊ฐ€ ํ• ๊ฒŒ", # -> ๊ทธ๊ฑด ๋‚ด๊ฐ€ ํ• ๊ป˜
202
+ "๊ฐ€์ง€๋ฅผ ์ฝ์„๊ฒŒ์š”",
203
+ "์ฑ… ์•ž์— ์žˆ์–ด์š”",
204
+ "๊ทธ ๊ณณ์œผ๋กœ ๊ฐˆ๊ฒŒ์š”",
205
+ "๊ธธ๊ฒŒ ์ž๋ฅด์„ธ์š”",
206
+ "ํฌ์ƒ์€ ์—ด์‹ฌํžˆ ํ•œ ์•„์ด์—๊ฒŒ๋งŒ ์ฃผ์–ด์ง€๊ธฐ ๋•Œ๋ฌธ์— ํฌ์ƒ์ธ ๊ฒƒ์ž…๋‹ˆ๋‹ค.",
207
+ "๋น„๋ก ์š”์ฆ˜์€ ์ „์—ผ๋ณ‘ ๋•Œ๋ฌธ์— ์ถœ์ž…๊ตญ์ด ์‰ฝ์ง€ ์•Š์ง€๋งŒ",
208
+ "์ธ๊ฐ„ ๋“ค์„ ๋‚ด๊ฒŒ ๋ฐ”์ณ๋ผ.",
209
+ "์—ด์‹ฌํžˆ ํ•  ๋‚˜์ด์—",
210
+ "๋ฌผ ๋„ฃ๊ธฐ",
211
+ "๋‹ฌ ๋„ˆ๋จธ",
212
+ "์ˆ  ๋‚จ๊ธฐ์ง€ ๋งˆ์„ธ์š”",
213
+ "๊ธ€ ๋‚ด์šฉ์ด ์ข‹์•„์š”",
214
+ "๊ฐํƒ„์„ ๋‚ด๊ฒŒ, ์ง€์›์„ ๋‚˜๋น„์—๊ฒŒ"
215
+
216
+ ]
217
+
218
+ print("Original vs Enhanced G2p comparison:")
219
+ print("-" * 50)
220
+
221
+ original_g2p = OriginalG2p()
222
+
223
+ for example in test_examples:
224
+ original_result = original_g2p(example)
225
+ enhanced_result = g2p(example)
226
+
227
+ print(f"Input: {example}")
228
+ print(f"Original G2p: {original_result}")
229
+ print(f"Enhanced G2p: {enhanced_result}")
230
+ print("-" * 50)
231
+ '''