jonathanjordan21 commited on
Commit
82a2cd4
·
verified ·
1 Parent(s): 085f4e9

Update idn_phonemes.py

Browse files
Files changed (1) hide show
  1. idn_phonemes.py +219 -130
idn_phonemes.py CHANGED
@@ -1,155 +1,244 @@
1
  import re
2
 
 
3
  ipa_map = {
4
- "ng": "ŋ",
5
- "ny": "ɲ",
6
- "sy": "ʃ",
7
- "kh": "x",
8
- "c": "tʃ",
9
- "j": "dʒ",
10
- "y": "j",
11
- "r": "r",
12
- "x": "ks",
13
- "a": "a",
14
- "i": "i",
15
- "u": "u",
16
- "e": "ə",
17
- "o": "o",
18
- "b": "b",
19
- "d": "d̪",
20
- "t": "t̪",
21
- "g": "ɡ",
22
- "k": "k",
23
- "p": "p",
24
- "m": "m",
25
- "n": "n",
26
- "l": "l",
27
- "s": "s",
28
- "h": "h",
29
- "w": "w",
30
  }
31
 
32
- num_words = {
33
- 0: "nol",
34
- 1: "satu",
35
- 2: "dua",
36
- 3: "tiga",
37
- 4: "empat",
38
- 5: "lima",
39
- 6: "enam",
40
- 7: "tujuh",
41
- 8: "delapan",
42
- 9: "sembilan",
43
- 10: "sepuluh",
44
- 11: "sebelas"
45
  }
46
 
 
47
  def number_to_words(n: int) -> str:
48
- """Konversi angka 0–9999 ke kata dalam bahasa Indonesia"""
49
- if n < 12:
50
- return num_words[n]
51
- elif n < 20:
52
- return number_to_words(n-10) + " belas"
53
- elif n < 100:
54
- puluhan, sisa = divmod(n, 10)
55
- result = number_to_words(puluhan) + " puluh"
56
- if sisa:
57
- result += " " + number_to_words(sisa)
58
- return result
59
- elif n < 200:
60
- return "seratus" + (" " + number_to_words(n-100) if n > 100 else "")
61
- elif n < 1000:
62
- ratusan, sisa = divmod(n, 100)
63
- result = number_to_words(ratusan) + " ratus"
64
- if sisa:
65
- result += " " + number_to_words(sisa)
66
- return result
67
- elif n < 2000:
68
- return "seribu" + (" " + number_to_words(n-1000) if n > 1000 else "")
69
- elif n < 10000:
70
- ribuan, sisa = divmod(n, 1000)
71
- result = number_to_words(ribuan) + " ribu"
72
- if sisa:
73
- result += " " + number_to_words(sisa)
74
- return result
75
- else:
76
- return str(n) # fallback
77
-
78
-
79
-
80
- def expand_abbreviation(word: str) -> str:
81
- """Ubah singkatan (huruf kapital) jadi ucapan Indonesia"""
82
- if word.isupper() and len(word) > 1: # contoh: KTP, DPR, RI
83
- return " ".join(letter_words.get(ch.lower(), ch) for ch in word)
84
- return word
85
 
 
 
 
 
 
 
 
86
 
87
- letter_words = {
88
- "a": "a",
89
- "b": "be",
90
- "c": "ce",
91
- "d": "de",
92
- "e": "e",
93
- "f": "ef",
94
- "g": "ge",
95
- "h": "ha",
96
- "i": "i",
97
- "j": "je",
98
- "k": "ka",
99
- "l": "el",
100
- "m": "em",
101
- "n": "en",
102
- "o": "o",
103
- "p": "pe",
104
- "q": "ki",
105
- "r": "er",
106
- "s": "es",
107
- "t": "te",
108
- "u": "u",
109
- "v": "fe",
110
- "w": "we",
111
- "x": "eks",
112
- "y": "ye",
113
- "z": "zet",
114
- }
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- def indo_to_ipa(text: str) -> str:
118
- text = text.lower()
119
-
120
- # Tangani singkatan (huruf kapital semua)
121
- words = []
122
- for w in text.split():
123
- if w.isupper() and len(w) > 1:
124
- words.append(expand_abbreviation(w))
125
- else:
126
- words.append(w)
127
- text = " ".join(words)
128
-
129
- # Tangani angka → kata
130
- def replace_number(match):
131
- num = int(match.group())
132
- return number_to_words(num)
133
- text = re.sub(r"\d+", replace_number, text)
134
-
135
- # Konversi huruf → IPA
136
- for k in sorted(ipa_map.keys(), key=lambda x: -len(x)):
137
- text = re.sub(k, ipa_map[k], text)
138
-
139
- return text
140
 
141
  # def indo_to_ipa(text: str) -> str:
142
  # text = text.lower()
143
 
144
- # # Cari semua angka dalam teks dan ubah ke kata
 
 
 
 
 
 
 
 
 
145
  # def replace_number(match):
146
  # num = int(match.group())
147
  # return number_to_words(num)
148
-
149
  # text = re.sub(r"\d+", replace_number, text)
150
 
151
  # # Konversi huruf → IPA
152
  # for k in sorted(ipa_map.keys(), key=lambda x: -len(x)):
153
  # text = re.sub(k, ipa_map[k], text)
154
 
155
- # return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
 
3
+ # --- IPA map ---
4
  ipa_map = {
5
+ "ng": "ŋ", "ny": "ɲ", "sy": "ʃ", "kh": "x", "c": "tʃ", "j": "dʒ",
6
+ "b": "b", "d": "d̪", "t": "t̪", "g": "ɡ", "k": "k", "p": "p",
7
+ "m": "m", "n": "n", "l": "l", "s": "s", "h": "h", "r": "r", "w": "w", "y": "j",
8
+ "a": "a", "i": "i", "u": "u", "o": "o", "e": "ə"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  }
10
 
11
+ # Sebutan huruf
12
+ letter_words = {
13
+ "a":"a","b":"be","c":"ce","d":"de","e":"e","f":"ef","g":"ge","h":"ha",
14
+ "i":"i","j":"je","k":"ka","l":"el","m":"em","n":"en","o":"o","p":"pe",
15
+ "q":"ki","r":"er","s":"es","t":"te","u":"u","v":"fe","w":"we","x":"eks",
16
+ "y":"ye","z":"zet"
17
+ }
18
+
19
+ digit_words = {
20
+ "0":"nol","1":"satu","2":"dua","3":"tiga","4":"empat",
21
+ "5":"lima","6":"enam","7":"tujuh","8":"delapan","9":"sembilan"
 
 
22
  }
23
 
24
+ # --- Number to words (hingga triliun) ---
25
  def number_to_words(n: int) -> str:
26
+ n = int(n)
27
+ if n == 0:
28
+ return "nol"
29
+ def _below_thousand(x):
30
+ words = ["nol","satu","dua","tiga","empat","lima","enam","tujuh","delapan","sembilan","sepuluh","sebelas"]
31
+ if x < 12: return words[x]
32
+ if x < 20: return _below_thousand(x-10)+" belas"
33
+ if x < 100:
34
+ q,r=divmod(x,10); return _below_thousand(q)+" puluh"+((" "+_below_thousand(r)) if r else "")
35
+ if x < 200: return "seratus"+((" "+_below_thousand(x-100)) if x>100 else "")
36
+ if x < 1000:
37
+ q,r=divmod(x,100); return _below_thousand(q)+" ratus"+((" "+_below_thousand(r)) if r else "")
38
+ scales=[(1_000_000_000_000,"triliun"),(1_000_000_000,"miliar"),(1_000_000,"juta"),(1000,"ribu")]
39
+ parts=[]; remaining=n
40
+ for v,nm in scales:
41
+ if remaining>=v:
42
+ q,remaining=divmod(remaining,v)
43
+ if v==1000 and q==1: parts.append("seribu")
44
+ else: parts.append(number_to_words(q)+" "+nm)
45
+ if remaining: parts.append(_below_thousand(remaining))
46
+ return " ".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ # --- Nomor HP ---
49
+ phone_pattern=re.compile(r'(?<!\w)(?:\+62|\d)\d{7,}(?!\w)')
50
+ def expand_phones(text:str)->str:
51
+ def repl(m):
52
+ digits=re.findall(r'\d',m.group(0))
53
+ return " ".join(digit_words[d] for d in digits)
54
+ return phone_pattern.sub(repl,text)
55
 
56
+ # --- Angka umum ---
57
+ def expand_numbers(text:str)->str:
58
+ def repl(m):
59
+ return number_to_words(int(m.group()))
60
+ return re.sub(r'\d+',repl,text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ # --- Singkatan ---
63
+ abbr_pattern=re.compile(r'(?<!\w)([A-Z]{2,})(?!\w)')
64
+ def expand_abbreviations(text:str)->str:
65
+ def repl(m):
66
+ token=m.group(1)
67
+ if token=="HP": # <-- jangan expand 'HP' kalau berdiri sendiri
68
+ return "ha pe"
69
+ return " ".join(letter_words[ch.lower()] for ch in token)
70
+ return abbr_pattern.sub(repl,text)
71
+
72
+ # --- IPA ---
73
+ def apply_ipa_map(text:str)->str:
74
+ t=text.lower()
75
+ for k in sorted(ipa_map,key=len,reverse=True):
76
+ t=re.sub(re.escape(k),ipa_map[k],t)
77
+ return "["+re.sub(r'\s+',' ',t).strip()+"]"
78
+
79
+ # --- Pipeline ---
80
+ def indo_to_ipa(text:str)->str:
81
+ # 1. nomor HP
82
+ step1=expand_phones(text)
83
+ # 2. angka biasa
84
+ step2=expand_numbers(step1)
85
+ # 3. singkatan
86
+ step3=expand_abbreviations(step2)
87
+ # 4. mapping IPA
88
+ return apply_ipa_map(step3)
89
+
90
+ # import re
91
+
92
+ # ipa_map = {
93
+ # "ng": "ŋ",
94
+ # "ny": "ɲ",
95
+ # "sy": "ʃ",
96
+ # "kh": "x",
97
+ # "c": "tʃ",
98
+ # "j": "dʒ",
99
+ # "y": "j",
100
+ # "r": "r",
101
+ # "x": "ks",
102
+ # "a": "a",
103
+ # "i": "i",
104
+ # "u": "u",
105
+ # "e": "ə",
106
+ # "o": "o",
107
+ # "b": "b",
108
+ # "d": "d̪",
109
+ # "t": "t̪",
110
+ # "g": "ɡ",
111
+ # "k": "k",
112
+ # "p": "p",
113
+ # "m": "m",
114
+ # "n": "n",
115
+ # "l": "l",
116
+ # "s": "s",
117
+ # "h": "h",
118
+ # "w": "w",
119
+ # }
120
+
121
+ # num_words = {
122
+ # 0: "nol",
123
+ # 1: "satu",
124
+ # 2: "dua",
125
+ # 3: "tiga",
126
+ # 4: "empat",
127
+ # 5: "lima",
128
+ # 6: "enam",
129
+ # 7: "tujuh",
130
+ # 8: "delapan",
131
+ # 9: "sembilan",
132
+ # 10: "sepuluh",
133
+ # 11: "sebelas"
134
+ # }
135
+
136
+ # def number_to_words(n: int) -> str:
137
+ # """Konversi angka 0–9999 ke kata dalam bahasa Indonesia"""
138
+ # if n < 12:
139
+ # return num_words[n]
140
+ # elif n < 20:
141
+ # return number_to_words(n-10) + " belas"
142
+ # elif n < 100:
143
+ # puluhan, sisa = divmod(n, 10)
144
+ # result = number_to_words(puluhan) + " puluh"
145
+ # if sisa:
146
+ # result += " " + number_to_words(sisa)
147
+ # return result
148
+ # elif n < 200:
149
+ # return "seratus" + (" " + number_to_words(n-100) if n > 100 else "")
150
+ # elif n < 1000:
151
+ # ratusan, sisa = divmod(n, 100)
152
+ # result = number_to_words(ratusan) + " ratus"
153
+ # if sisa:
154
+ # result += " " + number_to_words(sisa)
155
+ # return result
156
+ # elif n < 2000:
157
+ # return "seribu" + (" " + number_to_words(n-1000) if n > 1000 else "")
158
+ # elif n < 10000:
159
+ # ribuan, sisa = divmod(n, 1000)
160
+ # result = number_to_words(ribuan) + " ribu"
161
+ # if sisa:
162
+ # result += " " + number_to_words(sisa)
163
+ # return result
164
+ # else:
165
+ # return str(n) # fallback
166
+
167
+
168
+
169
+ # def expand_abbreviation(word: str) -> str:
170
+ # """Ubah singkatan (huruf kapital) jadi ucapan Indonesia"""
171
+ # if word.isupper() and len(word) > 1: # contoh: KTP, DPR, RI
172
+ # return " ".join(letter_words.get(ch.lower(), ch) for ch in word)
173
+ # return word
174
+
175
+
176
+ # letter_words = {
177
+ # "a": "a",
178
+ # "b": "be",
179
+ # "c": "ce",
180
+ # "d": "de",
181
+ # "e": "e",
182
+ # "f": "ef",
183
+ # "g": "ge",
184
+ # "h": "ha",
185
+ # "i": "i",
186
+ # "j": "je",
187
+ # "k": "ka",
188
+ # "l": "el",
189
+ # "m": "em",
190
+ # "n": "en",
191
+ # "o": "o",
192
+ # "p": "pe",
193
+ # "q": "ki",
194
+ # "r": "er",
195
+ # "s": "es",
196
+ # "t": "te",
197
+ # "u": "u",
198
+ # "v": "fe",
199
+ # "w": "we",
200
+ # "x": "eks",
201
+ # "y": "ye",
202
+ # "z": "zet",
203
+ # }
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  # def indo_to_ipa(text: str) -> str:
207
  # text = text.lower()
208
 
209
+ # # Tangani singkatan (huruf kapital semua)
210
+ # words = []
211
+ # for w in text.split():
212
+ # if w.isupper() and len(w) > 1:
213
+ # words.append(expand_abbreviation(w))
214
+ # else:
215
+ # words.append(w)
216
+ # text = " ".join(words)
217
+
218
+ # # Tangani angka → kata
219
  # def replace_number(match):
220
  # num = int(match.group())
221
  # return number_to_words(num)
 
222
  # text = re.sub(r"\d+", replace_number, text)
223
 
224
  # # Konversi huruf → IPA
225
  # for k in sorted(ipa_map.keys(), key=lambda x: -len(x)):
226
  # text = re.sub(k, ipa_map[k], text)
227
 
228
+ # return text
229
+
230
+ # # def indo_to_ipa(text: str) -> str:
231
+ # # text = text.lower()
232
+
233
+ # # # Cari semua angka dalam teks dan ubah ke kata
234
+ # # def replace_number(match):
235
+ # # num = int(match.group())
236
+ # # return number_to_words(num)
237
+
238
+ # # text = re.sub(r"\d+", replace_number, text)
239
+
240
+ # # # Konversi huruf → IPA
241
+ # # for k in sorted(ipa_map.keys(), key=lambda x: -len(x)):
242
+ # # text = re.sub(k, ipa_map[k], text)
243
+
244
+ # # return text