HK0712 commited on
Commit
0cdd321
·
1 Parent(s): 9418793

added cantonese

Browse files
Files changed (2) hide show
  1. analyzer/ASR_zh_hk.py +330 -0
  2. requirements.txt +16 -15
analyzer/ASR_zh_hk.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import librosa
4
+ from transformers import AutoProcessor, AutoModelForCTC
5
+ import os
6
+ import pycantonese
7
+ import numpy as np
8
+ from datetime import datetime, timezone
9
+ import unicodedata
10
+ import re
11
+
12
+ # =======================================================================
13
+ # 1. 全域設定與模型載入 (Global Config)
14
+ # =======================================================================
15
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
+ print(f"INFO: ASR_zh_hk.py is configured to use device: {DEVICE}")
17
+
18
+ MODEL_NAME = "HK0712/Wav2Vec2_Cantonese"
19
+
20
+ # =======================================================================
21
+ # 2. 輔助工具函數 (Helpers)
22
+ # =======================================================================
23
+
24
+ def _tokenize_unicode_ipa(ipa_string: str) -> list:
25
+ """
26
+ 智能地切分包含 Unicode 組合字元的 IPA 字串。
27
+ (直接沿用 ASR_fr_fr.py 的邏輯)
28
+ """
29
+ phonemes = []
30
+ s = ipa_string.replace(' ', '')
31
+
32
+ i = 0
33
+ while i < len(s):
34
+ current_char = s[i]
35
+ i += 1
36
+ while i < len(s) and unicodedata.category(s[i]) == 'Mn':
37
+ current_char += s[i]
38
+ i += 1
39
+ phonemes.append(current_char)
40
+ return phonemes
41
+
42
+ def _get_target_phonemes_by_word(text: str) -> tuple[list[str], list[list[str]]]:
43
+ """
44
+ 使用 pycantonese 將中文文本轉換為對應的單詞列表和 IPA 音素列表。
45
+ """
46
+ # characters_to_jyutping 回傳 [('單詞', 'jyutping'), ...]
47
+ jyutping_result = pycantonese.characters_to_jyutping(text)
48
+
49
+ target_words_original = []
50
+ target_ipa_by_word = []
51
+
52
+ for segment, jp_str in jyutping_result:
53
+ # 過濾掉標點符號或無法轉換的部分 (jp_str 為 None)
54
+ # 也過濾掉空白 segment
55
+ if not segment or not segment.strip() or jp_str is None:
56
+ continue
57
+
58
+ try:
59
+ # jyutping_to_ipa 回傳一個 IPA 字串列表 (每個音節一個字串)
60
+ ipa_list = pycantonese.jyutping_to_ipa(jp_str)
61
+ except Exception as e:
62
+ print(f"Warning: Failed to convert Jyutping '{jp_str}' to IPA: {e}")
63
+ continue
64
+
65
+ if not ipa_list:
66
+ continue
67
+
68
+ word_tokens = []
69
+ for ipa_syllable in ipa_list:
70
+ # 將每個音節的 IPA 字串再細分為音素
71
+ word_tokens.extend(_tokenize_unicode_ipa(ipa_syllable))
72
+
73
+ target_words_original.append(segment)
74
+ target_ipa_by_word.append(word_tokens)
75
+
76
+ return target_words_original, target_ipa_by_word
77
+
78
+ def _chars_to_ipa_flat(text: str) -> str:
79
+ """
80
+ 將中文字串轉換為扁平的 IPA 字串 (用於處理 ASR 的輸出)。
81
+ """
82
+ jyutping_result = pycantonese.characters_to_jyutping(text)
83
+ full_ipa_tokens = []
84
+
85
+ for segment, jp_str in jyutping_result:
86
+ if not segment or not segment.strip() or jp_str is None:
87
+ continue
88
+
89
+ try:
90
+ ipa_list = pycantonese.jyutping_to_ipa(jp_str)
91
+ for ipa_syllable in ipa_list:
92
+ full_ipa_tokens.extend(_tokenize_unicode_ipa(ipa_syllable))
93
+ except:
94
+ pass
95
+
96
+ # 回傳無空格的串接字串,或者保持 token 結構?
97
+ # 為了配合 _get_phoneme_alignments_by_word 的輸入需求 (user_phoneme_str),
98
+ # 我們這裡最好回傳 token 列表,但原函數簽名通常接收 string。
99
+ # 這裡我們為了兼容性,將其 join 起來,但這在 tokenization 時可能會混淆。
100
+ # 更好的做法是修改 analyze 讓它直接傳遞 list。
101
+ # 但為了保持 _get_phoneme_alignments_by_word 介面一致 (str, list[list]),
102
+ # 我們可以使用一個特殊的分隔符,或者依賴 _tokenize_unicode_ipa 再次切分。
103
+ # 鑑於 _tokenize_unicode_ipa 處理 unicode 很好,我們將所有音素串接。
104
+
105
+ return "".join(full_ipa_tokens)
106
+
107
+ # =======================================================================
108
+ # 3. 核心分析函數 (Analyze)
109
+ # =======================================================================
110
+
111
+ def analyze(audio_file_path: str, target_sentence: str, cache: dict = {}) -> dict:
112
+ """
113
+ 接收音訊檔案路徑和目標句子,回傳詳細的發音分析字典。
114
+ """
115
+ # 1. 模型載入與快取
116
+ if "model" not in cache:
117
+ print(f"快取未命中 (ASR_zh_hk)。正在載入模型 '{MODEL_NAME}'...")
118
+ try:
119
+ cache["processor"] = AutoProcessor.from_pretrained(MODEL_NAME)
120
+ cache["model"] = AutoModelForCTC.from_pretrained(MODEL_NAME)
121
+ cache["model"].to(DEVICE)
122
+ print(f"模型 '{MODEL_NAME}' 已載入並快取。")
123
+ except Exception as e:
124
+ print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
125
+ raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
126
+
127
+ processor = cache["processor"]
128
+ model = cache["model"]
129
+
130
+ # 2. 準備���標音素 (G2P)
131
+ target_words_original, target_ipa_by_word = _get_target_phonemes_by_word(target_sentence)
132
+
133
+ if not target_words_original:
134
+ print("警告: G2P 處理後目標句子為空。")
135
+ # 回傳空結果
136
+ return _format_to_json_structure([], target_sentence, [])
137
+
138
+ # 3. 執行語音辨識 (ASR)
139
+ try:
140
+ speech, sample_rate = sf.read(audio_file_path)
141
+ if len(speech) == 0:
142
+ raise ValueError("Audio file is empty")
143
+
144
+ if sample_rate != 16000:
145
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
146
+
147
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
148
+ input_values = input_values.to(DEVICE)
149
+
150
+ with torch.no_grad():
151
+ logits = model(input_values).logits
152
+
153
+ predicted_ids = torch.argmax(logits, dim=-1)
154
+
155
+ # 模型輸出的是中文字元 (假設 Wav2Vec2_Cantonese 是 character-based)
156
+ user_transcription_chars = processor.decode(predicted_ids[0])
157
+
158
+ # 4. 將使用者轉錄的字元轉換為 IPA
159
+ user_ipa_full = _chars_to_ipa_flat(user_transcription_chars)
160
+
161
+ except Exception as e:
162
+ raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
163
+
164
+ # 5. 對齊
165
+ word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
166
+
167
+ # 6. 格式化
168
+ return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
169
+
170
+
171
+ # =======================================================================
172
+ # 4. 對齊與格式化函數 (Alignment & Formatting)
173
+ # =======================================================================
174
+
175
+ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
176
+ """
177
+ 使用動態規劃執行音素對齊。
178
+ """
179
+ user_phonemes = _tokenize_unicode_ipa(user_phoneme_str)
180
+
181
+ target_phonemes_flat = []
182
+ word_boundaries_indices = []
183
+ current_idx = 0
184
+ for word_ipa_tokens in target_words_ipa_tokenized:
185
+ target_phonemes_flat.extend(word_ipa_tokens)
186
+ current_idx += len(word_ipa_tokens)
187
+ word_boundaries_indices.append(current_idx - 1)
188
+
189
+ # 處理空目標的情況
190
+ if not target_phonemes_flat:
191
+ return []
192
+
193
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
194
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
195
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
196
+ for i in range(1, len(user_phonemes) + 1):
197
+ for j in range(1, len(target_phonemes_flat) + 1):
198
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
199
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
200
+
201
+ i, j = len(user_phonemes), len(target_phonemes_flat)
202
+ user_path, target_path = [], []
203
+ while i > 0 or j > 0:
204
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
205
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
206
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
207
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
208
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
209
+ else:
210
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
211
+
212
+ alignments_by_word = []
213
+ word_start_idx_in_path = 0
214
+ target_phoneme_counter_in_path = 0
215
+
216
+ # 修正邊界處理,確保所有路徑都被包含
217
+ word_boundary_iter = iter(word_boundaries_indices)
218
+ current_word_boundary = next(word_boundary_iter, -1)
219
+
220
+ # 這裡的邏輯需要與 target_path 的長度匹配
221
+ # target_phoneme_counter_in_path 只在 target_path[k] != '-' 時增加
222
+
223
+ for path_idx, p in enumerate(target_path):
224
+ if p != '-':
225
+ if target_phoneme_counter_in_path == current_word_boundary:
226
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
227
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
228
+
229
+ alignments_by_word.append({
230
+ "target": target_alignment,
231
+ "user": user_alignment
232
+ })
233
+
234
+ word_start_idx_in_path = path_idx + 1
235
+ current_word_boundary = next(word_boundary_iter, -1)
236
+
237
+ target_phoneme_counter_in_path += 1
238
+
239
+ # 處理最後一個詞 (如果還沒處理完)
240
+ # 如果最後一個詞是缺失的 (全 '-'), 上面的邏輯可能無法捕捉
241
+ # 但通常 target_path 不會全是 '-' 除非 target 為空
242
+
243
+ return alignments_by_word
244
+
245
+ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
246
+ """
247
+ 將對齊結果格式化為最終的 JSON 結構。
248
+ """
249
+ total_phonemes = 0
250
+ total_errors = 0
251
+ correct_words_count = 0
252
+ words_data = []
253
+
254
+ num_words_to_process = min(len(alignments), len(original_words))
255
+
256
+ for i in range(num_words_to_process):
257
+ alignment = alignments[i]
258
+ word_is_correct = True
259
+ phonemes_data = []
260
+
261
+ # 確保 target 和 user 長度一致 (對齊算法保證)
262
+ length = len(alignment['target'])
263
+
264
+ for j in range(length):
265
+ target_phoneme = alignment['target'][j]
266
+ user_phoneme = alignment['user'][j]
267
+ is_match = (user_phoneme == target_phoneme)
268
+
269
+ phonemes_data.append({
270
+ "target": target_phoneme,
271
+ "user": user_phoneme,
272
+ "isMatch": is_match
273
+ })
274
+
275
+ if not is_match:
276
+ word_is_correct = False
277
+ if not (user_phoneme == '-' and target_phoneme == '-'):
278
+ total_errors += 1
279
+
280
+ if word_is_correct:
281
+ correct_words_count += 1
282
+
283
+ words_data.append({
284
+ "word": original_words[i],
285
+ "isCorrect": word_is_correct,
286
+ "phonemes": phonemes_data
287
+ })
288
+
289
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
290
+
291
+ # 處理未對齊的剩餘單詞 (Missed words)
292
+ if len(alignments) < len(original_words):
293
+ for i in range(len(alignments), len(original_words)):
294
+ # 獲取遺失單詞的音標
295
+ missed_word = original_words[i]
296
+ # 這裡簡單調用 G2P 獲取目標音標
297
+ _, missed_word_ipa_list = _get_target_phonemes_by_word(missed_word)
298
+
299
+ phonemes_data = []
300
+ if missed_word_ipa_list:
301
+ for p_ipa in missed_word_ipa_list[0]:
302
+ phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
303
+ total_errors += 1
304
+ total_phonemes += 1
305
+
306
+ words_data.append({
307
+ "word": missed_word,
308
+ "isCorrect": False,
309
+ "phonemes": phonemes_data
310
+ })
311
+
312
+ total_words = len(original_words)
313
+ overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
314
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
315
+
316
+ final_result = {
317
+ "sentence": sentence,
318
+ "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
319
+ "summary": {
320
+ "overallScore": round(overall_score, 1),
321
+ "totalWords": total_words,
322
+ "correctWords": correct_words_count,
323
+ "phonemeErrorRate": round(phoneme_error_rate, 2),
324
+ "total_errors": total_errors,
325
+ "total_target_phonemes": total_phonemes
326
+ },
327
+ "words": words_data
328
+ }
329
+
330
+ return final_result
requirements.txt CHANGED
@@ -1,15 +1,16 @@
1
- fastapi
2
- uvicorn[standard]
3
- pyngrok
4
- python-multipart
5
- torch
6
- soundfile
7
- librosa
8
- transformers
9
- phonemizer[espeak]
10
- numpy
11
- epitran
12
- g2p
13
- pyopenjtalk
14
- mecab-python3
15
- aiohttp
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pyngrok
4
+ python-multipart
5
+ torch
6
+ soundfile
7
+ librosa
8
+ transformers
9
+ phonemizer[espeak]
10
+ numpy
11
+ epitran
12
+ g2p
13
+ pyopenjtalk
14
+ mecab-python3
15
+ aiohttp
16
+ pycantonese