HK0712 commited on
Commit
3856f8c
·
1 Parent(s): daa0137

FIX: pt_br

Browse files
Files changed (3) hide show
  1. .dockerignore +35 -0
  2. analyzer/ASR_nl_nl.py +120 -53
  3. analyzer/ASR_pt_br.py +69 -97
.dockerignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. 首先,忽略所有被 .gitignore 忽略的檔案
2
+ # (這是一個簡化的概念,實際操作是手動複製 .gitignore 的內容)
3
+ # 或者,更直接地,將 .gitignore 的內容複製過來,然後擴展
4
+
5
+ # 2. 忽略 Git 自身的資料夾
6
+ .git
7
+
8
+ # 3. 忽略 Docker 自身的檔案
9
+ Dockerfile
10
+ .dockerignore
11
+
12
+ # 4. 忽略本地開發環境的設定
13
+ .vscode/
14
+ .devcontainer/
15
+
16
+ # 5. 忽略 Python 的快取和虛擬環境
17
+ __pycache__/
18
+ *.pyc
19
+ .venv/
20
+ venv/
21
+
22
+ # 6. 【【【 忽略您專案中特有的大型檔案和資料夾 】】】
23
+ # 這是最重要的部分!
24
+ ASRs/
25
+ data/
26
+ *.pth
27
+ *.safetensors
28
+
29
+ # 7. 忽略文件和非必要的檔案
30
+ README.md
31
+ docs/
32
+
33
+ # 8. 忽略作業系統產生的檔案
34
+ .DS_Store
35
+ Thumbs.db
analyzer/ASR_nl_nl.py CHANGED
@@ -1,10 +1,13 @@
1
  # =======================================================================
2
  # analyzer/ASR_nl_nl.py
3
  # 荷蘭語發音分析器
4
- # 最終修正- 使用用戶指定的正確模型
 
 
 
5
  # =======================================================================
6
 
7
- # 1. 匯入區 (Imports)
8
  import torch
9
  import soundfile as sf
10
  import librosa
@@ -13,32 +16,23 @@ import os
13
  from phonemizer import phonemize
14
  import numpy as np
15
  from datetime import datetime, timezone
16
- import re
17
- import unicodedata
18
 
19
- # =======================================================================
20
- # 2. 全域變數與配置區
21
- # =======================================================================
22
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
23
  print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
24
 
25
- # 【【【【【 最終的、決性的修正】】】】
26
- # 使用用戶指定的、正確的荷蘭語音素模型
27
  MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
28
 
29
  processor = None
30
  model = None
31
 
32
- # =======================================================================
33
- # 3. 核心業務邏輯區
34
- # =======================================================================
35
-
36
- # -----------------------------------------------------------------------
37
- # 3.1. 模型載入函數 (邏輯不變)
38
- # -----------------------------------------------------------------------
39
  def load_model():
40
  """
41
  載入荷蘭語 ASR 模型和對應的處理器。
 
42
  """
43
  global processor, model
44
  if processor and model:
@@ -56,12 +50,13 @@ def load_model():
56
  print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
57
  raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
58
 
59
- # -----------------------------------------------------------------------
60
- # 3.2. 通用 IPA 切分函數 (邏輯不變)
61
- # -----------------------------------------------------------------------
 
62
  def _tokenize_ipa(ipa_string: str) -> list:
63
  """
64
- 將 IPA 字串智能地切分為音素列表,可以正確處理任何語言的組合字符。
65
  """
66
  phonemes = []
67
  s = ipa_string.replace(' ', '')
@@ -69,38 +64,50 @@ def _tokenize_ipa(ipa_string: str) -> list:
69
  while i < len(s):
70
  current_char = s[i]
71
  i += 1
 
72
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
73
  current_char += s[i]
74
  i += 1
75
  phonemes.append(current_char)
76
  return phonemes
77
 
78
- # -----------------------------------------------------------------------
79
- # 3.3. 核心分析函數 (邏輯不變)
80
- # -----------------------------------------------------------------------
81
  def analyze(audio_file_path: str, target_sentence: str) -> dict:
82
  """
83
  接收音訊檔案路徑和目標荷蘭語句子,回傳詳細的發音分析字典。
 
84
  """
85
  if not processor or not model:
86
  raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
87
 
 
 
88
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
89
  cleaned_sentence = " ".join(target_words_original)
 
 
 
 
 
 
 
 
 
90
 
91
- target_ipa_by_word_str = phonemize(cleaned_sentence, language='nl', backend='espeak', with_stress=True, strip=True).split()
92
-
93
  if len(target_words_original) != len(target_ipa_by_word_str):
94
- print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。")
95
  min_len = min(len(target_words_original), len(target_ipa_by_word_str))
96
  target_words_original = target_words_original[:min_len]
97
  target_ipa_by_word_str = target_ipa_by_word_str[:min_len]
98
 
 
99
  target_ipa_by_word = [
100
  _tokenize_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('ː', ''))
101
  for word in target_ipa_by_word_str
102
  ]
103
 
 
104
  try:
105
  speech, sample_rate = sf.read(audio_file_path)
106
  if sample_rate != 16000:
@@ -113,20 +120,32 @@ def analyze(audio_file_path: str, target_sentence: str) -> dict:
113
  with torch.no_grad():
114
  logits = model(input_values).logits
115
  predicted_ids = torch.argmax(logits, dim=-1)
116
- user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '')
 
 
 
117
 
 
118
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
119
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
120
 
121
 
122
- # =======================================================================
123
- # 4. 對齊與格式化函數區 (語言無關,邏輯不變)
124
- # =======================================================================
125
-
126
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
 
 
 
 
127
  user_phonemes = _tokenize_ipa(user_phoneme_str)
128
- target_phonemes_flat = [p for word in target_words_ipa_tokenized for p in word]
129
- word_boundaries_indices = np.cumsum([len(word) for word in target_words_ipa_tokenized]) - 1
 
 
 
 
 
 
 
130
  dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
131
  for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
132
  for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
@@ -134,55 +153,94 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
134
  for j in range(1, len(target_phonemes_flat) + 1):
135
  cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
136
  dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
 
137
  i, j = len(user_phonemes), len(target_phonemes_flat)
138
  user_path, target_path = [], []
139
  while i > 0 or j > 0:
 
140
  cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
 
 
141
  if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
142
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
143
- elif i > 0 and (j == 0 or dp[i][j] == dp[i-1][j] + 1):
 
144
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
145
- elif j > 0 and (i == 0 or dp[i][j] == dp[i][j-1] + 1):
 
146
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
147
- else: break
148
  alignments_by_word = []
149
  word_start_idx_in_path = 0
150
  target_phoneme_counter_in_path = 0
151
- word_boundary_iter = iter(word_boundaries_indices)
152
- current_word_boundary = next(word_boundary_iter, -1)
153
  for path_idx, p in enumerate(target_path):
154
  if p != '-':
155
- if target_phoneme_counter_in_path == current_word_boundary:
 
 
 
156
  alignments_by_word.append({
157
- "target": target_path[word_start_idx_in_path : path_idx + 1],
158
- "user": user_path[word_start_idx_in_path : path_idx + 1]
159
  })
 
160
  word_start_idx_in_path = path_idx + 1
161
- current_word_boundary = next(word_boundary_iter, -1)
162
  target_phoneme_counter_in_path += 1
 
163
  return alignments_by_word
164
 
 
165
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
166
- total_phonemes, total_errors, correct_words_count = 0, 0, 0
 
 
 
 
 
 
167
  words_data = []
 
168
  num_words_to_process = min(len(alignments), len(original_words))
 
169
  for i in range(num_words_to_process):
170
  alignment = alignments[i]
171
  word_is_correct = True
172
  phonemes_data = []
173
- min_len = min(len(alignment['target']), len(alignment['user']))
174
- for j in range(min_len):
175
- target_phoneme, user_phoneme = alignment['target'][j], alignment['user'][j]
 
176
  is_match = (user_phoneme == target_phoneme)
177
- phonemes_data.append({"target": target_phoneme, "user": user_phoneme, "isMatch": is_match})
 
 
 
 
 
 
178
  if not is_match:
179
  word_is_correct = False
180
- if not (user_phoneme == '-' and target_phoneme == '-'): total_errors += 1
181
- if word_is_correct: correct_words_count += 1
182
- words_data.append({"word": original_words[i], "isCorrect": word_is_correct, "phonemes": phonemes_data})
 
 
 
 
 
 
 
 
 
 
183
  total_phonemes += sum(1 for p in alignment['target'] if p != '-')
 
 
184
  if len(alignments) < len(original_words):
185
  for i in range(len(alignments), len(original_words)):
 
186
  missed_word_ipa_str = phonemize(original_words[i], language='nl', backend='espeak', strip=True).replace('ː', '')
187
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
188
  phonemes_data = []
@@ -190,11 +248,18 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
190
  phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
191
  total_errors += 1
192
  total_phonemes += 1
193
- words_data.append({"word": original_words[i], "isCorrect": False, "phonemes": phonemes_data})
 
 
 
 
 
 
194
  total_words = len(original_words)
195
  overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
196
  phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
197
- return {
 
198
  "sentence": sentence,
199
  "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
200
  "summary": {
@@ -206,4 +271,6 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
206
  "total_target_phonemes": total_phonemes
207
  },
208
  "words": words_data
209
- }
 
 
 
1
  # =======================================================================
2
  # analyzer/ASR_nl_nl.py
3
  # 荷蘭語發音分析器
4
+ # 版本:v2.0 (與 en_us.py 邏輯對齊)
5
+ # 描述:此版本完全遵循 en_us.py 的程式碼結構和算法實現,
6
+ # 僅在語言特定配置(模型名稱、G2P語言)上有所不同,
7
+ # 並採用了更健壯的、基於 Unicode 的 IPA 切分方法。
8
  # =======================================================================
9
 
10
+ # --- 1. 匯入區 (與 en_us.py 保持一致) ---
11
  import torch
12
  import soundfile as sf
13
  import librosa
 
16
  from phonemizer import phonemize
17
  import numpy as np
18
  from datetime import datetime, timezone
19
+ import unicodedata # 【保留】這是處理多語言音素的更優方案
20
+ import re # 【保留】用於更準確地切分單詞
21
 
22
+ # --- 2. 全域設定與模型載入 ---
 
 
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
  print(f"INFO: ASR_nl_nl.py is configured to use device: {DEVICE}")
25
 
26
+ # 【關鍵修改 1:設為荷蘭語 ASR 模型
 
27
  MODEL_NAME = "Clementapa/wav2vec2-base-960h-phoneme-reco-dutch"
28
 
29
  processor = None
30
  model = None
31
 
 
 
 
 
 
 
 
32
  def load_model():
33
  """
34
  載入荷蘭語 ASR 模型和對應的處理器。
35
+ (此函數邏輯與 en_us.py 完全相同)
36
  """
37
  global processor, model
38
  if processor and model:
 
50
  print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
51
  raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
52
 
53
+ # --- 3. 智能 IPA 切分函數 ---
54
+ # 【關鍵修改 2:保留更優越的通用切分邏輯
55
+ # 雖然此函數的實現比英文版的更複雜,但它更健壯且適用於包括荷蘭語在內的多種語言。
56
+ # 這是為了「fit with Dutch」而必須保留的優化。
57
  def _tokenize_ipa(ipa_string: str) -> list:
58
  """
59
+ 將 IPA 字串智能地切分為音素列表,正確處理帶有附加符號的組合字符。
60
  """
61
  phonemes = []
62
  s = ipa_string.replace(' ', '')
 
64
  while i < len(s):
65
  current_char = s[i]
66
  i += 1
67
+ # 檢查並組合後續的非間距標記 (例如變音符)
68
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
69
  current_char += s[i]
70
  i += 1
71
  phonemes.append(current_char)
72
  return phonemes
73
 
74
+ # --- 4. 核心分析函數 (主入口) ---
 
 
75
  def analyze(audio_file_path: str, target_sentence: str) -> dict:
76
  """
77
  接收音訊檔案路徑和目標荷蘭語句子,回傳詳細的發音分析字典。
78
+ (此函數結構與 en_us.py 完全對齊)
79
  """
80
  if not processor or not model:
81
  raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
82
 
83
+ # 1. 準備目標音素 (G2P)
84
+ # 使用正則表達式準確切分單詞,這比簡單的 .split() 更穩健
85
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
86
  cleaned_sentence = " ".join(target_words_original)
87
+
88
+ # 【關鍵修改 3:設定 G2P 語言為 'nl'】
89
+ target_ipa_by_word_str = phonemize(
90
+ cleaned_sentence,
91
+ language='nl',
92
+ backend='espeak',
93
+ with_stress=True,
94
+ strip=True
95
+ ).split()
96
 
97
+ # 健壯性檢查:確保單詞和音素列表長度一致
 
98
  if len(target_words_original) != len(target_ipa_by_word_str):
99
+ print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。將進行截斷。")
100
  min_len = min(len(target_words_original), len(target_ipa_by_word_str))
101
  target_words_original = target_words_original[:min_len]
102
  target_ipa_by_word_str = target_ipa_by_word_str[:min_len]
103
 
104
+ # 【關鍵修改 4:與 en_us.py 對齊,在準備目標音素時就清除所有不比較 的符號】
105
  target_ipa_by_word = [
106
  _tokenize_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('ː', ''))
107
  for word in target_ipa_by_word_str
108
  ]
109
 
110
+ # 2. 處理音訊並進行語音辨識 (ASR)
111
  try:
112
  speech, sample_rate = sf.read(audio_file_path)
113
  if sample_rate != 16000:
 
120
  with torch.no_grad():
121
  logits = model(input_values).logits
122
  predicted_ids = torch.argmax(logits, dim=-1)
123
+
124
+ # 【關鍵修改 5:與 en_us.py 對齊,假設模型輸出是乾淨的,或在必要時清理】
125
+ # 移除模型可能產生的分隔符 |,並確保也移除長音符號,以匹配目標音素的處理方式
126
+ user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '').replace('ː', '')
127
 
128
+ # 3. 執行對齊並格式化輸出
129
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
130
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
131
 
132
 
133
+ # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
 
 
134
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
135
+ """
136
+ 使用動態規劃執行音素對齊。
137
+ (此函數實現與 en_us.py 完全相同)
138
+ """
139
  user_phonemes = _tokenize_ipa(user_phoneme_str)
140
+
141
+ target_phonemes_flat = []
142
+ word_boundaries_indices = []
143
+ current_idx = 0
144
+ for word_ipa_tokens in target_words_ipa_tokenized:
145
+ target_phonemes_flat.extend(word_ipa_tokens)
146
+ current_idx += len(word_ipa_tokens)
147
+ word_boundaries_indices.append(current_idx - 1)
148
+
149
  dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
150
  for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
151
  for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
 
153
  for j in range(1, len(target_phonemes_flat) + 1):
154
  cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
155
  dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
156
+
157
  i, j = len(user_phonemes), len(target_phonemes_flat)
158
  user_path, target_path = [], []
159
  while i > 0 or j > 0:
160
+ # 使用與 en_us.py 相同的、更簡潔的回溯邏輯
161
  cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
162
+
163
+ # 優先匹配/替換
164
  if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
165
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
166
+ # 其次是刪除 (user )
167
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
168
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
169
+ # 最後是插入 (target )
170
+ else:
171
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
172
+
173
  alignments_by_word = []
174
  word_start_idx_in_path = 0
175
  target_phoneme_counter_in_path = 0
176
+
 
177
  for path_idx, p in enumerate(target_path):
178
  if p != '-':
179
+ if target_phoneme_counter_in_path in word_boundaries_indices:
180
+ target_alignment = target_path[word_start_idx_in_path : path_idx + 1]
181
+ user_alignment = user_path[word_start_idx_in_path : path_idx + 1]
182
+
183
  alignments_by_word.append({
184
+ "target": target_alignment,
185
+ "user": user_alignment
186
  })
187
+
188
  word_start_idx_in_path = path_idx + 1
189
+
190
  target_phoneme_counter_in_path += 1
191
+
192
  return alignments_by_word
193
 
194
+ # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
195
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
196
+ """
197
+ 將對齊結果格式化為最終的 JSON 結構。
198
+ (此函數實現與 en_us.py 完全相同,僅 G2P 語言設定不同)
199
+ """
200
+ total_phonemes = 0
201
+ total_errors = 0
202
+ correct_words_count = 0
203
  words_data = []
204
+
205
  num_words_to_process = min(len(alignments), len(original_words))
206
+
207
  for i in range(num_words_to_process):
208
  alignment = alignments[i]
209
  word_is_correct = True
210
  phonemes_data = []
211
+
212
+ for j in range(len(alignment['target'])):
213
+ target_phoneme = alignment['target'][j]
214
+ user_phoneme = alignment['user'][j]
215
  is_match = (user_phoneme == target_phoneme)
216
+
217
+ phonemes_data.append({
218
+ "target": target_phoneme,
219
+ "user": user_phoneme,
220
+ "isMatch": is_match
221
+ })
222
+
223
  if not is_match:
224
  word_is_correct = False
225
+ # 只有在不是「目標和用戶都為空」的情況下才計為錯誤
226
+ if not (user_phoneme == '-' and target_phoneme == '-'):
227
+ total_errors += 1
228
+
229
+ if word_is_correct:
230
+ correct_words_count += 1
231
+
232
+ words_data.append({
233
+ "word": original_words[i],
234
+ "isCorrect": word_is_correct,
235
+ "phonemes": phonemes_data
236
+ })
237
+
238
  total_phonemes += sum(1 for p in alignment['target'] if p != '-')
239
+
240
+ # 處理使用者漏講單詞的情況
241
  if len(alignments) < len(original_words):
242
  for i in range(len(alignments), len(original_words)):
243
+ # 【關鍵修改 6:確保此處的 G2P 語言和符號清理也保持一致】
244
  missed_word_ipa_str = phonemize(original_words[i], language='nl', backend='espeak', strip=True).replace('ː', '')
245
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
246
  phonemes_data = []
 
248
  phonemes_data.append({"target": p_ipa, "user": "-", "isMatch": False})
249
  total_errors += 1
250
  total_phonemes += 1
251
+
252
+ words_data.append({
253
+ "word": original_words[i],
254
+ "isCorrect": False,
255
+ "phonemes": phonemes_data
256
+ })
257
+
258
  total_words = len(original_words)
259
  overall_score = (correct_words_count / total_words) * 100 if total_words > 0 else 0
260
  phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
261
+
262
+ final_result = {
263
  "sentence": sentence,
264
  "analysisTimestampUTC": datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S (UTC)'),
265
  "summary": {
 
271
  "total_target_phonemes": total_phonemes
272
  },
273
  "words": words_data
274
+ }
275
+
276
+ return final_result
analyzer/ASR_pt_br.py CHANGED
@@ -1,7 +1,13 @@
1
  # =======================================================================
2
- # 1. 匯入區 (Imports)
3
- # - 與英文版完全相同,因為我們使用相同的工具鏈。
 
 
 
 
4
  # =======================================================================
 
 
5
  import torch
6
  import soundfile as sf
7
  import librosa
@@ -10,33 +16,23 @@ import os
10
  from phonemizer import phonemize
11
  import numpy as np
12
  from datetime import datetime, timezone
13
- import re
14
- import unicodedata
15
 
16
- # =======================================================================
17
- # 2. 全域變數與配置區 (Global Variables & Config)
18
- # =======================================================================
19
- # 自動檢測可用設備
20
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
  print(f"INFO: ASR_pt_br.py is configured to use device: {DEVICE}")
22
 
23
- # 【【【【【 關鍵修改 1:設定為葡萄牙語 ASR 模型 】】】】
24
  MODEL_NAME = "caiocrocha/wav2vec2-large-xlsr-53-phoneme-portuguese"
25
 
26
  processor = None
27
  model = None
28
 
29
- # =======================================================================
30
- # 3. 核心業務邏輯區 (Core Business Logic)
31
- # =======================================================================
32
-
33
- # -----------------------------------------------------------------------
34
- # 3.1. 模型載入函數
35
- # - 與英文版邏輯完全相同,僅替換模型名稱。
36
- # -----------------------------------------------------------------------
37
  def load_model():
38
  """
39
  載入葡萄牙語 ASR 模型和對應的處理器。
 
40
  """
41
  global processor, model
42
  if processor and model:
@@ -45,7 +41,6 @@ def load_model():
45
 
46
  print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
47
  try:
48
- # 這些模型通常使用標準的 Wav2Vec2Processor 和 Wav2Vec2ForCTC
49
  processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
50
  model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
51
  model.to(DEVICE)
@@ -55,123 +50,104 @@ def load_model():
55
  print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
56
  raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
57
 
58
- # -----------------------------------------------------------------------
59
- # 3.2. 智能 IPA 切分函數
60
- # - 【關鍵修改 2】針對葡萄牙語的 IPA 特性進行調整。
61
- # -----------------------------------------------------------------------
62
  def _tokenize_ipa(ipa_string: str) -> list:
63
  """
64
- 將 IPA 字串智能地切分為音素列表。
65
- 這個版本能處理葡萄牙語中常見的多字元音素和帶有附加符號的音素。
66
  """
67
  phonemes = []
68
- # 移除所有由 phonemizer 產生的多餘空格
69
  s = ipa_string.replace(' ', '')
70
  i = 0
71
  while i < len(s):
72
- # 檢查葡萄牙語中常見的雙字塞擦音
73
  if i + 1 < len(s) and s[i:i+2] in {'dʒ', 'tʃ'}:
74
  phonemes.append(s[i:i+2])
75
  i += 2
76
  continue
77
 
78
- # 處理帶有鼻化符 (波浪號) 的元音
79
- # unicodedata.category(char) == 'Mn' 用於檢測非間距標記 (例如波浪號)
80
  current_char = s[i]
81
  i += 1
82
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
83
  current_char += s[i]
84
  i += 1
85
  phonemes.append(current_char)
86
-
87
  return phonemes
88
 
89
- # -----------------------------------------------------------------------
90
- # 3.3. 核心分析函數 (主入口)
91
- # - 【關鍵修改 3】將 G2P 語言設定為 'pt-br'。
92
- # -----------------------------------------------------------------------
93
  def analyze(audio_file_path: str, target_sentence: str) -> dict:
94
  """
95
  接收音訊檔案路徑和目標葡萄牙語句子,回傳詳細的發音分析字典。
 
96
  """
97
  if not processor or not model:
98
  raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
99
 
100
- # --- G2P 步驟 ---
101
- # 1. 使用正則表達式來準確地分割單詞,並自動忽略標點符號
102
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
103
- # 2. 將分割好的、乾淨的單詞重新組合,再傳給 phonemizer
104
  cleaned_sentence = " ".join(target_words_original)
105
-
106
- # 3. 呼叫 phonemizer,並將語言設定為 'pt-br' (巴西葡萄牙語)
107
  target_ipa_by_word_str = phonemize(
108
  cleaned_sentence,
109
  language='pt-br',
110
  backend='espeak',
111
- with_stress=True, # 保留重音符號以便後續處理
112
  strip=True
113
  ).split()
114
-
115
- # 4. 確保單詞列表和音素列表的長度一致,以防 G2P 工具出錯
116
  if len(target_words_original) != len(target_ipa_by_word_str):
117
- print(f"警告單詞數量 ({len(target_words_original)}) 與 G2P 結果數量 ({len(target_ipa_by_word_str)}) 不匹配。將進行截斷處理。")
118
  min_len = min(len(target_words_original), len(target_ipa_by_word_str))
119
  target_words_original = target_words_original[:min_len]
120
  target_ipa_by_word_str = target_ipa_by_word_str[:min_len]
121
 
122
- # 5. 清理 G2P 輸出的音素,並使用我們為葡萄牙語定製切分函數
123
  target_ipa_by_word = [
124
  _tokenize_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('ː', ''))
125
  for word in target_ipa_by_word_str
126
  ]
127
 
128
- # --- ASR 步驟 ---
129
  try:
130
  speech, sample_rate = sf.read(audio_file_path)
131
- if len(speech) == 0:
132
- print("警告: 音訊檔案為空。")
133
- user_ipa_full = ""
134
- else:
135
- if sample_rate != 16000:
136
- speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
137
-
138
- input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
139
- input_values = input_values.to(DEVICE)
140
- with torch.no_grad():
141
- logits = model(input_values).logits
142
- predicted_ids = torch.argmax(logits, dim=-1)
143
- # 解碼後,移除模型可能產生的分隔符 '|'
144
- user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '')
145
-
146
  except Exception as e:
147
  raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
148
 
149
- # --- 對齊與格式化步驟 (與英文版邏輯完全相同) ---
 
 
 
 
 
 
 
 
 
150
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
151
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
152
 
153
- # =======================================================================
154
- # 4. 對齊與格式化函數區 (Alignment & Formatting)
155
- # - 【注意】這些函數是語言無關的,直接從英文版複製而來,無需修改。
156
- # =======================================================================
157
 
158
- # -----------------------------------------------------------------------
159
- # 4.1. 對齊函數 (語言無關)
160
- # -----------------------------------------------------------------------
161
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
162
  """
163
- 使用動態規劃執行音素對齊。此函數是語言無關的。
 
164
  """
165
- # 對於 ASR 的輸出,我們也使用相同的、更通用的切分函數
166
  user_phonemes = _tokenize_ipa(user_phoneme_str)
167
 
168
- target_phonemes_flat = [p for word in target_words_ipa_tokenized for p in word]
169
-
170
- # 如果目標音素為空 (例如,輸入句子只有標點符號),返回空對齊
171
- if not target_phonemes_flat:
172
- return []
173
-
174
- word_boundaries_indices = np.cumsum([len(word) for word in target_words_ipa_tokenized]) - 1
175
 
176
  dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
177
  for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
@@ -187,35 +163,32 @@ def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized
187
  cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
188
  if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
189
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
190
- elif i > 0 and (j == 0 or dp[i][j] == dp[i-1][j] + 1):
191
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
192
- elif j > 0 and (i == 0 or dp[i][j] == dp[i][j-1] + 1):
193
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
194
- else: break
195
 
196
  alignments_by_word = []
197
  word_start_idx_in_path = 0
198
  target_phoneme_counter_in_path = 0
199
- word_boundary_iter = iter(word_boundaries_indices)
200
- current_word_boundary = next(word_boundary_iter, -1)
201
  for path_idx, p in enumerate(target_path):
202
  if p != '-':
203
- if target_phoneme_counter_in_path == current_word_boundary:
204
  alignments_by_word.append({
205
  "target": target_path[word_start_idx_in_path : path_idx + 1],
206
  "user": user_path[word_start_idx_in_path : path_idx + 1]
207
  })
208
  word_start_idx_in_path = path_idx + 1
209
- current_word_boundary = next(word_boundary_iter, -1)
210
  target_phoneme_counter_in_path += 1
 
211
  return alignments_by_word
212
 
213
- # -----------------------------------------------------------------------
214
- # 4.2. 格式化函數 (語言無關)
215
- # -----------------------------------------------------------------------
216
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
217
  """
218
- 將對齊結果格式化為最終的 JSON 結構。此函數是語言無關的。
 
219
  """
220
  total_phonemes, total_errors, correct_words_count = 0, 0, 0
221
  words_data = []
@@ -226,25 +199,24 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
226
  word_is_correct = True
227
  phonemes_data = []
228
 
229
- # 增加一個健壯性檢查,以防對齊演算法返回長度不一的列表
230
- min_len = min(len(alignment.get('target', [])), len(alignment.get('user', [])))
231
- for j in range(min_len):
232
- target_phoneme, user_phoneme = alignment['target'][j], alignment['user'][j]
233
  is_match = (user_phoneme == target_phoneme)
234
  phonemes_data.append({"target": target_phoneme, "user": user_phoneme, "isMatch": is_match})
235
  if not is_match:
236
  word_is_correct = False
237
  if not (user_phoneme == '-' and target_phoneme == '-'): total_errors += 1
238
 
239
- if word_is_correct and min_len > 0: correct_words_count += 1
240
-
 
241
  words_data.append({"word": original_words[i], "isCorrect": word_is_correct, "phonemes": phonemes_data})
242
- total_phonemes += sum(1 for p in alignment.get('target', []) if p != '-')
243
 
244
- # 【Fuse Logic】處理使用者漏講了單詞的情況
245
  if len(alignments) < len(original_words):
246
  for i in range(len(alignments), len(original_words)):
247
- # 【關鍵修改 4】確保這裡也使用 'pt-br'
248
  missed_word_ipa_str = phonemize(original_words[i], language='pt-br', backend='espeak', strip=True).replace('ː', '')
249
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
250
  phonemes_data = []
@@ -270,4 +242,4 @@ def _format_to_json_structure(alignments, sentence, original_words) -> dict:
270
  "total_target_phonemes": total_phonemes
271
  },
272
  "words": words_data
273
- }
 
1
  # =======================================================================
2
+ # analyzer/ASR_pt_br.py
3
+ # 巴西葡萄牙語發音分析器
4
+ # 版本:v2.0 (與 en_us.py 邏輯對齊)
5
+ # 描述:此版本完全遵循 en_us.py 的程式碼結構和算法實現,
6
+ # 僅在語言特定配置(模型名稱、G2P語言)上有所不同,
7
+ # 並採用了更健壯的、基於 Unicode 的 IPA 切分方法以適應葡萄牙語。
8
  # =======================================================================
9
+
10
+ # --- 1. 匯入區 (與 en_us.py 保持一致) ---
11
  import torch
12
  import soundfile as sf
13
  import librosa
 
16
  from phonemizer import phonemize
17
  import numpy as np
18
  from datetime import datetime, timezone
19
+ import unicodedata # 【保留】這是處理葡萄牙語鼻音等音素的更優方案
20
+ import re # 【保留】用於更準確地切分單詞
21
 
22
+ # --- 2. 全域設定與模型載入 ---
 
 
 
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
  print(f"INFO: ASR_pt_br.py is configured to use device: {DEVICE}")
25
 
26
+ # 【關鍵修改 1:設定為葡萄牙語 ASR 模型】
27
  MODEL_NAME = "caiocrocha/wav2vec2-large-xlsr-53-phoneme-portuguese"
28
 
29
  processor = None
30
  model = None
31
 
 
 
 
 
 
 
 
 
32
  def load_model():
33
  """
34
  載入葡萄牙語 ASR 模型和對應的處理器。
35
+ (此函數邏輯與 en_us.py 完全相同)
36
  """
37
  global processor, model
38
  if processor and model:
 
41
 
42
  print(f"正在準備 ASR 模型 '{MODEL_NAME}'...")
43
  try:
 
44
  processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
45
  model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
46
  model.to(DEVICE)
 
50
  print(f"處理或載入模型 '{MODEL_NAME}' 時發生錯誤: {e}")
51
  raise RuntimeError(f"Failed to load model '{MODEL_NAME}': {e}")
52
 
53
+ # --- 3. 智能 IPA 切分函數 ---
54
+ # 【關鍵修改 2:保留更優越的通用切分邏輯】
55
+ # 為了正確處理葡萄牙語的鼻化元音 (如 ɐ̃) 和塞擦音 (如 dʒ),
56
+ # 必須保留這個比英文版更強大的切分函數。
57
  def _tokenize_ipa(ipa_string: str) -> list:
58
  """
59
+ 將 IPA 字串智能地切分為音素列表,能正確處理帶有附加符號的組合字符
 
60
  """
61
  phonemes = []
 
62
  s = ipa_string.replace(' ', '')
63
  i = 0
64
  while i < len(s):
65
+ # 優先處理葡萄牙語中常見的雙字塞擦音
66
  if i + 1 < len(s) and s[i:i+2] in {'dʒ', 'tʃ'}:
67
  phonemes.append(s[i:i+2])
68
  i += 2
69
  continue
70
 
71
+ # 處理基礎字符及其後續的非間距標記 (例如鼻化符 ~)
 
72
  current_char = s[i]
73
  i += 1
74
  while i < len(s) and unicodedata.category(s[i]) == 'Mn':
75
  current_char += s[i]
76
  i += 1
77
  phonemes.append(current_char)
 
78
  return phonemes
79
 
80
+ # --- 4. 核心分析函數 (主入口) ---
 
 
 
81
  def analyze(audio_file_path: str, target_sentence: str) -> dict:
82
  """
83
  接收音訊檔案路徑和目標葡萄牙語句子,回傳詳細的發音分析字典。
84
+ (此函數結構與 en_us.py 完全對齊)
85
  """
86
  if not processor or not model:
87
  raise RuntimeError("模型尚未載入。請確保在呼叫 analyze 之前已成功執行 load_model()。")
88
 
89
+ # 1. 準備目標音素 (G2P)
 
90
  target_words_original = re.findall(r"[\w'-]+", target_sentence)
 
91
  cleaned_sentence = " ".join(target_words_original)
92
+
93
+ # 【關鍵修改 3:設定 G2P 語言為 'pt-br'
94
  target_ipa_by_word_str = phonemize(
95
  cleaned_sentence,
96
  language='pt-br',
97
  backend='espeak',
98
+ with_stress=True,
99
  strip=True
100
  ).split()
101
+
 
102
  if len(target_words_original) != len(target_ipa_by_word_str):
103
+ print(f"警告: G2P 後單詞數量 ({len(target_ipa_by_word_str)}) 與原始單詞數量 ({len(target_words_original)}) 不匹配。將進行截斷。")
104
  min_len = min(len(target_words_original), len(target_ipa_by_word_str))
105
  target_words_original = target_words_original[:min_len]
106
  target_ipa_by_word_str = target_ipa_by_word_str[:min_len]
107
 
108
+ # 【關鍵修改 4:與 en_us.py 對齊,在準備目標音素時就清除所有不比較符號】
109
  target_ipa_by_word = [
110
  _tokenize_ipa(word.replace('ˈ', '').replace('ˌ', '').replace('ː', ''))
111
  for word in target_ipa_by_word_str
112
  ]
113
 
114
+ # 2. 處理音訊並進行語音辨識 (ASR)
115
  try:
116
  speech, sample_rate = sf.read(audio_file_path)
117
+ if sample_rate != 16000:
118
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  except Exception as e:
120
  raise IOError(f"讀取或處理音訊時發生錯誤: {e}")
121
 
122
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
123
+ input_values = input_values.to(DEVICE)
124
+ with torch.no_grad():
125
+ logits = model(input_values).logits
126
+ predicted_ids = torch.argmax(logits, dim=-1)
127
+
128
+ # 【關鍵修改 5:與 en_us.py 對齊,清理模型輸出以匹配目標音素的處理方式】
129
+ user_ipa_full = processor.decode(predicted_ids[0]).replace('|', '').replace('ː', '')
130
+
131
+ # 3. 執行對齊並格式化輸出
132
  word_alignments = _get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
133
  return _format_to_json_structure(word_alignments, target_sentence, target_words_original)
134
 
 
 
 
 
135
 
136
+ # --- 5. 對齊函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
 
137
  def _get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa_tokenized):
138
  """
139
+ 使用動態規劃執行音素對齊。
140
+ (此函數實現與 en_us.py 完全相同)
141
  """
 
142
  user_phonemes = _tokenize_ipa(user_phoneme_str)
143
 
144
+ target_phonemes_flat = []
145
+ word_boundaries_indices = []
146
+ current_idx = 0
147
+ for word_ipa_tokens in target_words_ipa_tokenized:
148
+ target_phonemes_flat.extend(word_ipa_tokens)
149
+ current_idx += len(word_ipa_tokens)
150
+ word_boundaries_indices.append(current_idx - 1)
151
 
152
  dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
153
  for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
 
163
  cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
164
  if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
165
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
166
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
167
  user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
168
+ else:
169
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
 
170
 
171
  alignments_by_word = []
172
  word_start_idx_in_path = 0
173
  target_phoneme_counter_in_path = 0
174
+
 
175
  for path_idx, p in enumerate(target_path):
176
  if p != '-':
177
+ if target_phoneme_counter_in_path in word_boundaries_indices:
178
  alignments_by_word.append({
179
  "target": target_path[word_start_idx_in_path : path_idx + 1],
180
  "user": user_path[word_start_idx_in_path : path_idx + 1]
181
  })
182
  word_start_idx_in_path = path_idx + 1
 
183
  target_phoneme_counter_in_path += 1
184
+
185
  return alignments_by_word
186
 
187
+ # --- 6. 格式化函數 (與 en_us.py 的實現邏輯完全對齊) ---
 
 
188
  def _format_to_json_structure(alignments, sentence, original_words) -> dict:
189
  """
190
+ 將對齊結果格式化為最終的 JSON 結構。
191
+ (此函數實現與 en_us.py 完全相同,僅 G2P 語言設定不同)
192
  """
193
  total_phonemes, total_errors, correct_words_count = 0, 0, 0
194
  words_data = []
 
199
  word_is_correct = True
200
  phonemes_data = []
201
 
202
+ for j in range(len(alignment['target'])):
203
+ target_phoneme = alignment['target'][j]
204
+ user_phoneme = alignment['user'][j]
 
205
  is_match = (user_phoneme == target_phoneme)
206
  phonemes_data.append({"target": target_phoneme, "user": user_phoneme, "isMatch": is_match})
207
  if not is_match:
208
  word_is_correct = False
209
  if not (user_phoneme == '-' and target_phoneme == '-'): total_errors += 1
210
 
211
+ if word_is_correct:
212
+ correct_words_count += 1
213
+
214
  words_data.append({"word": original_words[i], "isCorrect": word_is_correct, "phonemes": phonemes_data})
215
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
216
 
 
217
  if len(alignments) < len(original_words):
218
  for i in range(len(alignments), len(original_words)):
219
+ # 【關鍵修改 6:確保此處的 G2P 語言和符號清理也保持一致】
220
  missed_word_ipa_str = phonemize(original_words[i], language='pt-br', backend='espeak', strip=True).replace('ː', '')
221
  missed_word_ipa = _tokenize_ipa(missed_word_ipa_str)
222
  phonemes_data = []
 
242
  "total_target_phonemes": total_phonemes
243
  },
244
  "words": words_data
245
+ }