HK0712 commited on
Commit
298fa73
·
1 Parent(s): 6bd5659

updated output format

Browse files
Files changed (2) hide show
  1. ASR.py +77 -28
  2. requirements.txt +4 -1
ASR.py CHANGED
@@ -5,16 +5,19 @@ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
  import os
6
  from phonemizer import phonemize
7
  import numpy as np
 
 
 
 
 
8
 
9
- # --- 1, 2, 3, 4 部分與之前版本完全相同,此處省略以保持簡潔 ---
10
- # ...
11
  # --- 1. 全域設定 ---
12
  TARGET_SENTENCE = "how was your day"
13
  AUDIO_FILE_PATH = "./TestAudio/hello.wav"
14
  MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
15
  MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
16
 
17
- # --- 2. 載入模型和處理器 ---
18
  print(f"正在準備模型 '{MODEL_NAME}'...")
19
  try:
20
  if not os.path.exists(MODEL_SAVE_PATH):
@@ -33,13 +36,15 @@ except Exception as e:
33
  print(f"處理或載入模型時發生錯誤: {e}")
34
  exit()
35
 
36
- # --- 3. 準備目標音標 (Target) ---
37
  print("正在準備目標音標...")
38
- target_ipa_by_word = phonemize(
39
- TARGET_SENTENCE, language='en-us', backend='espeak', with_stress=True
40
- ).split()
 
 
41
 
42
- # --- 4. 讀取音訊並進行簡單辨識 ---
43
  print(f"正在讀取音訊檔案: {AUDIO_FILE_PATH}...")
44
  try:
45
  speech, sample_rate = sf.read(AUDIO_FILE_PATH)
@@ -56,14 +61,14 @@ predicted_ids = torch.argmax(logits, dim=-1)
56
  user_ipa_full = processor.decode(predicted_ids[0])
57
 
58
 
59
- # --- 5. 核心函式:返回按單詞分割的詳細對齊路徑 (與之前版本相同) ---
60
  def get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa):
61
  user_phonemes = list(user_phoneme_str.replace(' ', ''))
62
  target_phonemes_flat = []
63
  word_boundaries = []
64
  current_idx = 0
65
  for word_ipa in target_words_ipa:
66
- phonemes = list(word_ipa.replace('ˌ', '').replace('ˈ', ''))
67
  target_phonemes_flat.extend(phonemes)
68
  current_idx += len(phonemes)
69
  word_boundaries.append(current_idx)
@@ -88,7 +93,7 @@ def get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa):
88
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
89
 
90
  alignments_by_word = []
91
- user_word_start_idx = 0
92
  target_phoneme_count = 0
93
 
94
  for i, phoneme in enumerate(target_path):
@@ -96,45 +101,89 @@ def get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa):
96
  target_phoneme_count += 1
97
 
98
  if target_phoneme_count in word_boundaries:
99
- target_alignment = target_path[user_word_start_idx:i+1]
100
- user_alignment = user_path[user_word_start_idx:i+1]
101
  alignments_by_word.append({
102
  "target": target_alignment,
103
  "user": user_alignment
104
  })
105
- user_word_start_idx = i + 1
106
 
107
  return alignments_by_word
108
 
109
- # --- 6. 最終的、格式完美的輸出函式 ---
110
- def format_and_print_final_version(alignments):
 
 
 
 
111
  target_line_parts = []
112
  user_line_parts = []
113
 
114
  for alignment in alignments:
115
- # 為每個單詞的對齊計算最大寬度
 
116
  max_lens = [max(len(t), len(u)) for t, u in zip(alignment['target'], alignment['user'])]
117
 
118
- # 格式化 Target 部分
119
- target_word_parts = [phoneme.ljust(max_lens[i]) for i, phoneme in enumerate(alignment['target'])]
120
  target_line_parts.append(f"[ {' '.join(target_word_parts)} ]")
121
 
122
- # 格式化 User 部分
123
- user_word_parts = [phoneme.ljust(max_lens[i]) for i, phoneme in enumerate(alignment['user'])]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  user_line_parts.append(f"[ {' '.join(user_word_parts)} ]")
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- # 組合並列印最終結果
 
 
 
 
 
 
127
  print(f"Target : {' '.join(target_line_parts)}")
128
  print(f"User : {' '.join(user_line_parts)}")
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  # --- 主流程 ---
132
  print("正在進行音素級對齊...")
133
  word_alignments = get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
134
 
135
- print("\n" + "="*60)
136
- print(" 發音對比分析結果")
137
- print("="*60)
138
- print(f"Sentence: {TARGET_SENTENCE}\n")
139
- format_and_print_final_version(word_alignments)
140
- print("="*60)
 
5
  import os
6
  from phonemizer import phonemize
7
  import numpy as np
8
+ from datetime import datetime
9
+ from colorama import init, Fore, Style
10
+
11
+ # 初始化 colorama
12
+ init(autoreset=True)
13
 
 
 
14
  # --- 1. 全域設定 ---
15
  TARGET_SENTENCE = "how was your day"
16
  AUDIO_FILE_PATH = "./TestAudio/hello.wav"
17
  MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
18
  MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
19
 
20
+ # --- 2. 載入模型和處理器 (保持不變) ---
21
  print(f"正在準備模型 '{MODEL_NAME}'...")
22
  try:
23
  if not os.path.exists(MODEL_SAVE_PATH):
 
36
  print(f"處理或載入模型時發生錯誤: {e}")
37
  exit()
38
 
39
+ # --- 3. 準備目標音標 (Target) - (已修改) ---
40
  print("正在準備目標音標...")
41
+ # 在這一步就徹底移除重音符號,得到最乾淨的目標音標列表
42
+ target_ipa_by_word = [
43
+ word.replace('ˌ', '').replace('ˈ', '')
44
+ for word in phonemize(TARGET_SENTENCE, language='en-us', backend='espeak', with_stress=True).split()
45
+ ]
46
 
47
+ # --- 4. 讀取音訊並進行辨識 (保持不變) ---
48
  print(f"正在讀取音訊檔案: {AUDIO_FILE_PATH}...")
49
  try:
50
  speech, sample_rate = sf.read(AUDIO_FILE_PATH)
 
61
  user_ipa_full = processor.decode(predicted_ids[0])
62
 
63
 
64
+ # --- 5. 核心函式:現在處理的都是乾淨的音標,邏輯保持不變 ---
65
  def get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa):
66
  user_phonemes = list(user_phoneme_str.replace(' ', ''))
67
  target_phonemes_flat = []
68
  word_boundaries = []
69
  current_idx = 0
70
  for word_ipa in target_words_ipa:
71
+ phonemes = list(word_ipa) # 已經是乾淨的音標
72
  target_phonemes_flat.extend(phonemes)
73
  current_idx += len(phonemes)
74
  word_boundaries.append(current_idx)
 
93
  user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
94
 
95
  alignments_by_word = []
96
+ word_start_idx = 0
97
  target_phoneme_count = 0
98
 
99
  for i, phoneme in enumerate(target_path):
 
101
  target_phoneme_count += 1
102
 
103
  if target_phoneme_count in word_boundaries:
104
+ target_alignment = target_path[word_start_idx:i+1]
105
+ user_alignment = user_path[word_start_idx:i+1]
106
  alignments_by_word.append({
107
  "target": target_alignment,
108
  "user": user_alignment
109
  })
110
+ word_start_idx = i + 1
111
 
112
  return alignments_by_word
113
 
114
+ # --- 6. 格式化輸出函式 (已簡化) ---
115
+ def format_and_print_final_report(alignments):
116
+ total_phonemes = 0
117
+ total_errors = 0
118
+ correct_words = 0
119
+
120
  target_line_parts = []
121
  user_line_parts = []
122
 
123
  for alignment in alignments:
124
+ word_is_correct = True
125
+
126
  max_lens = [max(len(t), len(u)) for t, u in zip(alignment['target'], alignment['user'])]
127
 
128
+ target_word_parts = [p.ljust(max_lens[j]) for j, p in enumerate(alignment['target'])]
 
129
  target_line_parts.append(f"[ {' '.join(target_word_parts)} ]")
130
 
131
+ user_word_parts = []
132
+ for j, user_phoneme in enumerate(alignment['user']):
133
+ target_phoneme = alignment['target'][j]
134
+ is_match = (user_phoneme == target_phoneme)
135
+
136
+ if not is_match:
137
+ word_is_correct = False
138
+ if user_phoneme != '-' and target_phoneme != '-': # 替換
139
+ total_errors += 1
140
+ elif user_phoneme == '-': # 省略
141
+ total_errors += 1
142
+ else: # 插入
143
+ total_errors += 1
144
+
145
+ color = Fore.GREEN if is_match else Fore.RED
146
+ user_word_parts.append(f"{color}{user_phoneme.ljust(max_lens[j])}{Style.RESET_ALL}")
147
+
148
  user_line_parts.append(f"[ {' '.join(user_word_parts)} ]")
149
+
150
+ if word_is_correct:
151
+ correct_words += 1
152
+
153
+ total_phonemes += sum(1 for p in alignment['target'] if p != '-')
154
+
155
+ # --- 計算統計資料 ---
156
+ total_words = len(alignments)
157
+ incorrect_words = total_words - correct_words
158
+ overall_score = (correct_words / total_words) * 100 if total_words > 0 else 0
159
+ phoneme_error_rate = (total_errors / total_phonemes) * 100 if total_phonemes > 0 else 0
160
 
161
+ # --- 列印報告 ---
162
+ separator = "="*70
163
+ print("\n" + separator)
164
+ print("Pronunciation Analysis".center(70))
165
+ print(separator + "\n")
166
+
167
+ print(f"Sentence: {TARGET_SENTENCE}\n")
168
  print(f"Target : {' '.join(target_line_parts)}")
169
  print(f"User : {' '.join(user_line_parts)}")
170
 
171
+ print("\n" + "-" * 70)
172
+ print("[ Summary ]")
173
+ print("-" * 70)
174
+ print(f"- Overall Score: {overall_score:.1f}%")
175
+ print(f"- Total Words: {total_words}")
176
+ print(f"- Correct Words: {correct_words}")
177
+ print(f"- Incorrect Words: {incorrect_words}")
178
+ print(f"- Phoneme Error Rate: {phoneme_error_rate:.2f}% ({total_errors} errors in {total_phonemes} target phonemes)")
179
+ # (已修改) 使用 UTC 時間
180
+ print(f"- Analysis Timestamp: {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} (UTC)")
181
+
182
+ print("\n" + separator)
183
+
184
 
185
  # --- 主流程 ---
186
  print("正在進行音素級對齊...")
187
  word_alignments = get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
188
 
189
+ format_and_print_final_report(word_alignments)
 
 
 
 
 
requirements.txt CHANGED
@@ -2,4 +2,7 @@ torch
2
  soundfile
3
  librosa
4
  transformers
5
- phonemizer
 
 
 
 
2
  soundfile
3
  librosa
4
  transformers
5
+ phonemizer
6
+ fastapi
7
+ uvicorn[standard]
8
+ colorama