HK0712 commited on
Commit
6bd5659
·
0 Parent(s):

initial commit

Browse files
Files changed (6) hide show
  1. .devcontainer/devcontainer.json +26 -0
  2. .gitignore +16 -0
  3. ASR.py +140 -0
  4. Dockerfile +25 -0
  5. cmudict_ipa.json +0 -0
  6. requirements.txt +5 -0
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // For format details, see https://aka.ms/devcontainer.json. For config options, see the
2
+ // README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
3
+ {
4
+ "name": "Existing Dockerfile",
5
+ "build": {
6
+ // Sets the run context to one level up instead of the .devcontainer folder.
7
+ "context": "..",
8
+ // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
9
+ "dockerfile": "../Dockerfile"
10
+ }
11
+
12
+ // Features to add to the dev container. More info: https://containers.dev/features.
13
+ // "features": {},
14
+
15
+ // Use 'forwardPorts' to make a list of ports inside the container available locally.
16
+ // "forwardPorts": [],
17
+
18
+ // Uncomment the next line to run commands after the container is created.
19
+ // "postCreateCommand": "cat /etc/os-release",
20
+
21
+ // Configure tool-specific properties.
22
+ // "customizations": {},
23
+
24
+ // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
25
+ // "remoteUser": "devcontainer"
26
+ }
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 忽略 Python 虛擬環境
2
+ venv/
3
+
4
+ # 忽略 VS Code 的設定
5
+ .vscode/
6
+
7
+ # 忽略 Python 的快取檔案
8
+ __pycache__/
9
+ *.pyc
10
+
11
+ # 忽略下載的本地模型 (非常重要,因為它太大了!)
12
+ ASRs/
13
+
14
+ # 忽略音訊檔案 (如果它們只是測試用的話)
15
+ TestAudio/
16
+ *.wav
ASR.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import librosa
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+ import os
6
+ from phonemizer import phonemize
7
+ import numpy as np
8
+
9
+ # --- 1, 2, 3, 4 部分與之前版本完全相同,此處省略以保持簡潔 ---
10
+ # ...
11
+ # --- 1. 全域設定 ---
12
+ TARGET_SENTENCE = "how was your day"
13
+ AUDIO_FILE_PATH = "./TestAudio/hello.wav"
14
+ MODEL_NAME = "MultiBridge/wav2vec-LnNor-IPA-ft"
15
+ MODEL_SAVE_PATH = "./ASRs/MultiBridge-wav2vec-LnNor-IPA-ft-local"
16
+
17
+ # --- 2. 載入模型和處理器 ---
18
+ print(f"正在準備模型 '{MODEL_NAME}'...")
19
+ try:
20
+ if not os.path.exists(MODEL_SAVE_PATH):
21
+ print(f"本地找不到模型,正在從 Hugging Face 下載並儲存...")
22
+ processor_to_save = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
23
+ model_to_save = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
24
+ processor_to_save.save_pretrained(MODEL_SAVE_PATH)
25
+ model_to_save.save_pretrained(MODEL_SAVE_PATH)
26
+ print("模型已成功下載並儲存。")
27
+ else:
28
+ print(f"在 '{MODEL_SAVE_PATH}' 中找到本地模型。")
29
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_SAVE_PATH)
30
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_SAVE_PATH)
31
+ print("模型和處理器載入成功!")
32
+ except Exception as e:
33
+ print(f"處理或載入模型時發生錯誤: {e}")
34
+ exit()
35
+
36
+ # --- 3. 準備目標音標 (Target) ---
37
+ print("正在準備目標音標...")
38
+ target_ipa_by_word = phonemize(
39
+ TARGET_SENTENCE, language='en-us', backend='espeak', with_stress=True
40
+ ).split()
41
+
42
+ # --- 4. 讀取音訊並進行簡單辨識 ---
43
+ print(f"正在讀取音訊檔案: {AUDIO_FILE_PATH}...")
44
+ try:
45
+ speech, sample_rate = sf.read(AUDIO_FILE_PATH)
46
+ if sample_rate != 16000:
47
+ speech = librosa.resample(y=speech, orig_sr=sample_rate, target_sr=16000)
48
+ except Exception as e:
49
+ print(f"讀取或處理音訊時發生錯誤: {e}")
50
+ exit()
51
+ print("正在辨識用戶的實際發音...")
52
+ input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values
53
+ with torch.no_grad():
54
+ logits = model(input_values).logits
55
+ predicted_ids = torch.argmax(logits, dim=-1)
56
+ user_ipa_full = processor.decode(predicted_ids[0])
57
+
58
+
59
+ # --- 5. 核心函式:返回按單詞分割的詳細對齊路徑 (與之前版本相同) ---
60
+ def get_phoneme_alignments_by_word(user_phoneme_str, target_words_ipa):
61
+ user_phonemes = list(user_phoneme_str.replace(' ', ''))
62
+ target_phonemes_flat = []
63
+ word_boundaries = []
64
+ current_idx = 0
65
+ for word_ipa in target_words_ipa:
66
+ phonemes = list(word_ipa.replace('ˌ', '').replace('ˈ', ''))
67
+ target_phonemes_flat.extend(phonemes)
68
+ current_idx += len(phonemes)
69
+ word_boundaries.append(current_idx)
70
+
71
+ dp = np.zeros((len(user_phonemes) + 1, len(target_phonemes_flat) + 1))
72
+ for i in range(1, len(user_phonemes) + 1): dp[i][0] = i
73
+ for j in range(1, len(target_phonemes_flat) + 1): dp[0][j] = j
74
+ for i in range(1, len(user_phonemes) + 1):
75
+ for j in range(1, len(target_phonemes_flat) + 1):
76
+ cost = 0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1
77
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
78
+
79
+ i, j = len(user_phonemes), len(target_phonemes_flat)
80
+ user_path, target_path = [], []
81
+ while i > 0 or j > 0:
82
+ cost = float('inf') if i == 0 or j == 0 else (0 if user_phonemes[i-1] == target_phonemes_flat[j-1] else 1)
83
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + cost:
84
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, target_phonemes_flat[j-1]); i -= 1; j -= 1
85
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
86
+ user_path.insert(0, user_phonemes[i-1]); target_path.insert(0, '-'); i -= 1
87
+ else:
88
+ user_path.insert(0, '-'); target_path.insert(0, target_phonemes_flat[j-1]); j -= 1
89
+
90
+ alignments_by_word = []
91
+ user_word_start_idx = 0
92
+ target_phoneme_count = 0
93
+
94
+ for i, phoneme in enumerate(target_path):
95
+ if phoneme != '-':
96
+ target_phoneme_count += 1
97
+
98
+ if target_phoneme_count in word_boundaries:
99
+ target_alignment = target_path[user_word_start_idx:i+1]
100
+ user_alignment = user_path[user_word_start_idx:i+1]
101
+ alignments_by_word.append({
102
+ "target": target_alignment,
103
+ "user": user_alignment
104
+ })
105
+ user_word_start_idx = i + 1
106
+
107
+ return alignments_by_word
108
+
109
+ # --- 6. 最終的、格式完美的輸出函式 ---
110
+ def format_and_print_final_version(alignments):
111
+ target_line_parts = []
112
+ user_line_parts = []
113
+
114
+ for alignment in alignments:
115
+ # 為每個單詞的對齊計算最大寬度
116
+ max_lens = [max(len(t), len(u)) for t, u in zip(alignment['target'], alignment['user'])]
117
+
118
+ # 格式化 Target 部分
119
+ target_word_parts = [phoneme.ljust(max_lens[i]) for i, phoneme in enumerate(alignment['target'])]
120
+ target_line_parts.append(f"[ {' '.join(target_word_parts)} ]")
121
+
122
+ # 格式化 User 部分
123
+ user_word_parts = [phoneme.ljust(max_lens[i]) for i, phoneme in enumerate(alignment['user'])]
124
+ user_line_parts.append(f"[ {' '.join(user_word_parts)} ]")
125
+
126
+ # 組合並列印最終結果
127
+ print(f"Target : {' '.join(target_line_parts)}")
128
+ print(f"User : {' '.join(user_line_parts)}")
129
+
130
+
131
+ # --- 主流程 ---
132
+ print("正在進行音素級對齊...")
133
+ word_alignments = get_phoneme_alignments_by_word(user_ipa_full, target_ipa_by_word)
134
+
135
+ print("\n" + "="*60)
136
+ print(" 發音對比分析結果")
137
+ print("="*60)
138
+ print(f"Sentence: {TARGET_SENTENCE}\n")
139
+ format_and_print_final_version(word_alignments)
140
+ print("="*60)
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. 選擇一個包含 Python 的官方 Linux 映像
2
+ FROM python:3.10-slim
3
+
4
+ # 2. 設定容器內的工作目錄
5
+ WORKDIR /app
6
+
7
+ # 3. 安裝系統級依賴 (最關鍵的一步:安裝 espeak-ng 和其他工具)
8
+ # -y 自動回答 'yes'
9
+ # --no-install-recommends 避免安裝不必要的建議套件,保持映像檔小巧
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ espeak-ng \
12
+ libsndfile1 \
13
+ ffmpeg \
14
+ wget && \
15
+ rm -rf /var/lib/apt/lists/*
16
+
17
+ # 4. 複製 requirements.txt 檔案到容器中並安裝 Python 套件
18
+ COPY requirements.txt .
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # 5. 將專案中的所有其他檔案複製到容器中
22
+ COPY . .
23
+
24
+ # 這行是可選的,它設定了當容器直接執行時的預設命令
25
+ # CMD ["python", "your_script.py"]
cmudict_ipa.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ soundfile
3
+ librosa
4
+ transformers
5
+ phonemizer