Spaces:

XiaoBai1221
/

Bloom_Ware

Sleeping

XiaoBai1221 commited on Oct 30, 2025

Commit

ab55b54

1 Parent(s): 85a83b0

feat: 語音登入優化 - 錄音時間調整為3秒 & CNN模組整合

- 將語音登入錄音時間從4秒調整為3秒，提升用戶體驗
- 在inference.py中添加predict_files函數，整合CNN說話者辨識
- 修改VoiceAuthService使用inference.py而非獨立adapter
- 前端語音綁定錄音時間同步調整為3秒
- 更新測試檔案中的模組引用路徑

Files changed (5) hide show

app.py +1 -1
models/speaker_identification/scripts/inference.py +70 -0
services/voice_login.py +1 -1
static/frontend/js/websocket.js +6 -6
tests/services/test_voice_login_cnn.py +4 -4

app.py CHANGED Viewed

@@ -219,7 +219,7 @@ async def lifespan(app: FastAPI):
         # 初始化語音登入服務（硬編參數）
         try:
             app.state.voice_auth = VoiceAuthService(config=VoiceLoginConfig(
-                window_seconds=4,
                 required_windows=1,
                 sample_rate=16000,
                 prob_threshold=0.40,

         # 初始化語音登入服務（硬編參數）
         try:
             app.state.voice_auth = VoiceAuthService(config=VoiceLoginConfig(
+                window_seconds=3,
                 required_windows=1,
                 sample_rate=16000,
                 prob_threshold=0.40,

models/speaker_identification/scripts/inference.py CHANGED Viewed

@@ -78,6 +78,76 @@ def softmax(x):
     return e / e.sum(dim=1, keepdim=True)
 # ============== 錄音與前處理（比照 process_audio.py） ==============
 REC_SR = 22050
 TARGET_RMS = 0.1

     return e / e.sum(dim=1, keepdim=True)
+def predict_files(model_dir, file_list, threshold=0.0):
+    """
+    預測多個音訊檔案的說話者。
+    Args:
+        model_dir: 模型目錄，包含 speaker_id_model.pth 和 classes.txt
+        file_list: 檔案路徑列表
+        threshold: 預測門檻（目前未使用）
+    Returns:
+        結果列表，每個元素為字典，包含 'pred', 'score', 'top'
+    """
+    device = get_device()
+    bundle = torchaudio.pipelines.WAV2VEC2_BASE
+    target_sr = bundle.sample_rate
+    model_path = os.path.join(model_dir, 'speaker_id_model.pth')
+    classes_path = os.path.join(model_dir, 'classes.txt')
+    processed_dir = os.path.join(model_dir, 'processed_audio')
+    if os.path.isfile(classes_path):
+        classes = load_classes(classes_path)
+    elif os.path.isdir(processed_dir):
+        classes = load_classes(processed_dir)
+    else:
+        raise FileNotFoundError(f"找不到類別定義：{classes_path} 或 {processed_dir}")
+    num_classes = len(classes)
+    model = Wav2Vec2SpeakerClassifier(bundle, num_classes)
+    state = torch.load(model_path, map_location='cpu')
+    model.load_state_dict(state)
+    model.to(device).eval()
+    results = []
+    for file_path in file_list:
+        try:
+            # 前處理音訊
+            y, sr = process_like_training(file_path)
+            # 轉成模型輸入
+            y_t = torch.tensor(y, dtype=torch.float32).unsqueeze(0)
+            if sr != target_sr:
+                resampler = torchaudio.transforms.Resample(sr, target_sr)
+                y_t = resampler(y_t)
+            length = torch.tensor([y_t.shape[1]], dtype=torch.long)
+            waveforms, lengths = y_t.to(device), length.to(device)
+            with torch.no_grad():
+                logits = model(waveforms, lengths)
+                probs = softmax(logits).squeeze(0).cpu()
+                top_prob, top_idx = torch.max(probs, dim=0)
+                pred = classes[top_idx.item()]
+                # 獲取 top 候選
+                topk = torch.topk(probs, k=min(3, num_classes))
+                top = [(classes[i], float(p)) for p, i in zip(topk.values.tolist(), topk.indices.tolist())]
+            result = {
+                'pred': pred,
+                'score': float(top_prob.item()),
+                'top': top
+            }
+            results.append(result)
+        except Exception as e:
+            results.append({'error': str(e)})
+    return results
 # ============== 錄音與前處理（比照 process_audio.py） ==============
 REC_SR = 22050
 TARGET_RMS = 0.1

services/voice_login.py CHANGED Viewed

@@ -105,7 +105,7 @@ class VoiceAuthService:
         if str(self.identity_dir) not in os.sys.path:
             os.sys.path.insert(0, str(self.identity_dir))
         try:
-            from models.speaker_identification.cnn_adapter import predict_files as _predict_files  # type: ignore
         except Exception as e:  # pragma: no cover
             raise RuntimeError(f"載入 CNN 說話者辨識模組失敗：{e}")
         self._predict_files = _predict_files

         if str(self.identity_dir) not in os.sys.path:
             os.sys.path.insert(0, str(self.identity_dir))
         try:
+            from scripts.inference import predict_files as _predict_files  # type: ignore
         except Exception as e:  # pragma: no cover
             raise RuntimeError(f"載入 CNN 說話者辨識模組失敗：{e}")
         self._predict_files = _predict_files

static/frontend/js/websocket.js CHANGED Viewed

@@ -783,7 +783,7 @@ async function handleVoiceBindingReady() {
   // 更新提示文字
   if (typeof transcript !== 'undefined') {
-    transcript.textContent = '請開始說話（錄音 5 秒）...';
     transcript.className = 'voice-transcript provisional';
   }
@@ -814,10 +814,10 @@ async function handleVoiceBindingReady() {
       return;
     }
-    console.log('⏱️ 開始倒數 5 秒錄音...');
     // 倒數計時提示
-    let countdown = 5;
     const countdownInterval = setInterval(() => {
       countdown--;
       if (countdown > 0 && typeof transcript !== 'undefined') {
@@ -826,10 +826,10 @@ async function handleVoiceBindingReady() {
       }
     }, 1000);
-    // 5 秒後自動停止錄音
     setTimeout(() => {
       clearInterval(countdownInterval);
-      console.log('⏹️ 5 秒錄音完成，自動停止');
       // 停止音訊視覺化
       if (typeof stopRealAudioAnalysis === 'function') {
@@ -852,7 +852,7 @@ async function handleVoiceBindingReady() {
         transcript.className = 'voice-transcript provisional';
       }
-    }, 5000);  // 5 秒錄音時長
   } else {
     console.error('❌ WebSocket 管理器未初始化');
     showErrorNotification('系統錯誤：WebSocket 未連接');

   // 更新提示文字
   if (typeof transcript !== 'undefined') {
+    transcript.textContent = '請開始說話（錄音 3 秒）...';
     transcript.className = 'voice-transcript provisional';
   }
       return;
     }
+    console.log('⏱️ 開始倒數 3 秒錄音...');
     // 倒數計時提示
+    let countdown = 3;
     const countdownInterval = setInterval(() => {
       countdown--;
       if (countdown > 0 && typeof transcript !== 'undefined') {
       }
     }, 1000);
+    // 3 秒後自動停止錄音
     setTimeout(() => {
       clearInterval(countdownInterval);
+      console.log('⏹️ 3 秒錄音完成，自動停止');
       // 停止音訊視覺化
       if (typeof stopRealAudioAnalysis === 'function') {
         transcript.className = 'voice-transcript provisional';
       }
+    }, 3000);  // 3 秒錄音時長
   } else {
     console.error('❌ WebSocket 管理器未初始化');
     showErrorNotification('系統錯誤：WebSocket 未連接');

tests/services/test_voice_login_cnn.py CHANGED Viewed

@@ -27,7 +27,7 @@ def test_voice_login_success_with_cnn_stub(monkeypatch):
         (tmpdir / "classes.txt").write_text("alice\nbob\n", encoding="utf-8")
         monkeypatch.setenv("VOICE_CNN_MODEL_DIR", str(tmpdir))
-        # 在 VoiceAuthService 初始化前，先以假模組覆蓋 cnn_adapter，避免實際載入大型相依（如 torchaudio）
         dummy = types.SimpleNamespace(
             predict_files=lambda model_dir, inputs, threshold=0.0: [{
                 "file": str(inputs[0]),
@@ -37,7 +37,7 @@ def test_voice_login_success_with_cnn_stub(monkeypatch):
                 "is_unknown": False,
             }]
         )
-        monkeypatch.setitem(sys.modules, 'models.speaker_identification.cnn_adapter', dummy)
         svc = VoiceAuthService(config=VoiceLoginConfig(
             window_seconds=1,
@@ -68,11 +68,11 @@ def test_voice_login_no_audio_returns_error(monkeypatch):
         (tmpdir / "classes.txt").write_text("alice\nbob\n", encoding="utf-8")
         monkeypatch.setenv("VOICE_CNN_MODEL_DIR", str(tmpdir))
-        # 同樣先注入假 cnn_adapter 模組
         dummy = types.SimpleNamespace(
             predict_files=lambda model_dir, inputs, threshold=0.0: []
         )
-        monkeypatch.setitem(sys.modules, 'models.speaker_identification.cnn_adapter', dummy)
         svc = VoiceAuthService(config=VoiceLoginConfig(
             window_seconds=1,

         (tmpdir / "classes.txt").write_text("alice\nbob\n", encoding="utf-8")
         monkeypatch.setenv("VOICE_CNN_MODEL_DIR", str(tmpdir))
+        # 在 VoiceAuthService 初始化前，先以假模組覆蓋 inference，避免實際載入大型相依（如 torchaudio）
         dummy = types.SimpleNamespace(
             predict_files=lambda model_dir, inputs, threshold=0.0: [{
                 "file": str(inputs[0]),
                 "is_unknown": False,
             }]
         )
+        monkeypatch.setitem(sys.modules, 'scripts.inference', dummy)
         svc = VoiceAuthService(config=VoiceLoginConfig(
             window_seconds=1,
         (tmpdir / "classes.txt").write_text("alice\nbob\n", encoding="utf-8")
         monkeypatch.setenv("VOICE_CNN_MODEL_DIR", str(tmpdir))
+        # 同樣先注入假 inference 模組
         dummy = types.SimpleNamespace(
             predict_files=lambda model_dir, inputs, threshold=0.0: []
         )
+        monkeypatch.setitem(sys.modules, 'scripts.inference', dummy)
         svc = VoiceAuthService(config=VoiceLoginConfig(
             window_seconds=1,