Spaces:

ayaka68
/

voice2place

Sleeping

App Files Files Community

ayaka68 commited on Sep 5, 2025

Commit

acd8897

verified ·

1 Parent(s): c36c080

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -17

app.py CHANGED Viewed

@@ -299,8 +299,9 @@ def _normalize_label(lbl: str) -> str:
 def predict_emotion_ai(audio_bytes):
     """
-    S3PRL Featurizer で必ず [B,T,H] を取得 → 各サンプルの有効長 reps_len で時間平均 → [B,H]。
-    その後、線形ヘッド（W,b）で分類。
     """
     try:
         featurizer, head, id2label, device = load_kushinada_s3prl()
@@ -321,25 +322,51 @@ def predict_emotion_ai(audio_bytes):
             y = y[:max_samples]
             st.warning("音声が30秒を超えたため、最初の30秒のみを分析します。")
-        # S3PRLは list[Tensor], list[int] を想定
         wavs = [torch.tensor(y, dtype=torch.float32)]
         wavs_len = [int(len(y))]
         with torch.no_grad():
-            reps, reps_len = featurizer(wavs, wavs_len)  # reps: [B,T,H], reps_len: list[int] or Tensor[B]
-            if isinstance(reps_len, torch.Tensor):
-                reps_len = reps_len.tolist()
-            # 有効長のみで平均（パディングを無視）
-            pooled = []
-            for i in range(reps.shape[0]):
-                Ti = int(reps_len[i]) if reps_len else reps.shape[1]
-                Ti = max(1, min(Ti, reps.shape[1]))  # 安全側
-                pooled.append(reps[i, :Ti].mean(dim=0))
-            pooled = torch.stack(pooled, dim=0)  # [B,H]
-            # 線形ヘッドで分類（head.fcに直接入れる）
-            logits = head.fc(pooled.to(device))  # [B,C]
             probs = torch.softmax(logits, dim=-1)[0].detach().cpu().numpy()
         pred_id = int(np.argmax(probs))
@@ -360,6 +387,7 @@ def predict_emotion_ai(audio_bytes):
         st.warning(f"AI予測中にエラーが発生: {e}")
         return predict_emotion_features(audio_bytes)
 # ===== 推薦 =====
 def score_places(emo_label, top_k=4, diversity=True):
     EMO_MAP_PRIORS = {

 def predict_emotion_ai(audio_bytes):
     """
+    S3PRL Featurizer → [B,T,H] と reps_len を受け取り、
+    reps_len が int / list / tuple / Tensor / None のいずれでも動くよう正規化して
+    有効長のみ平均化（[B,H]）→ 線形ヘッドで分類。
     """
     try:
         featurizer, head, id2label, device = load_kushinada_s3prl()
             y = y[:max_samples]
             st.warning("音声が30秒を超えたため、最初の30秒のみを分析します。")
+        # Featurizer想定の入力（CPU list でOK）
         wavs = [torch.tensor(y, dtype=torch.float32)]
         wavs_len = [int(len(y))]
         with torch.no_grad():
+            reps, reps_len = featurizer(wavs, wavs_len)  # reps: [B,T,H] か [T,H]、reps_len: list/int/Tensor/None
+            # --- reps を [B,T,H] に統一 ---
+            if isinstance(reps, torch.Tensor):
+                if reps.dim() == 2:        # [T,H] → [1,T,H]
+                    reps = reps.unsqueeze(0)
+                elif reps.dim() != 3:
+                    raise RuntimeError(f"Unexpected reps.dim(): {reps.dim()}")
+            else:
+                # 念のため、非テンソルなら失敗扱い（通常ここには来ない）
+                raise RuntimeError(f"Unexpected reps type: {type(reps)}")
+            B, T, H = reps.shape
+            # --- reps_len を [B] のリストに正規化 ---
+            if reps_len is None:
+                reps_len_list = [T] * B
+            elif isinstance(reps_len, int):
+                reps_len_list = [int(reps_len)] * B
+            elif isinstance(reps_len, (list, tuple)):
+                reps_len_list = [int(x) for x in reps_len]
+                if len(reps_len_list) != B:
+                    # 長さが合わなければ T で埋める
+                    reps_len_list = [T] * B
+            elif isinstance(reps_len, torch.Tensor):
+                reps_len_list = reps_len.view(-1).tolist()
+                if len(reps_len_list) != B:
+                    reps_len_list = [T] * B
+            else:
+                reps_len_list = [T] * B
+            # 安全に 1..T にクリップ
+            reps_len_list = [max(1, min(int(li), T)) for li in reps_len_list]
+            # --- 有効長のみ平均して [B,H] ---
+            pooled = torch.stack([reps[i, :reps_len_list[i]].mean(dim=0) for i in range(B)], dim=0)  # [B,H]
+            # --- 線形ヘッドで分類 ---
+            logits = head.fc(pooled.to(device))    # [B,C]
             probs = torch.softmax(logits, dim=-1)[0].detach().cpu().numpy()
         pred_id = int(np.argmax(probs))
         st.warning(f"AI予測中にエラーが発生: {e}")
         return predict_emotion_features(audio_bytes)
 # ===== 推薦 =====
 def score_places(emo_label, top_k=4, diversity=True):
     EMO_MAP_PRIORS = {