Spaces:

ayaka68
/

voice2place

Sleeping

App Files Files Community

ayaka68 commited on Sep 5, 2025

Commit

1981810

verified ·

1 Parent(s): d22abdc

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -47

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """
 Voice→Place Recommender (Streamlit / Hugging Face Spaces)
-- Gated model対応（HF_TOKENをSecretsに登録して使用）
-- モデルロードは@st.cache_resourceに一本化
 """
 # ===== 基本インポート =====
-import io, uuid, datetime as dt, csv, base64, json, random, os
 import numpy as np
 import soundfile as sf
 from pydub import AudioSegment
@@ -13,14 +15,18 @@ from pydub import AudioSegment
 import streamlit as st
 from audiorecorder import audiorecorder
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 from matplotlib import rcParams
 import japanize_matplotlib
 import torch
-from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 # ===== フォント設定 =====
 rcParams["font.family"] = "DejaVu Sans"
@@ -45,14 +51,33 @@ PLACES = [
 ]
 REASON_TAGS = ["静けさ","緑","水辺","発散","創作","交流","体験","学習","屋内","屋外","没入","回復"]
-# ===== Gated model ロード（一本化）=====
-MODEL_NAME = "imprt/kushinada-hubert-base-jtes-er"
 @st.cache_resource(show_spinner=False)
-def load_model():
     """
-    日本語音声感情認識モデルをロード（gated対応）。
-    SpacesのSecretsに 'HF_TOKEN' を保存している前提。
     """
     token = os.getenv("HF_TOKEN")
     if not token:
@@ -60,17 +85,71 @@ def load_model():
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    feature_extractor = AutoFeatureExtractor.from_pretrained(
-        MODEL_NAME,
-        token=token
-    )
-    model = AutoModelForAudioClassification.from_pretrained(
-        MODEL_NAME,
-        token=token
-    ).to(device)
-    model.eval()
-    return feature_extractor, model, device
 # ===== ユーティリティ =====
 def to_wav_bytes(any_bytes: bytes, target_sr=16000, mono=True) -> bytes:
@@ -92,6 +171,7 @@ def to_wav_bytes(any_bytes: bytes, target_sr=16000, mono=True) -> bytes:
     return buf.getvalue()
 def audio_player_bytes(b: bytes, mime="audio/wav"):
     if not b:
         return
     b64 = base64.b64encode(b).decode("utf-8")
@@ -104,8 +184,9 @@ def audio_player_bytes(b: bytes, mime="audio/wav"):
         unsafe_allow_html=True,
     )
-# ===== フォールバック用簡易特徴量 =====
 def extract_features(y, sr):
     abs_y = np.abs(y)
     thr = 0.01 * (abs_y.max() + 1e-9)
     idx = np.where(abs_y > thr)[0]
@@ -125,6 +206,7 @@ def extract_features(y, sr):
     zc = (y[:-1] * y[1:] < 0).astype(np.float32)
     zcr_mean = float(zc.mean()) if zc.size else 0.0
     fmin, fmax = 80.0, 600.0
     if len(y) < int(sr / fmin) + 2:
         f0_est = 0.0
@@ -148,6 +230,7 @@ def extract_features(y, sr):
     }
 def predict_emotion_features(audio_bytes):
     wav_bytes_16k = to_wav_bytes(audio_bytes, target_sr=16000)
     y, sr = sf.read(io.BytesIO(wav_bytes_16k), dtype="float32")
     feat = extract_features(y, sr)
@@ -172,23 +255,17 @@ def predict_emotion_features(audio_bytes):
     scores["neutral"] += 0.3
     return label, scores, "Features"
-# ===== AI推定 =====
-def normalize_label(lbl: str) -> str:
-    """モデル出力ラベルをUI想定に正規化"""
-    m = {
-        "happy": "happiness",
-        "happiness": "happiness",
-        "angry": "anger",
-        "anger": "anger",
-        "sad": "sadness",
-        "sadness": "sadness",
-        "neutral": "neutral"
-    }
     return m.get(lbl.lower(), lbl)
 def predict_emotion_ai(audio_bytes):
     try:
-        feature_extractor, model, device = load_model()
     except Exception as e:
         st.error(f"モデルのロードに失敗しました: {e}")
         st.info("音声特徴量ベースの分析に切り替えます。")
@@ -204,23 +281,32 @@ def predict_emotion_ai(audio_bytes):
             y = y[:max_samples]
             st.warning("音声が30秒を超えたため、最初の30秒のみを分析します。")
-        inputs = feature_extractor(y, sampling_rate=sr, return_tensors="pt", padding=True)
-        inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
-            logits = model(**inputs).logits
-        probs = torch.softmax(logits, dim=-1)[0].detach().cpu().numpy()
-        pred_id = int(np.argmax(probs))
-        raw_label = model.config.id2label[pred_id]
-        label = normalize_label(raw_label)
-        scores = {normalize_label(model.config.id2label[i]): float(probs[i]) for i in range(len(probs))}
-        # 期待外ラベルが混ざっても可視化で扱えるように0〜1にクリップ
         for k in list(scores.keys()):
             scores[k] = max(0.0, min(1.0, scores[k]))
-        return label, scores, "AI"
     except Exception as e:
         st.warning(f"AI予測中にエラーが発生: {e}")
@@ -330,9 +416,8 @@ def main():
         if key not in st.session_state: st.session_state[key] = default
     st.subheader("1) 録音またはアップロード")
-    with st.warning("⚠️ ファイルアップロードで403が出る場合は、録音機能をお使いください。"):
-        st.markdown("**🎤 録音** → PC/スマホで直接話す or 端末で音声再生しながら録音")
     tab_rec, tab_upload = st.tabs(["🎤 録音する（推奨）", "📁 ファイルを使う"])

+# app.py
 """
 Voice→Place Recommender (Streamlit / Hugging Face Spaces)
+- 日本語音声感情認識：S3PRL(HuBERT base) + HFの下流(.pt) チェックポイントを用いた推論
+- Spaces の Settings → Secrets に HF_TOKEN を設定してください
+- ffmpeg が必要（apt.txtに ffmpeg を記載）
 """
 # ===== 基本インポート =====
+import io, json, base64, random, os
 import numpy as np
 import soundfile as sf
 from pydub import AudioSegment
 import streamlit as st
 from audiorecorder import audiorecorder
+# Matplotlib
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 from matplotlib import rcParams
 import japanize_matplotlib
+# Torch / Hugging Face Hub / S3PRL
 import torch
+import torch.nn as nn
+from huggingface_hub import list_repo_files, hf_hub_download
+from s3prl.nn import S3PRLUpstream
 # ===== フォント設定 =====
 rcParams["font.family"] = "DejaVu Sans"
 ]
 REASON_TAGS = ["静けさ","緑","水辺","発散","創作","交流","体験","学習","屋内","屋外","没入","回復"]
+# ===== KUSHINADA 定義（HF の gated モデルのリポ）=====
+KUSHINADA_REPO = "imprt/kushinada-hubert-base-jtes-er"
+# ===== S3PRL 下流ヘッド（線形） =====
+class SimpleLinearHead(nn.Module):
+    """
+    チェックポイント中の線形分類器 (W, b) を復元する簡易ヘッド。
+    入力: [B, T, H] → mean-pool → [B, H] → Linear(H, C)
+    """
+    def __init__(self, in_dim: int, num_classes: int, W: torch.Tensor, b: torch.Tensor):
+        super().__init__()
+        self.pool = lambda x: x.mean(dim=1)  # 時系列平均
+        self.fc = nn.Linear(in_dim, num_classes)
+        with torch.no_grad():
+            self.fc.weight.copy_(W)  # [C, H]
+            self.fc.bias.copy_(b)    # [C]
+    def forward(self, reps):  # reps: [B, T, H]
+        x = self.pool(reps)
+        return self.fc(x)
+# ===== KUSHINADA (S3PRL) ローダ =====
 @st.cache_resource(show_spinner=False)
+def load_kushinada_s3prl():
     """
+    S3PRL上流(HuBERT base) + HFの下流(.pt)を自動取得して復元。
+    チェックポイント中から (weight,bias) を推定して線形ヘッドを構築。
     """
     token = os.getenv("HF_TOKEN")
     if not token:
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    # 1) S3PRL 上流：HuBERT base（kushinada はHuBERT系想定）
+    upstream = S3PRLUpstream("hubert_base").to(device).eval()
+    # 2) HFから .pt を探してダウンロード
+    files = list_repo_files(KUSHINADA_REPO, token=token)
+    pt_files = [f for f in files if f.endswith(".pt")]
+    if not pt_files:
+        raise FileNotFoundError("下流チェックポイント(.pt)が見つかりません。モデルページの Files を確認してください。")
+    # 最初の .pt を採用（必要なら固定のファイル名に変更）
+    ckpt_path = hf_hub_download(repo_id=KUSHINADA_REPO, filename=pt_files[0], token=token)
+    # 3) チェックポイント読込
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    # 4) state_dict から線形層の W, b を推定
+    state = None
+    if isinstance(ckpt, dict):
+        for key in ["state_dict", "Downstream", "model", "downstream", "net", "weights"]:
+            if key in ckpt and isinstance(ckpt[key], dict):
+                state = ckpt[key]
+                break
+        if state is None:
+            # そのままstate dictの可能性
+            # S3PRLのスクリプトにより出力形式は複数パターンありうる
+            state = ckpt
+    if not isinstance(state, dict):
+        raise RuntimeError("チェックポイント形式を解釈できませんでした。")
+    # W, b らしきテンソルを探索（[C,H], [C] っぽい組を探す）
+    linear_W, linear_b = None, None
+    for k, v in state.items():
+        if isinstance(v, torch.Tensor) and v.ndim == 2:
+            base = k.rsplit(".", 1)[0]  # 例: "classifier.fc.weight" → "classifier.fc"
+            bias_key = base + ".bias"
+            if bias_key in state and isinstance(state[bias_key], torch.Tensor) and state[bias_key].ndim == 1:
+                linear_W = v
+                linear_b = state[bias_key]
+                break
+    if linear_W is None:
+        # 次善策: "weight"と"bias"という名前のペアを総当たり
+        twos = [(k,v) for k,v in state.items() if isinstance(v, torch.Tensor) and v.ndim==2 and k.endswith("weight")]
+        for wk, w in twos:
+            bk = wk.replace("weight", "bias")
+            if bk in state and isinstance(state[bk], torch.Tensor) and state[bk].ndim == 1:
+                linear_W, linear_b = w, state[bk]
+                break
+    if linear_W is None:
+        raise RuntimeError("線形分類器の重みが見つかりません。S3PRLの公式手順に沿ったDownstream再現が必要です。")
+    num_classes, hidden_dim = linear_W.shape  # [C, H]
+    head = SimpleLinearHead(in_dim=hidden_dim, num_classes=num_classes,
+                            W=linear_W, b=linear_b).to(device).eval()
+    # JTES想定：4クラス（angry/happy/neutral/sad）※順序は環境/学習で異なる可能性あり
+    default_labels = ["angry", "happy", "neutral", "sad"]
+    if num_classes == 4:
+        id2label = {i: default_labels[i] for i in range(4)}
+    else:
+        id2label = {i: f"class_{i}" for i in range(num_classes)}
+    return upstream, head, id2label, device
 # ===== ユーティリティ =====
 def to_wav_bytes(any_bytes: bytes, target_sr=16000, mono=True) -> bytes:
     return buf.getvalue()
 def audio_player_bytes(b: bytes, mime="audio/wav"):
+    """音声プレイヤーを表示"""
     if not b:
         return
     b64 = base64.b64encode(b).decode("utf-8")
         unsafe_allow_html=True,
     )
+# ===== フォールバック用：簡易特徴量ベース =====
 def extract_features(y, sr):
+    """音声から簡易特徴量を抽出"""
     abs_y = np.abs(y)
     thr = 0.01 * (abs_y.max() + 1e-9)
     idx = np.where(abs_y > thr)[0]
     zc = (y[:-1] * y[1:] < 0).astype(np.float32)
     zcr_mean = float(zc.mean()) if zc.size else 0.0
+    # F0推定（非常に簡易）
     fmin, fmax = 80.0, 600.0
     if len(y) < int(sr / fmin) + 2:
         f0_est = 0.0
     }
 def predict_emotion_features(audio_bytes):
+    """音声特徴量から感情を推定（フォールバック）"""
     wav_bytes_16k = to_wav_bytes(audio_bytes, target_sr=16000)
     y, sr = sf.read(io.BytesIO(wav_bytes_16k), dtype="float32")
     feat = extract_features(y, sr)
     scores["neutral"] += 0.3
     return label, scores, "Features"
+# ===== AI推定（S3PRL）=====
+def _normalize_label(lbl: str) -> str:
+    m = {"happy": "happiness", "angry": "anger", "sad": "sadness", "neutral": "neutral"}
     return m.get(lbl.lower(), lbl)
 def predict_emotion_ai(audio_bytes):
+    """
+    S3PRL上流 + HF下流(.pt) で推論。
+    """
     try:
+        upstream, head, id2label, device = load_kushinada_s3prl()
     except Exception as e:
         st.error(f"モデルのロードに失敗しました: {e}")
         st.info("音声特徴量ベースの分析に切り替えます。")
             y = y[:max_samples]
             st.warning("音声が30秒を超えたため、最初の30秒のみを分析します。")
+        wav = torch.tensor(y, dtype=torch.float32, device=device).unsqueeze(0)  # [1, T]
         with torch.no_grad():
+            reps_dict = upstream(wav)  # S3PRL Upstream の出力
+            if isinstance(reps_dict, dict):
+                reps = reps_dict.get("last_hidden_state", None)
+                if reps is None:
+                    # 代替：最終層の hidden_states など
+                    if "hidden_states" in reps_dict and isinstance(reps_dict["hidden_states"], (list, tuple)):
+                        reps = reps_dict["hidden_states"][-1]
+                    else:
+                        # 直接テンソルが来る実装もある
+                        reps = list(reps_dict.values())[-1]
+            else:
+                reps = reps_dict  # テンソル想定 [B, T, H]
+            logits = head(reps)  # [B, C]
+            probs = torch.softmax(logits, dim=-1)[0].detach().cpu().numpy()
+        pred_id = int(np.argmax(probs))
+        raw_label = id2label[pred_id]
+        label = _normalize_label(raw_label)
+        scores = {_normalize_label(id2label[i]): float(probs[i]) for i in range(len(probs))}
         for k in list(scores.keys()):
             scores[k] = max(0.0, min(1.0, scores[k]))
+        return label, scores, "AI(S3PRL)"
     except Exception as e:
         st.warning(f"AI予測中にエラーが発生: {e}")
         if key not in st.session_state: st.session_state[key] = default
     st.subheader("1) 録音またはアップロード")
+    with st.warning("⚠️ ファイルアップロードで403が出る場合は、録音機能をご利用ください。"):
+        st.markdown("**🎤 録音** → 直接話す or 端末で音声再生しながら録音")
     tab_rec, tab_upload = st.tabs(["🎤 録音する（推奨）", "📁 ファイルを使う"])