Spaces:

Isk5434
/

sound-classify

Sleeping

App Files Files Community

Isk5434 commited on Mar 5

Commit

e85ca7f

verified ·

1 Parent(s): 8372e28

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +102 -279

app.py CHANGED Viewed

@@ -263,6 +263,89 @@ def add_sample_cb(audio, label):
     DATA.append({"audio": audio_n, "U": U, "label": label})
     return dataset_table(), gr.update(value=None)
 def undo_last_cb():
     if len(DATA) == 0:
         return dataset_table()
@@ -325,212 +408,6 @@ HEAD = """
 <link rel="preconnect" href="https://fonts.googleapis.com">
 <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
 <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@300;400;500;600&display=swap" rel="stylesheet">
-<script>
-// ── Auto-stop recording with silence detection ──
-window._autoRec = {
-  mediaStream: null,
-  audioCtx: null,
-  analyser: null,
-  scriptNode: null,
-  chunks: [],
-  isRecording: false,
-  speechDetected: false,
-  silenceStart: 0,
-  SILENCE_THRESH: 0.015,
-  SILENCE_DURATION: 1.5,
-  MIN_SPEECH_DURATION: 0.3,
-  speechStart: 0,
-  async start() {
-    if (this.isRecording) return;
-    try {
-      this.mediaStream = await navigator.mediaDevices.getUserMedia({audio: true});
-    } catch(e) {
-      const el = document.getElementById('rec_status');
-      if (el) el.textContent = 'マイク許可が必要です';
-      return;
-    }
-    this.audioCtx = new (window.AudioContext || window.webkitAudioContext)({sampleRate: 16000});
-    const source = this.audioCtx.createMediaStreamSource(this.mediaStream);
-    this.analyser = this.audioCtx.createAnalyser();
-    this.analyser.fftSize = 2048;
-    source.connect(this.analyser);
-    // ScriptProcessorNode to capture raw PCM
-    this.scriptNode = this.audioCtx.createScriptProcessor(4096, 1, 1);
-    this.chunks = [];
-    this.isRecording = true;
-    this.speechDetected = false;
-    this.silenceStart = 0;
-    this.speechStart = 0;
-    const self = this;
-    this.scriptNode.onaudioprocess = function(e) {
-      if (!self.isRecording) return;
-      const input = e.inputBuffer.getChannelData(0);
-      const buf = new Float32Array(input.length);
-      buf.set(input);
-      self.chunks.push(buf);
-      // RMS calculation
-      let sum = 0;
-      for (let i = 0; i < input.length; i++) sum += input[i] * input[i];
-      const rms = Math.sqrt(sum / input.length);
-      const now = self.audioCtx.currentTime;
-      // Update status
-      const el = document.getElementById('rec_status');
-      const bar = document.getElementById('rec_level');
-      if (bar) bar.style.width = Math.min(rms * 500, 100) + '%';
-      if (rms > self.SILENCE_THRESH) {
-        // Speech detected
-        if (!self.speechDetected) {
-          self.speechDetected = true;
-          self.speechStart = now;
-        }
-        self.silenceStart = 0;
-        if (el) el.textContent = '録音中... 🎙️';
-      } else {
-        // Silence
-        if (self.speechDetected) {
-          const speechDur = now - self.speechStart;
-          if (speechDur >= self.MIN_SPEECH_DURATION) {
-            if (self.silenceStart === 0) {
-              self.silenceStart = now;
-            } else if (now - self.silenceStart >= self.SILENCE_DURATION) {
-              // Auto stop!
-              if (el) el.textContent = '保存中...';
-              self.stop(true);
-              return;
-            }
-          }
-          if (el && self.silenceStart > 0) {
-            const remaining = self.SILENCE_DURATION - (now - self.silenceStart);
-            el.textContent = '無音検出中... ' + remaining.toFixed(1) + 's';
-          }
-        } else {
-          if (el) el.textContent = '待機中... 話してください';
-        }
-      }
-    };
-    source.connect(this.scriptNode);
-    this.scriptNode.connect(this.audioCtx.destination);
-    const el = document.getElementById('rec_status');
-    if (el) el.textContent = '待機中... 話してください';
-    const btn = document.getElementById('btn_auto_rec');
-    if (btn) {
-      btn.textContent = '録音中...';
-      btn.classList.add('recording');
-    }
-  },
-  stop(autoSave) {
-    if (!this.isRecording) return;
-    this.isRecording = false;
-    // Stop media
-    if (this.scriptNode) { this.scriptNode.disconnect(); this.scriptNode = null; }
-    if (this.mediaStream) { this.mediaStream.getTracks().forEach(t => t.stop()); this.mediaStream = null; }
-    const btn = document.getElementById('btn_auto_rec');
-    if (btn) {
-      btn.textContent = '録音開始';
-      btn.classList.remove('recording');
-    }
-    if (autoSave && this.chunks.length > 0 && this.speechDetected) {
-      // Concatenate chunks
-      let totalLen = 0;
-      for (const c of this.chunks) totalLen += c.length;
-      const fullAudio = new Float32Array(totalLen);
-      let offset = 0;
-      for (const c of this.chunks) { fullAudio.set(c, offset); offset += c.length; }
-      // Trim trailing silence (remove last SILENCE_DURATION worth of samples)
-      const sr = this.audioCtx ? this.audioCtx.sampleRate : 16000;
-      const trimSamples = Math.floor(this.SILENCE_DURATION * sr);
-      const trimmedLen = Math.max(sr, totalLen - trimSamples);
-      const trimmed = fullAudio.slice(0, trimmedLen);
-      // Convert to WAV blob
-      const wav = this._encodeWAV(trimmed, sr);
-      const blob = new Blob([wav], {type: 'audio/wav'});
-      // Set to Gradio hidden audio input
-      this._setGradioAudio(blob);
-    }
-    if (this.audioCtx) { this.audioCtx.close(); this.audioCtx = null; }
-    this.chunks = [];
-    const el = document.getElementById('rec_status');
-    if (el) el.textContent = autoSave ? '自動保存完了 ✓' : '停止';
-  },
-  _encodeWAV(samples, sampleRate) {
-    const buffer = new ArrayBuffer(44 + samples.length * 2);
-    const view = new DataView(buffer);
-    function writeStr(o, s) { for (let i = 0; i < s.length; i++) view.setUint8(o+i, s.charCodeAt(i)); }
-    writeStr(0, 'RIFF');
-    view.setUint32(4, 36 + samples.length * 2, true);
-    writeStr(8, 'WAVE');
-    writeStr(12, 'fmt ');
-    view.setUint32(16, 16, true);
-    view.setUint16(20, 1, true);
-    view.setUint16(22, 1, true);
-    view.setUint32(24, sampleRate, true);
-    view.setUint32(28, sampleRate * 2, true);
-    view.setUint16(32, 2, true);
-    view.setUint16(34, 16, true);
-    writeStr(36, 'data');
-    view.setUint32(40, samples.length * 2, true);
-    for (let i = 0; i < samples.length; i++) {
-      let s = Math.max(-1, Math.min(1, samples[i]));
-      view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
-    }
-    return buffer;
-  },
-  _setGradioAudio(blob) {
-    // Find the hidden audio input and set the file
-    const url = URL.createObjectURL(blob);
-    const hiddenAudio = document.querySelector('#hidden_audio_input audio, #hidden_audio_input input[type="file"]');
-    // Use DataTransfer to programmatically set file on Gradio's file input
-    const file = new File([blob], 'recording.wav', {type: 'audio/wav'});
-    const dt = new DataTransfer();
-    dt.items.add(file);
-    // Find file input inside hidden audio component
-    const container = document.getElementById('hidden_audio_input');
-    if (!container) return;
-    // Gradio 6: Upload via drag event or input change
-    const fileInput = container.querySelector('input[type="file"]');
-    if (fileInput) {
-      fileInput.files = dt.files;
-      fileInput.dispatchEvent(new Event('change', {bubbles: true}));
-      // Auto-click submit after short delay
-      setTimeout(() => {
-        const submitBtn = document.getElementById('btn_auto_submit');
-        if (submitBtn) submitBtn.click();
-      }, 500);
-      return;
-    }
-    // Fallback: dispatch drop event
-    const dropEvent = new DragEvent('drop', {bubbles: true, dataTransfer: dt});
-    container.dispatchEvent(dropEvent);
-    setTimeout(() => {
-      const submitBtn = document.getElementById('btn_auto_submit');
-      if (submitBtn) submitBtn.click();
-    }, 500);
-  }
-};
-</script>
 """
 CSS = """
@@ -1033,64 +910,20 @@ textarea {
 }
 /* ========================================
-   自動録音: ボタン & レベルメーター
    ======================================== */
-.auto-rec-area {
     text-align: center;
-    padding: 16px 0;
-}
-#btn_auto_rec {
-    display: inline-block;
-    padding: 16px 40px;
-    border: 2px solid transparent;
-    border-image: linear-gradient(135deg, #b2d8e8 0%, #b2e0d4 50%, #c8e8c0 100%) 1;
-    background: transparent;
     font-family: 'Cormorant Garamond', 'Georgia', serif;
-    font-size: 15px;
-    font-weight: 500;
-    letter-spacing: 0.15em;
-    text-transform: uppercase;
-    color: #2a3a2a;
-    cursor: pointer;
-    transition: all 0.3s ease;
-    touch-action: manipulation;
-}
-#btn_auto_rec:hover {
-    background: rgba(178,216,210,0.15);
-}
-#btn_auto_rec.recording {
-    border-image: none;
-    border-color: #d45050;
-    color: #d45050;
-    animation: recPulse 1.2s ease infinite;
 }
 @keyframes recPulse {
     0%, 100% { opacity: 1; }
     50% { opacity: 0.6; }
 }
-.rec-level-wrap {
-    margin: 12px auto 0;
-    width: 80%;
-    max-width: 280px;
-    height: 4px;
-    background: rgba(138,170,138,0.2);
-    border-radius: 0;
-    overflow: hidden;
-}
-.rec-level-bar {
-    height: 100%;
-    width: 0%;
-    background: linear-gradient(90deg, #b2d8e8, #90d4c8, #b0e0a0);
-    transition: width 0.08s ease;
-}
-#rec_status {
-    display: block;
-    margin-top: 8px;
-    font-family: 'Cormorant Garamond', 'Georgia', serif;
-    font-size: 13px;
-    color: #4a6a4a;
-    letter-spacing: 0.06em;
-}
 /* ========================================
    スクロールバー
@@ -1204,28 +1037,16 @@ with gr.Blocks() as demo:
                 label_box = gr.Textbox(label="新ラベル", placeholder="例: yes", scale=3)
                 add_btn = gr.Button("追加", size="lg", scale=1)
-            # 録音 & サンプル追加
-            gr.Markdown("### 録音")
             label_dd = gr.Radio(choices=LABELS, label="ラベル選択", interactive=True, elem_classes=["diamond-radio"])
-            # 自動録音UI（JSで制御）
-            gr.HTML("""
-            <div class="auto-rec-area">
-                <button id="btn_auto_rec" type="button"
-                    onclick="window._autoRec.isRecording ? window._autoRec.stop(false) : window._autoRec.start()">
-                    録音開始
-                </button>
-                <div class="rec-level-wrap"><div class="rec-level-bar" id="rec_level"></div></div>
-                <span id="rec_status">ラベルを選択して録音開始</span>
-            </div>
-            """)
-            # 隠し音声入力（JSからWAVを受け取る）
-            audio_rec = gr.Audio(type="numpy", label="録音データ", visible=False, elem_id="hidden_audio_input")
-            add_sample_btn = gr.Button("自動保存", variant="primary", size="lg", visible=False, elem_id="btn_auto_submit")
-            with gr.Row():
-                undo_btn = gr.Button("Undo", size="lg")
-                rerec_btn = gr.Button("Clear", size="lg")
             # データ一覧 & 編集
             gr.Markdown("### データ一覧")
@@ -1234,7 +1055,7 @@ with gr.Blocks() as demo:
                 value=dataset_table(),
                 datatype=["number", "str", "number"],
                 row_count=(6, "dynamic"),
-                col_count=(3, "fixed"),
                 interactive=False,
                 elem_id="data_table"
             )
@@ -1263,9 +1084,11 @@ with gr.Blocks() as demo:
     add_btn.click(add_label_cb, inputs=[label_box], outputs=[label_dd, table, relabel_dd])
     add_sample_btn.click(add_sample_cb, inputs=[audio_rec, label_dd], outputs=[table, audio_rec])
     undo_btn.click(undo_last_cb, inputs=[], outputs=[table])
-    rerec_btn.click(clear_rec_cb, inputs=[], outputs=[audio_rec])
     reset_btn.click(reset_all_cb, inputs=[], outputs=[table, label_dd, relabel_dd, audio_rec, selected_idx_state])
     # select row -> update state + replay + relabel dropdown value
     def _select_and_store(evt: gr.SelectData):
         if evt is None or evt.index is None:

     DATA.append({"audio": audio_n, "U": U, "label": label})
     return dataset_table(), gr.update(value=None)
+# ── 自動録音: ストリーミング蓄積 + 無音検出 ──
+SILENCE_THRESH = 0.012
+SILENCE_CHUNKS_NEEDED = 3   # 約1.5秒（stream_every=0.5sなので 0.5×3=1.5s）
+MIN_SPEECH_CHUNKS = 2       # 最低2チャンク分の音声
+def _new_rec_state():
+    return {"chunks": [], "speech_detected": False, "silence_count": 0, "saved": False, "status": "待機中"}
+def rec_stream_cb(chunk, label, rec_state):
+    """ストリーミング録音: 音声蓄積 + 無音検出 → 自動保存"""
+    if rec_state is None:
+        rec_state = _new_rec_state()
+    if rec_state.get("saved", False):
+        # 前回保存済み → リセット
+        rec_state = _new_rec_state()
+    if chunk is None:
+        return rec_state, dataset_table(), "待機中... マイクを開始してください"
+    sr, y = chunk
+    if y is None or len(y) < 10:
+        return rec_state, gr.update(), rec_state.get("status", "")
+    y = _mono_float32(y)
+    # RMS計算
+    rms = float(np.sqrt(np.mean(y ** 2)))
+    if rms > SILENCE_THRESH:
+        # 音声あり
+        rec_state["chunks"].append((sr, y.copy()))
+        rec_state["speech_detected"] = True
+        rec_state["silence_count"] = 0
+        n = len(rec_state["chunks"])
+        rec_state["status"] = f"録音中... 🎙️ ({n} chunks)"
+    else:
+        # 無音
+        if rec_state["speech_detected"]:
+            rec_state["silence_count"] = rec_state.get("silence_count", 0) + 1
+            remaining = max(0, SILENCE_CHUNKS_NEEDED - rec_state["silence_count"])
+            rec_state["status"] = f"無音検出中... あと{remaining}で自動保存"
+            if rec_state["silence_count"] >= SILENCE_CHUNKS_NEEDED:
+                # 十分な音声があれば保存
+                if len(rec_state["chunks"]) >= MIN_SPEECH_CHUNKS:
+                    label = (label or "").strip()
+                    if label in LABELS:
+                        # チャンクを結合
+                        all_y = []
+                        final_sr = SR
+                        for s, y_chunk in rec_state["chunks"]:
+                            if s != SR:
+                                y_chunk = librosa.resample(y_chunk, orig_sr=s, target_sr=SR)
+                            all_y.append(y_chunk)
+                        full_y = np.concatenate(all_y).astype(np.float32)
+                        full_y /= (np.max(np.abs(full_y)) + 1e-9)
+                        audio_n = (SR, full_y)
+                        U = audio_to_sequence(audio_n)
+                        if U is not None and len(U) >= 5:
+                            DATA.append({"audio": audio_n, "U": U, "label": label})
+                            rec_state["status"] = f"✓ 自動保存完了 (idx={len(DATA)-1})"
+                        else:
+                            rec_state["status"] = "音声が短すぎます"
+                    else:
+                        rec_state["status"] = "ラベルを選択してください"
+                    rec_state["saved"] = True
+                    rec_state["chunks"] = []
+                    rec_state["speech_detected"] = False
+                    rec_state["silence_count"] = 0
+                    return rec_state, dataset_table(), rec_state["status"]
+                else:
+                    rec_state["status"] = "音声が短すぎます。もう一度話してください"
+                    rec_state["chunks"] = []
+                    rec_state["speech_detected"] = False
+                    rec_state["silence_count"] = 0
+        else:
+            rec_state["status"] = "待機中... 話してください 🎤"
+    return rec_state, gr.update(), rec_state.get("status", "")
 def undo_last_cb():
     if len(DATA) == 0:
         return dataset_table()
 <link rel="preconnect" href="https://fonts.googleapis.com">
 <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
 <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@300;400;500;600&display=swap" rel="stylesheet">
 """
 CSS = """
 }
 /* ========================================
+   自動録音: ステータス表示
    ======================================== */
+.rec-status {
     text-align: center;
+    padding: 8px 12px;
     font-family: 'Cormorant Garamond', 'Georgia', serif;
+    font-size: 14px;
+    color: #4a6a4a;
+    letter-spacing: 0.06em;
 }
 @keyframes recPulse {
     0%, 100% { opacity: 1; }
     50% { opacity: 0.6; }
 }
 /* ========================================
    スクロールバー
                 label_box = gr.Textbox(label="新ラベル", placeholder="例: yes", scale=3)
                 add_btn = gr.Button("追加", size="lg", scale=1)
+            # 録音 & サンプル追加（自動停止）
+            gr.Markdown("### 録音（自動停止）")
             label_dd = gr.Radio(choices=LABELS, label="ラベル選択", interactive=True, elem_classes=["diamond-radio"])
+            rec_audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="マイク（開始で録音）")
+            rec_status_md = gr.Markdown("待機中... マイクを開始してください", elem_classes=["rec-status"])
+            rec_state = gr.State(None)
+            # 隠し（手動追加用に残す）
+            audio_rec = gr.Audio(type="numpy", visible=False)
+            add_sample_btn = gr.Button("追加", variant="primary", size="lg", visible=False)
+            undo_btn = gr.Button("Undo", size="lg")
             # データ一覧 & 編集
             gr.Markdown("### データ一覧")
                 value=dataset_table(),
                 datatype=["number", "str", "number"],
                 row_count=(6, "dynamic"),
+                column_count=(3, "fixed"),
                 interactive=False,
                 elem_id="data_table"
             )
     add_btn.click(add_label_cb, inputs=[label_box], outputs=[label_dd, table, relabel_dd])
     add_sample_btn.click(add_sample_cb, inputs=[audio_rec, label_dd], outputs=[table, audio_rec])
     undo_btn.click(undo_last_cb, inputs=[], outputs=[table])
     reset_btn.click(reset_all_cb, inputs=[], outputs=[table, label_dd, relabel_dd, audio_rec, selected_idx_state])
+    # ストリーミング録音 → 自動停止 & 保存
+    rec_audio.stream(rec_stream_cb, inputs=[rec_audio, label_dd, rec_state], outputs=[rec_state, table, rec_status_md], stream_every=0.5)
     # select row -> update state + replay + relabel dropdown value
     def _select_and_store(evt: gr.SelectData):
         if evt is None or evt.index is None: