Isk5434 commited on
Commit
2349f4b
·
verified ·
1 Parent(s): e85ca7f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +119 -89
app.py CHANGED
@@ -263,88 +263,21 @@ def add_sample_cb(audio, label):
263
  DATA.append({"audio": audio_n, "U": U, "label": label})
264
  return dataset_table(), gr.update(value=None)
265
 
266
- # ── 自動録音: ストリーミング蓄積 + 無音検出 ──
267
-
268
- SILENCE_THRESH = 0.012
269
- SILENCE_CHUNKS_NEEDED = 3 # 約1.5秒(stream_every=0.5sなので 0.5×3=1.5s)
270
- MIN_SPEECH_CHUNKS = 2 # 最低2チャンク分の音声
271
-
272
- def _new_rec_state():
273
- return {"chunks": [], "speech_detected": False, "silence_count": 0, "saved": False, "status": "待機中"}
274
-
275
- def rec_stream_cb(chunk, label, rec_state):
276
- """ストリーミング録音: 音声蓄積 + 無音検出 → 自動保存"""
277
- if rec_state is None:
278
- rec_state = _new_rec_state()
279
-
280
- if rec_state.get("saved", False):
281
- # 前回保存済み → リセット
282
- rec_state = _new_rec_state()
283
-
284
- if chunk is None:
285
- return rec_state, dataset_table(), "待機中... マイクを開始してください"
286
-
287
- sr, y = chunk
288
- if y is None or len(y) < 10:
289
- return rec_state, gr.update(), rec_state.get("status", "")
290
 
291
- y = _mono_float32(y)
 
 
 
292
 
293
- # RMS計算
294
- rms = float(np.sqrt(np.mean(y ** 2)))
295
-
296
- if rms > SILENCE_THRESH:
297
- # 音声あり
298
- rec_state["chunks"].append((sr, y.copy()))
299
- rec_state["speech_detected"] = True
300
- rec_state["silence_count"] = 0
301
- n = len(rec_state["chunks"])
302
- rec_state["status"] = f"録音中... 🎙️ ({n} chunks)"
303
- else:
304
- # 無音
305
- if rec_state["speech_detected"]:
306
- rec_state["silence_count"] = rec_state.get("silence_count", 0) + 1
307
- remaining = max(0, SILENCE_CHUNKS_NEEDED - rec_state["silence_count"])
308
- rec_state["status"] = f"無音検出中... あと{remaining}で自動保存"
309
-
310
- if rec_state["silence_count"] >= SILENCE_CHUNKS_NEEDED:
311
- # 十分な音声があれば保存
312
- if len(rec_state["chunks"]) >= MIN_SPEECH_CHUNKS:
313
- label = (label or "").strip()
314
- if label in LABELS:
315
- # チャンクを結合
316
- all_y = []
317
- final_sr = SR
318
- for s, y_chunk in rec_state["chunks"]:
319
- if s != SR:
320
- y_chunk = librosa.resample(y_chunk, orig_sr=s, target_sr=SR)
321
- all_y.append(y_chunk)
322
- full_y = np.concatenate(all_y).astype(np.float32)
323
- full_y /= (np.max(np.abs(full_y)) + 1e-9)
324
- audio_n = (SR, full_y)
325
- U = audio_to_sequence(audio_n)
326
- if U is not None and len(U) >= 5:
327
- DATA.append({"audio": audio_n, "U": U, "label": label})
328
- rec_state["status"] = f"✓ 自動保存完了 (idx={len(DATA)-1})"
329
- else:
330
- rec_state["status"] = "音声が短すぎます"
331
- else:
332
- rec_state["status"] = "ラベルを選択してください"
333
-
334
- rec_state["saved"] = True
335
- rec_state["chunks"] = []
336
- rec_state["speech_detected"] = False
337
- rec_state["silence_count"] = 0
338
- return rec_state, dataset_table(), rec_state["status"]
339
- else:
340
- rec_state["status"] = "音声が短すぎます。もう一度話してください"
341
- rec_state["chunks"] = []
342
- rec_state["speech_detected"] = False
343
- rec_state["silence_count"] = 0
344
- else:
345
- rec_state["status"] = "待機中... 話してください 🎤"
346
-
347
- return rec_state, gr.update(), rec_state.get("status", "")
348
 
349
  def undo_last_cb():
350
  if len(DATA) == 0:
@@ -408,6 +341,106 @@ HEAD = """
408
  <link rel="preconnect" href="https://fonts.googleapis.com">
409
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
410
  <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@300;400;500;600&display=swap" rel="stylesheet">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  """
412
 
413
  CSS = """
@@ -1040,12 +1073,10 @@ with gr.Blocks() as demo:
1040
  # 録音 & サンプル追加(自動停止)
1041
  gr.Markdown("### 録音(自動停止)")
1042
  label_dd = gr.Radio(choices=LABELS, label="ラベル選択", interactive=True, elem_classes=["diamond-radio"])
1043
- rec_audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="マイク(開始で録音)")
1044
- rec_status_md = gr.Markdown("待機中... マイクを開始してください", elem_classes=["rec-status"])
1045
- rec_state = gr.State(None)
1046
- # 隠し(手動追加用に残す)
1047
- audio_rec = gr.Audio(type="numpy", visible=False)
1048
- add_sample_btn = gr.Button("追加", variant="primary", size="lg", visible=False)
1049
  undo_btn = gr.Button("Undo", size="lg")
1050
 
1051
  # データ一覧 & 編集
@@ -1082,12 +1113,11 @@ with gr.Blocks() as demo:
1082
 
1083
  # wiring
1084
  add_btn.click(add_label_cb, inputs=[label_box], outputs=[label_dd, table, relabel_dd])
1085
- add_sample_btn.click(add_sample_cb, inputs=[audio_rec, label_dd], outputs=[table, audio_rec])
1086
  undo_btn.click(undo_last_cb, inputs=[], outputs=[table])
1087
  reset_btn.click(reset_all_cb, inputs=[], outputs=[table, label_dd, relabel_dd, audio_rec, selected_idx_state])
1088
 
1089
- # ストリーミング録音 → 自動停止 & 保存
1090
- rec_audio.stream(rec_stream_cb, inputs=[rec_audio, label_dd, rec_state], outputs=[rec_state, table, rec_status_md], stream_every=0.5)
1091
 
1092
  # select row -> update state + replay + relabel dropdown value
1093
  def _select_and_store(evt: gr.SelectData):
 
263
  DATA.append({"audio": audio_n, "U": U, "label": label})
264
  return dataset_table(), gr.update(value=None)
265
 
266
+ def auto_add_sample_cb(audio, label):
267
+ """録音完了時に自動でDATAに追加"""
268
+ label = (label or "").strip()
269
+ if label not in LABELS:
270
+ return dataset_table(), gr.update(value=None), "⚠ ラベルを選択してください"
271
+ if audio is None:
272
+ return dataset_table(), gr.update(value=None), "待機中..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ audio_n = normalize_audio_tuple(audio)
275
+ U = audio_to_sequence(audio_n)
276
+ if U is None or len(U) < 5:
277
+ return dataset_table(), gr.update(value=None), "⚠ 音声が短すぎます"
278
 
279
+ DATA.append({"audio": audio_n, "U": U, "label": label})
280
+ return dataset_table(), gr.update(value=None), f"✓ 保存完了 (idx={len(DATA)-1})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
  def undo_last_cb():
283
  if len(DATA) == 0:
 
341
  <link rel="preconnect" href="https://fonts.googleapis.com">
342
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
343
  <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@300;400;500;600&display=swap" rel="stylesheet">
344
+ <script>
345
+ /* ── 無音自動停止: Web Audio API で音量監視 → 停止ボタン自動クリック ── */
346
+ (function(){
347
+ const SILENCE_THRESHOLD = 0.01;
348
+ const SILENCE_MS = 1500;
349
+ const SPEECH_THRESHOLD = 0.015;
350
+ let audioCtx, analyser, srcNode, silenceStart, speechDetected, monitoring;
351
+
352
+ function getRecArea() {
353
+ return document.getElementById('auto_rec_area');
354
+ }
355
+ function getStopBtn() {
356
+ const area = getRecArea();
357
+ if (!area) return null;
358
+ /* Gradio 6: 録音中は stop ボタン(■)が出る。aria-label で探す */
359
+ let btn = area.querySelector('button[aria-label="Stop recording"]');
360
+ if (btn) return btn;
361
+ btn = area.querySelector('button[aria-label="停止"]');
362
+ if (btn) return btn;
363
+ /* フォールバック: 録音中に表示される赤い■ボタンを探す */
364
+ const btns = area.querySelectorAll('button');
365
+ for (const b of btns) {
366
+ const svg = b.querySelector('svg');
367
+ if (svg) {
368
+ const rect = svg.querySelector('rect');
369
+ if (rect) return b; /* ■アイコンがある = stop */
370
+ }
371
+ }
372
+ return null;
373
+ }
374
+
375
+ function startMonitoring(stream) {
376
+ if (monitoring) return;
377
+ monitoring = true;
378
+ speechDetected = false;
379
+ silenceStart = null;
380
+ audioCtx = new (window.AudioContext || window.webkitAudioContext)();
381
+ analyser = audioCtx.createAnalyser();
382
+ analyser.fftSize = 512;
383
+ srcNode = audioCtx.createMediaStreamSource(stream);
384
+ srcNode.connect(analyser);
385
+ const buf = new Float32Array(analyser.fftSize);
386
+ const statusEl = document.getElementById('rec_status_js');
387
+
388
+ function tick() {
389
+ if (!monitoring) return;
390
+ analyser.getFloatTimeDomainData(buf);
391
+ let sum = 0;
392
+ for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
393
+ const rms = Math.sqrt(sum / buf.length);
394
+
395
+ if (rms > SPEECH_THRESHOLD) {
396
+ speechDetected = true;
397
+ silenceStart = null;
398
+ if (statusEl) statusEl.textContent = '録音中... 🎙️';
399
+ } else if (speechDetected && rms < SILENCE_THRESHOLD) {
400
+ if (!silenceStart) silenceStart = Date.now();
401
+ const elapsed = Date.now() - silenceStart;
402
+ const remain = Math.max(0, Math.ceil((SILENCE_MS - elapsed) / 1000));
403
+ if (statusEl) statusEl.textContent = '無音検出中... あと' + remain + '秒';
404
+ if (elapsed >= SILENCE_MS) {
405
+ if (statusEl) statusEl.textContent = '自動停止...';
406
+ const stopBtn = getStopBtn();
407
+ if (stopBtn) stopBtn.click();
408
+ stopMonitoring();
409
+ return;
410
+ }
411
+ } else {
412
+ if (statusEl) statusEl.textContent = '待機中... 話してください 🎤';
413
+ }
414
+ requestAnimationFrame(tick);
415
+ }
416
+ tick();
417
+ }
418
+
419
+ function stopMonitoring() {
420
+ monitoring = false;
421
+ if (srcNode) { try { srcNode.disconnect(); } catch(e){} }
422
+ if (audioCtx) { try { audioCtx.close(); } catch(e){} }
423
+ srcNode = null; audioCtx = null; analyser = null;
424
+ }
425
+
426
+ /* MediaStream を横取り: getUserMedia を wrap */
427
+ const origGetUserMedia = navigator.mediaDevices.getUserMedia.bind(navigator.mediaDevices);
428
+ navigator.mediaDevices.getUserMedia = function(constraints) {
429
+ return origGetUserMedia(constraints).then(function(stream) {
430
+ if (constraints && constraints.audio) {
431
+ const area = getRecArea();
432
+ if (area) {
433
+ startMonitoring(stream);
434
+ stream.getAudioTracks().forEach(function(track) {
435
+ track.addEventListener('ended', stopMonitoring);
436
+ });
437
+ }
438
+ }
439
+ return stream;
440
+ });
441
+ };
442
+ })();
443
+ </script>
444
  """
445
 
446
  CSS = """
 
1073
  # 録音 & サンプル追加(自動停止)
1074
  gr.Markdown("### 録音(自動停止)")
1075
  label_dd = gr.Radio(choices=LABELS, label="ラベル選択", interactive=True, elem_classes=["diamond-radio"])
1076
+ with gr.Column(elem_id="auto_rec_area"):
1077
+ audio_rec = gr.Audio(sources=["microphone"], type="numpy", label="マイク(録音→自動停止→自動保存)")
1078
+ gr.HTML('<div id="rec_status_js" class="rec-status">待機中... 録音ボタンを押してください</div>')
1079
+ rec_status_md = gr.Markdown("", elem_classes=["rec-status"])
 
 
1080
  undo_btn = gr.Button("Undo", size="lg")
1081
 
1082
  # データ一覧 & 編集
 
1113
 
1114
  # wiring
1115
  add_btn.click(add_label_cb, inputs=[label_box], outputs=[label_dd, table, relabel_dd])
 
1116
  undo_btn.click(undo_last_cb, inputs=[], outputs=[table])
1117
  reset_btn.click(reset_all_cb, inputs=[], outputs=[table, label_dd, relabel_dd, audio_rec, selected_idx_state])
1118
 
1119
+ # 録音完了(停止)時に自動保存
1120
+ audio_rec.stop_recording(auto_add_sample_cb, inputs=[audio_rec, label_dd], outputs=[table, audio_rec, rec_status_md])
1121
 
1122
  # select row -> update state + replay + relabel dropdown value
1123
  def _select_and_store(evt: gr.SelectData):