Isk5434 commited on
Commit
e85ca7f
·
verified ·
1 Parent(s): 8372e28

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +102 -279
app.py CHANGED
@@ -263,6 +263,89 @@ def add_sample_cb(audio, label):
263
  DATA.append({"audio": audio_n, "U": U, "label": label})
264
  return dataset_table(), gr.update(value=None)
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  def undo_last_cb():
267
  if len(DATA) == 0:
268
  return dataset_table()
@@ -325,212 +408,6 @@ HEAD = """
325
  <link rel="preconnect" href="https://fonts.googleapis.com">
326
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
327
  <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@300;400;500;600&display=swap" rel="stylesheet">
328
- <script>
329
- // ── Auto-stop recording with silence detection ──
330
- window._autoRec = {
331
- mediaStream: null,
332
- audioCtx: null,
333
- analyser: null,
334
- scriptNode: null,
335
- chunks: [],
336
- isRecording: false,
337
- speechDetected: false,
338
- silenceStart: 0,
339
- SILENCE_THRESH: 0.015,
340
- SILENCE_DURATION: 1.5,
341
- MIN_SPEECH_DURATION: 0.3,
342
- speechStart: 0,
343
-
344
- async start() {
345
- if (this.isRecording) return;
346
- try {
347
- this.mediaStream = await navigator.mediaDevices.getUserMedia({audio: true});
348
- } catch(e) {
349
- const el = document.getElementById('rec_status');
350
- if (el) el.textContent = 'マイク許可が必要です';
351
- return;
352
- }
353
- this.audioCtx = new (window.AudioContext || window.webkitAudioContext)({sampleRate: 16000});
354
- const source = this.audioCtx.createMediaStreamSource(this.mediaStream);
355
- this.analyser = this.audioCtx.createAnalyser();
356
- this.analyser.fftSize = 2048;
357
- source.connect(this.analyser);
358
-
359
- // ScriptProcessorNode to capture raw PCM
360
- this.scriptNode = this.audioCtx.createScriptProcessor(4096, 1, 1);
361
- this.chunks = [];
362
- this.isRecording = true;
363
- this.speechDetected = false;
364
- this.silenceStart = 0;
365
- this.speechStart = 0;
366
-
367
- const self = this;
368
- this.scriptNode.onaudioprocess = function(e) {
369
- if (!self.isRecording) return;
370
- const input = e.inputBuffer.getChannelData(0);
371
- const buf = new Float32Array(input.length);
372
- buf.set(input);
373
- self.chunks.push(buf);
374
-
375
- // RMS calculation
376
- let sum = 0;
377
- for (let i = 0; i < input.length; i++) sum += input[i] * input[i];
378
- const rms = Math.sqrt(sum / input.length);
379
- const now = self.audioCtx.currentTime;
380
-
381
- // Update status
382
- const el = document.getElementById('rec_status');
383
- const bar = document.getElementById('rec_level');
384
- if (bar) bar.style.width = Math.min(rms * 500, 100) + '%';
385
-
386
- if (rms > self.SILENCE_THRESH) {
387
- // Speech detected
388
- if (!self.speechDetected) {
389
- self.speechDetected = true;
390
- self.speechStart = now;
391
- }
392
- self.silenceStart = 0;
393
- if (el) el.textContent = '録音中... 🎙️';
394
- } else {
395
- // Silence
396
- if (self.speechDetected) {
397
- const speechDur = now - self.speechStart;
398
- if (speechDur >= self.MIN_SPEECH_DURATION) {
399
- if (self.silenceStart === 0) {
400
- self.silenceStart = now;
401
- } else if (now - self.silenceStart >= self.SILENCE_DURATION) {
402
- // Auto stop!
403
- if (el) el.textContent = '保存中...';
404
- self.stop(true);
405
- return;
406
- }
407
- }
408
- if (el && self.silenceStart > 0) {
409
- const remaining = self.SILENCE_DURATION - (now - self.silenceStart);
410
- el.textContent = '無音検出中... ' + remaining.toFixed(1) + 's';
411
- }
412
- } else {
413
- if (el) el.textContent = '待機中... 話してください';
414
- }
415
- }
416
- };
417
-
418
- source.connect(this.scriptNode);
419
- this.scriptNode.connect(this.audioCtx.destination);
420
-
421
- const el = document.getElementById('rec_status');
422
- if (el) el.textContent = '待機中... 話してください';
423
- const btn = document.getElementById('btn_auto_rec');
424
- if (btn) {
425
- btn.textContent = '録音中...';
426
- btn.classList.add('recording');
427
- }
428
- },
429
-
430
- stop(autoSave) {
431
- if (!this.isRecording) return;
432
- this.isRecording = false;
433
-
434
- // Stop media
435
- if (this.scriptNode) { this.scriptNode.disconnect(); this.scriptNode = null; }
436
- if (this.mediaStream) { this.mediaStream.getTracks().forEach(t => t.stop()); this.mediaStream = null; }
437
-
438
- const btn = document.getElementById('btn_auto_rec');
439
- if (btn) {
440
- btn.textContent = '録音開始';
441
- btn.classList.remove('recording');
442
- }
443
-
444
- if (autoSave && this.chunks.length > 0 && this.speechDetected) {
445
- // Concatenate chunks
446
- let totalLen = 0;
447
- for (const c of this.chunks) totalLen += c.length;
448
- const fullAudio = new Float32Array(totalLen);
449
- let offset = 0;
450
- for (const c of this.chunks) { fullAudio.set(c, offset); offset += c.length; }
451
-
452
- // Trim trailing silence (remove last SILENCE_DURATION worth of samples)
453
- const sr = this.audioCtx ? this.audioCtx.sampleRate : 16000;
454
- const trimSamples = Math.floor(this.SILENCE_DURATION * sr);
455
- const trimmedLen = Math.max(sr, totalLen - trimSamples);
456
- const trimmed = fullAudio.slice(0, trimmedLen);
457
-
458
- // Convert to WAV blob
459
- const wav = this._encodeWAV(trimmed, sr);
460
- const blob = new Blob([wav], {type: 'audio/wav'});
461
-
462
- // Set to Gradio hidden audio input
463
- this._setGradioAudio(blob);
464
- }
465
-
466
- if (this.audioCtx) { this.audioCtx.close(); this.audioCtx = null; }
467
- this.chunks = [];
468
-
469
- const el = document.getElementById('rec_status');
470
- if (el) el.textContent = autoSave ? '自動保存完了 ✓' : '停止';
471
- },
472
-
473
- _encodeWAV(samples, sampleRate) {
474
- const buffer = new ArrayBuffer(44 + samples.length * 2);
475
- const view = new DataView(buffer);
476
- function writeStr(o, s) { for (let i = 0; i < s.length; i++) view.setUint8(o+i, s.charCodeAt(i)); }
477
- writeStr(0, 'RIFF');
478
- view.setUint32(4, 36 + samples.length * 2, true);
479
- writeStr(8, 'WAVE');
480
- writeStr(12, 'fmt ');
481
- view.setUint32(16, 16, true);
482
- view.setUint16(20, 1, true);
483
- view.setUint16(22, 1, true);
484
- view.setUint32(24, sampleRate, true);
485
- view.setUint32(28, sampleRate * 2, true);
486
- view.setUint16(32, 2, true);
487
- view.setUint16(34, 16, true);
488
- writeStr(36, 'data');
489
- view.setUint32(40, samples.length * 2, true);
490
- for (let i = 0; i < samples.length; i++) {
491
- let s = Math.max(-1, Math.min(1, samples[i]));
492
- view.setInt16(44 + i * 2, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
493
- }
494
- return buffer;
495
- },
496
-
497
- _setGradioAudio(blob) {
498
- // Find the hidden audio input and set the file
499
- const url = URL.createObjectURL(blob);
500
- const hiddenAudio = document.querySelector('#hidden_audio_input audio, #hidden_audio_input input[type="file"]');
501
-
502
- // Use DataTransfer to programmatically set file on Gradio's file input
503
- const file = new File([blob], 'recording.wav', {type: 'audio/wav'});
504
- const dt = new DataTransfer();
505
- dt.items.add(file);
506
-
507
- // Find file input inside hidden audio component
508
- const container = document.getElementById('hidden_audio_input');
509
- if (!container) return;
510
-
511
- // Gradio 6: Upload via drag event or input change
512
- const fileInput = container.querySelector('input[type="file"]');
513
- if (fileInput) {
514
- fileInput.files = dt.files;
515
- fileInput.dispatchEvent(new Event('change', {bubbles: true}));
516
- // Auto-click submit after short delay
517
- setTimeout(() => {
518
- const submitBtn = document.getElementById('btn_auto_submit');
519
- if (submitBtn) submitBtn.click();
520
- }, 500);
521
- return;
522
- }
523
-
524
- // Fallback: dispatch drop event
525
- const dropEvent = new DragEvent('drop', {bubbles: true, dataTransfer: dt});
526
- container.dispatchEvent(dropEvent);
527
- setTimeout(() => {
528
- const submitBtn = document.getElementById('btn_auto_submit');
529
- if (submitBtn) submitBtn.click();
530
- }, 500);
531
- }
532
- };
533
- </script>
534
  """
535
 
536
  CSS = """
@@ -1033,64 +910,20 @@ textarea {
1033
  }
1034
 
1035
  /* ========================================
1036
- 自動録音: ボタン & レベルメータ
1037
  ======================================== */
1038
- .auto-rec-area {
1039
  text-align: center;
1040
- padding: 16px 0;
1041
- }
1042
- #btn_auto_rec {
1043
- display: inline-block;
1044
- padding: 16px 40px;
1045
- border: 2px solid transparent;
1046
- border-image: linear-gradient(135deg, #b2d8e8 0%, #b2e0d4 50%, #c8e8c0 100%) 1;
1047
- background: transparent;
1048
  font-family: 'Cormorant Garamond', 'Georgia', serif;
1049
- font-size: 15px;
1050
- font-weight: 500;
1051
- letter-spacing: 0.15em;
1052
- text-transform: uppercase;
1053
- color: #2a3a2a;
1054
- cursor: pointer;
1055
- transition: all 0.3s ease;
1056
- touch-action: manipulation;
1057
- }
1058
- #btn_auto_rec:hover {
1059
- background: rgba(178,216,210,0.15);
1060
- }
1061
- #btn_auto_rec.recording {
1062
- border-image: none;
1063
- border-color: #d45050;
1064
- color: #d45050;
1065
- animation: recPulse 1.2s ease infinite;
1066
  }
1067
  @keyframes recPulse {
1068
  0%, 100% { opacity: 1; }
1069
  50% { opacity: 0.6; }
1070
  }
1071
- .rec-level-wrap {
1072
- margin: 12px auto 0;
1073
- width: 80%;
1074
- max-width: 280px;
1075
- height: 4px;
1076
- background: rgba(138,170,138,0.2);
1077
- border-radius: 0;
1078
- overflow: hidden;
1079
- }
1080
- .rec-level-bar {
1081
- height: 100%;
1082
- width: 0%;
1083
- background: linear-gradient(90deg, #b2d8e8, #90d4c8, #b0e0a0);
1084
- transition: width 0.08s ease;
1085
- }
1086
- #rec_status {
1087
- display: block;
1088
- margin-top: 8px;
1089
- font-family: 'Cormorant Garamond', 'Georgia', serif;
1090
- font-size: 13px;
1091
- color: #4a6a4a;
1092
- letter-spacing: 0.06em;
1093
- }
1094
 
1095
  /* ========================================
1096
  スクロールバー
@@ -1204,28 +1037,16 @@ with gr.Blocks() as demo:
1204
  label_box = gr.Textbox(label="新ラベル", placeholder="例: yes", scale=3)
1205
  add_btn = gr.Button("追加", size="lg", scale=1)
1206
 
1207
- # 録音 & サンプル追加
1208
- gr.Markdown("### 録音")
1209
  label_dd = gr.Radio(choices=LABELS, label="ラベル選択", interactive=True, elem_classes=["diamond-radio"])
1210
-
1211
- # 自動録音UI(JSで制御)
1212
- gr.HTML("""
1213
- <div class="auto-rec-area">
1214
- <button id="btn_auto_rec" type="button"
1215
- onclick="window._autoRec.isRecording ? window._autoRec.stop(false) : window._autoRec.start()">
1216
- 録音開始
1217
- </button>
1218
- <div class="rec-level-wrap"><div class="rec-level-bar" id="rec_level"></div></div>
1219
- <span id="rec_status">ラベルを選択して録音開始</span>
1220
- </div>
1221
- """)
1222
-
1223
- # 隠し音声入力(JSからWAVを受け取る)
1224
- audio_rec = gr.Audio(type="numpy", label="録音データ", visible=False, elem_id="hidden_audio_input")
1225
- add_sample_btn = gr.Button("自動保存", variant="primary", size="lg", visible=False, elem_id="btn_auto_submit")
1226
- with gr.Row():
1227
- undo_btn = gr.Button("Undo", size="lg")
1228
- rerec_btn = gr.Button("Clear", size="lg")
1229
 
1230
  # データ一覧 & 編集
1231
  gr.Markdown("### データ一覧")
@@ -1234,7 +1055,7 @@ with gr.Blocks() as demo:
1234
  value=dataset_table(),
1235
  datatype=["number", "str", "number"],
1236
  row_count=(6, "dynamic"),
1237
- col_count=(3, "fixed"),
1238
  interactive=False,
1239
  elem_id="data_table"
1240
  )
@@ -1263,9 +1084,11 @@ with gr.Blocks() as demo:
1263
  add_btn.click(add_label_cb, inputs=[label_box], outputs=[label_dd, table, relabel_dd])
1264
  add_sample_btn.click(add_sample_cb, inputs=[audio_rec, label_dd], outputs=[table, audio_rec])
1265
  undo_btn.click(undo_last_cb, inputs=[], outputs=[table])
1266
- rerec_btn.click(clear_rec_cb, inputs=[], outputs=[audio_rec])
1267
  reset_btn.click(reset_all_cb, inputs=[], outputs=[table, label_dd, relabel_dd, audio_rec, selected_idx_state])
1268
 
 
 
 
1269
  # select row -> update state + replay + relabel dropdown value
1270
  def _select_and_store(evt: gr.SelectData):
1271
  if evt is None or evt.index is None:
 
263
  DATA.append({"audio": audio_n, "U": U, "label": label})
264
  return dataset_table(), gr.update(value=None)
265
 
266
+ # ── 自動録音: ストリーミング蓄積 + 無音検出 ──
267
+
268
+ SILENCE_THRESH = 0.012
269
+ SILENCE_CHUNKS_NEEDED = 3 # 約1.5秒(stream_every=0.5sなので 0.5×3=1.5s)
270
+ MIN_SPEECH_CHUNKS = 2 # 最低2チャンク分の音声
271
+
272
+ def _new_rec_state():
273
+ return {"chunks": [], "speech_detected": False, "silence_count": 0, "saved": False, "status": "待機中"}
274
+
275
+ def rec_stream_cb(chunk, label, rec_state):
276
+ """ストリーミング録音: 音声蓄積 + 無音検出 → 自動保存"""
277
+ if rec_state is None:
278
+ rec_state = _new_rec_state()
279
+
280
+ if rec_state.get("saved", False):
281
+ # 前回保存済み → リセット
282
+ rec_state = _new_rec_state()
283
+
284
+ if chunk is None:
285
+ return rec_state, dataset_table(), "待機中... マイクを開始してください"
286
+
287
+ sr, y = chunk
288
+ if y is None or len(y) < 10:
289
+ return rec_state, gr.update(), rec_state.get("status", "")
290
+
291
+ y = _mono_float32(y)
292
+
293
+ # RMS計算
294
+ rms = float(np.sqrt(np.mean(y ** 2)))
295
+
296
+ if rms > SILENCE_THRESH:
297
+ # 音声あり
298
+ rec_state["chunks"].append((sr, y.copy()))
299
+ rec_state["speech_detected"] = True
300
+ rec_state["silence_count"] = 0
301
+ n = len(rec_state["chunks"])
302
+ rec_state["status"] = f"録音中... 🎙️ ({n} chunks)"
303
+ else:
304
+ # 無音
305
+ if rec_state["speech_detected"]:
306
+ rec_state["silence_count"] = rec_state.get("silence_count", 0) + 1
307
+ remaining = max(0, SILENCE_CHUNKS_NEEDED - rec_state["silence_count"])
308
+ rec_state["status"] = f"無音検出中... あと{remaining}で自動保存"
309
+
310
+ if rec_state["silence_count"] >= SILENCE_CHUNKS_NEEDED:
311
+ # 十分な音声があれば保存
312
+ if len(rec_state["chunks"]) >= MIN_SPEECH_CHUNKS:
313
+ label = (label or "").strip()
314
+ if label in LABELS:
315
+ # チャンクを結合
316
+ all_y = []
317
+ final_sr = SR
318
+ for s, y_chunk in rec_state["chunks"]:
319
+ if s != SR:
320
+ y_chunk = librosa.resample(y_chunk, orig_sr=s, target_sr=SR)
321
+ all_y.append(y_chunk)
322
+ full_y = np.concatenate(all_y).astype(np.float32)
323
+ full_y /= (np.max(np.abs(full_y)) + 1e-9)
324
+ audio_n = (SR, full_y)
325
+ U = audio_to_sequence(audio_n)
326
+ if U is not None and len(U) >= 5:
327
+ DATA.append({"audio": audio_n, "U": U, "label": label})
328
+ rec_state["status"] = f"✓ 自動保存完了 (idx={len(DATA)-1})"
329
+ else:
330
+ rec_state["status"] = "音声が短すぎます"
331
+ else:
332
+ rec_state["status"] = "ラベルを選択してください"
333
+
334
+ rec_state["saved"] = True
335
+ rec_state["chunks"] = []
336
+ rec_state["speech_detected"] = False
337
+ rec_state["silence_count"] = 0
338
+ return rec_state, dataset_table(), rec_state["status"]
339
+ else:
340
+ rec_state["status"] = "音声が短すぎます。もう一度話してください"
341
+ rec_state["chunks"] = []
342
+ rec_state["speech_detected"] = False
343
+ rec_state["silence_count"] = 0
344
+ else:
345
+ rec_state["status"] = "待機中... 話してください 🎤"
346
+
347
+ return rec_state, gr.update(), rec_state.get("status", "")
348
+
349
  def undo_last_cb():
350
  if len(DATA) == 0:
351
  return dataset_table()
 
408
  <link rel="preconnect" href="https://fonts.googleapis.com">
409
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
410
  <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@300;400;500;600&display=swap" rel="stylesheet">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  """
412
 
413
  CSS = """
 
910
  }
911
 
912
  /* ========================================
913
+ 自動録音: ステータス表示
914
  ======================================== */
915
+ .rec-status {
916
  text-align: center;
917
+ padding: 8px 12px;
 
 
 
 
 
 
 
918
  font-family: 'Cormorant Garamond', 'Georgia', serif;
919
+ font-size: 14px;
920
+ color: #4a6a4a;
921
+ letter-spacing: 0.06em;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
  }
923
  @keyframes recPulse {
924
  0%, 100% { opacity: 1; }
925
  50% { opacity: 0.6; }
926
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
 
928
  /* ========================================
929
  スクロールバー
 
1037
  label_box = gr.Textbox(label="新ラベル", placeholder="例: yes", scale=3)
1038
  add_btn = gr.Button("追加", size="lg", scale=1)
1039
 
1040
+ # 録音 & サンプル追加(自動停止)
1041
+ gr.Markdown("### 録音(自動停止)")
1042
  label_dd = gr.Radio(choices=LABELS, label="ラベル選択", interactive=True, elem_classes=["diamond-radio"])
1043
+ rec_audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy", label="マイク(開始で録音)")
1044
+ rec_status_md = gr.Markdown("待機中... マイクを開始してください", elem_classes=["rec-status"])
1045
+ rec_state = gr.State(None)
1046
+ # 隠し(手動追加用に残す)
1047
+ audio_rec = gr.Audio(type="numpy", visible=False)
1048
+ add_sample_btn = gr.Button("追加", variant="primary", size="lg", visible=False)
1049
+ undo_btn = gr.Button("Undo", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
1050
 
1051
  # データ一覧 & 編集
1052
  gr.Markdown("### データ一覧")
 
1055
  value=dataset_table(),
1056
  datatype=["number", "str", "number"],
1057
  row_count=(6, "dynamic"),
1058
+ column_count=(3, "fixed"),
1059
  interactive=False,
1060
  elem_id="data_table"
1061
  )
 
1084
  add_btn.click(add_label_cb, inputs=[label_box], outputs=[label_dd, table, relabel_dd])
1085
  add_sample_btn.click(add_sample_cb, inputs=[audio_rec, label_dd], outputs=[table, audio_rec])
1086
  undo_btn.click(undo_last_cb, inputs=[], outputs=[table])
 
1087
  reset_btn.click(reset_all_cb, inputs=[], outputs=[table, label_dd, relabel_dd, audio_rec, selected_idx_state])
1088
 
1089
+ # ストリーミング録音 → 自動停止 & 保存
1090
+ rec_audio.stream(rec_stream_cb, inputs=[rec_audio, label_dd, rec_state], outputs=[rec_state, table, rec_status_md], stream_every=0.5)
1091
+
1092
  # select row -> update state + replay + relabel dropdown value
1093
  def _select_and_store(evt: gr.SelectData):
1094
  if evt is None or evt.index is None: