wmoto-ai commited on
Commit
cb5e942
·
verified ·
1 Parent(s): 4ddfbcd

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +31 -4
  2. index.html +472 -18
README.md CHANGED
@@ -1,10 +1,37 @@
1
  ---
2
- title: Moonshine Tiny Ja Demo
3
- emoji: 🌖
4
  colorFrom: blue
5
- colorTo: red
6
  sdk: static
7
  pinned: false
 
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Moonshine Tiny JA
3
+ emoji: 🎤
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: static
7
  pinned: false
8
+ license: other
9
+ license_name: moonshine-ai-community-license
10
+ license_link: https://github.com/usefulsensors/moonshine/blob/main/LICENSE
11
+ short_description: ブラウザで動作する日本語リアルタイム音声認識
12
  ---
13
 
14
+ # Moonshine Tiny JA Demo
15
+
16
+ ブラウザ上で動作する日本語リアルタイム文字起こしデモです。
17
+
18
+ ## 特徴
19
+
20
+ - **完全ローカル処理**: 音声データはサーバーに送信されません
21
+ - **リアルタイム**: 指定間隔(1〜6秒)ごとに文字起こし
22
+ - **軽量**: 27Mパラメータの小型モデル
23
+
24
+ ## 使用モデル
25
+
26
+ - [wmoto-ai/moonshine-tiny-ja-ONNX](https://huggingface.co/wmoto-ai/moonshine-tiny-ja-ONNX)
27
+ - ベース: [UsefulSensors/moonshine-tiny-ja](https://huggingface.co/UsefulSensors/moonshine-tiny-ja)
28
+
29
+ ## 技術スタック
30
+
31
+ - [Transformers.js](https://huggingface.co/docs/transformers.js) - ブラウザでのML推論
32
+ - ONNX Runtime Web - モデル実行
33
+ - Web Audio API - 音声処理
34
+
35
+ ## ライセンス
36
+
37
+ [Moonshine AI Community License](https://github.com/usefulsensors/moonshine/blob/main/LICENSE)
index.html CHANGED
@@ -1,19 +1,473 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="ja">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Moonshine Tiny JA - リアルタイム日本語文字起こし</title>
8
+ <meta name="description" content="ブラウザ上で動作する日本語音声認識デモ。Moonshine Tiny JAモデルをTransformers.jsで実行。">
9
+ <style>
10
+ * {
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
16
+ max-width: 800px;
17
+ margin: 0 auto;
18
+ padding: 20px;
19
+ background: #1a1a2e;
20
+ color: #eee;
21
+ min-height: 100vh;
22
+ }
23
+
24
+ h1 {
25
+ text-align: center;
26
+ color: #00d4ff;
27
+ margin-bottom: 5px;
28
+ }
29
+
30
+ .subtitle {
31
+ text-align: center;
32
+ color: #888;
33
+ font-size: 14px;
34
+ margin-bottom: 20px;
35
+ }
36
+
37
+ .status {
38
+ text-align: center;
39
+ padding: 10px;
40
+ border-radius: 8px;
41
+ margin: 20px 0;
42
+ background: #16213e;
43
+ }
44
+
45
+ .status.loading {
46
+ color: #ffa500;
47
+ }
48
+
49
+ .status.ready {
50
+ color: #00ff88;
51
+ }
52
+
53
+ .status.recording {
54
+ color: #ff4757;
55
+ }
56
+
57
+ .status.error {
58
+ color: #ff4757;
59
+ background: #2d1f1f;
60
+ }
61
+
62
+ button {
63
+ display: block;
64
+ width: 200px;
65
+ margin: 20px auto;
66
+ padding: 15px 30px;
67
+ font-size: 18px;
68
+ border: none;
69
+ border-radius: 50px;
70
+ cursor: pointer;
71
+ transition: all 0.3s;
72
+ }
73
+
74
+ button:disabled {
75
+ background: #555;
76
+ cursor: not-allowed;
77
+ }
78
+
79
+ #startBtn {
80
+ background: linear-gradient(135deg, #00d4ff, #00ff88);
81
+ color: #1a1a2e;
82
+ font-weight: bold;
83
+ }
84
+
85
+ #startBtn:hover:not(:disabled) {
86
+ transform: scale(1.05);
87
+ box-shadow: 0 0 20px rgba(0, 212, 255, 0.5);
88
+ }
89
+
90
+ #startBtn.recording {
91
+ background: linear-gradient(135deg, #ff4757, #ff6b81);
92
+ animation: pulse 1s infinite;
93
+ }
94
+
95
+ @keyframes pulse {
96
+ 0%, 100% {
97
+ box-shadow: 0 0 0 0 rgba(255, 71, 87, 0.4);
98
+ }
99
+ 50% {
100
+ box-shadow: 0 0 0 15px rgba(255, 71, 87, 0);
101
+ }
102
+ }
103
+
104
+ .slider-container {
105
+ margin: 20px 0;
106
+ padding: 15px;
107
+ background: #16213e;
108
+ border-radius: 8px;
109
+ }
110
+
111
+ .slider-container label {
112
+ display: block;
113
+ margin-bottom: 10px;
114
+ }
115
+
116
+ .slider-container input[type="range"] {
117
+ width: 100%;
118
+ cursor: pointer;
119
+ }
120
+
121
+ .slider-labels {
122
+ display: flex;
123
+ justify-content: space-between;
124
+ font-size: 12px;
125
+ color: #888;
126
+ margin-top: 5px;
127
+ }
128
+
129
+ #transcript {
130
+ background: #16213e;
131
+ border-radius: 12px;
132
+ padding: 20px;
133
+ min-height: 200px;
134
+ margin-top: 20px;
135
+ font-size: 18px;
136
+ line-height: 1.8;
137
+ white-space: pre-wrap;
138
+ word-wrap: break-word;
139
+ }
140
+
141
+ #transcript:empty::before {
142
+ content: "文字起こし結果がここに表示されます...";
143
+ color: #666;
144
+ }
145
+
146
+ #currentText {
147
+ color: #00d4ff;
148
+ font-style: italic;
149
+ min-height: 30px;
150
+ margin-top: 10px;
151
+ text-align: center;
152
+ }
153
+
154
+ .info {
155
+ background: #16213e;
156
+ border-radius: 8px;
157
+ padding: 15px;
158
+ margin: 20px 0;
159
+ font-size: 14px;
160
+ color: #aaa;
161
+ }
162
+
163
+ .info a {
164
+ color: #00d4ff;
165
+ text-decoration: none;
166
+ }
167
+
168
+ .info a:hover {
169
+ text-decoration: underline;
170
+ }
171
+
172
+ .progress-container {
173
+ background: #0f0f23;
174
+ border-radius: 10px;
175
+ height: 20px;
176
+ margin: 10px 0;
177
+ overflow: hidden;
178
+ }
179
+
180
+ .progress-bar {
181
+ height: 100%;
182
+ background: linear-gradient(90deg, #00d4ff, #00ff88);
183
+ width: 0%;
184
+ transition: width 0.3s;
185
+ }
186
+
187
+ .footer {
188
+ text-align: center;
189
+ margin-top: 30px;
190
+ padding-top: 20px;
191
+ border-top: 1px solid #333;
192
+ font-size: 12px;
193
+ color: #666;
194
+ }
195
+
196
+ .footer a {
197
+ color: #00d4ff;
198
+ text-decoration: none;
199
+ }
200
+ </style>
201
+ </head>
202
+
203
+ <body>
204
+ <h1>Moonshine Tiny JA</h1>
205
+ <p class="subtitle">ブラウザで動作する日本語リアルタイム文字起こし</p>
206
+
207
+ <div id="status" class="status loading">
208
+ モデルを読み込み中...
209
+ <div class="progress-container">
210
+ <div id="progressBar" class="progress-bar"></div>
211
+ </div>
212
+ </div>
213
+
214
+ <button id="startBtn" disabled>読み込み中...</button>
215
+
216
+ <div class="slider-container">
217
+ <label for="intervalSlider">
218
+ 録音間隔: <span id="intervalValue">3</span>秒
219
+ </label>
220
+ <input type="range" id="intervalSlider" min="1" max="6" step="0.5" value="3">
221
+ <div class="slider-labels">
222
+ <span>1秒 (高速)</span>
223
+ <span>6秒 (高精度)</span>
224
+ </div>
225
+ </div>
226
+
227
+ <div id="currentText"></div>
228
+ <div id="transcript"></div>
229
+
230
+ <div class="info">
231
+ <strong>使い方:</strong><br>
232
+ 1. モデルの読み込みを待つ(初回は数分かかります)<br>
233
+ 2. 「録音開始」ボタンをクリック<br>
234
+ 3. マイクに向かって話す<br>
235
+ 4. 指定間隔ごとに文字起こし結果が表示されます<br><br>
236
+ <strong>モデル:</strong> <a href="https://huggingface.co/wmoto-ai/moonshine-tiny-ja-ONNX" target="_blank">wmoto-ai/moonshine-tiny-ja-ONNX</a><br>
237
+ <strong>ベース:</strong> <a href="https://huggingface.co/UsefulSensors/moonshine-tiny-ja" target="_blank">UsefulSensors/moonshine-tiny-ja</a>
238
+ </div>
239
+
240
+ <div class="footer">
241
+ Powered by <a href="https://www.moonshine.ai/" target="_blank">Moonshine AI</a> |
242
+ <a href="https://huggingface.co/docs/transformers.js" target="_blank">Transformers.js</a><br>
243
+ Licensed under <a href="https://github.com/usefulsensors/moonshine/blob/main/LICENSE" target="_blank">Moonshine AI Community License</a>
244
+ </div>
245
+
246
+ <script type="module">
247
+ import {
248
+ MoonshineForConditionalGeneration,
249
+ AutoProcessor,
250
+ AutoTokenizer,
251
+ } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3';
252
+
253
+ const statusEl = document.getElementById('status');
254
+ const startBtn = document.getElementById('startBtn');
255
+ const transcriptEl = document.getElementById('transcript');
256
+ const currentTextEl = document.getElementById('currentText');
257
+ const progressBar = document.getElementById('progressBar');
258
+ const intervalSlider = document.getElementById('intervalSlider');
259
+ const intervalValue = document.getElementById('intervalValue');
260
+
261
+ intervalSlider.addEventListener('input', () => {
262
+ intervalValue.textContent = intervalSlider.value;
263
+ });
264
+
265
+ let model = null;
266
+ let processor = null;
267
+ let tokenizer = null;
268
+ let isRecording = false;
269
+ let mediaRecorder = null;
270
+ let audioContext = null;
271
+ let audioChunks = [];
272
+
273
+ const MODEL_ID = 'wmoto-ai/moonshine-tiny-ja-ONNX';
274
+
275
+ async function loadModel() {
276
+ try {
277
+ statusEl.textContent = 'モデルを読み込み中... (初回は数分かかることがあります)';
278
+
279
+ const progressCallback = (progress) => {
280
+ if (progress.status === 'progress') {
281
+ const percent = Math.round((progress.loaded / progress.total) * 100);
282
+ progressBar.style.width = percent + '%';
283
+ statusEl.textContent = `モデルを読み込み中... ${percent}%`;
284
+ }
285
+ };
286
+
287
+ [model, processor, tokenizer] = await Promise.all([
288
+ MoonshineForConditionalGeneration.from_pretrained(MODEL_ID, {
289
+ dtype: 'fp32',
290
+ progress_callback: progressCallback
291
+ }),
292
+ AutoProcessor.from_pretrained(MODEL_ID),
293
+ AutoTokenizer.from_pretrained(MODEL_ID)
294
+ ]);
295
+
296
+ statusEl.textContent = '準備完了!録音を開始できます';
297
+ statusEl.className = 'status ready';
298
+ startBtn.textContent = '録音開始';
299
+ startBtn.disabled = false;
300
+ } catch (error) {
301
+ console.error('Model loading error:', error);
302
+ statusEl.textContent = `エラー: ${error.message}`;
303
+ statusEl.className = 'status error';
304
+ }
305
+ }
306
+
307
+ async function transcribeAudio(audioBlob) {
308
+ if (!model || !processor || !tokenizer) return;
309
+
310
+ try {
311
+ currentTextEl.textContent = '処理中...';
312
+
313
+ const arrayBuffer = await audioBlob.arrayBuffer();
314
+
315
+ if (!audioContext) {
316
+ audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
317
+ }
318
+
319
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
320
+
321
+ const targetSampleRate = 16000;
322
+ const offlineCtx = new OfflineAudioContext(
323
+ 1,
324
+ Math.ceil(audioBuffer.duration * targetSampleRate),
325
+ targetSampleRate
326
+ );
327
+
328
+ const source = offlineCtx.createBufferSource();
329
+ source.buffer = audioBuffer;
330
+ source.connect(offlineCtx.destination);
331
+ source.start();
332
+
333
+ const resampled = await offlineCtx.startRendering();
334
+ const audioData = resampled.getChannelData(0);
335
+
336
+ if (audioData.length < 1600) {
337
+ currentTextEl.textContent = '(音声が短すぎます)';
338
+ return;
339
+ }
340
+
341
+ let maxLevel = 0;
342
+ let sumSquares = 0;
343
+ for (let i = 0; i < audioData.length; i++) {
344
+ const abs = Math.abs(audioData[i]);
345
+ if (abs > maxLevel) maxLevel = abs;
346
+ sumSquares += audioData[i] * audioData[i];
347
+ }
348
+ const rms = Math.sqrt(sumSquares / audioData.length);
349
+
350
+ if (rms < 0.01 || maxLevel < 0.05) {
351
+ currentTextEl.textContent = '(音声が小さすぎます)';
352
+ return;
353
+ }
354
+
355
+ const inputs = await processor(audioData);
356
+
357
+ const intervalSec = parseFloat(intervalSlider.value);
358
+ const maxTokens = Math.min(Math.round(intervalSec * 25), 150);
359
+ const outputs = await model.generate({
360
+ ...inputs,
361
+ max_new_tokens: maxTokens,
362
+ });
363
+
364
+ let text = tokenizer.decode(outputs[0], { skip_special_tokens: true }).trim();
365
+
366
+ const repeatPattern = /(.{2,}?)\1{4,}/;
367
+ if (repeatPattern.test(text)) {
368
+ text = text.replace(/(.{2,}?)\1{3,}/g, '$1');
369
+ }
370
+
371
+ const hallucinations = ['彼は私', '彼女は私', 'そう、そう'];
372
+ const isHallucination = hallucinations.some(h => text.includes(h) && text.length > 30);
373
+
374
+ if (text && !isHallucination) {
375
+ currentTextEl.textContent = text;
376
+ transcriptEl.textContent += text + '\n';
377
+ } else if (isHallucination) {
378
+ currentTextEl.textContent = '(ノイズ検出)';
379
+ } else {
380
+ currentTextEl.textContent = '(音声が検出されませんでした)';
381
+ }
382
+ } catch (error) {
383
+ console.error('Transcription error:', error);
384
+ currentTextEl.textContent = `エラー: ${error.message}`;
385
+ }
386
+ }
387
+
388
+ async function startRecording() {
389
+ try {
390
+ const stream = await navigator.mediaDevices.getUserMedia({
391
+ audio: {
392
+ channelCount: 1,
393
+ sampleRate: 16000,
394
+ }
395
+ });
396
+
397
+ audioChunks = [];
398
+
399
+ mediaRecorder = new MediaRecorder(stream, {
400
+ mimeType: 'audio/webm;codecs=opus'
401
+ });
402
+
403
+ mediaRecorder.ondataavailable = (event) => {
404
+ if (event.data.size > 0) {
405
+ audioChunks.push(event.data);
406
+ }
407
+ };
408
+
409
+ const processAndRestart = async () => {
410
+ if (!isRecording) return;
411
+ mediaRecorder.stop();
412
+ };
413
+
414
+ mediaRecorder.onstop = async () => {
415
+ if (audioChunks.length > 0 && isRecording) {
416
+ const audioBlob = new Blob(audioChunks, { type: 'audio/webm;codecs=opus' });
417
+ audioChunks = [];
418
+
419
+ await transcribeAudio(audioBlob);
420
+
421
+ if (isRecording && mediaRecorder.stream.active) {
422
+ const intervalMs = parseFloat(intervalSlider.value) * 1000;
423
+ mediaRecorder.start(500);
424
+ setTimeout(processAndRestart, intervalMs);
425
+ }
426
+ }
427
+ };
428
+
429
+ mediaRecorder.start(500);
430
+ isRecording = true;
431
+
432
+ const intervalMs = parseFloat(intervalSlider.value) * 1000;
433
+ setTimeout(processAndRestart, intervalMs);
434
+
435
+ statusEl.textContent = '録音中... マイクに向かって話してください';
436
+ statusEl.className = 'status recording';
437
+ startBtn.textContent = '録音停止';
438
+ startBtn.classList.add('recording');
439
+ } catch (error) {
440
+ console.error('Recording error:', error);
441
+ statusEl.textContent = `マイクエラー: ${error.message}`;
442
+ statusEl.className = 'status error';
443
+ }
444
+ }
445
+
446
+ function stopRecording() {
447
+ if (mediaRecorder && mediaRecorder.state !== 'inactive') {
448
+ mediaRecorder.stop();
449
+ mediaRecorder.stream.getTracks().forEach(track => track.stop());
450
+ }
451
+ isRecording = false;
452
+ audioChunks = [];
453
+
454
+ statusEl.textContent = '録音停止。再開するにはボタンをクリック';
455
+ statusEl.className = 'status ready';
456
+ startBtn.textContent = '録音開始';
457
+ startBtn.classList.remove('recording');
458
+ currentTextEl.textContent = '';
459
+ }
460
+
461
+ startBtn.addEventListener('click', () => {
462
+ if (isRecording) {
463
+ stopRecording();
464
+ } else {
465
+ startRecording();
466
+ }
467
+ });
468
+
469
+ loadModel();
470
+ </script>
471
+ </body>
472
+
473
  </html>