KikKoh commited on
Commit
250d7d9
·
1 Parent(s): 5b96b14

first upload

Browse files
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --upgrade pip
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY --chown=user . /app
14
+
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Hokkien
3
- emoji: 🔥
4
- colorFrom: gray
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
- short_description: ' 台語語音辨識示範,使用 Wav2Vec2 模型將錄音轉成羅馬拼音;使用 whisper 模型將錄音轉成中文。'
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Hokkien
3
+ emoji: 👀
4
+ colorFrom: green
5
+ colorTo: pink
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
+ short_description: 台語語音辨識示範,使用 Wav2Vec2 模型將錄音轉成羅馬拼音;使用 whisper 模型將錄音轉成中文。
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.staticfiles import StaticFiles
3
+ from fastapi.responses import FileResponse
4
+
5
+ from demo.app.api import router as api_router # 注意這裡改成 app.api
6
+
7
+ app = FastAPI()
8
+
9
+ app.include_router(api_router)
10
+
11
+ app.mount("/static", StaticFiles(directory="demo/static"), name="static")
12
+
13
+ @app.get("/")
14
+ async def index():
15
+ return FileResponse("demo/static/index.html")
demo/app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # nothing
demo/app/api.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+ import os
4
+ import tempfile
5
+ import torch
6
+
7
+ from demo.app.preprocess import preprocess_audio
8
+ from demo.app.model_wav2vec2 import infer as wav2vec2_infer
9
+
10
+ router = APIRouter()
11
+
12
+ @router.post("/transcribe")
13
+ async def transcribe(file: UploadFile):
14
+ suffix = os.path.splitext(file.filename)[1]
15
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
16
+ tmp.write(await file.read())
17
+ tmp_path = tmp.name
18
+
19
+ waveform, sr = preprocess_audio(tmp_path)
20
+ os.remove(tmp_path)
21
+
22
+ wav2vec2_text, wav2vec2_conf = wav2vec2_infer(waveform.squeeze(0).cpu().numpy(), sr)
23
+
24
+ return JSONResponse({
25
+ "transcription": wav2vec2_text,
26
+ "confidence": round(wav2vec2_conf, 4)
27
+ })
demo/app/model_wav2vec2.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn.functional import softmax
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+
5
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
6
+
7
+ model = Wav2Vec2ForCTC.from_pretrained("demo/my-wav2vec2").to(device).eval()
8
+ processor = Wav2Vec2Processor.from_pretrained("demo/my-wav2vec2")
9
+
10
+ def ctc_decode(token_ids, blank_id=0):
11
+ prev_token = None
12
+ decoded_ids = []
13
+ for token in token_ids:
14
+ if token != prev_token and token != blank_id:
15
+ decoded_ids.append(token)
16
+ prev_token = token
17
+ tokens = processor.tokenizer.convert_ids_to_tokens(decoded_ids)
18
+ return "".join(tokens).replace(processor.tokenizer.pad_token, "").strip()
19
+
20
+ def infer(waveform, sample_rate):
21
+ inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt").to(device)
22
+ with torch.no_grad():
23
+ logits = model(**inputs).logits
24
+ predicted_ids = torch.argmax(logits, dim=-1)[0].tolist()
25
+ transcription = ctc_decode(predicted_ids)
26
+
27
+ probs = softmax(logits, dim=-1)
28
+ confidences, _ = torch.max(probs, dim=-1)
29
+ confidence = confidences.mean().item()
30
+
31
+ return transcription, confidence
demo/app/preprocess.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import os
4
+ import torchaudio
5
+
6
+ target_sample_rate = 16000
7
+
8
+ def preprocess_audio(file_path):
9
+ suffix = os.path.splitext(file_path)[1]
10
+ if suffix == ".pt":
11
+ waveform = torch.load(file_path)
12
+ sr = target_sample_rate
13
+ else:
14
+ waveform, sr = sf.read(file_path)
15
+ waveform = torch.tensor(waveform).float()
16
+ if waveform.dim() == 1:
17
+ waveform = waveform.unsqueeze(0)
18
+ else:
19
+ waveform = waveform.permute(1, 0)
20
+
21
+ if waveform.shape[0] > 1:
22
+ waveform = waveform.mean(dim=0, keepdim=True)
23
+
24
+ if sr != target_sample_rate:
25
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
26
+ waveform = resampler(waveform)
27
+
28
+ return waveform, target_sample_rate
demo/my-wav2vec2/config.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "adapter_attn_dim": null,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_norm": "group",
51
+ "feat_proj_dropout": 0.1,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.0,
54
+ "freeze_feat_extract_train": true,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 768,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 3072,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.0,
62
+ "mask_channel_length": 10,
63
+ "mask_channel_min_space": 1,
64
+ "mask_channel_other": 0.0,
65
+ "mask_channel_prob": 0.0,
66
+ "mask_channel_selection": "static",
67
+ "mask_feature_length": 10,
68
+ "mask_feature_min_masks": 0,
69
+ "mask_feature_prob": 0.0,
70
+ "mask_time_length": 10,
71
+ "mask_time_min_masks": 2,
72
+ "mask_time_min_space": 1,
73
+ "mask_time_other": 0.0,
74
+ "mask_time_prob": 0.05,
75
+ "mask_time_selection": "static",
76
+ "model_type": "wav2vec2",
77
+ "no_mask_channel_overlap": false,
78
+ "no_mask_time_overlap": false,
79
+ "num_adapter_layers": 3,
80
+ "num_attention_heads": 12,
81
+ "num_codevector_groups": 2,
82
+ "num_codevectors_per_group": 320,
83
+ "num_conv_pos_embedding_groups": 16,
84
+ "num_conv_pos_embeddings": 128,
85
+ "num_feat_extract_layers": 7,
86
+ "num_hidden_layers": 12,
87
+ "num_negatives": 100,
88
+ "output_hidden_size": 768,
89
+ "pad_token_id": 0,
90
+ "proj_codevector_dim": 256,
91
+ "tdnn_dilation": [
92
+ 1,
93
+ 2,
94
+ 3,
95
+ 1,
96
+ 1
97
+ ],
98
+ "tdnn_dim": [
99
+ 512,
100
+ 512,
101
+ 512,
102
+ 512,
103
+ 1500
104
+ ],
105
+ "tdnn_kernel": [
106
+ 5,
107
+ 3,
108
+ 3,
109
+ 1,
110
+ 1
111
+ ],
112
+ "torch_dtype": "float32",
113
+ "transformers_version": "4.53.3",
114
+ "use_weighted_layer_sum": false,
115
+ "vocab_size": 99,
116
+ "xvector_output_dim": 512
117
+ }
demo/my-wav2vec2/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": false,
9
+ "sampling_rate": 16000
10
+ }
demo/my-wav2vec2/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
demo/my-wav2vec2/tokenizer.json ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<s>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "</s>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 98,
35
+ "content": "<pad>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": null,
44
+ "pre_tokenizer": {
45
+ "type": "Split",
46
+ "pattern": {
47
+ "String": ""
48
+ },
49
+ "behavior": "Isolated",
50
+ "invert": false
51
+ },
52
+ "post_processor": null,
53
+ "decoder": null,
54
+ "model": {
55
+ "type": "Unigram",
56
+ "unk_id": null,
57
+ "vocab": [
58
+ [
59
+ "<unk>",
60
+ 0.0
61
+ ],
62
+ [
63
+ "<s>",
64
+ 0.0
65
+ ],
66
+ [
67
+ "</s>",
68
+ 0.0
69
+ ],
70
+ [
71
+ " ",
72
+ -2.0491303237853096
73
+ ],
74
+ [
75
+ "-",
76
+ -2.438041258277112
77
+ ],
78
+ [
79
+ "n",
80
+ -2.501524477266583
81
+ ],
82
+ [
83
+ "t",
84
+ -2.638661273777304
85
+ ],
86
+ [
87
+ "h",
88
+ -2.692965388541229
89
+ ],
90
+ [
91
+ "i",
92
+ -2.694754512457008
93
+ ],
94
+ [
95
+ "s",
96
+ -2.8923410887869334
97
+ ],
98
+ [
99
+ "k",
100
+ -3.1970415940597174
101
+ ],
102
+ [
103
+ "u",
104
+ -3.2651700304177123
105
+ ],
106
+ [
107
+ "g",
108
+ -3.4248376939813774
109
+ ],
110
+ [
111
+ "a",
112
+ -3.4963570859315585
113
+ ],
114
+ [
115
+ "á",
116
+ -3.9240834819779824
117
+ ],
118
+ [
119
+ "o",
120
+ -3.975795370165889
121
+ ],
122
+ [
123
+ "l",
124
+ -4.007994364937257
125
+ ],
126
+ [
127
+ "̍",
128
+ -4.170577777562061
129
+ ],
130
+ [
131
+ ".",
132
+ -4.189869429609686
133
+ ],
134
+ [
135
+ "â",
136
+ -4.25267258530512
137
+ ],
138
+ [
139
+ "ā",
140
+ -4.298943240036779
141
+ ],
142
+ [
143
+ "p",
144
+ -4.3723955360015605
145
+ ],
146
+ [
147
+ "à",
148
+ -4.461337584827133
149
+ ],
150
+ [
151
+ "b",
152
+ -4.466076323196452
153
+ ],
154
+ [
155
+ ",",
156
+ -4.499891555448983
157
+ ],
158
+ [
159
+ "í",
160
+ -4.578458027626528
161
+ ],
162
+ [
163
+ "m",
164
+ -4.5786016851469995
165
+ ],
166
+ [
167
+ "e",
168
+ -4.749594791761277
169
+ ],
170
+ [
171
+ "ó",
172
+ -4.759358486978165
173
+ ],
174
+ [
175
+ "ī",
176
+ -4.845970910394813
177
+ ],
178
+ [
179
+ "ì",
180
+ -4.851051839639437
181
+ ],
182
+ [
183
+ "ê",
184
+ -4.877819521265044
185
+ ],
186
+ [
187
+ "ē",
188
+ -4.927070017931555
189
+ ],
190
+ [
191
+ "ō",
192
+ -5.013998122557426
193
+ ],
194
+ [
195
+ "î",
196
+ -5.099728488704239
197
+ ],
198
+ [
199
+ "ū",
200
+ -5.347765015858414
201
+ ],
202
+ [
203
+ "ô",
204
+ -5.365594702921749
205
+ ],
206
+ [
207
+ "T",
208
+ -5.418076380746381
209
+ ],
210
+ [
211
+ "ò",
212
+ -5.444021231995655
213
+ ],
214
+ [
215
+ "I",
216
+ -5.644843649981011
217
+ ],
218
+ [
219
+ "ú",
220
+ -5.656596146044344
221
+ ],
222
+ [
223
+ "̄",
224
+ -5.816993865454794
225
+ ],
226
+ [
227
+ "è",
228
+ -5.825455837126889
229
+ ],
230
+ [
231
+ "L",
232
+ -5.997384198304952
233
+ ],
234
+ [
235
+ "j",
236
+ -6.023851907952154
237
+ ],
238
+ [
239
+ "é",
240
+ -6.0893458766159565
241
+ ],
242
+ [
243
+ "û",
244
+ -6.36828010754205
245
+ ],
246
+ [
247
+ "ù",
248
+ -6.395308778217399
249
+ ],
250
+ [
251
+ "K",
252
+ -6.453533494168924
253
+ ],
254
+ [
255
+ "G",
256
+ -6.49075152735316
257
+ ],
258
+ [
259
+ "?",
260
+ -6.808536997604716
261
+ ],
262
+ [
263
+ "H",
264
+ -6.961241358131142
265
+ ],
266
+ [
267
+ "S",
268
+ -7.013969214753503
269
+ ],
270
+ [
271
+ "“",
272
+ -7.060997180654681
273
+ ],
274
+ [
275
+ "”",
276
+ -7.060997180654681
277
+ ],
278
+ [
279
+ "!",
280
+ -7.1527881697284315
281
+ ],
282
+ [
283
+ "̂",
284
+ -7.164162696539445
285
+ ],
286
+ [
287
+ "ǹ",
288
+ -7.416567092235995
289
+ ],
290
+ [
291
+ "B",
292
+ -7.561557318255249
293
+ ],
294
+ [
295
+ "ń",
296
+ -7.616965035855269
297
+ ],
298
+ [
299
+ "P",
300
+ -7.724493393771001
301
+ ],
302
+ [
303
+ "A",
304
+ -7.790034021304488
305
+ ],
306
+ [
307
+ "M",
308
+ -8.091475968442492
309
+ ],
310
+ [
311
+ "N",
312
+ -8.58636336389603
313
+ ],
314
+ [
315
+ "Ū",
316
+ -8.677334622535016
317
+ ],
318
+ [
319
+ "J",
320
+ -8.836547433198561
321
+ ],
322
+ [
323
+ "U",
324
+ -9.428393546992902
325
+ ],
326
+ [
327
+ "À",
328
+ -9.976578745300651
329
+ ],
330
+ [
331
+ "O",
332
+ -10.076652901770805
333
+ ],
334
+ [
335
+ "Í",
336
+ -10.112367187485088
337
+ ],
338
+ [
339
+ ";",
340
+ -10.187865762983664
341
+ ],
342
+ [
343
+ "E",
344
+ -10.227865762983663
345
+ ],
346
+ [
347
+ "Ī",
348
+ -10.45608428359349
349
+ ],
350
+ [
351
+ "Â",
352
+ -10.623094947508182
353
+ ],
354
+ [
355
+ "Ô",
356
+ -10.685594947508188
357
+ ],
358
+ [
359
+ "Ē",
360
+ -10.685594947508188
361
+ ],
362
+ [
363
+ "Ā",
364
+ -10.752261614174865
365
+ ],
366
+ [
367
+ "Á",
368
+ -10.752261614174865
369
+ ],
370
+ [
371
+ "─",
372
+ -10.98394659586001
373
+ ],
374
+ [
375
+ "Î",
376
+ -11.920490607456603
377
+ ],
378
+ [
379
+ "ḿ",
380
+ -11.920490607456603
381
+ ],
382
+ [
383
+ "‘",
384
+ -12.170490607456603
385
+ ],
386
+ [
387
+ "…",
388
+ -12.170490607456603
389
+ ],
390
+ [
391
+ "’",
392
+ -12.170490607456603
393
+ ],
394
+ [
395
+ "Ì",
396
+ -12.503823940789935
397
+ ],
398
+ [
399
+ "0",
400
+ -12.503823940789935
401
+ ],
402
+ [
403
+ "Ê",
404
+ -12.503823940789935
405
+ ],
406
+ [
407
+ "2",
408
+ -13.003823940789935
409
+ ],
410
+ [
411
+ "1",
412
+ -13.003823940789935
413
+ ],
414
+ [
415
+ "Ó",
416
+ -13.003823940789935
417
+ ],
418
+ [
419
+ "9",
420
+ -13.003823940789935
421
+ ],
422
+ [
423
+ "Ǹ",
424
+ -13.003823940789935
425
+ ],
426
+ [
427
+ "̋",
428
+ -13.003823940789935
429
+ ],
430
+ [
431
+ "Ō",
432
+ -14.003823940789935
433
+ ],
434
+ [
435
+ "Ú",
436
+ -14.003823940789935
437
+ ],
438
+ [
439
+ "3",
440
+ -14.003823940789935
441
+ ],
442
+ [
443
+ " ",
444
+ -14.003823940789935
445
+ ],
446
+ [
447
+ "4",
448
+ -14.003823940789935
449
+ ]
450
+ ],
451
+ "byte_fallback": false
452
+ }
453
+ }
demo/my-wav2vec2/tokenizer_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "98": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "eos_token": "</s>",
39
+ "extra_special_tokens": {},
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "tokenizer_class": "PreTrainedTokenizerFast",
44
+ "unk_token": "<unk>"
45
+ }
demo/static/app.js ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // 上傳檔案並辨識
2
+ async function uploadAudio() {
3
+ const fileInput = document.getElementById('audio');
4
+ if (!fileInput.files.length) {
5
+ alert('請先選擇音檔!');
6
+ return;
7
+ }
8
+ const formData = new FormData();
9
+ formData.append("file", fileInput.files[0]);
10
+
11
+ document.getElementById('result').innerText = '辨識中...';
12
+
13
+ const response = await fetch("/transcribe", {
14
+ method: "POST",
15
+ body: formData
16
+ });
17
+
18
+ const data = await response.json();
19
+ document.getElementById('result').innerText =
20
+ `辨識結果:${data.transcription}\n信心分數:${data.confidence}`;
21
+ }
22
+
23
+ // 錄音相關
24
+ let recorder;
25
+ let audioChunks = [];
26
+
27
+ document.getElementById("record").onclick = async () => {
28
+ try {
29
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
30
+
31
+ let mimeType = "";
32
+ if (MediaRecorder.isTypeSupported("audio/mp4")) {
33
+ mimeType = "audio/mp4";
34
+ } else if (MediaRecorder.isTypeSupported("audio/webm")) {
35
+ mimeType = "audio/webm";
36
+ } else {
37
+ mimeType = "";
38
+ }
39
+
40
+ recorder = new MediaRecorder(stream, mimeType ? { mimeType } : {});
41
+ audioChunks = [];
42
+
43
+ recorder.ondataavailable = e => audioChunks.push(e.data);
44
+
45
+ recorder.onstart = () => {
46
+ document.getElementById("status").innerText = "錄音中...";
47
+ document.getElementById("record").disabled = true;
48
+ document.getElementById("stop").disabled = false;
49
+ document.getElementById("result").innerText = "";
50
+ };
51
+
52
+ recorder.onstop = async () => {
53
+ const webmBlob = new Blob(audioChunks, { type: "audio/webm" });
54
+ const arrayBuffer = await webmBlob.arrayBuffer();
55
+ const audioContext = new (window.AudioContext || window.webkitAudioContext)();
56
+ const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
57
+
58
+ // 轉 WAV
59
+ const wavBuffer = audioBufferToWav(audioBuffer);
60
+ const wavBlob = new Blob([wavBuffer], { type: "audio/wav" });
61
+
62
+ const formData = new FormData();
63
+ formData.append("file", wavBlob, "record.wav");
64
+
65
+ document.getElementById("status").innerText = "辨識中...";
66
+
67
+ const response = await fetch("/transcribe", {
68
+ method: "POST",
69
+ body: formData
70
+ });
71
+
72
+ const result = await response.json();
73
+ document.getElementById("status").innerText = "辨識完成";
74
+ document.getElementById("result").innerText =
75
+ `辨識結果:${result.transcription}\n信心分數:${result.confidence}`;
76
+ };
77
+
78
+ recorder.start();
79
+ } catch (e) {
80
+ alert("無法存取麥克風,請確認權限。");
81
+ }
82
+ };
83
+
84
+ document.getElementById("stop").onclick = () => {
85
+ recorder.stop();
86
+ document.getElementById("stop").disabled = true;
87
+ document.getElementById("record").disabled = false;
88
+ document.getElementById("status").innerText = "停止錄音,等待辨識...";
89
+ };
90
+
91
+ // Web Audio API -> WAV 轉換
92
+ function audioBufferToWav(buffer) {
93
+ const numOfChan = buffer.numberOfChannels;
94
+ const length = buffer.length * numOfChan * 2 + 44;
95
+ const bufferArray = new ArrayBuffer(length);
96
+ const view = new DataView(bufferArray);
97
+ const channels = [];
98
+
99
+ function writeString(view, offset, string) {
100
+ for (let i = 0; i < string.length; i++) {
101
+ view.setUint8(offset + i, string.charCodeAt(i));
102
+ }
103
+ }
104
+
105
+ writeString(view, 0, 'RIFF');
106
+ view.setUint32(4, length - 8, true);
107
+ writeString(view, 8, 'WAVE');
108
+ writeString(view, 12, 'fmt ');
109
+ view.setUint32(16, 16, true);
110
+ view.setUint16(20, 1, true);
111
+ view.setUint16(22, numOfChan, true);
112
+ view.setUint32(24, buffer.sampleRate, true);
113
+ view.setUint32(28, buffer.sampleRate * 2 * numOfChan, true);
114
+ view.setUint16(32, numOfChan * 2, true);
115
+ view.setUint16(34, 16, true);
116
+ writeString(view, 36, 'data');
117
+ view.setUint32(40, length - 44, true);
118
+
119
+ for (let i = 0; i < numOfChan; i++) {
120
+ channels.push(buffer.getChannelData(i));
121
+ }
122
+
123
+ let interleaved = new Float32Array(buffer.length * numOfChan);
124
+ for (let i = 0; i < buffer.length; i++) {
125
+ for (let ch = 0; ch < numOfChan; ch++) {
126
+ interleaved[i * numOfChan + ch] = channels[ch][i];
127
+ }
128
+ }
129
+
130
+ let index = 44;
131
+ for (let i = 0; i < interleaved.length; i++, index += 2) {
132
+ let s = Math.max(-1, Math.min(1, interleaved[i]));
133
+ view.setInt16(index, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
134
+ }
135
+ return bufferArray;
136
+ }
demo/static/index.html ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="zh">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Wav2Vec2 台語羅馬拼音辨識 Demo</title>
6
+ <link rel="stylesheet" href="/static/style.css">
7
+ </head>
8
+ <body>
9
+ <h1>Wav2Vec2 台語羅馬拼音辨識 Demo</h1>
10
+ <p>請使用手機或電腦錄音,然後上傳音檔,等待辨識結果。</p>
11
+
12
+ <!-- 檔案上傳 -->
13
+ <input type="file" id="audio" accept="audio/*">
14
+ <button onclick="uploadAudio()">上傳並辨識</button>
15
+
16
+ <hr>
17
+
18
+ <!-- 錄音功能 -->
19
+ <button id="record">開始錄音</button>
20
+ <button id="stop" disabled>停止錄音</button>
21
+ <p id="status"></p>
22
+
23
+ <div id="result"></div>
24
+ <script src="/static/app.js"></script>
25
+ </html>
demo/static/style.css ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: sans-serif;
3
+ margin: 40px;
4
+ text-align: center;
5
+ }
6
+ input[type="file"] {
7
+ margin: 20px 0;
8
+ }
9
+ button {
10
+ padding: 10px 20px;
11
+ margin: 5px;
12
+ }
13
+ #result {
14
+ margin-top: 20px;
15
+ font-size: 1.2em;
16
+ white-space: pre-line;
17
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ torch
4
+ transformers
5
+ torchaudio
6
+ soundfile
7
+ python-multipart