Yiru Yang commited on
Commit
5f2f308
·
0 Parent(s):

fresh LFS version for Hugging Face push

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +4 -0
  3. 1.png +3 -0
  4. README.md +26 -0
  5. configs/default_config.yaml +78 -0
  6. distil-whisper-lora-run5/training_config.json +11 -0
  7. requirements.txt +1 -0
  8. run_distillation.py +2137 -0
  9. scripts/.DS_Store +0 -0
  10. scripts/__pycache__/hyperparameter_search.cpython-312.pyc +0 -0
  11. scripts/distil-whisper-lora-run5/adapter/README.md +202 -0
  12. scripts/distil-whisper-lora-run5/adapter/adapter_config.json +68 -0
  13. scripts/distil-whisper-lora-run5/adapter/adapter_model.safetensors +3 -0
  14. scripts/distil-whisper-lora-run5/adapter/added_tokens.json +1609 -0
  15. scripts/distil-whisper-lora-run5/adapter/merges.txt +0 -0
  16. scripts/distil-whisper-lora-run5/adapter/normalizer.json +1742 -0
  17. scripts/distil-whisper-lora-run5/adapter/preprocessor_config.json +15 -0
  18. scripts/distil-whisper-lora-run5/adapter/projection.pt +3 -0
  19. scripts/distil-whisper-lora-run5/adapter/special_tokens_map.json +139 -0
  20. scripts/distil-whisper-lora-run5/adapter/tokenizer_config.json +0 -0
  21. scripts/distil-whisper-lora-run5/adapter/vocab.json +0 -0
  22. scripts/distil-whisper-lora-run5/training_config.json +11 -0
  23. scripts/distil-whisper-lora-run5/training_history.json +1 -0
  24. scripts/hyperparameter_search.py +215 -0
  25. scripts/inference.py +17 -0
  26. scripts/train.py +119 -0
  27. scripts/train.sh +52 -0
  28. scripts/train_taid.py +120 -0
  29. scripts/train_taid.sh +52 -0
  30. src/.DS_Store +0 -0
  31. src/__init__.py +1 -0
  32. src/__pycache__/__init__.cpython-311.pyc +0 -0
  33. src/__pycache__/__init__.cpython-312.pyc +0 -0
  34. src/data/__init__.py +4 -0
  35. src/data/__pycache__/__init__.cpython-311.pyc +0 -0
  36. src/data/__pycache__/__init__.cpython-312.pyc +0 -0
  37. src/data/__pycache__/dataloader.cpython-311.pyc +0 -0
  38. src/data/__pycache__/dataloader.cpython-312.pyc +0 -0
  39. src/data/__pycache__/dataset.cpython-311.pyc +0 -0
  40. src/data/__pycache__/dataset.cpython-312.pyc +0 -0
  41. src/data/dataloader.py +105 -0
  42. src/data/dataset.py +131 -0
  43. src/models/__init__.py +1 -0
  44. src/models/__pycache__/__init__.cpython-311.pyc +0 -0
  45. src/models/__pycache__/__init__.cpython-312.pyc +0 -0
  46. src/models/__pycache__/lora.cpython-311.pyc +0 -0
  47. src/models/__pycache__/lora.cpython-312.pyc +0 -0
  48. src/models/lora.py +71 -0
  49. src/trainers/__init__.py +1 -0
  50. src/trainers/__pycache__/__init__.cpython-312.pyc +0 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.png filter=lfs diff=lfs merge=lfs -text
1.png ADDED

Git LFS Details

  • SHA256: fdf497f15eb44266b2d3fa3118e4d1198e83b1de4514e8387e4f75b3860cf0b6
  • Pointer size: 131 Bytes
  • Size of remote file: 129 kB
README.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ASR Knowledge Distillation Project
2
+
3
+ <br>
4
+
5
+ 1. Latest version of Code is from the below Co-lab, not here.
6
+
7
+ 2. Training json, no need to use for the final version.
8
+
9
+ - https://drive.google.com/drive/folders/1puqQgObUfshk-NlqZa5VUnozoVCAz6bY?usp=sharing
10
+
11
+ 📍 3. suepr simple final codebase not done wrap up
12
+
13
+ - https://colab.research.google.com/drive/1YOhYEHmzkHj6tU195ym5W8ZZ9rCMDmh3?usp=sharing
14
+
15
+ <br>
16
+
17
+ ## Project Structure
18
+
19
+ <br>
20
+
21
+ ## Notice
22
+
23
+ 1. 在实际测试中,taid训练速度极慢。![训练截图](1.png) 但是显存占用不到 6gb,dataloader 的每个 batch 是以文件夹位单位。
24
+
25
+
26
+ <br>
configs/default_config.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ root_dir: /root/autodl-tmp/dataset
3
+ sample_cap: 4000
4
+ val_ratio: 0.2
5
+ batch_size: 4
6
+ max_frames: 3000
7
+ sample_rate: 16000
8
+ pin_memory: true
9
+ persistent_workers: true
10
+ prefetch_factor: 2
11
+ num_workers: 2
12
+ model:
13
+ teacher_model: openai/whisper-large-v2
14
+ student_model: distil-whisper/distil-small.en
15
+ student_hidden_dim: 768
16
+ teacher_hidden_dim: 1280
17
+ training:
18
+ max_steps: 1600
19
+ warmup_steps: 320
20
+ eval_steps: 100
21
+ log_steps: 50
22
+ batch_size: 8
23
+ grad_accum: 4
24
+ max_grad_norm: 0.5
25
+ optimizer:
26
+ lr: 3.752055855124284e-05
27
+ weight_decay: 0.01
28
+ lora:
29
+ r: 128
30
+ alpha: 32
31
+ dropout: 0.15798625466052896
32
+ target_modules:
33
+ - model.decoder.layers.0.q_proj
34
+ - model.decoder.layers.0.k_proj
35
+ - model.decoder.layers.0.v_proj
36
+ - model.decoder.layers.0.out_proj
37
+ - model.decoder.layers.1.q_proj
38
+ - model.decoder.layers.1.k_proj
39
+ - model.decoder.layers.1.v_proj
40
+ - model.decoder.layers.1.out_proj
41
+ - model.decoder.layers.2.q_proj
42
+ - model.decoder.layers.2.k_proj
43
+ - model.decoder.layers.2.v_proj
44
+ - model.decoder.layers.2.out_proj
45
+ - model.decoder.layers.3.q_proj
46
+ - model.decoder.layers.3.k_proj
47
+ - model.decoder.layers.3.v_proj
48
+ - model.decoder.layers.3.out_proj
49
+ - model.decoder.layers.4.q_proj
50
+ - model.decoder.layers.4.k_proj
51
+ - model.decoder.layers.4.v_proj
52
+ - model.decoder.layers.4.out_proj
53
+ - model.decoder.layers.5.q_proj
54
+ - model.decoder.layers.5.k_proj
55
+ - model.decoder.layers.5.v_proj
56
+ - model.decoder.layers.5.out_proj
57
+ - model.decoder.layers.0.fc1
58
+ - model.decoder.layers.0.fc2
59
+ - model.decoder.layers.1.fc1
60
+ - model.decoder.layers.1.fc2
61
+ - model.decoder.layers.2.fc1
62
+ - model.decoder.layers.2.fc2
63
+ - model.decoder.layers.3.fc1
64
+ - model.decoder.layers.3.fc2
65
+ - model.decoder.layers.4.fc1
66
+ - model.decoder.layers.4.fc2
67
+ - model.decoder.layers.5.fc1
68
+ - model.decoder.layers.5.fc2
69
+ distillation:
70
+ temperature: 2.1649165607921677
71
+ kl_weight: 0.6118528947223795
72
+ hidden_beta: 0.4184815819561255
73
+ taid:
74
+ start: 0.1
75
+ mid: 0.5
76
+ end: 0.9
77
+ output:
78
+ dir: ./distil-whisper-lora-run5
distil-whisper-lora-run5/training_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lr": 3.752055855124284e-05,
3
+ "lora_dropout": 0.15798625466052896,
4
+ "temperature": 2.1649165607921677,
5
+ "kl_weight": 0.6118528947223795,
6
+ "hidden_beta": 0.4184815819561255,
7
+ "grad_accum": 4,
8
+ "batch_size": 8,
9
+ "lora_r": 128,
10
+ "lora_alpha": 32
11
+ }
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
run_distillation.py ADDED
@@ -0,0 +1,2137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Training the Whisper model for sequence to sequence speech recognition via teacher-student distillation.
18
+ """
19
+ # You can also adapt this script for your own distillation tasks. Pointers for this are left as comments.
20
+
21
+ import logging
22
+ import os
23
+ import re
24
+ import shutil
25
+ import string
26
+ import sys
27
+ import time
28
+ from dataclasses import dataclass, field
29
+ from functools import partial
30
+ from pathlib import Path
31
+ from typing import Any, Callable, Dict, List, Optional, Union
32
+
33
+ import datasets
34
+ import evaluate
35
+ import flax
36
+ import jax
37
+ import jax.numpy as jnp
38
+ import numpy as np
39
+ import optax
40
+ import torch
41
+ import transformers
42
+ from datasets import (
43
+ DatasetDict,
44
+ IterableDataset,
45
+ IterableDatasetDict,
46
+ concatenate_datasets,
47
+ interleave_datasets,
48
+ load_dataset,
49
+ )
50
+ from flax import jax_utils, traverse_util
51
+ from flax.jax_utils import pad_shard_unpad, unreplicate
52
+ from flax.serialization import from_bytes, to_bytes
53
+ from flax.training import train_state
54
+ from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
55
+ from huggingface_hub import Repository, create_repo
56
+ from jax.experimental.compilation_cache import compilation_cache as cc
57
+ from optax._src import linear_algebra
58
+ from torch.utils.data import DataLoader
59
+ from torchdata.datapipes.iter import IterableWrapper
60
+ from tqdm import tqdm
61
+ from transformers import (
62
+ AddedToken,
63
+ HfArgumentParser,
64
+ Seq2SeqTrainingArguments,
65
+ WhisperConfig,
66
+ WhisperFeatureExtractor,
67
+ WhisperProcessor,
68
+ WhisperTokenizerFast,
69
+ is_tensorboard_available,
70
+ is_wandb_available,
71
+ set_seed,
72
+ )
73
+ from transformers.file_utils import get_full_repo_name
74
+ from transformers.modeling_flax_outputs import FlaxBaseModelOutput
75
+ from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
76
+ from transformers.utils import check_min_version, send_example_telemetry
77
+ from transformers.utils.versions import require_version
78
+
79
+ from distil_whisper import FlaxWhisperForConditionalGeneration
80
+
81
+
82
+ # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
83
+ check_min_version("4.27.0.dev0")
84
+
85
+ require_version(
86
+ "datasets>=1.18.0",
87
+ "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt",
88
+ )
89
+
90
+ logger = logging.getLogger(__name__)
91
+
92
+
93
+ @flax.struct.dataclass
94
+ class ModelArguments:
95
+ """
96
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
97
+ """
98
+
99
+ model_name_or_path: str = field(
100
+ metadata={"help": ("Path to pretrained student model or model identifier from huggingface.co/models")}
101
+ )
102
+ teacher_model_name_or_path: str = field(
103
+ metadata={"help": ("Path to pretrained teacher model or model identifier from huggingface.co/models")}
104
+ )
105
+ config_name: Optional[str] = field(
106
+ default=None,
107
+ metadata={"help": "Pretrained config name or path if not the same as model_name"},
108
+ )
109
+ tokenizer_name: Optional[str] = field(
110
+ default=None,
111
+ metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"},
112
+ )
113
+ feature_extractor_name: Optional[str] = field(
114
+ default=None,
115
+ metadata={"help": "feature extractor name or path if not the same as model_name"},
116
+ )
117
+ cache_dir: Optional[str] = field(
118
+ default=None,
119
+ metadata={"help": ("Where to store the pretrained models downloaded from huggingface.co")},
120
+ )
121
+ use_fast_tokenizer: bool = field(
122
+ default=True,
123
+ metadata={"help": ("Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.")},
124
+ )
125
+ model_revision: str = field(
126
+ default="main",
127
+ metadata={"help": ("The specific model version to use (can be a branch name, tag name or commit id).")},
128
+ )
129
+ subfolder: str = field(
130
+ default="",
131
+ metadata={
132
+ "help": "In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can"
133
+ "specify the folder name here."
134
+ },
135
+ )
136
+ use_auth_token: bool = field(
137
+ default=False,
138
+ metadata={
139
+ "help": (
140
+ "Will use the token generated when running `transformers-cli login`"
141
+ " (necessary to use this script with private models)."
142
+ )
143
+ },
144
+ )
145
+ dtype: Optional[str] = field(
146
+ default="float32",
147
+ metadata={
148
+ "help": (
149
+ "Floating-point format in which the model weights should be initialized"
150
+ " and trained. Choose one of `[float32, float16, bfloat16]`."
151
+ )
152
+ },
153
+ )
154
+ load_with_scan_weights: bool = field(
155
+ default=False,
156
+ metadata={
157
+ "help": "Whether the pre-trained checkpoint has its weights stored in scan format. Set to True for scanned "
158
+ "weights, defaults to False for non-scan (unrolled) weights."
159
+ },
160
+ )
161
+ activation_dropout: float = field(
162
+ default=0.0,
163
+ metadata={"help": "The dropout ratio for activations inside the fully connected layer."},
164
+ )
165
+ attention_dropout: float = field(
166
+ default=0.0,
167
+ metadata={"help": "The dropout ratio for the attention probabilities."},
168
+ )
169
+ dropout: float = field(
170
+ default=0.0,
171
+ metadata={
172
+ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
173
+ },
174
+ )
175
+
176
+
177
+ @flax.struct.dataclass
178
+ class DataTrainingArguments:
179
+ """
180
+ Arguments pertaining to what data we are going to input our model for training and eval.
181
+ """
182
+
183
+ train_dataset_name: str = field(
184
+ default=None,
185
+ metadata={
186
+ "help": "The name of the training dataset to use (via the datasets library). Load and combine "
187
+ "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
188
+ " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
189
+ },
190
+ )
191
+ train_dataset_config_name: Optional[str] = field(
192
+ default=None,
193
+ metadata={
194
+ "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
195
+ "multiple datasets by separating dataset configs by a '+' symbol."
196
+ },
197
+ )
198
+ train_dataset_samples: str = field(
199
+ default=None,
200
+ metadata={
201
+ "help": "Number of samples in the training data. Load and combine "
202
+ "multiple datasets by separating dataset samples by a '+' symbol."
203
+ },
204
+ )
205
+ eval_dataset_name: str = field(
206
+ default=None,
207
+ metadata={
208
+ "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
209
+ },
210
+ )
211
+ eval_dataset_config_name: Optional[str] = field(
212
+ default=None,
213
+ metadata={
214
+ "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
215
+ },
216
+ )
217
+ dataset_cache_dir: Optional[str] = field(
218
+ default=None,
219
+ metadata={"help": "Path to cache directory for saving and loading datasets"},
220
+ )
221
+ overwrite_cache: bool = field(
222
+ default=False,
223
+ metadata={"help": "Overwrite the cached training and evaluation sets"},
224
+ )
225
+ preprocessing_num_workers: Optional[int] = field(
226
+ default=None,
227
+ metadata={"help": "The number of processes to use for the preprocessing."},
228
+ )
229
+ max_train_samples: Optional[int] = field(
230
+ default=None,
231
+ metadata={
232
+ "help": (
233
+ "For debugging purposes or quicker training, truncate the number of"
234
+ " training examples to this value if set."
235
+ )
236
+ },
237
+ )
238
+ max_eval_samples: Optional[int] = field(
239
+ default=None,
240
+ metadata={
241
+ "help": (
242
+ "For debugging purposes or quicker training, truncate the number of"
243
+ " evaluation examples to this value if set."
244
+ )
245
+ },
246
+ )
247
+ audio_column_name: str = field(
248
+ default="audio",
249
+ metadata={"help": ("The name of the dataset column containing the audio data. Defaults to 'audio'")},
250
+ )
251
+ train_text_column_name: str = field(
252
+ default="whisper_transcript",
253
+ metadata={
254
+ "help": (
255
+ "The name of the dataset column containing the text data. Defaults to"
256
+ " 'whisper_transcript'which is the pseudo-labelled Whisper"
257
+ " transcription data."
258
+ )
259
+ },
260
+ )
261
+ eval_text_column_name: str = field(
262
+ default="text",
263
+ metadata={
264
+ "help": (
265
+ "The name of the dataset column containing the text data. Defaults to"
266
+ " 'text', which is the original text data"
267
+ )
268
+ },
269
+ )
270
+ max_duration_in_seconds: float = field(
271
+ default=30.0,
272
+ metadata={"help": ("Filter audio files that are longer than `max_duration_in_seconds` seconds")},
273
+ )
274
+ min_duration_in_seconds: float = field(
275
+ default=0.0,
276
+ metadata={"help": ("Filter audio files that are shorter than `min_duration_in_seconds` seconds")},
277
+ )
278
+ max_label_length: int = field(
279
+ default=128,
280
+ metadata={"help": "Truncate transcriptions that are longer `max_label_length` tokens."},
281
+ )
282
+ pad_target_to_multiple_of: Optional[int] = field(
283
+ default=None,
284
+ metadata={
285
+ "help": (
286
+ "If set will pad the target sequence to a multiple of the provided"
287
+ " value. This is important to avoid triggering recompilations on TPU."
288
+ " If unspecified, will default to padding the targets to max length."
289
+ )
290
+ },
291
+ )
292
+ preprocessing_only: bool = field(
293
+ default=False,
294
+ metadata={
295
+ "help": (
296
+ "Whether to only do data preprocessing and skip training. This is"
297
+ " especially useful when data preprocessing errors out in distributed"
298
+ " training due to timeout. In this case, one should run the"
299
+ " preprocessing in a non-distributed setup with"
300
+ " `preprocessing_only=True` so that the cached datasets can"
301
+ " consequently be loaded in distributed training"
302
+ )
303
+ },
304
+ )
305
+ train_split_name: str = field(
306
+ default="train",
307
+ metadata={
308
+ "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
309
+ },
310
+ )
311
+ eval_split_name: str = field(
312
+ default="validation",
313
+ metadata={
314
+ "help": (
315
+ "The name of the evaluation data set split to use (via the datasets"
316
+ " library). Defaults to 'validation'"
317
+ )
318
+ },
319
+ )
320
+ wandb_project: str = field(
321
+ default="distil-whisper",
322
+ metadata={"help": "The name of the wandb project."},
323
+ )
324
+ wandb_name: str = field(
325
+ default=None,
326
+ metadata={"help": "The name of the wandb run."},
327
+ )
328
+ wandb_job_type: str = field(
329
+ default="distil-whisper",
330
+ metadata={"help": "The name of the wandb job type."},
331
+ )
332
+ wandb_dir: str = field(
333
+ default=None,
334
+ metadata={"help": "The absolute path to save the wandb logs."},
335
+ )
336
+ save_code_to_wandb: bool = field(
337
+ default=False,
338
+ metadata={
339
+ "help": (
340
+ "Whether to save main script to wandb. This is valuable for improving"
341
+ " experiment reproducibility and to diff code across experiments in"
342
+ " the UI."
343
+ )
344
+ },
345
+ )
346
+ streaming: bool = field(
347
+ default=True,
348
+ metadata={"help": "Whether to use Datasets' streaming mode to load and the data."},
349
+ )
350
+ wer_threshold: float = field(
351
+ default=None,
352
+ metadata={
353
+ "help": "Filter training data with Whisper transcriptions that have greater than `wer_threshold` "
354
+ "WER with the normalised transcriptions."
355
+ },
356
+ )
357
+ prefetch_size: int = field(
358
+ default=0,
359
+ metadata={"help": "Number of samples to pre-fetch if using an iterable dataset."},
360
+ )
361
+ timestamp_probability: float = field(
362
+ default=0.5, metadata={"help": "Probability for training on timestamped tokens if the data contains it."}
363
+ )
364
+ return_timestamps: bool = field(
365
+ default=False, metadata={"help": "Whether or not to predict timestamps in the generation step."}
366
+ )
367
+ round_timestamps: bool = field(
368
+ default=False,
369
+ metadata={
370
+ "help": "Whether or not to round the timestamp tokens to the nearest tenth of a second."
371
+ "By default, Whisper predicts timestamps to the nearest hundredth of a second."
372
+ "Reducing the timestamp precision to one tenth of a second simplifies the timestamp"
373
+ "prediction task, at the expense of timestamp granularity."
374
+ },
375
+ )
376
+
377
+
378
+ @dataclass
379
+ class FlaxSeq2SeqTrainingArguments(Seq2SeqTrainingArguments):
380
+ use_scan: Optional[bool] = field(
381
+ default=True,
382
+ metadata={
383
+ "help": (
384
+ "Whether or not to use `scan_with_axes` over the encoder and decoder blocks. Using scan results "
385
+ "in faster compile times and more efficient memory use during training, since all of the layers "
386
+ "in the encoder/decoder are stacked, and we perform a lax.scan over the stacked block to index "
387
+ "each layer. However, it results in slower inference time due to the overhead of stacking the "
388
+ "layers this way. Thus, we **always** default to disabling scan for the inference step."
389
+ )
390
+ },
391
+ )
392
+ freeze_encoder: Optional[bool] = field(
393
+ default=False,
394
+ metadata={
395
+ "help": (
396
+ "Whether to freeze the entire encoder model. Only recommended when the entire encoder has been "
397
+ "copied from the teacher model."
398
+ )
399
+ },
400
+ )
401
+ temperature: Optional[float] = field(
402
+ default=2.0, metadata={"help": "Temperature to anneal the logits when computing the softmax."}
403
+ )
404
+ kl_weight: Optional[float] = field(
405
+ default=1.0,
406
+ metadata={
407
+ "help": (
408
+ "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
409
+ "computed between the teacher-student hidden states and attentions."
410
+ )
411
+ },
412
+ )
413
+ mse_weight: Optional[float] = field(
414
+ default=0.0,
415
+ metadata={
416
+ "help": (
417
+ "Weighting assigned to the MSE loss in the KD formulation. MSE loss is "
418
+ "computed between the teacher-student hidden states and attentions."
419
+ )
420
+ },
421
+ )
422
+ precision: Optional[str] = field(
423
+ default="half_mixed",
424
+ metadata={
425
+ "help": (
426
+ "Precision with which run training, Can be one of `full`, `half_mixed` or `full_mixed`, the latter two"
427
+ "of which enable *mixed-precision* training. **Note that this only specifies the dtype of the computation "
428
+ "and optimizer state. It does not influence the dtype of model parameters.** An explanation of the three "
429
+ "settings is provided below:"
430
+ " 1. Full precision: forward pass, backward pass and optimiser states all in float32."
431
+ " 2. Half mixed precision: forward pass in bfloat16, backward pass and optimiser states in float32. This "
432
+ " corresponds to setting the dtype argument to bfloat16 when instantiating the model."
433
+ " 3. Full mixed precision: forward pass, backward pass and optimiser states all in bfloat16. The dtype "
434
+ " argument is set to bfloat16 for the forward pass, and the gradients computed with respect to the bfloat16 "
435
+ " parameters in the backward pass (giving bfloat16 gradients). The new optimiser states and parameter "
436
+ " updates are computed in float32 by upcasting the bfloat16 gradients and optimiser states to float32 "
437
+ " prior to the optimiser update step. The optimiser states are returned in float32 (but not saved to "
438
+ " memory) and then downcasted to bfloat16 (saved to memory) for the subsequent train step."
439
+ "For further details, refer to https://github.com/deepmind/optax/discussions/336"
440
+ )
441
+ },
442
+ )
443
+ compilation_cache: Optional[bool] = field(
444
+ default=False,
445
+ metadata={
446
+ "help": (
447
+ "Whether to enable the JAX (experimental) compilation cache. The compilation step is *cached* the "
448
+ "first time it is run. Successive compilation steps for the same function utilise the cache to reduce"
449
+ "the compilation time."
450
+ )
451
+ },
452
+ )
453
+ save_train_state: Optional[bool] = field(
454
+ default=False,
455
+ metadata={
456
+ "help": "Whether or not to save the Flax Train State on each `save_steps` steps. Required if you intend"
457
+ "to resume training from partial training runs. If False, only the model weights will be saved."
458
+ "If True, both the model weights and Flax Train state will be saved."
459
+ },
460
+ )
461
+
462
+
463
+ def shift_tokens_right(label_ids: np.array, decoder_start_token_id: int) -> np.ndarray:
464
+ """
465
+ Shift label ids one token to the right.
466
+ """
467
+ shifted_label_ids = np.zeros_like(label_ids)
468
+ shifted_label_ids[:, 1:] = label_ids[:, :-1]
469
+ shifted_label_ids[:, 0] = decoder_start_token_id
470
+
471
+ return shifted_label_ids
472
+
473
+
474
+ @flax.struct.dataclass
475
+ class FlaxDataCollatorSpeechSeq2SeqWithPadding:
476
+ """
477
+ Data collator that will dynamically pad the inputs received.
478
+ Args:
479
+ processor ([`Wav2Vec2Processor`])
480
+ The processor used for proccessing the data.
481
+ decoder_start_token_id (:obj: `int`)
482
+ The start-of-sequence token id of the decoder.
483
+ decoder_prev_token_id (:obj: `int`)
484
+ The start-of-prompt token id of the decoder
485
+ input_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
486
+ Select a strategy to pad the returned input sequences (according to the model's padding side and padding index)
487
+ among:
488
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
489
+ sequence if provided).
490
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
491
+ maximum acceptable input length for the model if that argument is not provided.
492
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
493
+ different lengths).
494
+ target_padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
495
+ Select a strategy to pad the returned target sequences (according to the model's padding side and padding index).
496
+ See above for details.
497
+ max_target_length (:obj:`int`, `optional`):
498
+ Maximum length of the ``labels`` of the returned list and optionally padding length (see above).
499
+ """
500
+
501
+ processor: Any
502
+ decoder_start_token_id: int
503
+ decoder_prev_token_id: int
504
+ input_padding: Union[bool, str] = "max_length"
505
+ target_padding: Union[bool, str] = "max_length"
506
+ max_target_length: Optional[int] = None
507
+
508
+ def __call__(self, features: List[Dict[str, Union[List[int], np.ndarray]]]) -> Dict[str, np.ndarray]:
509
+ # split inputs and labels since they have to be of different lengths and need
510
+ # different padding methods
511
+ model_input_name = self.processor.model_input_names[0]
512
+
513
+ # dataloader returns a list of features which we convert to a dict
514
+ input_features = {model_input_name: [feature[model_input_name] for feature in features]}
515
+ label_features = {"input_ids": [feature["labels"] for feature in features]}
516
+
517
+ # reformat list to dict and set to pytorch format
518
+ batch = self.processor.feature_extractor.pad(
519
+ input_features,
520
+ padding=self.input_padding,
521
+ return_tensors="np",
522
+ )
523
+
524
+ labels_batch = self.processor.tokenizer.pad(
525
+ label_features,
526
+ max_length=self.max_target_length,
527
+ padding=self.target_padding,
528
+ return_tensors="np",
529
+ )
530
+
531
+ # if bos token is appended in previous tokenization step,
532
+ # cut bos token here as it's append later anyways
533
+ labels = labels_batch["input_ids"]
534
+ if set(np.unique(labels[:, 0])).issubset({self.decoder_start_token_id, self.decoder_prev_token_id}):
535
+ decoder_input_ids = labels[:, :-1]
536
+ labels = labels[:, 1:]
537
+ labels_batch.attention_mask = labels_batch.attention_mask[:, 1:]
538
+ else:
539
+ decoder_input_ids = shift_tokens_right(labels, self.decoder_start_token_id)
540
+
541
+ # replace padding with -100 to ignore correctly when computing the loss
542
+ labels = np.ma.array(labels, mask=np.not_equal(labels_batch.attention_mask, 1))
543
+ labels = labels.filled(fill_value=-100)
544
+
545
+ # replace initial prompt tokens with -100 to ignore correctly when computing the loss
546
+ bos_index = np.argmax(labels == self.decoder_start_token_id, axis=1)
547
+ prompt_mask = np.arange(labels.shape[1]) < bos_index[:, None]
548
+ labels = np.where(prompt_mask, -100, labels)
549
+
550
+ batch["labels"] = labels
551
+ batch["decoder_input_ids"] = decoder_input_ids
552
+
553
+ return batch
554
+
555
+
556
+ def get_data_loader(
557
+ seed: int,
558
+ dataset: IterableDataset,
559
+ batch_size: int,
560
+ data_collator: FlaxDataCollatorSpeechSeq2SeqWithPadding,
561
+ shuffle: bool = True,
562
+ drop_last: bool = True,
563
+ dataloader_num_workers: int = 0,
564
+ skip_batches: int = 0,
565
+ pin_memory: bool = True,
566
+ prefetch_size: int = 0,
567
+ ) -> DataLoader:
568
+ """
569
+ Returns batches of size `batch_size` from `dataset`. If `drop_last` is set to `False`, the final batch may be incomplete,
570
+ and range in size from 1 to `batch_size`. Shuffle batches if `shuffle` is `True`.
571
+
572
+ Args:
573
+ seed (int): Numpy seed for generating pseudo random numbers. Used if shuffling the dataset.
574
+ dataset (IterableDataset): streaming dataset from which to load the data.
575
+ batch_size (int): how many samples per batch to load.
576
+ data_collator (FlaxDataCollatorSpeechSeq2SeqWithPadding, optional): merges a list of samples to form a
577
+ mini-batch of Tensor(s). Used when using batched loading from a map-style dataset.
578
+ shuffle (bool, optional): set to `True` to have the batches reshuffled.
579
+ drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
580
+ if the dataset size is not divisible by the batch size. If ``False`` and
581
+ the size of dataset is not divisible by the batch size, then the last batch
582
+ will be smaller. (default: ``False``)
583
+ dataloader_num_workers (int, optional): how many subprocesses to use for data
584
+ loading. ``0`` means that the data will be loaded in the main process.
585
+ (default: ``0``)
586
+ skip_batches (int, optional): Efficiently skip the first `skip_batches`.
587
+ pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
588
+ into device/CUDA pinned memory before returning them. If your data elements
589
+ are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
590
+ see the example below.
591
+
592
+ """
593
+ if shuffle:
594
+ dataset = dataset.shuffle(seed)
595
+
596
+ if skip_batches > 0:
597
+ dataset = dataset.skip(skip_batches * batch_size)
598
+
599
+ if prefetch_size > 0:
600
+ dataset = IterableWrapper(dataset)
601
+ dataset = dataset.prefetch(prefetch_size)
602
+
603
+ data_loader = DataLoader(
604
+ dataset,
605
+ batch_size=batch_size,
606
+ drop_last=drop_last,
607
+ pin_memory=pin_memory,
608
+ collate_fn=data_collator,
609
+ num_workers=dataloader_num_workers,
610
+ )
611
+
612
+ return data_loader
613
+
614
+
615
+ def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
616
+ ordering_and_checkpoint_path = []
617
+
618
+ glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
619
+
620
+ for path in glob_checkpoints:
621
+ if use_mtime:
622
+ ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
623
+ else:
624
+ regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
625
+ if regex_match is not None and regex_match.groups() is not None:
626
+ ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
627
+
628
+ checkpoints_sorted = sorted(ordering_and_checkpoint_path)
629
+ checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
630
+ return checkpoints_sorted
631
+
632
+
633
+ def rotate_checkpoints(
634
+ save_total_limit=None, use_mtime=False, output_dir=None, checkpoint_prefix="checkpoint"
635
+ ) -> None:
636
+ if save_total_limit is None or save_total_limit <= 0:
637
+ return
638
+
639
+ # Check if we should delete older checkpoint(s)
640
+ checkpoints_sorted = sorted_checkpoints(
641
+ use_mtime=use_mtime, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
642
+ )
643
+ if len(checkpoints_sorted) <= save_total_limit:
644
+ return
645
+
646
+ number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
647
+ checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
648
+ for checkpoint in checkpoints_to_be_deleted:
649
+ logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
650
+ shutil.rmtree(checkpoint, ignore_errors=True)
651
+
652
+
653
+ def to_fp32(t):
654
+ return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
655
+
656
+
657
+ def to_bf16(t):
658
+ return jax.tree_map(lambda x: x.astype(jnp.bfloat16) if x.dtype == jnp.float32 else x, t)
659
+
660
+
661
+ class TrainState(train_state.TrainState):
662
+ dropout_rng: jnp.ndarray
663
+ max_grad_norm: float
664
+
665
+ def apply_gradients(self, *, grads, to_dtype: to_fp32, **kwargs):
666
+ """Updates `step`, `params`, `opt_state` and `**kwargs` in return value, clipping the
667
+ gradients by the maximum grad norm.
668
+
669
+ Note that internally this function calls `.tx.update()` followed by a call
670
+ to `optax.apply_updates()` to update `params` and `opt_state`.
671
+
672
+ Args:
673
+ grads: Gradients that have the same pytree structure as `.params`.
674
+ **kwargs: Additional dataclass attributes that should be `.replace()`-ed.
675
+
676
+ Returns:
677
+ An updated instance of `self` with `step` incremented by one, `params`
678
+ and `opt_state` updated by applying `grads`, and additional attributes
679
+ replaced as specified by `kwargs`.
680
+ """
681
+ # clip gradients by global l2 norm
682
+ casted_max_grad_norm = to_dtype(self.max_grad_norm)
683
+ g_norm = linear_algebra.global_norm(grads)
684
+ g_norm = jnp.maximum(casted_max_grad_norm, g_norm)
685
+ grads = jax.tree_map(lambda t: (t / g_norm) * casted_max_grad_norm, grads)
686
+
687
+ # perform update step in fp32 and subsequently downcast optimizer states if mixed precision training
688
+ # grads and opt_state in bf16 (need to upcast), params in fp32 (leave as is)
689
+ updates, new_opt_state = self.tx.update(to_fp32(grads), to_fp32(self.opt_state), self.params)
690
+
691
+ new_params = optax.apply_updates(self.params, updates)
692
+
693
+ return self.replace(
694
+ step=self.step + 1,
695
+ params=new_params,
696
+ opt_state=to_dtype(new_opt_state),
697
+ **kwargs,
698
+ )
699
+
700
+ @classmethod
701
+ def create(cls, *, apply_fn, params, tx, to_dtype: to_fp32, **kwargs):
702
+ """Creates a new instance with `step=0` and initialized `opt_state`."""
703
+ # downcast optimizer state to bf16 if mixed-precision training
704
+ opt_state = tx.init(to_dtype(params))
705
+ return cls(
706
+ step=0,
707
+ apply_fn=apply_fn,
708
+ params=params,
709
+ tx=tx,
710
+ opt_state=opt_state,
711
+ **kwargs,
712
+ )
713
+
714
+ def replicate(self):
715
+ return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
716
+
717
+ def unreplicate(self):
718
+ return jax_utils.unreplicate(self)
719
+
720
+ def save_state(self, output_dir, save_total_limit=None, checkpoint_prefix="checkpoint"):
721
+ step = int(jax.device_get(unreplicate(self.step)))
722
+ serialized_state = to_bytes(self.unreplicate())
723
+
724
+ output_file = Path(os.path.join(output_dir, f"{checkpoint_prefix}-{step}", "train_state.msgpack"))
725
+ output_file.parent.mkdir(exist_ok=True, parents=True)
726
+
727
+ with output_file.open("wb") as f:
728
+ f.write(serialized_state)
729
+
730
+ logger.info(f"Flax train state saved in {output_file}")
731
+ rotate_checkpoints(
732
+ save_total_limit=save_total_limit, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix
733
+ )
734
+
735
+
736
+ def save_hf_weights(
737
+ student_state: TrainState,
738
+ student_model: FlaxWhisperForConditionalGeneration,
739
+ processor: WhisperProcessor,
740
+ output_dir: str,
741
+ cur_step: int,
742
+ total_train_steps: int,
743
+ use_scan: bool = True,
744
+ checkpoint_prefix: str = "checkpoint",
745
+ ) -> None:
746
+ # always disable scan in the params / model so that we can load from PyTorch directly - this is a no-op if we're not using scan for training
747
+ student_state_params = unreplicate(student_state.params)
748
+ student_state_params = student_model.convert_scan_to_unroll(student_state_params)
749
+ student_params = jax.device_get(student_state_params)
750
+ student_model.disable_scan()
751
+
752
+ if cur_step != total_train_steps:
753
+ output_dir = os.path.join(output_dir, f"{checkpoint_prefix}-{cur_step}")
754
+ os.makedirs(output_dir, exist_ok=True)
755
+
756
+ student_model.save_pretrained(output_dir, params=student_params)
757
+ processor.save_pretrained(output_dir)
758
+
759
+ # re-enable scan only if required for training
760
+ if use_scan:
761
+ student_model.enable_scan()
762
+
763
+
764
+ def write_train_metric(summary_writer, train_metrics, train_time, step, logging_steps):
765
+ summary_writer.scalar("train/time", train_time, step)
766
+
767
+ train_metrics = get_metrics(train_metrics)
768
+ for key, vals in train_metrics.items():
769
+ steps_arr = np.arange(0, step, logging_steps)[-len(vals) :]
770
+ tag = f"train/{key}"
771
+ for i, val in enumerate(vals):
772
+ summary_writer.scalar(tag, val, steps_arr[i])
773
+
774
+
775
+ def write_eval_metric(summary_writer, eval_metrics, step, prefix="eval"):
776
+ for metric_name, value in eval_metrics.items():
777
+ summary_writer.scalar(f"{prefix}/{metric_name}", value, step)
778
+
779
+
780
+ def write_wandb_metric(wandb_logger, metrics, train_time, step, epoch, prefix="train"):
781
+ log_metrics = {}
782
+ for k, v in metrics.items():
783
+ log_metrics[f"{prefix}/{k}"] = v
784
+ log_metrics[f"{prefix}/time"] = train_time
785
+ log_metrics[f"{prefix}/epoch"] = epoch
786
+ wandb_logger.log(log_metrics, step)
787
+
788
+
789
+ def write_wandb_pred(
790
+ wandb_logger, pred_str, label_str, norm_pred_str, norm_label_str, cur_step, prefix="eval", num_lines=200000
791
+ ):
792
+ # pretty name for current step: step 50000 -> step 50k
793
+ cur_step_pretty = f"{int(cur_step // 1000)}k" if cur_step > 1000 else cur_step
794
+ # convert str data to a wandb compatible format
795
+ str_data = [[label_str[i], pred_str[i], norm_label_str[i], norm_pred_str[i]] for i in range(len(pred_str))]
796
+ # log as a table with the appropriate headers
797
+ wandb_logger.log(
798
+ {
799
+ f"predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
800
+ columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data[:num_lines]
801
+ )
802
+ },
803
+ cur_step,
804
+ )
805
+ # log incorrect normalised predictions
806
+ str_data = np.asarray(str_data)
807
+ str_data_incorrect = str_data[str_data[:, -2] != str_data[:, -1]]
808
+ # log as a table with the appropriate headers
809
+ wandb_logger.log(
810
+ {
811
+ f"incorrect_predictions/{prefix.replace('/', '-')}-step-{cur_step_pretty}": wandb_logger.Table(
812
+ columns=["Target", "Pred", "Norm Target", "Norm Pred"], data=str_data_incorrect[:num_lines]
813
+ )
814
+ },
815
+ cur_step,
816
+ )
817
+
818
+
819
+ def create_learning_rate_fn(
820
+ num_train_steps: int, lr_scheduler_type: str, num_warmup_steps: int, learning_rate: float
821
+ ) -> Callable[[int], jnp.array]:
822
+ """Returns a linear warmup, linear_decay learning rate function."""
823
+ lr_scheduler_types = ("linear", "constant_with_warmup")
824
+
825
+ if lr_scheduler_type not in lr_scheduler_types:
826
+ raise ValueError(
827
+ f"lr_scheduler_type of type {lr_scheduler_type} not supported, choose from {lr_scheduler_types}."
828
+ )
829
+
830
+ warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
831
+ decay_fn = optax.linear_schedule(
832
+ init_value=learning_rate,
833
+ end_value=0 if lr_scheduler_type == "linear" else learning_rate,
834
+ transition_steps=num_train_steps - num_warmup_steps,
835
+ )
836
+ schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
837
+ return schedule_fn
838
+
839
+
840
+ def convert_dataset_str_to_list(
841
+ dataset_names,
842
+ dataset_config_names,
843
+ splits=None,
844
+ text_column_names=None,
845
+ dataset_samples=None,
846
+ default_split="train",
847
+ ):
848
+ if isinstance(dataset_names, str):
849
+ dataset_names = dataset_names.split("+")
850
+
851
+ # we assume that all the datasets we're using derive from the distil-whisper org on the Hub - prepend the org name if necessary
852
+ for i in range(len(dataset_names)):
853
+ ds_name = dataset_names[i]
854
+ dataset_names[i] = f"distil-whisper/{ds_name}" if "/" not in ds_name else ds_name
855
+
856
+ dataset_config_names = dataset_config_names.split("+")
857
+ splits = splits.split("+") if splits is not None else None
858
+ text_column_names = text_column_names.split("+") if text_column_names is not None else None
859
+ dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
860
+
861
+ # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
862
+ if len(dataset_names) != len(dataset_config_names):
863
+ raise ValueError(
864
+ f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
865
+ f" {len(dataset_config_names)} configs."
866
+ )
867
+
868
+ if splits is not None and len(splits) != len(dataset_names):
869
+ raise ValueError(
870
+ f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
871
+ )
872
+
873
+ if text_column_names is not None and len(text_column_names) != len(dataset_names):
874
+ raise ValueError(
875
+ f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
876
+ f" {len(text_column_names)} text column names."
877
+ )
878
+
879
+ if dataset_samples is not None:
880
+ if len(dataset_samples) != len(dataset_names):
881
+ raise ValueError(
882
+ f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
883
+ f"{len(dataset_samples)} samples."
884
+ )
885
+ dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
886
+ else:
887
+ dataset_samples = [None] * len(dataset_names)
888
+
889
+ text_column_names = (
890
+ text_column_names if text_column_names is not None else ["text" for _ in range(len(dataset_names))]
891
+ )
892
+ splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
893
+
894
+ dataset_names_dict = []
895
+ for i, ds_name in enumerate(dataset_names):
896
+ dataset_names_dict.append(
897
+ {
898
+ "name": ds_name,
899
+ "config": dataset_config_names[i],
900
+ "split": splits[i],
901
+ "text_column_name": text_column_names[i],
902
+ "samples": dataset_samples[i],
903
+ }
904
+ )
905
+ return dataset_names_dict
906
+
907
+
908
+ def load_multiple_datasets(
909
+ dataset_names: Union[List, str],
910
+ dataset_config_names: Union[List, str],
911
+ splits: Optional[Union[List, str]] = None,
912
+ text_column_names: Optional[List] = None,
913
+ sampling_rate: Optional[int] = 16000,
914
+ stopping_strategy: Optional[str] = "first_exhausted",
915
+ dataset_samples: Optional[Union[List, np.array]] = None,
916
+ streaming: bool = True,
917
+ seed: int = None,
918
+ **kwargs,
919
+ ) -> IterableDataset:
920
+ dataset_names_dict = convert_dataset_str_to_list(
921
+ dataset_names, dataset_config_names, splits, text_column_names, dataset_samples
922
+ )
923
+
924
+ if dataset_samples is not None:
925
+ dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
926
+ probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
927
+ else:
928
+ probabilities = None
929
+
930
+ if len(dataset_names_dict) == 1:
931
+ dataset_dict = dataset_names_dict[0]
932
+ # we have a single dataset so just return it as is
933
+ return load_dataset(
934
+ dataset_dict["name"],
935
+ dataset_dict["config"],
936
+ split=dataset_dict["split"],
937
+ streaming=streaming,
938
+ **kwargs,
939
+ )
940
+
941
+ all_datasets = []
942
+ # iterate over the datasets we want to interleave
943
+ for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
944
+ dataset = load_dataset(
945
+ dataset_dict["name"],
946
+ dataset_dict["config"],
947
+ split=dataset_dict["split"],
948
+ streaming=streaming,
949
+ **kwargs,
950
+ )
951
+ # resample to specified sampling rate
952
+ dataset = dataset.cast_column("audio", datasets.features.Audio(sampling_rate))
953
+ dataset = dataset.remove_columns(
954
+ set(dataset.features.keys()) - {"audio", dataset_dict["text_column_name"], "whisper_transcript"}
955
+ )
956
+ all_datasets.append(dataset)
957
+
958
+ if streaming:
959
+ interleaved_dataset = interleave_datasets(
960
+ all_datasets,
961
+ stopping_strategy=stopping_strategy,
962
+ probabilities=probabilities,
963
+ seed=seed,
964
+ )
965
+ else:
966
+ interleaved_dataset = concatenate_datasets(all_datasets)
967
+
968
+ return interleaved_dataset
969
+
970
+
971
+ def get_layers_to_supervise(student_layers: int, teacher_layers: int) -> dict:
972
+ """Helper function to map the student layer i to the teacher layer j whose output we'd like them to emulate. Used
973
+ for MSE loss terms in distillation (hidden-states and activations). Student layers are paired with teacher layers
974
+ in equal increments, e.g. for a 12-layer model distilled to a 3-layer model, student layer 0 emulates teacher layer
975
+ 3 (such that it behaves like the first 4 teacher layers), student layer 1 emulates teacher layer 7, and student layer
976
+ 2 emulates teacher layer 11. This mapping is summarised by the dictionary: {0: 3, 1: 7, 2: 11}, which is precisely
977
+ the output of this function for the arguments (student_layers=3, teacher_layers=12)."""
978
+ layer_intervals = np.linspace(teacher_layers // student_layers - 1, teacher_layers - 1, student_layers, dtype=int)
979
+ layer_intervals[-1] = teacher_layers - 1
980
+ layer_map = {}
981
+
982
+ for student_layer, teacher_layer in enumerate(layer_intervals):
983
+ layer_map[student_layer] = teacher_layer
984
+
985
+ return layer_map
986
+
987
+
988
+ class FlaxWhisperFeatureExtractor(WhisperFeatureExtractor):
989
+ def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
990
+ """
991
+ Compute the log-mel spectrogram of the provided audio using torch filters. Using the torch implementation
992
+ computes stft filter banks approx 5x faster than its numpy counterpart, which is the native implementation
993
+ in transformers, and matches to within 1e-5 abs tolerance.
994
+ """
995
+ waveform = torch.from_numpy(waveform).type(torch.float32)
996
+
997
+ window = torch.hann_window(self.n_fft)
998
+ stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
999
+ magnitudes = stft[..., :-1].abs() ** 2
1000
+
1001
+ mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)
1002
+ mel_spec = mel_filters.T @ magnitudes
1003
+
1004
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
1005
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
1006
+ log_spec = (log_spec + 4.0) / 4.0
1007
+ return log_spec.numpy()
1008
+
1009
+
1010
+ def main():
1011
+ # 1. Parse input arguments
1012
+ # See all possible arguments in src/transformers/training_args.py
1013
+ # or by passing the --help flag to this script.
1014
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
1015
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, FlaxSeq2SeqTrainingArguments))
1016
+
1017
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
1018
+ # If we pass only one argument to the script and it's the path to a json file,
1019
+ # let's parse it to get our arguments.
1020
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
1021
+ else:
1022
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
1023
+
1024
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
1025
+ # information sent is the one passed as arguments along with your JAX/Flax versions.
1026
+ send_example_telemetry("run_flax_speech_recognition_seq2seq", model_args, data_args, framework="flax")
1027
+
1028
+ # 2. Define remote logging - do this early so that we get the full traceback on our remote logs
1029
+ # Enable tensorboard only on the master node
1030
+ has_tensorboard = is_tensorboard_available()
1031
+ if has_tensorboard:
1032
+ if jax.process_index() == 0:
1033
+ try:
1034
+ from flax.metrics.tensorboard import SummaryWriter
1035
+
1036
+ summary_writer = SummaryWriter(log_dir=os.path.join(Path(training_args.output_dir), "runs"))
1037
+ except ImportError as ie:
1038
+ has_tensorboard = False
1039
+ logger.warning(
1040
+ "Unable to display metrics through TensorBoard because some package" f" are not installed: {ie}"
1041
+ )
1042
+ else:
1043
+ logger.warning(
1044
+ "Unable to display metrics through TensorBoard because the package is not"
1045
+ " installed: Please run `pip install tensorboard` to enable."
1046
+ )
1047
+
1048
+ # Enable wandb only on the master node
1049
+ has_wandb = is_wandb_available()
1050
+ if has_wandb:
1051
+ import wandb as wandb_logger
1052
+
1053
+ # Set up wandb run
1054
+ if jax.process_index() == 0:
1055
+ wandb_logger.init(
1056
+ project=data_args.wandb_project,
1057
+ name=data_args.wandb_name,
1058
+ job_type=data_args.wandb_job_type,
1059
+ dir=data_args.wandb_dir,
1060
+ save_code=data_args.save_code_to_wandb,
1061
+ )
1062
+ else:
1063
+ logger.warning("Wandb logging requires wandb to be installed. Run `pip install wandb` to enable.")
1064
+
1065
+ # 3. Setup local logging
1066
+ # Make one log on every process with the configuration for debugging.
1067
+ logging.basicConfig(
1068
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
1069
+ datefmt="%m/%d/%Y %H:%M:%S",
1070
+ handlers=[logging.StreamHandler(sys.stdout)],
1071
+ )
1072
+ # Set the verbosity to info of the Transformers logger.
1073
+ # We only want one process per machine to log things on the screen.
1074
+ logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
1075
+ if jax.process_index() == 0:
1076
+ datasets.utils.logging.set_verbosity_warning()
1077
+ transformers.utils.logging.set_verbosity_info()
1078
+ else:
1079
+ datasets.utils.logging.set_verbosity_error()
1080
+ transformers.utils.logging.set_verbosity_error()
1081
+
1082
+ logger.info("Training/evaluation parameters %s", training_args)
1083
+
1084
+ # Check the output dir is valid
1085
+ if (
1086
+ os.path.exists(training_args.output_dir)
1087
+ and os.listdir(training_args.output_dir)
1088
+ and training_args.do_train
1089
+ and not training_args.overwrite_output_dir
1090
+ ):
1091
+ raise ValueError(
1092
+ f"Output directory ({training_args.output_dir}) already exists and is not"
1093
+ " empty. Use `--overwrite_output_dir` to overcome."
1094
+ )
1095
+
1096
+ # 4. Handle the repository creation
1097
+ if training_args.push_to_hub:
1098
+ if training_args.hub_model_id is None:
1099
+ repo_name = get_full_repo_name(
1100
+ Path(training_args.output_dir).absolute().name,
1101
+ token=training_args.hub_token,
1102
+ )
1103
+ else:
1104
+ repo_name = training_args.hub_model_id
1105
+ create_repo(repo_name, exist_ok=True, token=training_args.hub_token)
1106
+ repo = Repository(
1107
+ training_args.output_dir,
1108
+ clone_from=repo_name,
1109
+ token=training_args.hub_token,
1110
+ )
1111
+
1112
+ if training_args.compilation_cache:
1113
+ cc.initialize_cache(os.path.join(model_args.cache_dir, "jax_cache"))
1114
+
1115
+ # 5. Load dataset
1116
+ raw_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
1117
+
1118
+ # set seed for determinism
1119
+ set_seed(training_args.seed)
1120
+
1121
+ if training_args.do_train:
1122
+ raw_datasets["train"] = load_multiple_datasets(
1123
+ data_args.train_dataset_name,
1124
+ data_args.train_dataset_config_name,
1125
+ splits=data_args.train_split_name,
1126
+ streaming=data_args.streaming,
1127
+ dataset_samples=data_args.train_dataset_samples,
1128
+ seed=training_args.seed,
1129
+ cache_dir=data_args.dataset_cache_dir,
1130
+ token=True if model_args.use_auth_token else None,
1131
+ )
1132
+
1133
+ if training_args.do_eval:
1134
+ dataset_names_dict = convert_dataset_str_to_list(
1135
+ data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
1136
+ (
1137
+ data_args.eval_dataset_config_name
1138
+ if data_args.eval_dataset_config_name
1139
+ else data_args.train_dataset_config_name
1140
+ ),
1141
+ splits=data_args.eval_split_name,
1142
+ text_column_names=data_args.eval_text_column_name,
1143
+ )
1144
+ all_eval_splits = []
1145
+ if len(dataset_names_dict) == 1:
1146
+ # load a single eval set
1147
+ dataset_dict = dataset_names_dict[0]
1148
+ all_eval_splits.append("eval")
1149
+ raw_datasets["eval"] = load_dataset(
1150
+ dataset_dict["name"],
1151
+ dataset_dict["config"],
1152
+ split=dataset_dict["split"],
1153
+ cache_dir=data_args.dataset_cache_dir,
1154
+ token=True if model_args.use_auth_token else None,
1155
+ streaming=data_args.streaming,
1156
+ )
1157
+ else:
1158
+ # load multiple eval sets
1159
+ for dataset_dict in dataset_names_dict:
1160
+ if dataset_dict["name"] == "esb/diagnostic-dataset":
1161
+ # for the ESB diagnostic dataset, the dataset name is effectively the config
1162
+ pretty_name = f"{dataset_dict['config']}-diagnostic/{dataset_dict['split']}"
1163
+ else:
1164
+ pretty_name = f"{dataset_dict['name'].split('/')[-1]}/{dataset_dict['split'].replace('.', '-')}"
1165
+ all_eval_splits.append(pretty_name)
1166
+ raw_datasets[pretty_name] = load_dataset(
1167
+ dataset_dict["name"],
1168
+ dataset_dict["config"],
1169
+ split=dataset_dict["split"],
1170
+ cache_dir=data_args.dataset_cache_dir,
1171
+ token=True if model_args.use_auth_token else None,
1172
+ streaming=data_args.streaming,
1173
+ )
1174
+ features = raw_datasets[pretty_name].features.keys()
1175
+ if "text" not in features:
1176
+ raw_datasets[pretty_name] = raw_datasets[pretty_name].rename_column(
1177
+ dataset_dict["text_column_name"], "text"
1178
+ )
1179
+ raw_datasets[pretty_name] = raw_datasets[pretty_name].remove_columns(
1180
+ set(raw_datasets[pretty_name].features.keys()) - {"audio", "text"}
1181
+ )
1182
+
1183
+ if not training_args.do_train and not training_args.do_eval:
1184
+ raise ValueError(
1185
+ "Cannot not train and not do evaluation. At least one of training or evaluation has to be performed."
1186
+ )
1187
+
1188
+ raw_datasets_train_features = list(raw_datasets["train"].features.keys())
1189
+
1190
+ if data_args.audio_column_name not in raw_datasets_train_features:
1191
+ raise ValueError(
1192
+ f"--audio_column_name '{data_args.audio_column_name}' not found in dataset"
1193
+ f" '{data_args.dataset_name}'. Make sure to set `--audio_column_name` to"
1194
+ " the correct audio column - one of"
1195
+ f" {', '.join(raw_datasets_train_features)}."
1196
+ )
1197
+
1198
+ if data_args.train_text_column_name not in raw_datasets_train_features:
1199
+ raise ValueError(
1200
+ f"--train_text_column_name {data_args.train_text_column_name} not found in dataset"
1201
+ f" '{data_args.dataset_name}'. Make sure to set `--train_text_column_name` to the"
1202
+ " correct text column - one of"
1203
+ f" {', '.join(raw_datasets_train_features)}."
1204
+ )
1205
+
1206
+ # 6. Load pretrained model, tokenizer, and feature extractor
1207
+ config = WhisperConfig.from_pretrained(
1208
+ (model_args.config_name if model_args.config_name else model_args.model_name_or_path),
1209
+ cache_dir=model_args.cache_dir,
1210
+ revision=model_args.model_revision,
1211
+ token=True if model_args.use_auth_token else None,
1212
+ )
1213
+ feature_extractor = FlaxWhisperFeatureExtractor.from_pretrained(
1214
+ (model_args.feature_extractor_name if model_args.feature_extractor_name else model_args.model_name_or_path),
1215
+ cache_dir=model_args.cache_dir,
1216
+ revision=model_args.model_revision,
1217
+ token=True if model_args.use_auth_token else None,
1218
+ )
1219
+ tokenizer = WhisperTokenizerFast.from_pretrained(
1220
+ (model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path),
1221
+ cache_dir=model_args.cache_dir,
1222
+ use_fast=model_args.use_fast_tokenizer,
1223
+ revision=model_args.model_revision,
1224
+ token=True if model_args.use_auth_token else None,
1225
+ )
1226
+
1227
+ # override timestamp tokens until tokenizer issues are fixed in transformers
1228
+ timestamps = [AddedToken("<|%.2f|>" % (i * 0.02), lstrip=False, rstrip=False) for i in range(1500 + 1)]
1229
+ tokenizer.add_tokens(timestamps)
1230
+
1231
+ config.update(
1232
+ {
1233
+ "activation_dropout": model_args.activation_dropout,
1234
+ "attention_dropout": model_args.attention_dropout,
1235
+ "dropout": model_args.dropout,
1236
+ }
1237
+ )
1238
+
1239
+ if training_args.precision == "full_mixed":
1240
+ # forward pass, backward pass and optimiser states in bf16
1241
+ dtype = jnp.bfloat16
1242
+ to_dtype = to_bf16
1243
+ elif training_args.precision == "half_mixed" or model_args.dtype == "bfloat16":
1244
+ # forward pass in bf16, backward pass and optimiser states in fp32
1245
+ dtype = jnp.bfloat16
1246
+ to_dtype = to_fp32
1247
+ else:
1248
+ if training_args.precision != "full":
1249
+ raise ValueError(
1250
+ f"`precision` should be one of: `full`, `half_mixed` or `full_mixed`, got {training_args.precision}"
1251
+ )
1252
+ # forward pass, backward pass and optimiser states in fp32
1253
+ dtype = jnp.float32
1254
+ to_dtype = to_fp32
1255
+
1256
+ student_model, student_params = FlaxWhisperForConditionalGeneration.from_pretrained(
1257
+ model_args.model_name_or_path,
1258
+ config=config,
1259
+ dtype=dtype,
1260
+ cache_dir=model_args.cache_dir,
1261
+ revision=model_args.model_revision,
1262
+ subfolder=model_args.subfolder,
1263
+ token=True if model_args.use_auth_token else None,
1264
+ _do_init=False,
1265
+ use_scan=model_args.load_with_scan_weights,
1266
+ )
1267
+
1268
+ teacher_model, teacher_params = FlaxWhisperForConditionalGeneration.from_pretrained(
1269
+ model_args.teacher_model_name_or_path,
1270
+ # config=config,
1271
+ dtype=dtype,
1272
+ cache_dir=model_args.cache_dir,
1273
+ # revision=model_args.model_revision,
1274
+ token=True if model_args.use_auth_token else None,
1275
+ _do_init=False,
1276
+ )
1277
+
1278
+ if student_model.config.decoder_start_token_id is None or teacher_model.config.decoder_start_token_id is None:
1279
+ raise ValueError(
1280
+ f"Make sure that `config.decoder_start_token_id` is correctly defined for both the "
1281
+ f"student and teacher model. Got {student_model.config.decoder_start_token_id} for the "
1282
+ f"student and {teacher_model.config.decoder_start_token_id} for the teacher."
1283
+ )
1284
+
1285
+ # enable scan / gradient checkpointing if necessary
1286
+ if training_args.use_scan:
1287
+ student_model.enable_scan() # to enable scan in the nn.Module
1288
+ student_params = student_model.convert_unroll_to_scan(student_params) # to convert the unrolled params to scan
1289
+
1290
+ teacher_model.enable_scan() # faster compile time (even though we don't train the teacher)
1291
+ teacher_params = teacher_model.convert_unroll_to_scan(teacher_params)
1292
+
1293
+ if training_args.gradient_checkpointing:
1294
+ student_model.enable_gradient_checkpointing() # to enable checkpointing in the nn.Module, there is no change to the params structure
1295
+ teacher_model.enable_gradient_checkpointing()
1296
+
1297
+ if hasattr(teacher_model.generation_config, "is_multilingual") and teacher_model.generation_config.is_multilingual:
1298
+ # We need to set the language and task ids for previously multilingual checkpoints - for now we hardcode this to English
1299
+ tokenizer.set_prefix_tokens(language="English", task="transcribe", predict_timestamps=False)
1300
+ student_model.generation_config.update(
1301
+ **{
1302
+ "language": "<|en|>",
1303
+ "task": "transcribe",
1304
+ }
1305
+ )
1306
+
1307
+ # 7. Resample speech dataset: `datasets` takes care of automatically loading and resampling the audio,
1308
+ # so we just need to set the correct target sampling rate.
1309
+ raw_datasets = raw_datasets.cast_column(
1310
+ data_args.audio_column_name,
1311
+ datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate),
1312
+ )
1313
+
1314
+ # 8. Preprocessing the datasets.
1315
+ # We need to read the audio files as arrays and tokenize the targets.
1316
+ max_input_length = int(data_args.max_duration_in_seconds * feature_extractor.sampling_rate)
1317
+ min_input_length = int(data_args.min_duration_in_seconds * feature_extractor.sampling_rate)
1318
+ max_label_length = (
1319
+ data_args.max_label_length if data_args.max_label_length is not None else student_model.config.max_length
1320
+ )
1321
+ audio_column_name = data_args.audio_column_name
1322
+ num_workers = data_args.preprocessing_num_workers
1323
+ dataloader_num_workers = training_args.dataloader_num_workers
1324
+ dataloader_prefetch_size = data_args.prefetch_size
1325
+ train_text_column_name = data_args.train_text_column_name
1326
+ eval_text_column_name = "text"
1327
+ model_input_name = feature_extractor.model_input_names[0]
1328
+ normalizer = EnglishTextNormalizer(tokenizer.english_spelling_normalizer)
1329
+ wer_threshold = data_args.wer_threshold
1330
+ round_timestamps = data_args.round_timestamps
1331
+
1332
+ if training_args.do_train and data_args.max_train_samples is not None:
1333
+ raw_datasets["train"] = (
1334
+ raw_datasets["train"].take(data_args.max_train_samples)
1335
+ if data_args.streaming
1336
+ else raw_datasets["train"].select(range(data_args.max_train_samples))
1337
+ )
1338
+
1339
+ if training_args.do_eval and data_args.max_eval_samples is not None:
1340
+ for eval_split in all_eval_splits:
1341
+ raw_datasets[eval_split] = (
1342
+ raw_datasets[eval_split].take(data_args.max_eval_samples)
1343
+ if data_args.streaming
1344
+ else raw_datasets[eval_split].select(range(data_args.max_eval_samples))
1345
+ )
1346
+
1347
+ def is_wer_in_range(ground_truth, whisper_transcript):
1348
+ norm_ground_truth = normalizer(ground_truth)
1349
+ if len(norm_ground_truth) > 0 and whisper_transcript is not None:
1350
+ norm_whisper_transcript = normalizer(whisper_transcript)
1351
+ wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
1352
+ return wer < wer_threshold
1353
+ else:
1354
+ # filter automatically since we can't know the WER
1355
+ return False
1356
+
1357
+ filter_by_wer_threshold = partial(
1358
+ raw_datasets["train"].filter,
1359
+ function=is_wer_in_range,
1360
+ input_columns=[eval_text_column_name, train_text_column_name],
1361
+ )
1362
+
1363
+ if wer_threshold is not None:
1364
+ raw_datasets["train"] = (
1365
+ filter_by_wer_threshold(num_proc=num_workers, desc="filtering train dataset by wer")
1366
+ if not data_args.streaming
1367
+ else filter_by_wer_threshold()
1368
+ )
1369
+
1370
+ def has_timestamp_tokens(input_str):
1371
+ """
1372
+ Identify whether the input string contains timestamp tokens, of the form <|0.00|>, by searching for
1373
+ pairs of left and right-angle brackets.
1374
+ """
1375
+ return bool(re.search("\<[^\>]*\>", input_str))
1376
+
1377
+ def round_timestamp_tokens(input_str: str, ndigits: int = 1):
1378
+ timestamps = re.findall("\<[^\>]*\>", input_str, re.DOTALL)
1379
+ for token in timestamps:
1380
+ # extract time digits from timestamp token, e.g. <|6.24|> to 6.24
1381
+ time_digit = token[2:-2]
1382
+ # round to specified number of digits, e.g. 6.24 to 6.2
1383
+ time_digit = round(float(time_digit), ndigits=ndigits)
1384
+ # replace in original string with the same precision, e.g. <|6.24|> to <|6.20|>
1385
+ input_str = input_str.replace(token, "<|{:.2f}|>".format(time_digit))
1386
+ return input_str
1387
+
1388
+ def prepare_train_dataset(batch):
1389
+ # process audio input
1390
+ sample = batch[audio_column_name]
1391
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
1392
+ batch[model_input_name] = inputs.get(model_input_name)[0]
1393
+ batch["input_length"] = len(sample["array"])
1394
+
1395
+ # process text targets
1396
+ input_str = batch[train_text_column_name]
1397
+
1398
+ # prompt & timestamp processing: for now, we only do one or the other
1399
+ if input_str.startswith("<|startoftranscript|>") or input_str.startswith("<|startofprev|>"):
1400
+ # prompted target text already has special ids added, so don't add them here
1401
+ batch["labels"] = tokenizer(input_str, add_special_tokens=False).input_ids
1402
+ return batch
1403
+
1404
+ has_timestamps = has_timestamp_tokens(input_str)
1405
+
1406
+ if has_timestamps:
1407
+ predict_timestamps = bool(np.random.binomial(1, data_args.timestamp_probability))
1408
+ if not predict_timestamps:
1409
+ # filter timestamp token ids if not part of the prediction task
1410
+ input_str = tokenizer._filter_timestamp_ids(input_str)
1411
+ elif round_timestamps:
1412
+ input_str = round_timestamp_tokens(input_str)
1413
+ else:
1414
+ predict_timestamps = False
1415
+
1416
+ tokenizer.set_prefix_tokens(language="English", task="transcribe", predict_timestamps=predict_timestamps)
1417
+ input_ids = tokenizer(input_str).input_ids
1418
+ batch["labels"] = input_ids
1419
+ return batch
1420
+
1421
+ def prepare_eval_dataset(batch):
1422
+ # process audio
1423
+ sample = batch[audio_column_name]
1424
+ inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
1425
+ # process audio length
1426
+ batch[model_input_name] = inputs.get(model_input_name)[0]
1427
+ batch["input_length"] = len(sample["array"])
1428
+
1429
+ # process targets
1430
+ input_str = batch[eval_text_column_name]
1431
+ batch["labels"] = tokenizer(input_str).input_ids
1432
+ return batch
1433
+
1434
+ vectorized_datasets = IterableDatasetDict() if data_args.streaming else DatasetDict()
1435
+ if training_args.do_train:
1436
+ map_fn_train = partial(
1437
+ raw_datasets["train"].map, function=prepare_train_dataset, remove_columns=raw_datasets_train_features
1438
+ )
1439
+ vectorized_datasets["train"] = (
1440
+ map_fn_train(num_proc=num_workers, desc="preprocess train dataset")
1441
+ if not data_args.streaming
1442
+ else map_fn_train()
1443
+ )
1444
+ if training_args.do_eval:
1445
+ for eval_split in all_eval_splits:
1446
+ raw_datasets_eval_features = list(raw_datasets[eval_split].features.keys())
1447
+ map_fn_eval = partial(
1448
+ raw_datasets[eval_split].map, function=prepare_eval_dataset, remove_columns=raw_datasets_eval_features
1449
+ )
1450
+ vectorized_datasets[eval_split] = (
1451
+ map_fn_eval(num_proc=num_workers, desc="preprocess eval dataset")
1452
+ if not data_args.streaming
1453
+ else map_fn_eval()
1454
+ )
1455
+
1456
+ # filter training data with inputs longer than max_input_length
1457
+ def is_audio_in_length_range(length):
1458
+ return min_input_length < length < max_input_length
1459
+
1460
+ filter_by_audio_fn = partial(
1461
+ vectorized_datasets.filter, function=is_audio_in_length_range, input_columns=["input_length"]
1462
+ )
1463
+ vectorized_datasets = (
1464
+ filter_by_audio_fn(num_proc=num_workers, desc="filtering train dataset by audio length")
1465
+ if not data_args.streaming
1466
+ else filter_by_audio_fn()
1467
+ )
1468
+
1469
+ # filter training data with labels longer than max_label_length
1470
+ def is_labels_in_length_range(labels):
1471
+ return 0 < len(labels) < max_label_length
1472
+
1473
+ filter_by_labels_fn = partial(
1474
+ vectorized_datasets.filter, function=is_labels_in_length_range, input_columns=["labels"]
1475
+ )
1476
+ vectorized_datasets = (
1477
+ filter_by_labels_fn(num_proc=num_workers, desc="filtering train dataset")
1478
+ if not data_args.streaming
1479
+ else filter_by_labels_fn()
1480
+ )
1481
+
1482
+ # for large datasets it is advised to run the preprocessing on a
1483
+ # single machine first with `args.preprocessing_only` since there will mostly likely
1484
+ # be a timeout when running the script in distributed mode.
1485
+ # In a second step `args.preprocessing_only` can then be set to `False` to load the
1486
+ # cached dataset
1487
+ if data_args.preprocessing_only:
1488
+ cache = {k: v.cache_files for k, v in vectorized_datasets.items()}
1489
+ logger.info(f"Data preprocessing finished. Files cached at {cache}.")
1490
+ return
1491
+
1492
+ # 8. Load Metric
1493
+ metric = evaluate.load("wer")
1494
+ # convention is that we space all punctuation *except* apostrophes
1495
+ all_punctuation = list(string.punctuation.replace("'", ""))
1496
+ return_timestamps = data_args.return_timestamps if data_args.timestamp_probability > 0 else False
1497
+
1498
+ def compute_metrics(preds, labels):
1499
+ # replace padded labels by the padding token
1500
+ for idx in range(len(labels)):
1501
+ labels[idx][labels[idx] == -100] = tokenizer.pad_token_id
1502
+
1503
+ pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True, decode_with_timestamps=return_timestamps)
1504
+ # we do not want to group tokens when computing the metrics
1505
+ label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
1506
+
1507
+ # space punctuation for orthographic WER (c.f. ESB paper https://arxiv.org/abs/2210.13352)
1508
+ spaced_pred_str = [
1509
+ pred_str[i].replace(punctuation, f" {punctuation} ")
1510
+ for punctuation in all_punctuation
1511
+ for i in range(len(pred_str))
1512
+ ]
1513
+ spaced_label_str = [
1514
+ label_str[i].replace(punctuation, f" {punctuation} ")
1515
+ for punctuation in all_punctuation
1516
+ for i in range(len(label_str))
1517
+ ]
1518
+ wer_ortho = 100 * metric.compute(predictions=spaced_pred_str, references=spaced_label_str)
1519
+
1520
+ # normalize everything and re-compute the WER
1521
+ norm_pred_str = [normalizer(pred) for pred in pred_str]
1522
+ norm_label_str = [normalizer(label) for label in label_str]
1523
+ # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
1524
+ pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
1525
+ label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
1526
+ # filtering step to only evaluate the samples that correspond to non-zero normalized references:
1527
+ norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
1528
+ norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
1529
+
1530
+ wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
1531
+
1532
+ return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
1533
+
1534
+ # 9. Save feature extractor, tokenizer, config and generation config
1535
+ feature_extractor.save_pretrained(training_args.output_dir)
1536
+ tokenizer.save_pretrained(training_args.output_dir)
1537
+ config.save_pretrained(training_args.output_dir)
1538
+ student_model.generation_config.save_pretrained(
1539
+ training_args.output_dir
1540
+ ) # generation config stays bound to model to make it easy to jit
1541
+
1542
+ processor = WhisperProcessor.from_pretrained(training_args.output_dir)
1543
+
1544
+ data_collator = FlaxDataCollatorSpeechSeq2SeqWithPadding(
1545
+ processor=processor,
1546
+ decoder_start_token_id=student_model.config.decoder_start_token_id, # <|startoftranscript|>
1547
+ decoder_prev_token_id=tokenizer.all_special_ids[-3], # <|startofprev|>
1548
+ input_padding="longest",
1549
+ target_padding="max_length",
1550
+ max_target_length=max_label_length,
1551
+ )
1552
+
1553
+ # Initialize our training
1554
+ rng = jax.random.PRNGKey(training_args.seed)
1555
+ rng, dropout_rng = jax.random.split(rng)
1556
+
1557
+ # Store some constants
1558
+ train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
1559
+ gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
1560
+ per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
1561
+ eval_batch_size = per_device_eval_batch_size * jax.device_count()
1562
+
1563
+ if not data_args.streaming and training_args.max_steps < 0:
1564
+ num_epochs = int(training_args.num_train_epochs)
1565
+ steps_per_epoch = len(vectorized_datasets["train"]) // train_batch_size
1566
+ total_train_steps = steps_per_epoch * num_epochs
1567
+ elif training_args.max_steps > 0:
1568
+ logger.info("max_steps is given, it will override any value given in num_train_epochs")
1569
+ total_train_steps = int(training_args.max_steps)
1570
+ # Setting a very large number of epochs so we go as many times as necessary over the iterator.
1571
+ num_epochs = sys.maxsize
1572
+ steps_per_epoch = total_train_steps
1573
+ else:
1574
+ raise ValueError("max_steps must be specified when training with a streaming (iterable) dataset")
1575
+
1576
+ if training_args.eval_steps is None:
1577
+ logger.info(
1578
+ f"eval_steps is not set, evaluating at the end of {'each epoch' if not data_args.streaming else 'training'}"
1579
+ )
1580
+ eval_steps = steps_per_epoch
1581
+ else:
1582
+ eval_steps = training_args.eval_steps
1583
+
1584
+ # Create learning rate schedule
1585
+ linear_decay_lr_schedule_fn = create_learning_rate_fn(
1586
+ total_train_steps * gradient_accumulation_steps,
1587
+ training_args.lr_scheduler_type,
1588
+ training_args.warmup_steps * gradient_accumulation_steps,
1589
+ training_args.learning_rate,
1590
+ )
1591
+
1592
+ # We use Optax's "masking" functionality to not apply weight decay
1593
+ # to bias and LayerNorm scale parameters. decay_mask_fn returns a
1594
+ # mask boolean with the same structure as the parameters.
1595
+ # The mask is True for parameters that should be decayed.
1596
+ def decay_mask_fn(params):
1597
+ flat_params = traverse_util.flatten_dict(params)
1598
+ # find out all LayerNorm parameters
1599
+ layer_norm_candidates = [
1600
+ "layer_norm",
1601
+ "self_attn_layer_norm",
1602
+ "final_layer_norm",
1603
+ "encoder_attn_layer_norm",
1604
+ ]
1605
+ layer_norm_named_params = {
1606
+ layer[-2:]
1607
+ for layer_norm_name in layer_norm_candidates
1608
+ for layer in flat_params.keys()
1609
+ if layer_norm_name in "".join(layer).lower()
1610
+ }
1611
+ flat_mask = {path: path[-1] != "bias" and path[-2:] not in layer_norm_named_params for path in flat_params}
1612
+ return traverse_util.unflatten_dict(flat_mask)
1613
+
1614
+ # create adam optimizer
1615
+ adamw = optax.adamw(
1616
+ learning_rate=linear_decay_lr_schedule_fn,
1617
+ b1=training_args.adam_beta1,
1618
+ b2=training_args.adam_beta2,
1619
+ eps=training_args.adam_epsilon,
1620
+ weight_decay=training_args.weight_decay,
1621
+ mask=decay_mask_fn,
1622
+ )
1623
+
1624
+ if gradient_accumulation_steps > 1:
1625
+ # accumulate gradients and apply once every k steps
1626
+ adamw = optax.MultiSteps(adamw, every_k_schedule=gradient_accumulation_steps)
1627
+
1628
+ share_hidden_states = training_args.freeze_encoder and student_model.config.d_model == teacher_model.config.d_model
1629
+ encoder_layer_mapping = get_layers_to_supervise(
1630
+ student_model.config.encoder_layers, teacher_model.config.encoder_layers
1631
+ )
1632
+ decoder_layer_mapping = get_layers_to_supervise(
1633
+ student_model.config.decoder_layers, teacher_model.config.decoder_layers
1634
+ )
1635
+
1636
+ # Setup train state
1637
+ student_state = TrainState.create(
1638
+ apply_fn=student_model.decode if share_hidden_states else student_model.__call__,
1639
+ params=student_params,
1640
+ tx=adamw,
1641
+ to_dtype=to_dtype,
1642
+ dropout_rng=dropout_rng,
1643
+ max_grad_norm=training_args.max_grad_norm,
1644
+ )
1645
+
1646
+ if training_args.resume_from_checkpoint is not None:
1647
+ if os.path.isfile(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")):
1648
+ logger.info(
1649
+ f"Checkpoint detected, resuming training at {training_args.resume_from_checkpoint}. To avoid "
1650
+ "this behavior, omit the resume_from_checkpoint argument."
1651
+ )
1652
+ with Path(os.path.join(training_args.resume_from_checkpoint, "train_state.msgpack")).open("rb") as f:
1653
+ student_state = from_bytes(student_state, f.read())
1654
+ else:
1655
+ logger.warning(
1656
+ f"Checkpoint {training_args.resume_from_checkpoint} not detected, training from scratch. Ensure "
1657
+ f"you pass the path to a folder with a valid checkpoint for your model."
1658
+ )
1659
+
1660
+ def cross_entropy_loss(logits, labels):
1661
+ vocab_size = logits.shape[-1]
1662
+ # optax onehot always returns a float32 device array, need to downcast if performing mixed precision training
1663
+ onehot_targets = to_dtype(onehot(labels, vocab_size))
1664
+ loss = optax.softmax_cross_entropy(logits, onehot_targets)
1665
+ # ignore padded tokens from loss, i.e. where labels are not set to -100
1666
+ padding = labels >= 0
1667
+ loss = loss * padding
1668
+ loss = loss.sum()
1669
+ num_labels = padding.sum()
1670
+ return loss, num_labels
1671
+
1672
+ # temperature smoothed kl-divergence
1673
+ def kl_divergence(target_distribution, log_predicted_distribution, labels, eps=1e-20):
1674
+ divergence = -target_distribution * (log_predicted_distribution - jnp.log(target_distribution + eps))
1675
+ # ignore padded tokens from divergence, i.e. where labels are not set to -100
1676
+ padding_mask = labels >= 0
1677
+ padding_mask = jnp.expand_dims(padding_mask, axis=-1)
1678
+ divergence = (divergence * padding_mask).sum()
1679
+ return to_dtype(divergence) # respect the dtype of the backprop
1680
+
1681
+ def mean_square_error_loss(student_outputs, teacher_outputs):
1682
+ mse = dtype(0.0)
1683
+
1684
+ # tie encoder embeddings
1685
+ mse += jnp.mean(
1686
+ jnp.square(teacher_outputs.encoder_hidden_states[0] - student_outputs.encoder_hidden_states[0])
1687
+ )
1688
+
1689
+ for student_layer_id, teacher_layer_id in encoder_layer_mapping.items():
1690
+ # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
1691
+ student_hidden_state = student_outputs.encoder_hidden_states[student_layer_id + 1]
1692
+ teacher_hidden_state = teacher_outputs.encoder_hidden_states[teacher_layer_id + 1]
1693
+ mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
1694
+
1695
+ # student_attention = student_outputs.encoder_attentions[student_layer_id]
1696
+ # teacher_attention = teacher_outputs.encoder_attentions[teacher_layer_id]
1697
+ # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
1698
+
1699
+ # tie decoder embeddings
1700
+ mse += jnp.mean(
1701
+ jnp.square(teacher_outputs.decoder_hidden_states[0] - student_outputs.decoder_hidden_states[0])
1702
+ )
1703
+
1704
+ for student_layer_id, teacher_layer_id in decoder_layer_mapping.items():
1705
+ # offset the hidden-state layer ids by 1 to account for the extra embedding hidden-state
1706
+ student_hidden_state = student_outputs.decoder_hidden_states[student_layer_id + 1]
1707
+ teacher_hidden_state = teacher_outputs.decoder_hidden_states[teacher_layer_id + 1]
1708
+ mse += jnp.mean(jnp.square(teacher_hidden_state - student_hidden_state))
1709
+
1710
+ # student_attention = student_outputs.decoder_attentions[student_layer_id]
1711
+ # teacher_attention = teacher_outputs.decoder_attentions[teacher_layer_id]
1712
+ # mse += jnp.mean(jnp.square(student_attention - teacher_attention))
1713
+
1714
+ # student_cross_attention = student_outputs.cross_attentions[student_layer_id]
1715
+ # teacher_cross_attention = teacher_outputs.cross_attentions[teacher_layer_id]
1716
+ # mse += jnp.mean(jnp.square(student_cross_attention - teacher_cross_attention))
1717
+
1718
+ return to_dtype(mse) # respect the dtype of the backprop
1719
+
1720
+ # Define gradient update step fn
1721
+ def train_step(
1722
+ student_state,
1723
+ teacher_params,
1724
+ batch,
1725
+ freeze_encoder,
1726
+ share_hidden_states,
1727
+ temperature=2.0,
1728
+ ):
1729
+ dropout_rng, new_dropout_rng = jax.random.split(student_state.dropout_rng)
1730
+
1731
+ def compute_loss(student_params):
1732
+ labels = batch.pop("labels")
1733
+ output_hidden_states = not share_hidden_states and training_args.mse_weight > 0.0
1734
+
1735
+ teacher_outputs = teacher_model(
1736
+ **batch,
1737
+ params=teacher_params,
1738
+ freeze_encoder=True,
1739
+ output_hidden_states=output_hidden_states,
1740
+ train=False,
1741
+ )
1742
+
1743
+ if share_hidden_states:
1744
+ # if the student and teacher share the same frozen encoder then we don't have to recompute the
1745
+ # encoder hidden-states for the student model, we can just re-use from the teacher
1746
+ encoder_hidden_states = jax.lax.stop_gradient(teacher_outputs.encoder_last_hidden_state)
1747
+ encoder_outputs = FlaxBaseModelOutput(last_hidden_state=encoder_hidden_states)
1748
+
1749
+ student_outputs = student_state.apply_fn(
1750
+ decoder_input_ids=batch["decoder_input_ids"],
1751
+ encoder_outputs=encoder_outputs,
1752
+ params=student_params,
1753
+ dropout_rng=dropout_rng,
1754
+ train=True,
1755
+ )
1756
+ else:
1757
+ # do the full forward pass for the student model (encoder + decoder)
1758
+ student_outputs = student_state.apply_fn(
1759
+ **batch,
1760
+ params=student_params,
1761
+ dropout_rng=dropout_rng,
1762
+ freeze_encoder=freeze_encoder,
1763
+ output_hidden_states=output_hidden_states,
1764
+ train=True,
1765
+ )
1766
+
1767
+ # CE (data) loss
1768
+ ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
1769
+
1770
+ # rescale by temperature to ensure gradients scale correctly
1771
+ teacher_distribution = jax.nn.softmax(teacher_outputs.logits / temperature, axis=-1)
1772
+ # ensure no information flow backwards through teacher
1773
+ teacher_distribution = jax.lax.stop_gradient(teacher_distribution)
1774
+ # log softmax of student predictions for numerical stability
1775
+ student_distribution = jax.nn.log_softmax(student_outputs.logits / temperature, axis=-1)
1776
+ # KL-divergence loss (scaled by temperature)
1777
+ kl_loss = kl_divergence(teacher_distribution, student_distribution, labels) * temperature**2
1778
+
1779
+ # MSE loss between enc-dec hidden-states and attentions
1780
+ mse_loss = (
1781
+ mean_square_error_loss(student_outputs, teacher_outputs)
1782
+ if output_hidden_states
1783
+ else jnp.zeros_like(kl_loss)
1784
+ )
1785
+
1786
+ # use DistilBart formulation - only tune the MSE weight and take remaining HPs from DistilBERT
1787
+ ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
1788
+ loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
1789
+
1790
+ return loss, (
1791
+ ce_loss,
1792
+ kl_loss,
1793
+ mse_loss,
1794
+ num_labels,
1795
+ )
1796
+
1797
+ grad_fn = jax.value_and_grad(compute_loss, has_aux=True)
1798
+ (loss, (ce_loss, kl_loss, mse_loss, num_labels)), grad = grad_fn(to_dtype(student_state.params))
1799
+
1800
+ # true loss = total loss / total samples
1801
+ loss = jax.lax.psum(loss, "batch")
1802
+ num_labels = jax.lax.psum(num_labels, "batch")
1803
+ loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
1804
+
1805
+ # true grad = total grad / total samples
1806
+ grad = jax.lax.psum(grad, "batch")
1807
+ grad = jax.tree_util.tree_map(lambda x: x / num_labels, grad)
1808
+ new_state = student_state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng, to_dtype=to_dtype)
1809
+
1810
+ # CE/KL/MSE losses for logging
1811
+ ce_loss = jax.lax.psum(ce_loss, "batch")
1812
+ ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
1813
+
1814
+ kl_loss = jax.lax.psum(kl_loss, "batch")
1815
+ kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
1816
+
1817
+ mse_loss = jax.lax.psum(mse_loss, "batch")
1818
+ mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
1819
+
1820
+ metrics = {
1821
+ "loss": loss,
1822
+ "learning_rate": linear_decay_lr_schedule_fn(student_state.step),
1823
+ "ce_loss": ce_loss,
1824
+ "kl_loss": kl_loss,
1825
+ "mse_loss": mse_loss,
1826
+ }
1827
+ return new_state, metrics
1828
+
1829
+ # Define eval fn
1830
+ def eval_step(student_params, teacher_params, batch):
1831
+ labels = batch.pop("labels")
1832
+ output_hidden_states = not share_hidden_states and training_args.mse_weight > 0
1833
+
1834
+ student_outputs = student_model(
1835
+ **batch,
1836
+ params=student_params,
1837
+ output_hidden_states=output_hidden_states,
1838
+ train=False,
1839
+ )
1840
+ student_distribution = jax.nn.log_softmax(student_outputs.logits, axis=-1)
1841
+ ce_loss, num_labels = cross_entropy_loss(student_outputs.logits, labels)
1842
+
1843
+ teacher_outputs = teacher_model(
1844
+ **batch,
1845
+ params=teacher_params,
1846
+ output_hidden_states=output_hidden_states,
1847
+ train=False,
1848
+ )
1849
+ teacher_distribution = jax.nn.softmax(teacher_outputs.logits, axis=-1)
1850
+ # temperature is always 1 for eval
1851
+ kl_loss = kl_divergence(teacher_distribution, student_distribution, labels)
1852
+
1853
+ mse_loss = (
1854
+ mean_square_error_loss(student_outputs, teacher_outputs)
1855
+ if output_hidden_states
1856
+ else jnp.zeros_like(kl_loss)
1857
+ )
1858
+
1859
+ ce_weight = 0.8 if training_args.kl_weight > 0 else 1.0
1860
+ loss = ce_weight * ce_loss + training_args.kl_weight * kl_loss + training_args.mse_weight * mse_loss
1861
+ # true loss = total loss / total samples
1862
+ loss = jax.lax.psum(loss, "batch")
1863
+ num_labels = jax.lax.psum(num_labels, "batch")
1864
+ loss = jax.tree_util.tree_map(lambda x: x / num_labels, loss)
1865
+
1866
+ # CE/KL/MSE losses for logging
1867
+ ce_loss = jax.lax.psum(ce_loss, "batch")
1868
+ ce_loss = jax.tree_util.tree_map(lambda x: x / num_labels, ce_loss)
1869
+
1870
+ kl_loss = jax.lax.psum(kl_loss, "batch")
1871
+ kl_loss = jax.tree_util.tree_map(lambda x: x / num_labels, kl_loss)
1872
+
1873
+ mse_loss = jax.lax.psum(mse_loss, "batch")
1874
+ mse_loss = jax.tree_util.tree_map(lambda x: x / num_labels, mse_loss)
1875
+
1876
+ metrics = {"loss": loss, "ce_loss": ce_loss, "kl_loss": kl_loss, "mse_loss": mse_loss}
1877
+ return metrics
1878
+
1879
+ # Define generation function
1880
+ num_beams = (
1881
+ training_args.generation_num_beams
1882
+ if training_args.generation_num_beams is not None
1883
+ else student_model.config.num_beams
1884
+ )
1885
+
1886
+ # forcing the language and task tokens helps the model in its generations
1887
+ gen_kwargs = {
1888
+ "max_length": max_label_length,
1889
+ "num_beams": num_beams,
1890
+ "language": "<|en|>",
1891
+ "task": "transcribe",
1892
+ "return_timestamps": return_timestamps,
1893
+ }
1894
+
1895
+ def generate_step(student_params, batch):
1896
+ output_ids = student_model.generate(
1897
+ batch[model_input_name],
1898
+ attention_mask=batch.get("attention_mask"),
1899
+ params=student_params,
1900
+ **gen_kwargs,
1901
+ )
1902
+ return output_ids.sequences
1903
+
1904
+ # Replicate the train state on each device
1905
+ student_state = student_state.replicate()
1906
+
1907
+ # Replicate the teacher params on each device
1908
+ teacher_params = jax_utils.replicate(teacher_params)
1909
+
1910
+ # Create parallel version of the train and eval step
1911
+ p_train_step = jax.pmap(
1912
+ train_step,
1913
+ "batch",
1914
+ in_axes=(0, 0, 0, None, None, None),
1915
+ donate_argnums=(0,),
1916
+ static_broadcasted_argnums=(
1917
+ 3,
1918
+ 4,
1919
+ ),
1920
+ )
1921
+ p_eval_step = jax.pmap(eval_step, "batch")
1922
+ p_generate_step = jax.pmap(generate_step, "batch")
1923
+
1924
+ logger.info("***** Running training *****")
1925
+ logger.info(f" Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
1926
+ logger.info(" Instantaneous batch size per device =" f" {training_args.per_device_train_batch_size}")
1927
+ logger.info(" Gradient accumulation steps =" f" {gradient_accumulation_steps}")
1928
+ logger.info(
1929
+ f" Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
1930
+ )
1931
+ logger.info(f" Total optimization steps = {total_train_steps}")
1932
+
1933
+ # ======================== Training ================================
1934
+ train_time = 0
1935
+ train_start = time.time()
1936
+ train_metrics = []
1937
+ batches_to_skip = jax.device_get(unreplicate(student_state.step))
1938
+ cur_step = int(batches_to_skip) # will be zero if starting from scratch
1939
+ epochs_trained = batches_to_skip // steps_per_epoch
1940
+ steps_trained_progress_bar = tqdm(range(total_train_steps), desc="Train steps ... ", position=0)
1941
+ steps_trained_progress_bar.update(batches_to_skip)
1942
+ continue_training = True
1943
+ minibatch_steps = 0
1944
+
1945
+ if batches_to_skip > 0:
1946
+ logger.info(" Continuing training from checkpoint, will skip to saved global_step")
1947
+ logger.info(f" Continuing training from epoch {epochs_trained}")
1948
+ logger.info(f" Continuing training from global step {batches_to_skip}")
1949
+
1950
+ # Generate a training data loader by shuffling sampling indices from the train dataset
1951
+ train_loader = get_data_loader(
1952
+ training_args.seed,
1953
+ vectorized_datasets["train"],
1954
+ batch_size=train_batch_size,
1955
+ data_collator=data_collator,
1956
+ dataloader_num_workers=dataloader_num_workers,
1957
+ skip_batches=batches_to_skip,
1958
+ prefetch_size=dataloader_prefetch_size,
1959
+ )
1960
+
1961
+ for epoch in range(epochs_trained, num_epochs):
1962
+ if hasattr(train_loader, "dataset") and isinstance(train_loader.dataset, IterableDataset):
1963
+ train_loader.dataset.set_epoch(epoch)
1964
+
1965
+ for batch in train_loader:
1966
+ minibatch_steps += 1
1967
+ update_step = minibatch_steps == gradient_accumulation_steps
1968
+
1969
+ if update_step:
1970
+ steps_trained_progress_bar.update(1)
1971
+ cur_step += 1
1972
+ minibatch_steps = 0
1973
+
1974
+ batch = shard(batch.data)
1975
+ student_state, train_metric = p_train_step(
1976
+ student_state,
1977
+ teacher_params,
1978
+ batch,
1979
+ training_args.freeze_encoder,
1980
+ share_hidden_states,
1981
+ training_args.temperature,
1982
+ )
1983
+
1984
+ if cur_step % training_args.logging_steps == 0 and update_step:
1985
+ train_metrics.append(train_metric)
1986
+ train_metric_to_write = unreplicate(train_metric)
1987
+ steps_trained_progress_bar.write(
1988
+ f"Step... ({cur_step} / {total_train_steps} | Loss:"
1989
+ f" {train_metric_to_write['loss']}, Learning Rate:"
1990
+ f" {train_metric_to_write['learning_rate']})"
1991
+ )
1992
+ if has_wandb and jax.process_index() == 0:
1993
+ write_wandb_metric(
1994
+ wandb_logger,
1995
+ train_metric_to_write,
1996
+ train_time + time.time() - train_start,
1997
+ cur_step,
1998
+ epoch,
1999
+ prefix="train",
2000
+ )
2001
+
2002
+ # save checkpoint and weights after each save_steps and at the end of training
2003
+ if (cur_step % training_args.save_steps == 0 and update_step) or cur_step == total_train_steps:
2004
+ if jax.process_index() == 0:
2005
+ save_hf_weights(
2006
+ student_state,
2007
+ student_model,
2008
+ processor,
2009
+ training_args.output_dir,
2010
+ cur_step,
2011
+ total_train_steps,
2012
+ use_scan=training_args.use_scan,
2013
+ )
2014
+ if training_args.save_train_state:
2015
+ student_state.save_state(
2016
+ training_args.output_dir, save_total_limit=training_args.save_total_limit
2017
+ )
2018
+ if training_args.push_to_hub:
2019
+ repo.push_to_hub(
2020
+ commit_message=f"Saving train state of step {cur_step}",
2021
+ blocking=False,
2022
+ )
2023
+
2024
+ if training_args.do_eval and (
2025
+ (cur_step % eval_steps == 0 and update_step) or cur_step == total_train_steps
2026
+ ):
2027
+ train_time += time.time() - train_start
2028
+ # ======================== Evaluating ==============================
2029
+ for eval_split in all_eval_splits:
2030
+ eval_metrics = []
2031
+ eval_preds = []
2032
+ eval_labels = []
2033
+ eval_start = time.time()
2034
+
2035
+ eval_loader = get_data_loader(
2036
+ training_args.seed,
2037
+ vectorized_datasets[eval_split],
2038
+ batch_size=eval_batch_size,
2039
+ data_collator=data_collator,
2040
+ shuffle=False,
2041
+ drop_last=False,
2042
+ dataloader_num_workers=dataloader_num_workers,
2043
+ )
2044
+ for batch in tqdm(eval_loader, desc=f"Evaluating {eval_split}...", position=2):
2045
+ # Model forward
2046
+ labels = batch["labels"]
2047
+
2048
+ metrics = pad_shard_unpad(
2049
+ p_eval_step,
2050
+ static_argnums=(
2051
+ 0,
2052
+ 1,
2053
+ ),
2054
+ static_return=True,
2055
+ )(
2056
+ student_state.params,
2057
+ teacher_params,
2058
+ batch.data,
2059
+ min_device_batch=per_device_eval_batch_size,
2060
+ )
2061
+ eval_metrics.append(metrics)
2062
+
2063
+ # generation
2064
+ if training_args.predict_with_generate:
2065
+ generated_ids = pad_shard_unpad(p_generate_step)(
2066
+ student_state.params, batch.data, min_device_batch=per_device_eval_batch_size
2067
+ )
2068
+ eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"])))
2069
+ eval_labels.extend(labels)
2070
+
2071
+ eval_time = time.time() - eval_start
2072
+
2073
+ # normalize eval metrics
2074
+ eval_metrics = get_metrics(eval_metrics)
2075
+ eval_metrics = jax.tree_util.tree_map(jnp.mean, eval_metrics)
2076
+
2077
+ # compute WER metric
2078
+ wer_desc = ""
2079
+ if training_args.predict_with_generate:
2080
+ wer_metric, pred_str, label_str, norm_pred_str, norm_label_str = compute_metrics(
2081
+ eval_preds, eval_labels
2082
+ )
2083
+ eval_metrics.update(wer_metric)
2084
+ wer_desc = " ".join([f"Eval {key}: {value} |" for key, value in wer_metric.items()])
2085
+
2086
+ # Print metrics and update progress bar
2087
+ steps_trained_progress_bar.write(
2088
+ f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
2089
+ f" {wer_desc})"
2090
+ )
2091
+
2092
+ if has_tensorboard and jax.process_index() == 0:
2093
+ write_eval_metric(
2094
+ summary_writer,
2095
+ eval_metrics,
2096
+ cur_step,
2097
+ prefix=eval_split,
2098
+ )
2099
+
2100
+ if has_wandb and jax.process_index() == 0:
2101
+ write_wandb_metric(wandb_logger, eval_metrics, eval_time, cur_step, epoch, prefix=eval_split)
2102
+ if training_args.predict_with_generate:
2103
+ write_wandb_pred(
2104
+ wandb_logger,
2105
+ pred_str,
2106
+ label_str,
2107
+ norm_pred_str,
2108
+ norm_label_str,
2109
+ cur_step,
2110
+ prefix=eval_split,
2111
+ )
2112
+
2113
+ if has_tensorboard and jax.process_index() == 0:
2114
+ # we'll only log to tensorboard every eval steps
2115
+ write_train_metric(
2116
+ summary_writer,
2117
+ train_metrics,
2118
+ train_time,
2119
+ cur_step,
2120
+ training_args.logging_steps,
2121
+ )
2122
+
2123
+ # flush the train metrics
2124
+ train_start = time.time()
2125
+ train_metrics = []
2126
+
2127
+ # break condition
2128
+ if cur_step == total_train_steps:
2129
+ continue_training = False
2130
+ break
2131
+
2132
+ if not continue_training:
2133
+ break
2134
+
2135
+
2136
+ if __name__ == "__main__":
2137
+ main()
scripts/.DS_Store ADDED
Binary file (6.15 kB). View file
 
scripts/__pycache__/hyperparameter_search.cpython-312.pyc ADDED
Binary file (10.5 kB). View file
 
scripts/distil-whisper-lora-run5/adapter/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: distil-whisper/distil-small.en
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2
scripts/distil-whisper-lora-run5/adapter/adapter_config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "distil-whisper/distil-small.en",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.08899690296270608,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "model.decoder.layers.1.q_proj",
28
+ "model.decoder.layers.4.fc2",
29
+ "model.decoder.layers.3.q_proj",
30
+ "model.decoder.layers.5.fc1",
31
+ "model.decoder.layers.0.q_proj",
32
+ "model.decoder.layers.1.fc2",
33
+ "model.decoder.layers.2.fc2",
34
+ "model.decoder.layers.0.v_proj",
35
+ "model.decoder.layers.5.fc2",
36
+ "model.decoder.layers.0.fc2",
37
+ "model.decoder.layers.3.out_proj",
38
+ "model.decoder.layers.5.q_proj",
39
+ "model.decoder.layers.5.k_proj",
40
+ "model.decoder.layers.4.k_proj",
41
+ "model.decoder.layers.0.out_proj",
42
+ "model.decoder.layers.2.fc1",
43
+ "model.decoder.layers.1.out_proj",
44
+ "model.decoder.layers.4.v_proj",
45
+ "model.decoder.layers.1.v_proj",
46
+ "model.decoder.layers.3.v_proj",
47
+ "model.decoder.layers.1.fc1",
48
+ "model.decoder.layers.3.fc1",
49
+ "model.decoder.layers.5.v_proj",
50
+ "model.decoder.layers.1.k_proj",
51
+ "model.decoder.layers.5.out_proj",
52
+ "model.decoder.layers.4.q_proj",
53
+ "model.decoder.layers.4.fc1",
54
+ "model.decoder.layers.2.out_proj",
55
+ "model.decoder.layers.0.fc1",
56
+ "model.decoder.layers.3.fc2",
57
+ "model.decoder.layers.2.q_proj",
58
+ "model.decoder.layers.0.k_proj",
59
+ "model.decoder.layers.2.v_proj",
60
+ "model.decoder.layers.3.k_proj",
61
+ "model.decoder.layers.4.out_proj",
62
+ "model.decoder.layers.2.k_proj"
63
+ ],
64
+ "task_type": "SEQ_2_SEQ_LM",
65
+ "trainable_token_indices": null,
66
+ "use_dora": false,
67
+ "use_rslora": false
68
+ }
scripts/distil-whisper-lora-run5/adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80c2ed208a00884a430a6d144943318470e3cad997b0791506d4db466518d2df
3
+ size 3934208
scripts/distil-whisper-lora-run5/adapter/added_tokens.json ADDED
@@ -0,0 +1,1609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|0.00|>": 50364,
3
+ "<|0.02|>": 50365,
4
+ "<|0.04|>": 50366,
5
+ "<|0.06|>": 50367,
6
+ "<|0.08|>": 50368,
7
+ "<|0.10|>": 50369,
8
+ "<|0.12|>": 50370,
9
+ "<|0.14|>": 50371,
10
+ "<|0.16|>": 50372,
11
+ "<|0.18|>": 50373,
12
+ "<|0.20|>": 50374,
13
+ "<|0.22|>": 50375,
14
+ "<|0.24|>": 50376,
15
+ "<|0.26|>": 50377,
16
+ "<|0.28|>": 50378,
17
+ "<|0.30|>": 50379,
18
+ "<|0.32|>": 50380,
19
+ "<|0.34|>": 50381,
20
+ "<|0.36|>": 50382,
21
+ "<|0.38|>": 50383,
22
+ "<|0.40|>": 50384,
23
+ "<|0.42|>": 50385,
24
+ "<|0.44|>": 50386,
25
+ "<|0.46|>": 50387,
26
+ "<|0.48|>": 50388,
27
+ "<|0.50|>": 50389,
28
+ "<|0.52|>": 50390,
29
+ "<|0.54|>": 50391,
30
+ "<|0.56|>": 50392,
31
+ "<|0.58|>": 50393,
32
+ "<|0.60|>": 50394,
33
+ "<|0.62|>": 50395,
34
+ "<|0.64|>": 50396,
35
+ "<|0.66|>": 50397,
36
+ "<|0.68|>": 50398,
37
+ "<|0.70|>": 50399,
38
+ "<|0.72|>": 50400,
39
+ "<|0.74|>": 50401,
40
+ "<|0.76|>": 50402,
41
+ "<|0.78|>": 50403,
42
+ "<|0.80|>": 50404,
43
+ "<|0.82|>": 50405,
44
+ "<|0.84|>": 50406,
45
+ "<|0.86|>": 50407,
46
+ "<|0.88|>": 50408,
47
+ "<|0.90|>": 50409,
48
+ "<|0.92|>": 50410,
49
+ "<|0.94|>": 50411,
50
+ "<|0.96|>": 50412,
51
+ "<|0.98|>": 50413,
52
+ "<|1.00|>": 50414,
53
+ "<|1.02|>": 50415,
54
+ "<|1.04|>": 50416,
55
+ "<|1.06|>": 50417,
56
+ "<|1.08|>": 50418,
57
+ "<|1.10|>": 50419,
58
+ "<|1.12|>": 50420,
59
+ "<|1.14|>": 50421,
60
+ "<|1.16|>": 50422,
61
+ "<|1.18|>": 50423,
62
+ "<|1.20|>": 50424,
63
+ "<|1.22|>": 50425,
64
+ "<|1.24|>": 50426,
65
+ "<|1.26|>": 50427,
66
+ "<|1.28|>": 50428,
67
+ "<|1.30|>": 50429,
68
+ "<|1.32|>": 50430,
69
+ "<|1.34|>": 50431,
70
+ "<|1.36|>": 50432,
71
+ "<|1.38|>": 50433,
72
+ "<|1.40|>": 50434,
73
+ "<|1.42|>": 50435,
74
+ "<|1.44|>": 50436,
75
+ "<|1.46|>": 50437,
76
+ "<|1.48|>": 50438,
77
+ "<|1.50|>": 50439,
78
+ "<|1.52|>": 50440,
79
+ "<|1.54|>": 50441,
80
+ "<|1.56|>": 50442,
81
+ "<|1.58|>": 50443,
82
+ "<|1.60|>": 50444,
83
+ "<|1.62|>": 50445,
84
+ "<|1.64|>": 50446,
85
+ "<|1.66|>": 50447,
86
+ "<|1.68|>": 50448,
87
+ "<|1.70|>": 50449,
88
+ "<|1.72|>": 50450,
89
+ "<|1.74|>": 50451,
90
+ "<|1.76|>": 50452,
91
+ "<|1.78|>": 50453,
92
+ "<|1.80|>": 50454,
93
+ "<|1.82|>": 50455,
94
+ "<|1.84|>": 50456,
95
+ "<|1.86|>": 50457,
96
+ "<|1.88|>": 50458,
97
+ "<|1.90|>": 50459,
98
+ "<|1.92|>": 50460,
99
+ "<|1.94|>": 50461,
100
+ "<|1.96|>": 50462,
101
+ "<|1.98|>": 50463,
102
+ "<|10.00|>": 50864,
103
+ "<|10.02|>": 50865,
104
+ "<|10.04|>": 50866,
105
+ "<|10.06|>": 50867,
106
+ "<|10.08|>": 50868,
107
+ "<|10.10|>": 50869,
108
+ "<|10.12|>": 50870,
109
+ "<|10.14|>": 50871,
110
+ "<|10.16|>": 50872,
111
+ "<|10.18|>": 50873,
112
+ "<|10.20|>": 50874,
113
+ "<|10.22|>": 50875,
114
+ "<|10.24|>": 50876,
115
+ "<|10.26|>": 50877,
116
+ "<|10.28|>": 50878,
117
+ "<|10.30|>": 50879,
118
+ "<|10.32|>": 50880,
119
+ "<|10.34|>": 50881,
120
+ "<|10.36|>": 50882,
121
+ "<|10.38|>": 50883,
122
+ "<|10.40|>": 50884,
123
+ "<|10.42|>": 50885,
124
+ "<|10.44|>": 50886,
125
+ "<|10.46|>": 50887,
126
+ "<|10.48|>": 50888,
127
+ "<|10.50|>": 50889,
128
+ "<|10.52|>": 50890,
129
+ "<|10.54|>": 50891,
130
+ "<|10.56|>": 50892,
131
+ "<|10.58|>": 50893,
132
+ "<|10.60|>": 50894,
133
+ "<|10.62|>": 50895,
134
+ "<|10.64|>": 50896,
135
+ "<|10.66|>": 50897,
136
+ "<|10.68|>": 50898,
137
+ "<|10.70|>": 50899,
138
+ "<|10.72|>": 50900,
139
+ "<|10.74|>": 50901,
140
+ "<|10.76|>": 50902,
141
+ "<|10.78|>": 50903,
142
+ "<|10.80|>": 50904,
143
+ "<|10.82|>": 50905,
144
+ "<|10.84|>": 50906,
145
+ "<|10.86|>": 50907,
146
+ "<|10.88|>": 50908,
147
+ "<|10.90|>": 50909,
148
+ "<|10.92|>": 50910,
149
+ "<|10.94|>": 50911,
150
+ "<|10.96|>": 50912,
151
+ "<|10.98|>": 50913,
152
+ "<|11.00|>": 50914,
153
+ "<|11.02|>": 50915,
154
+ "<|11.04|>": 50916,
155
+ "<|11.06|>": 50917,
156
+ "<|11.08|>": 50918,
157
+ "<|11.10|>": 50919,
158
+ "<|11.12|>": 50920,
159
+ "<|11.14|>": 50921,
160
+ "<|11.16|>": 50922,
161
+ "<|11.18|>": 50923,
162
+ "<|11.20|>": 50924,
163
+ "<|11.22|>": 50925,
164
+ "<|11.24|>": 50926,
165
+ "<|11.26|>": 50927,
166
+ "<|11.28|>": 50928,
167
+ "<|11.30|>": 50929,
168
+ "<|11.32|>": 50930,
169
+ "<|11.34|>": 50931,
170
+ "<|11.36|>": 50932,
171
+ "<|11.38|>": 50933,
172
+ "<|11.40|>": 50934,
173
+ "<|11.42|>": 50935,
174
+ "<|11.44|>": 50936,
175
+ "<|11.46|>": 50937,
176
+ "<|11.48|>": 50938,
177
+ "<|11.50|>": 50939,
178
+ "<|11.52|>": 50940,
179
+ "<|11.54|>": 50941,
180
+ "<|11.56|>": 50942,
181
+ "<|11.58|>": 50943,
182
+ "<|11.60|>": 50944,
183
+ "<|11.62|>": 50945,
184
+ "<|11.64|>": 50946,
185
+ "<|11.66|>": 50947,
186
+ "<|11.68|>": 50948,
187
+ "<|11.70|>": 50949,
188
+ "<|11.72|>": 50950,
189
+ "<|11.74|>": 50951,
190
+ "<|11.76|>": 50952,
191
+ "<|11.78|>": 50953,
192
+ "<|11.80|>": 50954,
193
+ "<|11.82|>": 50955,
194
+ "<|11.84|>": 50956,
195
+ "<|11.86|>": 50957,
196
+ "<|11.88|>": 50958,
197
+ "<|11.90|>": 50959,
198
+ "<|11.92|>": 50960,
199
+ "<|11.94|>": 50961,
200
+ "<|11.96|>": 50962,
201
+ "<|11.98|>": 50963,
202
+ "<|12.00|>": 50964,
203
+ "<|12.02|>": 50965,
204
+ "<|12.04|>": 50966,
205
+ "<|12.06|>": 50967,
206
+ "<|12.08|>": 50968,
207
+ "<|12.10|>": 50969,
208
+ "<|12.12|>": 50970,
209
+ "<|12.14|>": 50971,
210
+ "<|12.16|>": 50972,
211
+ "<|12.18|>": 50973,
212
+ "<|12.20|>": 50974,
213
+ "<|12.22|>": 50975,
214
+ "<|12.24|>": 50976,
215
+ "<|12.26|>": 50977,
216
+ "<|12.28|>": 50978,
217
+ "<|12.30|>": 50979,
218
+ "<|12.32|>": 50980,
219
+ "<|12.34|>": 50981,
220
+ "<|12.36|>": 50982,
221
+ "<|12.38|>": 50983,
222
+ "<|12.40|>": 50984,
223
+ "<|12.42|>": 50985,
224
+ "<|12.44|>": 50986,
225
+ "<|12.46|>": 50987,
226
+ "<|12.48|>": 50988,
227
+ "<|12.50|>": 50989,
228
+ "<|12.52|>": 50990,
229
+ "<|12.54|>": 50991,
230
+ "<|12.56|>": 50992,
231
+ "<|12.58|>": 50993,
232
+ "<|12.60|>": 50994,
233
+ "<|12.62|>": 50995,
234
+ "<|12.64|>": 50996,
235
+ "<|12.66|>": 50997,
236
+ "<|12.68|>": 50998,
237
+ "<|12.70|>": 50999,
238
+ "<|12.72|>": 51000,
239
+ "<|12.74|>": 51001,
240
+ "<|12.76|>": 51002,
241
+ "<|12.78|>": 51003,
242
+ "<|12.80|>": 51004,
243
+ "<|12.82|>": 51005,
244
+ "<|12.84|>": 51006,
245
+ "<|12.86|>": 51007,
246
+ "<|12.88|>": 51008,
247
+ "<|12.90|>": 51009,
248
+ "<|12.92|>": 51010,
249
+ "<|12.94|>": 51011,
250
+ "<|12.96|>": 51012,
251
+ "<|12.98|>": 51013,
252
+ "<|13.00|>": 51014,
253
+ "<|13.02|>": 51015,
254
+ "<|13.04|>": 51016,
255
+ "<|13.06|>": 51017,
256
+ "<|13.08|>": 51018,
257
+ "<|13.10|>": 51019,
258
+ "<|13.12|>": 51020,
259
+ "<|13.14|>": 51021,
260
+ "<|13.16|>": 51022,
261
+ "<|13.18|>": 51023,
262
+ "<|13.20|>": 51024,
263
+ "<|13.22|>": 51025,
264
+ "<|13.24|>": 51026,
265
+ "<|13.26|>": 51027,
266
+ "<|13.28|>": 51028,
267
+ "<|13.30|>": 51029,
268
+ "<|13.32|>": 51030,
269
+ "<|13.34|>": 51031,
270
+ "<|13.36|>": 51032,
271
+ "<|13.38|>": 51033,
272
+ "<|13.40|>": 51034,
273
+ "<|13.42|>": 51035,
274
+ "<|13.44|>": 51036,
275
+ "<|13.46|>": 51037,
276
+ "<|13.48|>": 51038,
277
+ "<|13.50|>": 51039,
278
+ "<|13.52|>": 51040,
279
+ "<|13.54|>": 51041,
280
+ "<|13.56|>": 51042,
281
+ "<|13.58|>": 51043,
282
+ "<|13.60|>": 51044,
283
+ "<|13.62|>": 51045,
284
+ "<|13.64|>": 51046,
285
+ "<|13.66|>": 51047,
286
+ "<|13.68|>": 51048,
287
+ "<|13.70|>": 51049,
288
+ "<|13.72|>": 51050,
289
+ "<|13.74|>": 51051,
290
+ "<|13.76|>": 51052,
291
+ "<|13.78|>": 51053,
292
+ "<|13.80|>": 51054,
293
+ "<|13.82|>": 51055,
294
+ "<|13.84|>": 51056,
295
+ "<|13.86|>": 51057,
296
+ "<|13.88|>": 51058,
297
+ "<|13.90|>": 51059,
298
+ "<|13.92|>": 51060,
299
+ "<|13.94|>": 51061,
300
+ "<|13.96|>": 51062,
301
+ "<|13.98|>": 51063,
302
+ "<|14.00|>": 51064,
303
+ "<|14.02|>": 51065,
304
+ "<|14.04|>": 51066,
305
+ "<|14.06|>": 51067,
306
+ "<|14.08|>": 51068,
307
+ "<|14.10|>": 51069,
308
+ "<|14.12|>": 51070,
309
+ "<|14.14|>": 51071,
310
+ "<|14.16|>": 51072,
311
+ "<|14.18|>": 51073,
312
+ "<|14.20|>": 51074,
313
+ "<|14.22|>": 51075,
314
+ "<|14.24|>": 51076,
315
+ "<|14.26|>": 51077,
316
+ "<|14.28|>": 51078,
317
+ "<|14.30|>": 51079,
318
+ "<|14.32|>": 51080,
319
+ "<|14.34|>": 51081,
320
+ "<|14.36|>": 51082,
321
+ "<|14.38|>": 51083,
322
+ "<|14.40|>": 51084,
323
+ "<|14.42|>": 51085,
324
+ "<|14.44|>": 51086,
325
+ "<|14.46|>": 51087,
326
+ "<|14.48|>": 51088,
327
+ "<|14.50|>": 51089,
328
+ "<|14.52|>": 51090,
329
+ "<|14.54|>": 51091,
330
+ "<|14.56|>": 51092,
331
+ "<|14.58|>": 51093,
332
+ "<|14.60|>": 51094,
333
+ "<|14.62|>": 51095,
334
+ "<|14.64|>": 51096,
335
+ "<|14.66|>": 51097,
336
+ "<|14.68|>": 51098,
337
+ "<|14.70|>": 51099,
338
+ "<|14.72|>": 51100,
339
+ "<|14.74|>": 51101,
340
+ "<|14.76|>": 51102,
341
+ "<|14.78|>": 51103,
342
+ "<|14.80|>": 51104,
343
+ "<|14.82|>": 51105,
344
+ "<|14.84|>": 51106,
345
+ "<|14.86|>": 51107,
346
+ "<|14.88|>": 51108,
347
+ "<|14.90|>": 51109,
348
+ "<|14.92|>": 51110,
349
+ "<|14.94|>": 51111,
350
+ "<|14.96|>": 51112,
351
+ "<|14.98|>": 51113,
352
+ "<|15.00|>": 51114,
353
+ "<|15.02|>": 51115,
354
+ "<|15.04|>": 51116,
355
+ "<|15.06|>": 51117,
356
+ "<|15.08|>": 51118,
357
+ "<|15.10|>": 51119,
358
+ "<|15.12|>": 51120,
359
+ "<|15.14|>": 51121,
360
+ "<|15.16|>": 51122,
361
+ "<|15.18|>": 51123,
362
+ "<|15.20|>": 51124,
363
+ "<|15.22|>": 51125,
364
+ "<|15.24|>": 51126,
365
+ "<|15.26|>": 51127,
366
+ "<|15.28|>": 51128,
367
+ "<|15.30|>": 51129,
368
+ "<|15.32|>": 51130,
369
+ "<|15.34|>": 51131,
370
+ "<|15.36|>": 51132,
371
+ "<|15.38|>": 51133,
372
+ "<|15.40|>": 51134,
373
+ "<|15.42|>": 51135,
374
+ "<|15.44|>": 51136,
375
+ "<|15.46|>": 51137,
376
+ "<|15.48|>": 51138,
377
+ "<|15.50|>": 51139,
378
+ "<|15.52|>": 51140,
379
+ "<|15.54|>": 51141,
380
+ "<|15.56|>": 51142,
381
+ "<|15.58|>": 51143,
382
+ "<|15.60|>": 51144,
383
+ "<|15.62|>": 51145,
384
+ "<|15.64|>": 51146,
385
+ "<|15.66|>": 51147,
386
+ "<|15.68|>": 51148,
387
+ "<|15.70|>": 51149,
388
+ "<|15.72|>": 51150,
389
+ "<|15.74|>": 51151,
390
+ "<|15.76|>": 51152,
391
+ "<|15.78|>": 51153,
392
+ "<|15.80|>": 51154,
393
+ "<|15.82|>": 51155,
394
+ "<|15.84|>": 51156,
395
+ "<|15.86|>": 51157,
396
+ "<|15.88|>": 51158,
397
+ "<|15.90|>": 51159,
398
+ "<|15.92|>": 51160,
399
+ "<|15.94|>": 51161,
400
+ "<|15.96|>": 51162,
401
+ "<|15.98|>": 51163,
402
+ "<|16.00|>": 51164,
403
+ "<|16.02|>": 51165,
404
+ "<|16.04|>": 51166,
405
+ "<|16.06|>": 51167,
406
+ "<|16.08|>": 51168,
407
+ "<|16.10|>": 51169,
408
+ "<|16.12|>": 51170,
409
+ "<|16.14|>": 51171,
410
+ "<|16.16|>": 51172,
411
+ "<|16.18|>": 51173,
412
+ "<|16.20|>": 51174,
413
+ "<|16.22|>": 51175,
414
+ "<|16.24|>": 51176,
415
+ "<|16.26|>": 51177,
416
+ "<|16.28|>": 51178,
417
+ "<|16.30|>": 51179,
418
+ "<|16.32|>": 51180,
419
+ "<|16.34|>": 51181,
420
+ "<|16.36|>": 51182,
421
+ "<|16.38|>": 51183,
422
+ "<|16.40|>": 51184,
423
+ "<|16.42|>": 51185,
424
+ "<|16.44|>": 51186,
425
+ "<|16.46|>": 51187,
426
+ "<|16.48|>": 51188,
427
+ "<|16.50|>": 51189,
428
+ "<|16.52|>": 51190,
429
+ "<|16.54|>": 51191,
430
+ "<|16.56|>": 51192,
431
+ "<|16.58|>": 51193,
432
+ "<|16.60|>": 51194,
433
+ "<|16.62|>": 51195,
434
+ "<|16.64|>": 51196,
435
+ "<|16.66|>": 51197,
436
+ "<|16.68|>": 51198,
437
+ "<|16.70|>": 51199,
438
+ "<|16.72|>": 51200,
439
+ "<|16.74|>": 51201,
440
+ "<|16.76|>": 51202,
441
+ "<|16.78|>": 51203,
442
+ "<|16.80|>": 51204,
443
+ "<|16.82|>": 51205,
444
+ "<|16.84|>": 51206,
445
+ "<|16.86|>": 51207,
446
+ "<|16.88|>": 51208,
447
+ "<|16.90|>": 51209,
448
+ "<|16.92|>": 51210,
449
+ "<|16.94|>": 51211,
450
+ "<|16.96|>": 51212,
451
+ "<|16.98|>": 51213,
452
+ "<|17.00|>": 51214,
453
+ "<|17.02|>": 51215,
454
+ "<|17.04|>": 51216,
455
+ "<|17.06|>": 51217,
456
+ "<|17.08|>": 51218,
457
+ "<|17.10|>": 51219,
458
+ "<|17.12|>": 51220,
459
+ "<|17.14|>": 51221,
460
+ "<|17.16|>": 51222,
461
+ "<|17.18|>": 51223,
462
+ "<|17.20|>": 51224,
463
+ "<|17.22|>": 51225,
464
+ "<|17.24|>": 51226,
465
+ "<|17.26|>": 51227,
466
+ "<|17.28|>": 51228,
467
+ "<|17.30|>": 51229,
468
+ "<|17.32|>": 51230,
469
+ "<|17.34|>": 51231,
470
+ "<|17.36|>": 51232,
471
+ "<|17.38|>": 51233,
472
+ "<|17.40|>": 51234,
473
+ "<|17.42|>": 51235,
474
+ "<|17.44|>": 51236,
475
+ "<|17.46|>": 51237,
476
+ "<|17.48|>": 51238,
477
+ "<|17.50|>": 51239,
478
+ "<|17.52|>": 51240,
479
+ "<|17.54|>": 51241,
480
+ "<|17.56|>": 51242,
481
+ "<|17.58|>": 51243,
482
+ "<|17.60|>": 51244,
483
+ "<|17.62|>": 51245,
484
+ "<|17.64|>": 51246,
485
+ "<|17.66|>": 51247,
486
+ "<|17.68|>": 51248,
487
+ "<|17.70|>": 51249,
488
+ "<|17.72|>": 51250,
489
+ "<|17.74|>": 51251,
490
+ "<|17.76|>": 51252,
491
+ "<|17.78|>": 51253,
492
+ "<|17.80|>": 51254,
493
+ "<|17.82|>": 51255,
494
+ "<|17.84|>": 51256,
495
+ "<|17.86|>": 51257,
496
+ "<|17.88|>": 51258,
497
+ "<|17.90|>": 51259,
498
+ "<|17.92|>": 51260,
499
+ "<|17.94|>": 51261,
500
+ "<|17.96|>": 51262,
501
+ "<|17.98|>": 51263,
502
+ "<|18.00|>": 51264,
503
+ "<|18.02|>": 51265,
504
+ "<|18.04|>": 51266,
505
+ "<|18.06|>": 51267,
506
+ "<|18.08|>": 51268,
507
+ "<|18.10|>": 51269,
508
+ "<|18.12|>": 51270,
509
+ "<|18.14|>": 51271,
510
+ "<|18.16|>": 51272,
511
+ "<|18.18|>": 51273,
512
+ "<|18.20|>": 51274,
513
+ "<|18.22|>": 51275,
514
+ "<|18.24|>": 51276,
515
+ "<|18.26|>": 51277,
516
+ "<|18.28|>": 51278,
517
+ "<|18.30|>": 51279,
518
+ "<|18.32|>": 51280,
519
+ "<|18.34|>": 51281,
520
+ "<|18.36|>": 51282,
521
+ "<|18.38|>": 51283,
522
+ "<|18.40|>": 51284,
523
+ "<|18.42|>": 51285,
524
+ "<|18.44|>": 51286,
525
+ "<|18.46|>": 51287,
526
+ "<|18.48|>": 51288,
527
+ "<|18.50|>": 51289,
528
+ "<|18.52|>": 51290,
529
+ "<|18.54|>": 51291,
530
+ "<|18.56|>": 51292,
531
+ "<|18.58|>": 51293,
532
+ "<|18.60|>": 51294,
533
+ "<|18.62|>": 51295,
534
+ "<|18.64|>": 51296,
535
+ "<|18.66|>": 51297,
536
+ "<|18.68|>": 51298,
537
+ "<|18.70|>": 51299,
538
+ "<|18.72|>": 51300,
539
+ "<|18.74|>": 51301,
540
+ "<|18.76|>": 51302,
541
+ "<|18.78|>": 51303,
542
+ "<|18.80|>": 51304,
543
+ "<|18.82|>": 51305,
544
+ "<|18.84|>": 51306,
545
+ "<|18.86|>": 51307,
546
+ "<|18.88|>": 51308,
547
+ "<|18.90|>": 51309,
548
+ "<|18.92|>": 51310,
549
+ "<|18.94|>": 51311,
550
+ "<|18.96|>": 51312,
551
+ "<|18.98|>": 51313,
552
+ "<|19.00|>": 51314,
553
+ "<|19.02|>": 51315,
554
+ "<|19.04|>": 51316,
555
+ "<|19.06|>": 51317,
556
+ "<|19.08|>": 51318,
557
+ "<|19.10|>": 51319,
558
+ "<|19.12|>": 51320,
559
+ "<|19.14|>": 51321,
560
+ "<|19.16|>": 51322,
561
+ "<|19.18|>": 51323,
562
+ "<|19.20|>": 51324,
563
+ "<|19.22|>": 51325,
564
+ "<|19.24|>": 51326,
565
+ "<|19.26|>": 51327,
566
+ "<|19.28|>": 51328,
567
+ "<|19.30|>": 51329,
568
+ "<|19.32|>": 51330,
569
+ "<|19.34|>": 51331,
570
+ "<|19.36|>": 51332,
571
+ "<|19.38|>": 51333,
572
+ "<|19.40|>": 51334,
573
+ "<|19.42|>": 51335,
574
+ "<|19.44|>": 51336,
575
+ "<|19.46|>": 51337,
576
+ "<|19.48|>": 51338,
577
+ "<|19.50|>": 51339,
578
+ "<|19.52|>": 51340,
579
+ "<|19.54|>": 51341,
580
+ "<|19.56|>": 51342,
581
+ "<|19.58|>": 51343,
582
+ "<|19.60|>": 51344,
583
+ "<|19.62|>": 51345,
584
+ "<|19.64|>": 51346,
585
+ "<|19.66|>": 51347,
586
+ "<|19.68|>": 51348,
587
+ "<|19.70|>": 51349,
588
+ "<|19.72|>": 51350,
589
+ "<|19.74|>": 51351,
590
+ "<|19.76|>": 51352,
591
+ "<|19.78|>": 51353,
592
+ "<|19.80|>": 51354,
593
+ "<|19.82|>": 51355,
594
+ "<|19.84|>": 51356,
595
+ "<|19.86|>": 51357,
596
+ "<|19.88|>": 51358,
597
+ "<|19.90|>": 51359,
598
+ "<|19.92|>": 51360,
599
+ "<|19.94|>": 51361,
600
+ "<|19.96|>": 51362,
601
+ "<|19.98|>": 51363,
602
+ "<|2.00|>": 50464,
603
+ "<|2.02|>": 50465,
604
+ "<|2.04|>": 50466,
605
+ "<|2.06|>": 50467,
606
+ "<|2.08|>": 50468,
607
+ "<|2.10|>": 50469,
608
+ "<|2.12|>": 50470,
609
+ "<|2.14|>": 50471,
610
+ "<|2.16|>": 50472,
611
+ "<|2.18|>": 50473,
612
+ "<|2.20|>": 50474,
613
+ "<|2.22|>": 50475,
614
+ "<|2.24|>": 50476,
615
+ "<|2.26|>": 50477,
616
+ "<|2.28|>": 50478,
617
+ "<|2.30|>": 50479,
618
+ "<|2.32|>": 50480,
619
+ "<|2.34|>": 50481,
620
+ "<|2.36|>": 50482,
621
+ "<|2.38|>": 50483,
622
+ "<|2.40|>": 50484,
623
+ "<|2.42|>": 50485,
624
+ "<|2.44|>": 50486,
625
+ "<|2.46|>": 50487,
626
+ "<|2.48|>": 50488,
627
+ "<|2.50|>": 50489,
628
+ "<|2.52|>": 50490,
629
+ "<|2.54|>": 50491,
630
+ "<|2.56|>": 50492,
631
+ "<|2.58|>": 50493,
632
+ "<|2.60|>": 50494,
633
+ "<|2.62|>": 50495,
634
+ "<|2.64|>": 50496,
635
+ "<|2.66|>": 50497,
636
+ "<|2.68|>": 50498,
637
+ "<|2.70|>": 50499,
638
+ "<|2.72|>": 50500,
639
+ "<|2.74|>": 50501,
640
+ "<|2.76|>": 50502,
641
+ "<|2.78|>": 50503,
642
+ "<|2.80|>": 50504,
643
+ "<|2.82|>": 50505,
644
+ "<|2.84|>": 50506,
645
+ "<|2.86|>": 50507,
646
+ "<|2.88|>": 50508,
647
+ "<|2.90|>": 50509,
648
+ "<|2.92|>": 50510,
649
+ "<|2.94|>": 50511,
650
+ "<|2.96|>": 50512,
651
+ "<|2.98|>": 50513,
652
+ "<|20.00|>": 51364,
653
+ "<|20.02|>": 51365,
654
+ "<|20.04|>": 51366,
655
+ "<|20.06|>": 51367,
656
+ "<|20.08|>": 51368,
657
+ "<|20.10|>": 51369,
658
+ "<|20.12|>": 51370,
659
+ "<|20.14|>": 51371,
660
+ "<|20.16|>": 51372,
661
+ "<|20.18|>": 51373,
662
+ "<|20.20|>": 51374,
663
+ "<|20.22|>": 51375,
664
+ "<|20.24|>": 51376,
665
+ "<|20.26|>": 51377,
666
+ "<|20.28|>": 51378,
667
+ "<|20.30|>": 51379,
668
+ "<|20.32|>": 51380,
669
+ "<|20.34|>": 51381,
670
+ "<|20.36|>": 51382,
671
+ "<|20.38|>": 51383,
672
+ "<|20.40|>": 51384,
673
+ "<|20.42|>": 51385,
674
+ "<|20.44|>": 51386,
675
+ "<|20.46|>": 51387,
676
+ "<|20.48|>": 51388,
677
+ "<|20.50|>": 51389,
678
+ "<|20.52|>": 51390,
679
+ "<|20.54|>": 51391,
680
+ "<|20.56|>": 51392,
681
+ "<|20.58|>": 51393,
682
+ "<|20.60|>": 51394,
683
+ "<|20.62|>": 51395,
684
+ "<|20.64|>": 51396,
685
+ "<|20.66|>": 51397,
686
+ "<|20.68|>": 51398,
687
+ "<|20.70|>": 51399,
688
+ "<|20.72|>": 51400,
689
+ "<|20.74|>": 51401,
690
+ "<|20.76|>": 51402,
691
+ "<|20.78|>": 51403,
692
+ "<|20.80|>": 51404,
693
+ "<|20.82|>": 51405,
694
+ "<|20.84|>": 51406,
695
+ "<|20.86|>": 51407,
696
+ "<|20.88|>": 51408,
697
+ "<|20.90|>": 51409,
698
+ "<|20.92|>": 51410,
699
+ "<|20.94|>": 51411,
700
+ "<|20.96|>": 51412,
701
+ "<|20.98|>": 51413,
702
+ "<|21.00|>": 51414,
703
+ "<|21.02|>": 51415,
704
+ "<|21.04|>": 51416,
705
+ "<|21.06|>": 51417,
706
+ "<|21.08|>": 51418,
707
+ "<|21.10|>": 51419,
708
+ "<|21.12|>": 51420,
709
+ "<|21.14|>": 51421,
710
+ "<|21.16|>": 51422,
711
+ "<|21.18|>": 51423,
712
+ "<|21.20|>": 51424,
713
+ "<|21.22|>": 51425,
714
+ "<|21.24|>": 51426,
715
+ "<|21.26|>": 51427,
716
+ "<|21.28|>": 51428,
717
+ "<|21.30|>": 51429,
718
+ "<|21.32|>": 51430,
719
+ "<|21.34|>": 51431,
720
+ "<|21.36|>": 51432,
721
+ "<|21.38|>": 51433,
722
+ "<|21.40|>": 51434,
723
+ "<|21.42|>": 51435,
724
+ "<|21.44|>": 51436,
725
+ "<|21.46|>": 51437,
726
+ "<|21.48|>": 51438,
727
+ "<|21.50|>": 51439,
728
+ "<|21.52|>": 51440,
729
+ "<|21.54|>": 51441,
730
+ "<|21.56|>": 51442,
731
+ "<|21.58|>": 51443,
732
+ "<|21.60|>": 51444,
733
+ "<|21.62|>": 51445,
734
+ "<|21.64|>": 51446,
735
+ "<|21.66|>": 51447,
736
+ "<|21.68|>": 51448,
737
+ "<|21.70|>": 51449,
738
+ "<|21.72|>": 51450,
739
+ "<|21.74|>": 51451,
740
+ "<|21.76|>": 51452,
741
+ "<|21.78|>": 51453,
742
+ "<|21.80|>": 51454,
743
+ "<|21.82|>": 51455,
744
+ "<|21.84|>": 51456,
745
+ "<|21.86|>": 51457,
746
+ "<|21.88|>": 51458,
747
+ "<|21.90|>": 51459,
748
+ "<|21.92|>": 51460,
749
+ "<|21.94|>": 51461,
750
+ "<|21.96|>": 51462,
751
+ "<|21.98|>": 51463,
752
+ "<|22.00|>": 51464,
753
+ "<|22.02|>": 51465,
754
+ "<|22.04|>": 51466,
755
+ "<|22.06|>": 51467,
756
+ "<|22.08|>": 51468,
757
+ "<|22.10|>": 51469,
758
+ "<|22.12|>": 51470,
759
+ "<|22.14|>": 51471,
760
+ "<|22.16|>": 51472,
761
+ "<|22.18|>": 51473,
762
+ "<|22.20|>": 51474,
763
+ "<|22.22|>": 51475,
764
+ "<|22.24|>": 51476,
765
+ "<|22.26|>": 51477,
766
+ "<|22.28|>": 51478,
767
+ "<|22.30|>": 51479,
768
+ "<|22.32|>": 51480,
769
+ "<|22.34|>": 51481,
770
+ "<|22.36|>": 51482,
771
+ "<|22.38|>": 51483,
772
+ "<|22.40|>": 51484,
773
+ "<|22.42|>": 51485,
774
+ "<|22.44|>": 51486,
775
+ "<|22.46|>": 51487,
776
+ "<|22.48|>": 51488,
777
+ "<|22.50|>": 51489,
778
+ "<|22.52|>": 51490,
779
+ "<|22.54|>": 51491,
780
+ "<|22.56|>": 51492,
781
+ "<|22.58|>": 51493,
782
+ "<|22.60|>": 51494,
783
+ "<|22.62|>": 51495,
784
+ "<|22.64|>": 51496,
785
+ "<|22.66|>": 51497,
786
+ "<|22.68|>": 51498,
787
+ "<|22.70|>": 51499,
788
+ "<|22.72|>": 51500,
789
+ "<|22.74|>": 51501,
790
+ "<|22.76|>": 51502,
791
+ "<|22.78|>": 51503,
792
+ "<|22.80|>": 51504,
793
+ "<|22.82|>": 51505,
794
+ "<|22.84|>": 51506,
795
+ "<|22.86|>": 51507,
796
+ "<|22.88|>": 51508,
797
+ "<|22.90|>": 51509,
798
+ "<|22.92|>": 51510,
799
+ "<|22.94|>": 51511,
800
+ "<|22.96|>": 51512,
801
+ "<|22.98|>": 51513,
802
+ "<|23.00|>": 51514,
803
+ "<|23.02|>": 51515,
804
+ "<|23.04|>": 51516,
805
+ "<|23.06|>": 51517,
806
+ "<|23.08|>": 51518,
807
+ "<|23.10|>": 51519,
808
+ "<|23.12|>": 51520,
809
+ "<|23.14|>": 51521,
810
+ "<|23.16|>": 51522,
811
+ "<|23.18|>": 51523,
812
+ "<|23.20|>": 51524,
813
+ "<|23.22|>": 51525,
814
+ "<|23.24|>": 51526,
815
+ "<|23.26|>": 51527,
816
+ "<|23.28|>": 51528,
817
+ "<|23.30|>": 51529,
818
+ "<|23.32|>": 51530,
819
+ "<|23.34|>": 51531,
820
+ "<|23.36|>": 51532,
821
+ "<|23.38|>": 51533,
822
+ "<|23.40|>": 51534,
823
+ "<|23.42|>": 51535,
824
+ "<|23.44|>": 51536,
825
+ "<|23.46|>": 51537,
826
+ "<|23.48|>": 51538,
827
+ "<|23.50|>": 51539,
828
+ "<|23.52|>": 51540,
829
+ "<|23.54|>": 51541,
830
+ "<|23.56|>": 51542,
831
+ "<|23.58|>": 51543,
832
+ "<|23.60|>": 51544,
833
+ "<|23.62|>": 51545,
834
+ "<|23.64|>": 51546,
835
+ "<|23.66|>": 51547,
836
+ "<|23.68|>": 51548,
837
+ "<|23.70|>": 51549,
838
+ "<|23.72|>": 51550,
839
+ "<|23.74|>": 51551,
840
+ "<|23.76|>": 51552,
841
+ "<|23.78|>": 51553,
842
+ "<|23.80|>": 51554,
843
+ "<|23.82|>": 51555,
844
+ "<|23.84|>": 51556,
845
+ "<|23.86|>": 51557,
846
+ "<|23.88|>": 51558,
847
+ "<|23.90|>": 51559,
848
+ "<|23.92|>": 51560,
849
+ "<|23.94|>": 51561,
850
+ "<|23.96|>": 51562,
851
+ "<|23.98|>": 51563,
852
+ "<|24.00|>": 51564,
853
+ "<|24.02|>": 51565,
854
+ "<|24.04|>": 51566,
855
+ "<|24.06|>": 51567,
856
+ "<|24.08|>": 51568,
857
+ "<|24.10|>": 51569,
858
+ "<|24.12|>": 51570,
859
+ "<|24.14|>": 51571,
860
+ "<|24.16|>": 51572,
861
+ "<|24.18|>": 51573,
862
+ "<|24.20|>": 51574,
863
+ "<|24.22|>": 51575,
864
+ "<|24.24|>": 51576,
865
+ "<|24.26|>": 51577,
866
+ "<|24.28|>": 51578,
867
+ "<|24.30|>": 51579,
868
+ "<|24.32|>": 51580,
869
+ "<|24.34|>": 51581,
870
+ "<|24.36|>": 51582,
871
+ "<|24.38|>": 51583,
872
+ "<|24.40|>": 51584,
873
+ "<|24.42|>": 51585,
874
+ "<|24.44|>": 51586,
875
+ "<|24.46|>": 51587,
876
+ "<|24.48|>": 51588,
877
+ "<|24.50|>": 51589,
878
+ "<|24.52|>": 51590,
879
+ "<|24.54|>": 51591,
880
+ "<|24.56|>": 51592,
881
+ "<|24.58|>": 51593,
882
+ "<|24.60|>": 51594,
883
+ "<|24.62|>": 51595,
884
+ "<|24.64|>": 51596,
885
+ "<|24.66|>": 51597,
886
+ "<|24.68|>": 51598,
887
+ "<|24.70|>": 51599,
888
+ "<|24.72|>": 51600,
889
+ "<|24.74|>": 51601,
890
+ "<|24.76|>": 51602,
891
+ "<|24.78|>": 51603,
892
+ "<|24.80|>": 51604,
893
+ "<|24.82|>": 51605,
894
+ "<|24.84|>": 51606,
895
+ "<|24.86|>": 51607,
896
+ "<|24.88|>": 51608,
897
+ "<|24.90|>": 51609,
898
+ "<|24.92|>": 51610,
899
+ "<|24.94|>": 51611,
900
+ "<|24.96|>": 51612,
901
+ "<|24.98|>": 51613,
902
+ "<|25.00|>": 51614,
903
+ "<|25.02|>": 51615,
904
+ "<|25.04|>": 51616,
905
+ "<|25.06|>": 51617,
906
+ "<|25.08|>": 51618,
907
+ "<|25.10|>": 51619,
908
+ "<|25.12|>": 51620,
909
+ "<|25.14|>": 51621,
910
+ "<|25.16|>": 51622,
911
+ "<|25.18|>": 51623,
912
+ "<|25.20|>": 51624,
913
+ "<|25.22|>": 51625,
914
+ "<|25.24|>": 51626,
915
+ "<|25.26|>": 51627,
916
+ "<|25.28|>": 51628,
917
+ "<|25.30|>": 51629,
918
+ "<|25.32|>": 51630,
919
+ "<|25.34|>": 51631,
920
+ "<|25.36|>": 51632,
921
+ "<|25.38|>": 51633,
922
+ "<|25.40|>": 51634,
923
+ "<|25.42|>": 51635,
924
+ "<|25.44|>": 51636,
925
+ "<|25.46|>": 51637,
926
+ "<|25.48|>": 51638,
927
+ "<|25.50|>": 51639,
928
+ "<|25.52|>": 51640,
929
+ "<|25.54|>": 51641,
930
+ "<|25.56|>": 51642,
931
+ "<|25.58|>": 51643,
932
+ "<|25.60|>": 51644,
933
+ "<|25.62|>": 51645,
934
+ "<|25.64|>": 51646,
935
+ "<|25.66|>": 51647,
936
+ "<|25.68|>": 51648,
937
+ "<|25.70|>": 51649,
938
+ "<|25.72|>": 51650,
939
+ "<|25.74|>": 51651,
940
+ "<|25.76|>": 51652,
941
+ "<|25.78|>": 51653,
942
+ "<|25.80|>": 51654,
943
+ "<|25.82|>": 51655,
944
+ "<|25.84|>": 51656,
945
+ "<|25.86|>": 51657,
946
+ "<|25.88|>": 51658,
947
+ "<|25.90|>": 51659,
948
+ "<|25.92|>": 51660,
949
+ "<|25.94|>": 51661,
950
+ "<|25.96|>": 51662,
951
+ "<|25.98|>": 51663,
952
+ "<|26.00|>": 51664,
953
+ "<|26.02|>": 51665,
954
+ "<|26.04|>": 51666,
955
+ "<|26.06|>": 51667,
956
+ "<|26.08|>": 51668,
957
+ "<|26.10|>": 51669,
958
+ "<|26.12|>": 51670,
959
+ "<|26.14|>": 51671,
960
+ "<|26.16|>": 51672,
961
+ "<|26.18|>": 51673,
962
+ "<|26.20|>": 51674,
963
+ "<|26.22|>": 51675,
964
+ "<|26.24|>": 51676,
965
+ "<|26.26|>": 51677,
966
+ "<|26.28|>": 51678,
967
+ "<|26.30|>": 51679,
968
+ "<|26.32|>": 51680,
969
+ "<|26.34|>": 51681,
970
+ "<|26.36|>": 51682,
971
+ "<|26.38|>": 51683,
972
+ "<|26.40|>": 51684,
973
+ "<|26.42|>": 51685,
974
+ "<|26.44|>": 51686,
975
+ "<|26.46|>": 51687,
976
+ "<|26.48|>": 51688,
977
+ "<|26.50|>": 51689,
978
+ "<|26.52|>": 51690,
979
+ "<|26.54|>": 51691,
980
+ "<|26.56|>": 51692,
981
+ "<|26.58|>": 51693,
982
+ "<|26.60|>": 51694,
983
+ "<|26.62|>": 51695,
984
+ "<|26.64|>": 51696,
985
+ "<|26.66|>": 51697,
986
+ "<|26.68|>": 51698,
987
+ "<|26.70|>": 51699,
988
+ "<|26.72|>": 51700,
989
+ "<|26.74|>": 51701,
990
+ "<|26.76|>": 51702,
991
+ "<|26.78|>": 51703,
992
+ "<|26.80|>": 51704,
993
+ "<|26.82|>": 51705,
994
+ "<|26.84|>": 51706,
995
+ "<|26.86|>": 51707,
996
+ "<|26.88|>": 51708,
997
+ "<|26.90|>": 51709,
998
+ "<|26.92|>": 51710,
999
+ "<|26.94|>": 51711,
1000
+ "<|26.96|>": 51712,
1001
+ "<|26.98|>": 51713,
1002
+ "<|27.00|>": 51714,
1003
+ "<|27.02|>": 51715,
1004
+ "<|27.04|>": 51716,
1005
+ "<|27.06|>": 51717,
1006
+ "<|27.08|>": 51718,
1007
+ "<|27.10|>": 51719,
1008
+ "<|27.12|>": 51720,
1009
+ "<|27.14|>": 51721,
1010
+ "<|27.16|>": 51722,
1011
+ "<|27.18|>": 51723,
1012
+ "<|27.20|>": 51724,
1013
+ "<|27.22|>": 51725,
1014
+ "<|27.24|>": 51726,
1015
+ "<|27.26|>": 51727,
1016
+ "<|27.28|>": 51728,
1017
+ "<|27.30|>": 51729,
1018
+ "<|27.32|>": 51730,
1019
+ "<|27.34|>": 51731,
1020
+ "<|27.36|>": 51732,
1021
+ "<|27.38|>": 51733,
1022
+ "<|27.40|>": 51734,
1023
+ "<|27.42|>": 51735,
1024
+ "<|27.44|>": 51736,
1025
+ "<|27.46|>": 51737,
1026
+ "<|27.48|>": 51738,
1027
+ "<|27.50|>": 51739,
1028
+ "<|27.52|>": 51740,
1029
+ "<|27.54|>": 51741,
1030
+ "<|27.56|>": 51742,
1031
+ "<|27.58|>": 51743,
1032
+ "<|27.60|>": 51744,
1033
+ "<|27.62|>": 51745,
1034
+ "<|27.64|>": 51746,
1035
+ "<|27.66|>": 51747,
1036
+ "<|27.68|>": 51748,
1037
+ "<|27.70|>": 51749,
1038
+ "<|27.72|>": 51750,
1039
+ "<|27.74|>": 51751,
1040
+ "<|27.76|>": 51752,
1041
+ "<|27.78|>": 51753,
1042
+ "<|27.80|>": 51754,
1043
+ "<|27.82|>": 51755,
1044
+ "<|27.84|>": 51756,
1045
+ "<|27.86|>": 51757,
1046
+ "<|27.88|>": 51758,
1047
+ "<|27.90|>": 51759,
1048
+ "<|27.92|>": 51760,
1049
+ "<|27.94|>": 51761,
1050
+ "<|27.96|>": 51762,
1051
+ "<|27.98|>": 51763,
1052
+ "<|28.00|>": 51764,
1053
+ "<|28.02|>": 51765,
1054
+ "<|28.04|>": 51766,
1055
+ "<|28.06|>": 51767,
1056
+ "<|28.08|>": 51768,
1057
+ "<|28.10|>": 51769,
1058
+ "<|28.12|>": 51770,
1059
+ "<|28.14|>": 51771,
1060
+ "<|28.16|>": 51772,
1061
+ "<|28.18|>": 51773,
1062
+ "<|28.20|>": 51774,
1063
+ "<|28.22|>": 51775,
1064
+ "<|28.24|>": 51776,
1065
+ "<|28.26|>": 51777,
1066
+ "<|28.28|>": 51778,
1067
+ "<|28.30|>": 51779,
1068
+ "<|28.32|>": 51780,
1069
+ "<|28.34|>": 51781,
1070
+ "<|28.36|>": 51782,
1071
+ "<|28.38|>": 51783,
1072
+ "<|28.40|>": 51784,
1073
+ "<|28.42|>": 51785,
1074
+ "<|28.44|>": 51786,
1075
+ "<|28.46|>": 51787,
1076
+ "<|28.48|>": 51788,
1077
+ "<|28.50|>": 51789,
1078
+ "<|28.52|>": 51790,
1079
+ "<|28.54|>": 51791,
1080
+ "<|28.56|>": 51792,
1081
+ "<|28.58|>": 51793,
1082
+ "<|28.60|>": 51794,
1083
+ "<|28.62|>": 51795,
1084
+ "<|28.64|>": 51796,
1085
+ "<|28.66|>": 51797,
1086
+ "<|28.68|>": 51798,
1087
+ "<|28.70|>": 51799,
1088
+ "<|28.72|>": 51800,
1089
+ "<|28.74|>": 51801,
1090
+ "<|28.76|>": 51802,
1091
+ "<|28.78|>": 51803,
1092
+ "<|28.80|>": 51804,
1093
+ "<|28.82|>": 51805,
1094
+ "<|28.84|>": 51806,
1095
+ "<|28.86|>": 51807,
1096
+ "<|28.88|>": 51808,
1097
+ "<|28.90|>": 51809,
1098
+ "<|28.92|>": 51810,
1099
+ "<|28.94|>": 51811,
1100
+ "<|28.96|>": 51812,
1101
+ "<|28.98|>": 51813,
1102
+ "<|29.00|>": 51814,
1103
+ "<|29.02|>": 51815,
1104
+ "<|29.04|>": 51816,
1105
+ "<|29.06|>": 51817,
1106
+ "<|29.08|>": 51818,
1107
+ "<|29.10|>": 51819,
1108
+ "<|29.12|>": 51820,
1109
+ "<|29.14|>": 51821,
1110
+ "<|29.16|>": 51822,
1111
+ "<|29.18|>": 51823,
1112
+ "<|29.20|>": 51824,
1113
+ "<|29.22|>": 51825,
1114
+ "<|29.24|>": 51826,
1115
+ "<|29.26|>": 51827,
1116
+ "<|29.28|>": 51828,
1117
+ "<|29.30|>": 51829,
1118
+ "<|29.32|>": 51830,
1119
+ "<|29.34|>": 51831,
1120
+ "<|29.36|>": 51832,
1121
+ "<|29.38|>": 51833,
1122
+ "<|29.40|>": 51834,
1123
+ "<|29.42|>": 51835,
1124
+ "<|29.44|>": 51836,
1125
+ "<|29.46|>": 51837,
1126
+ "<|29.48|>": 51838,
1127
+ "<|29.50|>": 51839,
1128
+ "<|29.52|>": 51840,
1129
+ "<|29.54|>": 51841,
1130
+ "<|29.56|>": 51842,
1131
+ "<|29.58|>": 51843,
1132
+ "<|29.60|>": 51844,
1133
+ "<|29.62|>": 51845,
1134
+ "<|29.64|>": 51846,
1135
+ "<|29.66|>": 51847,
1136
+ "<|29.68|>": 51848,
1137
+ "<|29.70|>": 51849,
1138
+ "<|29.72|>": 51850,
1139
+ "<|29.74|>": 51851,
1140
+ "<|29.76|>": 51852,
1141
+ "<|29.78|>": 51853,
1142
+ "<|29.80|>": 51854,
1143
+ "<|29.82|>": 51855,
1144
+ "<|29.84|>": 51856,
1145
+ "<|29.86|>": 51857,
1146
+ "<|29.88|>": 51858,
1147
+ "<|29.90|>": 51859,
1148
+ "<|29.92|>": 51860,
1149
+ "<|29.94|>": 51861,
1150
+ "<|29.96|>": 51862,
1151
+ "<|29.98|>": 51863,
1152
+ "<|3.00|>": 50514,
1153
+ "<|3.02|>": 50515,
1154
+ "<|3.04|>": 50516,
1155
+ "<|3.06|>": 50517,
1156
+ "<|3.08|>": 50518,
1157
+ "<|3.10|>": 50519,
1158
+ "<|3.12|>": 50520,
1159
+ "<|3.14|>": 50521,
1160
+ "<|3.16|>": 50522,
1161
+ "<|3.18|>": 50523,
1162
+ "<|3.20|>": 50524,
1163
+ "<|3.22|>": 50525,
1164
+ "<|3.24|>": 50526,
1165
+ "<|3.26|>": 50527,
1166
+ "<|3.28|>": 50528,
1167
+ "<|3.30|>": 50529,
1168
+ "<|3.32|>": 50530,
1169
+ "<|3.34|>": 50531,
1170
+ "<|3.36|>": 50532,
1171
+ "<|3.38|>": 50533,
1172
+ "<|3.40|>": 50534,
1173
+ "<|3.42|>": 50535,
1174
+ "<|3.44|>": 50536,
1175
+ "<|3.46|>": 50537,
1176
+ "<|3.48|>": 50538,
1177
+ "<|3.50|>": 50539,
1178
+ "<|3.52|>": 50540,
1179
+ "<|3.54|>": 50541,
1180
+ "<|3.56|>": 50542,
1181
+ "<|3.58|>": 50543,
1182
+ "<|3.60|>": 50544,
1183
+ "<|3.62|>": 50545,
1184
+ "<|3.64|>": 50546,
1185
+ "<|3.66|>": 50547,
1186
+ "<|3.68|>": 50548,
1187
+ "<|3.70|>": 50549,
1188
+ "<|3.72|>": 50550,
1189
+ "<|3.74|>": 50551,
1190
+ "<|3.76|>": 50552,
1191
+ "<|3.78|>": 50553,
1192
+ "<|3.80|>": 50554,
1193
+ "<|3.82|>": 50555,
1194
+ "<|3.84|>": 50556,
1195
+ "<|3.86|>": 50557,
1196
+ "<|3.88|>": 50558,
1197
+ "<|3.90|>": 50559,
1198
+ "<|3.92|>": 50560,
1199
+ "<|3.94|>": 50561,
1200
+ "<|3.96|>": 50562,
1201
+ "<|3.98|>": 50563,
1202
+ "<|30.00|>": 51864,
1203
+ "<|4.00|>": 50564,
1204
+ "<|4.02|>": 50565,
1205
+ "<|4.04|>": 50566,
1206
+ "<|4.06|>": 50567,
1207
+ "<|4.08|>": 50568,
1208
+ "<|4.10|>": 50569,
1209
+ "<|4.12|>": 50570,
1210
+ "<|4.14|>": 50571,
1211
+ "<|4.16|>": 50572,
1212
+ "<|4.18|>": 50573,
1213
+ "<|4.20|>": 50574,
1214
+ "<|4.22|>": 50575,
1215
+ "<|4.24|>": 50576,
1216
+ "<|4.26|>": 50577,
1217
+ "<|4.28|>": 50578,
1218
+ "<|4.30|>": 50579,
1219
+ "<|4.32|>": 50580,
1220
+ "<|4.34|>": 50581,
1221
+ "<|4.36|>": 50582,
1222
+ "<|4.38|>": 50583,
1223
+ "<|4.40|>": 50584,
1224
+ "<|4.42|>": 50585,
1225
+ "<|4.44|>": 50586,
1226
+ "<|4.46|>": 50587,
1227
+ "<|4.48|>": 50588,
1228
+ "<|4.50|>": 50589,
1229
+ "<|4.52|>": 50590,
1230
+ "<|4.54|>": 50591,
1231
+ "<|4.56|>": 50592,
1232
+ "<|4.58|>": 50593,
1233
+ "<|4.60|>": 50594,
1234
+ "<|4.62|>": 50595,
1235
+ "<|4.64|>": 50596,
1236
+ "<|4.66|>": 50597,
1237
+ "<|4.68|>": 50598,
1238
+ "<|4.70|>": 50599,
1239
+ "<|4.72|>": 50600,
1240
+ "<|4.74|>": 50601,
1241
+ "<|4.76|>": 50602,
1242
+ "<|4.78|>": 50603,
1243
+ "<|4.80|>": 50604,
1244
+ "<|4.82|>": 50605,
1245
+ "<|4.84|>": 50606,
1246
+ "<|4.86|>": 50607,
1247
+ "<|4.88|>": 50608,
1248
+ "<|4.90|>": 50609,
1249
+ "<|4.92|>": 50610,
1250
+ "<|4.94|>": 50611,
1251
+ "<|4.96|>": 50612,
1252
+ "<|4.98|>": 50613,
1253
+ "<|5.00|>": 50614,
1254
+ "<|5.02|>": 50615,
1255
+ "<|5.04|>": 50616,
1256
+ "<|5.06|>": 50617,
1257
+ "<|5.08|>": 50618,
1258
+ "<|5.10|>": 50619,
1259
+ "<|5.12|>": 50620,
1260
+ "<|5.14|>": 50621,
1261
+ "<|5.16|>": 50622,
1262
+ "<|5.18|>": 50623,
1263
+ "<|5.20|>": 50624,
1264
+ "<|5.22|>": 50625,
1265
+ "<|5.24|>": 50626,
1266
+ "<|5.26|>": 50627,
1267
+ "<|5.28|>": 50628,
1268
+ "<|5.30|>": 50629,
1269
+ "<|5.32|>": 50630,
1270
+ "<|5.34|>": 50631,
1271
+ "<|5.36|>": 50632,
1272
+ "<|5.38|>": 50633,
1273
+ "<|5.40|>": 50634,
1274
+ "<|5.42|>": 50635,
1275
+ "<|5.44|>": 50636,
1276
+ "<|5.46|>": 50637,
1277
+ "<|5.48|>": 50638,
1278
+ "<|5.50|>": 50639,
1279
+ "<|5.52|>": 50640,
1280
+ "<|5.54|>": 50641,
1281
+ "<|5.56|>": 50642,
1282
+ "<|5.58|>": 50643,
1283
+ "<|5.60|>": 50644,
1284
+ "<|5.62|>": 50645,
1285
+ "<|5.64|>": 50646,
1286
+ "<|5.66|>": 50647,
1287
+ "<|5.68|>": 50648,
1288
+ "<|5.70|>": 50649,
1289
+ "<|5.72|>": 50650,
1290
+ "<|5.74|>": 50651,
1291
+ "<|5.76|>": 50652,
1292
+ "<|5.78|>": 50653,
1293
+ "<|5.80|>": 50654,
1294
+ "<|5.82|>": 50655,
1295
+ "<|5.84|>": 50656,
1296
+ "<|5.86|>": 50657,
1297
+ "<|5.88|>": 50658,
1298
+ "<|5.90|>": 50659,
1299
+ "<|5.92|>": 50660,
1300
+ "<|5.94|>": 50661,
1301
+ "<|5.96|>": 50662,
1302
+ "<|5.98|>": 50663,
1303
+ "<|6.00|>": 50664,
1304
+ "<|6.02|>": 50665,
1305
+ "<|6.04|>": 50666,
1306
+ "<|6.06|>": 50667,
1307
+ "<|6.08|>": 50668,
1308
+ "<|6.10|>": 50669,
1309
+ "<|6.12|>": 50670,
1310
+ "<|6.14|>": 50671,
1311
+ "<|6.16|>": 50672,
1312
+ "<|6.18|>": 50673,
1313
+ "<|6.20|>": 50674,
1314
+ "<|6.22|>": 50675,
1315
+ "<|6.24|>": 50676,
1316
+ "<|6.26|>": 50677,
1317
+ "<|6.28|>": 50678,
1318
+ "<|6.30|>": 50679,
1319
+ "<|6.32|>": 50680,
1320
+ "<|6.34|>": 50681,
1321
+ "<|6.36|>": 50682,
1322
+ "<|6.38|>": 50683,
1323
+ "<|6.40|>": 50684,
1324
+ "<|6.42|>": 50685,
1325
+ "<|6.44|>": 50686,
1326
+ "<|6.46|>": 50687,
1327
+ "<|6.48|>": 50688,
1328
+ "<|6.50|>": 50689,
1329
+ "<|6.52|>": 50690,
1330
+ "<|6.54|>": 50691,
1331
+ "<|6.56|>": 50692,
1332
+ "<|6.58|>": 50693,
1333
+ "<|6.60|>": 50694,
1334
+ "<|6.62|>": 50695,
1335
+ "<|6.64|>": 50696,
1336
+ "<|6.66|>": 50697,
1337
+ "<|6.68|>": 50698,
1338
+ "<|6.70|>": 50699,
1339
+ "<|6.72|>": 50700,
1340
+ "<|6.74|>": 50701,
1341
+ "<|6.76|>": 50702,
1342
+ "<|6.78|>": 50703,
1343
+ "<|6.80|>": 50704,
1344
+ "<|6.82|>": 50705,
1345
+ "<|6.84|>": 50706,
1346
+ "<|6.86|>": 50707,
1347
+ "<|6.88|>": 50708,
1348
+ "<|6.90|>": 50709,
1349
+ "<|6.92|>": 50710,
1350
+ "<|6.94|>": 50711,
1351
+ "<|6.96|>": 50712,
1352
+ "<|6.98|>": 50713,
1353
+ "<|7.00|>": 50714,
1354
+ "<|7.02|>": 50715,
1355
+ "<|7.04|>": 50716,
1356
+ "<|7.06|>": 50717,
1357
+ "<|7.08|>": 50718,
1358
+ "<|7.10|>": 50719,
1359
+ "<|7.12|>": 50720,
1360
+ "<|7.14|>": 50721,
1361
+ "<|7.16|>": 50722,
1362
+ "<|7.18|>": 50723,
1363
+ "<|7.20|>": 50724,
1364
+ "<|7.22|>": 50725,
1365
+ "<|7.24|>": 50726,
1366
+ "<|7.26|>": 50727,
1367
+ "<|7.28|>": 50728,
1368
+ "<|7.30|>": 50729,
1369
+ "<|7.32|>": 50730,
1370
+ "<|7.34|>": 50731,
1371
+ "<|7.36|>": 50732,
1372
+ "<|7.38|>": 50733,
1373
+ "<|7.40|>": 50734,
1374
+ "<|7.42|>": 50735,
1375
+ "<|7.44|>": 50736,
1376
+ "<|7.46|>": 50737,
1377
+ "<|7.48|>": 50738,
1378
+ "<|7.50|>": 50739,
1379
+ "<|7.52|>": 50740,
1380
+ "<|7.54|>": 50741,
1381
+ "<|7.56|>": 50742,
1382
+ "<|7.58|>": 50743,
1383
+ "<|7.60|>": 50744,
1384
+ "<|7.62|>": 50745,
1385
+ "<|7.64|>": 50746,
1386
+ "<|7.66|>": 50747,
1387
+ "<|7.68|>": 50748,
1388
+ "<|7.70|>": 50749,
1389
+ "<|7.72|>": 50750,
1390
+ "<|7.74|>": 50751,
1391
+ "<|7.76|>": 50752,
1392
+ "<|7.78|>": 50753,
1393
+ "<|7.80|>": 50754,
1394
+ "<|7.82|>": 50755,
1395
+ "<|7.84|>": 50756,
1396
+ "<|7.86|>": 50757,
1397
+ "<|7.88|>": 50758,
1398
+ "<|7.90|>": 50759,
1399
+ "<|7.92|>": 50760,
1400
+ "<|7.94|>": 50761,
1401
+ "<|7.96|>": 50762,
1402
+ "<|7.98|>": 50763,
1403
+ "<|8.00|>": 50764,
1404
+ "<|8.02|>": 50765,
1405
+ "<|8.04|>": 50766,
1406
+ "<|8.06|>": 50767,
1407
+ "<|8.08|>": 50768,
1408
+ "<|8.10|>": 50769,
1409
+ "<|8.12|>": 50770,
1410
+ "<|8.14|>": 50771,
1411
+ "<|8.16|>": 50772,
1412
+ "<|8.18|>": 50773,
1413
+ "<|8.20|>": 50774,
1414
+ "<|8.22|>": 50775,
1415
+ "<|8.24|>": 50776,
1416
+ "<|8.26|>": 50777,
1417
+ "<|8.28|>": 50778,
1418
+ "<|8.30|>": 50779,
1419
+ "<|8.32|>": 50780,
1420
+ "<|8.34|>": 50781,
1421
+ "<|8.36|>": 50782,
1422
+ "<|8.38|>": 50783,
1423
+ "<|8.40|>": 50784,
1424
+ "<|8.42|>": 50785,
1425
+ "<|8.44|>": 50786,
1426
+ "<|8.46|>": 50787,
1427
+ "<|8.48|>": 50788,
1428
+ "<|8.50|>": 50789,
1429
+ "<|8.52|>": 50790,
1430
+ "<|8.54|>": 50791,
1431
+ "<|8.56|>": 50792,
1432
+ "<|8.58|>": 50793,
1433
+ "<|8.60|>": 50794,
1434
+ "<|8.62|>": 50795,
1435
+ "<|8.64|>": 50796,
1436
+ "<|8.66|>": 50797,
1437
+ "<|8.68|>": 50798,
1438
+ "<|8.70|>": 50799,
1439
+ "<|8.72|>": 50800,
1440
+ "<|8.74|>": 50801,
1441
+ "<|8.76|>": 50802,
1442
+ "<|8.78|>": 50803,
1443
+ "<|8.80|>": 50804,
1444
+ "<|8.82|>": 50805,
1445
+ "<|8.84|>": 50806,
1446
+ "<|8.86|>": 50807,
1447
+ "<|8.88|>": 50808,
1448
+ "<|8.90|>": 50809,
1449
+ "<|8.92|>": 50810,
1450
+ "<|8.94|>": 50811,
1451
+ "<|8.96|>": 50812,
1452
+ "<|8.98|>": 50813,
1453
+ "<|9.00|>": 50814,
1454
+ "<|9.02|>": 50815,
1455
+ "<|9.04|>": 50816,
1456
+ "<|9.06|>": 50817,
1457
+ "<|9.08|>": 50818,
1458
+ "<|9.10|>": 50819,
1459
+ "<|9.12|>": 50820,
1460
+ "<|9.14|>": 50821,
1461
+ "<|9.16|>": 50822,
1462
+ "<|9.18|>": 50823,
1463
+ "<|9.20|>": 50824,
1464
+ "<|9.22|>": 50825,
1465
+ "<|9.24|>": 50826,
1466
+ "<|9.26|>": 50827,
1467
+ "<|9.28|>": 50828,
1468
+ "<|9.30|>": 50829,
1469
+ "<|9.32|>": 50830,
1470
+ "<|9.34|>": 50831,
1471
+ "<|9.36|>": 50832,
1472
+ "<|9.38|>": 50833,
1473
+ "<|9.40|>": 50834,
1474
+ "<|9.42|>": 50835,
1475
+ "<|9.44|>": 50836,
1476
+ "<|9.46|>": 50837,
1477
+ "<|9.48|>": 50838,
1478
+ "<|9.50|>": 50839,
1479
+ "<|9.52|>": 50840,
1480
+ "<|9.54|>": 50841,
1481
+ "<|9.56|>": 50842,
1482
+ "<|9.58|>": 50843,
1483
+ "<|9.60|>": 50844,
1484
+ "<|9.62|>": 50845,
1485
+ "<|9.64|>": 50846,
1486
+ "<|9.66|>": 50847,
1487
+ "<|9.68|>": 50848,
1488
+ "<|9.70|>": 50849,
1489
+ "<|9.72|>": 50850,
1490
+ "<|9.74|>": 50851,
1491
+ "<|9.76|>": 50852,
1492
+ "<|9.78|>": 50853,
1493
+ "<|9.80|>": 50854,
1494
+ "<|9.82|>": 50855,
1495
+ "<|9.84|>": 50856,
1496
+ "<|9.86|>": 50857,
1497
+ "<|9.88|>": 50858,
1498
+ "<|9.90|>": 50859,
1499
+ "<|9.92|>": 50860,
1500
+ "<|9.94|>": 50861,
1501
+ "<|9.96|>": 50862,
1502
+ "<|9.98|>": 50863,
1503
+ "<|af|>": 50327,
1504
+ "<|am|>": 50334,
1505
+ "<|ar|>": 50272,
1506
+ "<|as|>": 50350,
1507
+ "<|az|>": 50304,
1508
+ "<|ba|>": 50355,
1509
+ "<|be|>": 50330,
1510
+ "<|bg|>": 50292,
1511
+ "<|bn|>": 50302,
1512
+ "<|bo|>": 50347,
1513
+ "<|br|>": 50309,
1514
+ "<|bs|>": 50315,
1515
+ "<|ca|>": 50270,
1516
+ "<|cs|>": 50283,
1517
+ "<|cy|>": 50297,
1518
+ "<|da|>": 50285,
1519
+ "<|de|>": 50261,
1520
+ "<|el|>": 50281,
1521
+ "<|en|>": 50259,
1522
+ "<|es|>": 50262,
1523
+ "<|et|>": 50307,
1524
+ "<|eu|>": 50310,
1525
+ "<|fa|>": 50300,
1526
+ "<|fi|>": 50277,
1527
+ "<|fo|>": 50338,
1528
+ "<|fr|>": 50265,
1529
+ "<|gl|>": 50319,
1530
+ "<|gu|>": 50333,
1531
+ "<|haw|>": 50352,
1532
+ "<|ha|>": 50354,
1533
+ "<|he|>": 50279,
1534
+ "<|hi|>": 50276,
1535
+ "<|hr|>": 50291,
1536
+ "<|ht|>": 50339,
1537
+ "<|hu|>": 50286,
1538
+ "<|hy|>": 50312,
1539
+ "<|id|>": 50275,
1540
+ "<|is|>": 50311,
1541
+ "<|it|>": 50274,
1542
+ "<|ja|>": 50266,
1543
+ "<|jw|>": 50356,
1544
+ "<|ka|>": 50329,
1545
+ "<|kk|>": 50316,
1546
+ "<|km|>": 50323,
1547
+ "<|kn|>": 50306,
1548
+ "<|ko|>": 50264,
1549
+ "<|la|>": 50294,
1550
+ "<|lb|>": 50345,
1551
+ "<|ln|>": 50353,
1552
+ "<|lo|>": 50336,
1553
+ "<|lt|>": 50293,
1554
+ "<|lv|>": 50301,
1555
+ "<|mg|>": 50349,
1556
+ "<|mi|>": 50295,
1557
+ "<|mk|>": 50308,
1558
+ "<|ml|>": 50296,
1559
+ "<|mn|>": 50314,
1560
+ "<|mr|>": 50320,
1561
+ "<|ms|>": 50282,
1562
+ "<|mt|>": 50343,
1563
+ "<|my|>": 50346,
1564
+ "<|ne|>": 50313,
1565
+ "<|nl|>": 50271,
1566
+ "<|nn|>": 50342,
1567
+ "<|nocaptions|>": 50362,
1568
+ "<|notimestamps|>": 50363,
1569
+ "<|no|>": 50288,
1570
+ "<|oc|>": 50328,
1571
+ "<|pa|>": 50321,
1572
+ "<|pl|>": 50269,
1573
+ "<|ps|>": 50340,
1574
+ "<|pt|>": 50267,
1575
+ "<|ro|>": 50284,
1576
+ "<|ru|>": 50263,
1577
+ "<|sa|>": 50344,
1578
+ "<|sd|>": 50332,
1579
+ "<|si|>": 50322,
1580
+ "<|sk|>": 50298,
1581
+ "<|sl|>": 50305,
1582
+ "<|sn|>": 50324,
1583
+ "<|so|>": 50326,
1584
+ "<|sq|>": 50317,
1585
+ "<|sr|>": 50303,
1586
+ "<|startoflm|>": 50360,
1587
+ "<|startofprev|>": 50361,
1588
+ "<|startoftranscript|>": 50258,
1589
+ "<|su|>": 50357,
1590
+ "<|sv|>": 50273,
1591
+ "<|sw|>": 50318,
1592
+ "<|ta|>": 50287,
1593
+ "<|te|>": 50299,
1594
+ "<|tg|>": 50331,
1595
+ "<|th|>": 50289,
1596
+ "<|tk|>": 50341,
1597
+ "<|tl|>": 50348,
1598
+ "<|transcribe|>": 50359,
1599
+ "<|translate|>": 50358,
1600
+ "<|tr|>": 50268,
1601
+ "<|tt|>": 50351,
1602
+ "<|uk|>": 50280,
1603
+ "<|ur|>": 50290,
1604
+ "<|uz|>": 50337,
1605
+ "<|vi|>": 50278,
1606
+ "<|yi|>": 50335,
1607
+ "<|yo|>": 50325,
1608
+ "<|zh|>": 50260
1609
+ }
scripts/distil-whisper-lora-run5/adapter/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
scripts/distil-whisper-lora-run5/adapter/normalizer.json ADDED
@@ -0,0 +1,1742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accessorise": "accessorize",
3
+ "accessorised": "accessorized",
4
+ "accessorises": "accessorizes",
5
+ "accessorising": "accessorizing",
6
+ "acclimatisation": "acclimatization",
7
+ "acclimatise": "acclimatize",
8
+ "acclimatised": "acclimatized",
9
+ "acclimatises": "acclimatizes",
10
+ "acclimatising": "acclimatizing",
11
+ "accoutrements": "accouterments",
12
+ "aeon": "eon",
13
+ "aeons": "eons",
14
+ "aerogramme": "aerogram",
15
+ "aerogrammes": "aerograms",
16
+ "aeroplane": "airplane",
17
+ "aeroplanes": "airplanes",
18
+ "aesthete": "esthete",
19
+ "aesthetes": "esthetes",
20
+ "aesthetic": "esthetic",
21
+ "aesthetically": "esthetically",
22
+ "aesthetics": "esthetics",
23
+ "aetiology": "etiology",
24
+ "ageing": "aging",
25
+ "aggrandisement": "aggrandizement",
26
+ "agonise": "agonize",
27
+ "agonised": "agonized",
28
+ "agonises": "agonizes",
29
+ "agonising": "agonizing",
30
+ "agonisingly": "agonizingly",
31
+ "almanack": "almanac",
32
+ "almanacks": "almanacs",
33
+ "aluminium": "aluminum",
34
+ "amortisable": "amortizable",
35
+ "amortisation": "amortization",
36
+ "amortisations": "amortizations",
37
+ "amortise": "amortize",
38
+ "amortised": "amortized",
39
+ "amortises": "amortizes",
40
+ "amortising": "amortizing",
41
+ "amphitheatre": "amphitheater",
42
+ "amphitheatres": "amphitheaters",
43
+ "anaemia": "anemia",
44
+ "anaemic": "anemic",
45
+ "anaesthesia": "anesthesia",
46
+ "anaesthetic": "anesthetic",
47
+ "anaesthetics": "anesthetics",
48
+ "anaesthetise": "anesthetize",
49
+ "anaesthetised": "anesthetized",
50
+ "anaesthetises": "anesthetizes",
51
+ "anaesthetising": "anesthetizing",
52
+ "anaesthetist": "anesthetist",
53
+ "anaesthetists": "anesthetists",
54
+ "anaesthetize": "anesthetize",
55
+ "anaesthetized": "anesthetized",
56
+ "anaesthetizes": "anesthetizes",
57
+ "anaesthetizing": "anesthetizing",
58
+ "analogue": "analog",
59
+ "analogues": "analogs",
60
+ "analyse": "analyze",
61
+ "analysed": "analyzed",
62
+ "analyses": "analyzes",
63
+ "analysing": "analyzing",
64
+ "anglicise": "anglicize",
65
+ "anglicised": "anglicized",
66
+ "anglicises": "anglicizes",
67
+ "anglicising": "anglicizing",
68
+ "annualised": "annualized",
69
+ "antagonise": "antagonize",
70
+ "antagonised": "antagonized",
71
+ "antagonises": "antagonizes",
72
+ "antagonising": "antagonizing",
73
+ "apologise": "apologize",
74
+ "apologised": "apologized",
75
+ "apologises": "apologizes",
76
+ "apologising": "apologizing",
77
+ "appal": "appall",
78
+ "appals": "appalls",
79
+ "appetiser": "appetizer",
80
+ "appetisers": "appetizers",
81
+ "appetising": "appetizing",
82
+ "appetisingly": "appetizingly",
83
+ "arbour": "arbor",
84
+ "arbours": "arbors",
85
+ "archaeologically": "archeologically",
86
+ "archaeologist": "archeologist",
87
+ "archaeologists": "archeologists",
88
+ "archaeology": "archeology</span>",
89
+ "archeological": "archaeological",
90
+ "ardour": "ardor",
91
+ "armour": "armor",
92
+ "armoured": "armored",
93
+ "armourer": "armorer",
94
+ "armourers": "armorers",
95
+ "armouries": "armories",
96
+ "armoury": "armory",
97
+ "artefact": "artifact",
98
+ "artefacts": "artifacts",
99
+ "authorise": "authorize",
100
+ "authorised": "authorized",
101
+ "authorises": "authorizes",
102
+ "authorising": "authorizing",
103
+ "axe": "ax",
104
+ "backpedalled": "backpedaled",
105
+ "backpedalling": "backpedaling",
106
+ "bannister": "banister",
107
+ "bannisters": "banisters",
108
+ "baptise": "baptize",
109
+ "baptised": "baptized",
110
+ "baptises": "baptizes",
111
+ "baptising": "baptizing",
112
+ "bastardise": "bastardize",
113
+ "bastardised": "bastardized",
114
+ "bastardises": "bastardizes",
115
+ "bastardising": "bastardizing",
116
+ "battleax": "battleaxe",
117
+ "baulk": "balk",
118
+ "baulked": "balked",
119
+ "baulking": "balking",
120
+ "baulks": "balks",
121
+ "bedevilled": "bedeviled",
122
+ "bedevilling": "bedeviling",
123
+ "behaviour": "behavior",
124
+ "behavioural": "behavioral",
125
+ "behaviourism": "behaviorism",
126
+ "behaviourist": "behaviorist",
127
+ "behaviourists": "behaviorists",
128
+ "behaviours": "behaviors",
129
+ "behove": "behoove",
130
+ "behoved": "behooved",
131
+ "behoves": "behooves",
132
+ "bejewelled": "bejeweled",
133
+ "belabour": "belabor",
134
+ "belaboured": "belabored",
135
+ "belabouring": "belaboring",
136
+ "belabours": "belabors",
137
+ "bevelled": "beveled",
138
+ "bevvies": "bevies",
139
+ "bevvy": "bevy",
140
+ "biassed": "biased",
141
+ "biassing": "biasing",
142
+ "bingeing": "binging",
143
+ "bougainvillaea": "bougainvillea",
144
+ "bougainvillaeas": "bougainvilleas",
145
+ "bowdlerise": "bowdlerize",
146
+ "bowdlerised": "bowdlerized",
147
+ "bowdlerises": "bowdlerizes",
148
+ "bowdlerising": "bowdlerizing",
149
+ "breathalyse": "breathalyze",
150
+ "breathalysed": "breathalyzed",
151
+ "breathalyser": "breathalyzer",
152
+ "breathalysers": "breathalyzers",
153
+ "breathalyses": "breathalyzes",
154
+ "breathalysing": "breathalyzing",
155
+ "brutalise": "brutalize",
156
+ "brutalised": "brutalized",
157
+ "brutalises": "brutalizes",
158
+ "brutalising": "brutalizing",
159
+ "busses": "buses",
160
+ "bussing": "busing",
161
+ "caesarean": "cesarean",
162
+ "caesareans": "cesareans",
163
+ "calibre": "caliber",
164
+ "calibres": "calibers",
165
+ "calliper": "caliper",
166
+ "callipers": "calipers",
167
+ "callisthenics": "calisthenics",
168
+ "canalise": "canalize",
169
+ "canalised": "canalized",
170
+ "canalises": "canalizes",
171
+ "canalising": "canalizing",
172
+ "cancelation": "cancellation",
173
+ "cancelations": "cancellations",
174
+ "cancelled": "canceled",
175
+ "cancelling": "canceling",
176
+ "candour": "candor",
177
+ "cannibalise": "cannibalize",
178
+ "cannibalised": "cannibalized",
179
+ "cannibalises": "cannibalizes",
180
+ "cannibalising": "cannibalizing",
181
+ "canonise": "canonize",
182
+ "canonised": "canonized",
183
+ "canonises": "canonizes",
184
+ "canonising": "canonizing",
185
+ "capitalise": "capitalize",
186
+ "capitalised": "capitalized",
187
+ "capitalises": "capitalizes",
188
+ "capitalising": "capitalizing",
189
+ "caramelise": "caramelize",
190
+ "caramelised": "caramelized",
191
+ "caramelises": "caramelizes",
192
+ "caramelising": "caramelizing",
193
+ "carbonise": "carbonize",
194
+ "carbonised": "carbonized",
195
+ "carbonises": "carbonizes",
196
+ "carbonising": "carbonizing",
197
+ "carolled": "caroled",
198
+ "carolling": "caroling",
199
+ "catalogue": "catalog",
200
+ "catalogued": "cataloged",
201
+ "catalogues": "catalogs",
202
+ "cataloguing": "cataloging",
203
+ "catalyse": "catalyze",
204
+ "catalysed": "catalyzed",
205
+ "catalyses": "catalyzes",
206
+ "catalysing": "catalyzing",
207
+ "categorise": "categorize",
208
+ "categorised": "categorized",
209
+ "categorises": "categorizes",
210
+ "categorising": "categorizing",
211
+ "cauterise": "cauterize",
212
+ "cauterised": "cauterized",
213
+ "cauterises": "cauterizes",
214
+ "cauterising": "cauterizing",
215
+ "cavilled": "caviled",
216
+ "cavilling": "caviling",
217
+ "centigramme": "centigram",
218
+ "centigrammes": "centigrams",
219
+ "centilitre": "centiliter",
220
+ "centilitres": "centiliters",
221
+ "centimetre": "centimeter",
222
+ "centimetres": "centimeters",
223
+ "centralise": "centralize",
224
+ "centralised": "centralized",
225
+ "centralises": "centralizes",
226
+ "centralising": "centralizing",
227
+ "centre": "center",
228
+ "centred": "centered",
229
+ "centrefold": "centerfold",
230
+ "centrefolds": "centerfolds",
231
+ "centrepiece": "centerpiece",
232
+ "centrepieces": "centerpieces",
233
+ "centres": "centers",
234
+ "channelled": "channeled",
235
+ "channelling": "channeling",
236
+ "characterise": "characterize",
237
+ "characterised": "characterized",
238
+ "characterises": "characterizes",
239
+ "characterising": "characterizing",
240
+ "cheque": "check",
241
+ "chequebook": "checkbook",
242
+ "chequebooks": "checkbooks",
243
+ "chequered": "checkered",
244
+ "cheques": "checks",
245
+ "chilli": "chili",
246
+ "chimaera": "chimera",
247
+ "chimaeras": "chimeras",
248
+ "chiselled": "chiseled",
249
+ "chiselling": "chiseling",
250
+ "circularise": "circularize",
251
+ "circularised": "circularized",
252
+ "circularises": "circularizes",
253
+ "circularising": "circularizing",
254
+ "civilise": "civilize",
255
+ "civilised": "civilized",
256
+ "civilises": "civilizes",
257
+ "civilising": "civilizing",
258
+ "clamour": "clamor",
259
+ "clamoured": "clamored",
260
+ "clamouring": "clamoring",
261
+ "clamours": "clamors",
262
+ "clangour": "clangor",
263
+ "clarinettist": "clarinetist",
264
+ "clarinettists": "clarinetists",
265
+ "collectivise": "collectivize",
266
+ "collectivised": "collectivized",
267
+ "collectivises": "collectivizes",
268
+ "collectivising": "collectivizing",
269
+ "colonisation": "colonization",
270
+ "colonise": "colonize",
271
+ "colonised": "colonized",
272
+ "coloniser": "colonizer",
273
+ "colonisers": "colonizers",
274
+ "colonises": "colonizes",
275
+ "colonising": "colonizing",
276
+ "colour": "color",
277
+ "colourant": "colorant",
278
+ "colourants": "colorants",
279
+ "coloured": "colored",
280
+ "coloureds": "coloreds",
281
+ "colourful": "colorful",
282
+ "colourfully": "colorfully",
283
+ "colouring": "coloring",
284
+ "colourize": "colorize",
285
+ "colourized": "colorized",
286
+ "colourizes": "colorizes",
287
+ "colourizing": "colorizing",
288
+ "colourless": "colorless",
289
+ "colours": "colors",
290
+ "commercialise": "commercialize",
291
+ "commercialised": "commercialized",
292
+ "commercialises": "commercializes",
293
+ "commercialising": "commercializing",
294
+ "compartmentalise": "compartmentalize",
295
+ "compartmentalised": "compartmentalized",
296
+ "compartmentalises": "compartmentalizes",
297
+ "compartmentalising": "compartmentalizing",
298
+ "computerise": "computerize",
299
+ "computerised": "computerized",
300
+ "computerises": "computerizes",
301
+ "computerising": "computerizing",
302
+ "conceptualise": "conceptualize",
303
+ "conceptualised": "conceptualized",
304
+ "conceptualises": "conceptualizes",
305
+ "conceptualising": "conceptualizing",
306
+ "connexion": "connection",
307
+ "connexions": "connections",
308
+ "contextualise": "contextualize",
309
+ "contextualised": "contextualized",
310
+ "contextualises": "contextualizes",
311
+ "contextualising": "contextualizing",
312
+ "cosier": "cozier",
313
+ "cosies": "cozies",
314
+ "cosiest": "coziest",
315
+ "cosily": "cozily",
316
+ "cosiness": "coziness",
317
+ "cosy": "cozy",
318
+ "councillor": "councilor",
319
+ "councillors": "councilors",
320
+ "counselled": "counseled",
321
+ "counselling": "counseling",
322
+ "counsellor": "counselor",
323
+ "counsellors": "counselors",
324
+ "crenelated": "crenellated",
325
+ "criminalise": "criminalize",
326
+ "criminalised": "criminalized",
327
+ "criminalises": "criminalizes",
328
+ "criminalising": "criminalizing",
329
+ "criticise": "criticize",
330
+ "criticised": "criticized",
331
+ "criticises": "criticizes",
332
+ "criticising": "criticizing",
333
+ "crueller": "crueler",
334
+ "cruellest": "cruelest",
335
+ "crystallisation": "crystallization",
336
+ "crystallise": "crystallize",
337
+ "crystallised": "crystallized",
338
+ "crystallises": "crystallizes",
339
+ "crystallising": "crystallizing",
340
+ "cudgelled": "cudgeled",
341
+ "cudgelling": "cudgeling",
342
+ "customise": "customize",
343
+ "customised": "customized",
344
+ "customises": "customizes",
345
+ "customising": "customizing",
346
+ "cypher": "cipher",
347
+ "cyphers": "ciphers",
348
+ "decentralisation": "decentralization",
349
+ "decentralise": "decentralize",
350
+ "decentralised": "decentralized",
351
+ "decentralises": "decentralizes",
352
+ "decentralising": "decentralizing",
353
+ "decriminalisation": "decriminalization",
354
+ "decriminalise": "decriminalize",
355
+ "decriminalised": "decriminalized",
356
+ "decriminalises": "decriminalizes",
357
+ "decriminalising": "decriminalizing",
358
+ "defence": "defense",
359
+ "defenceless": "defenseless",
360
+ "defences": "defenses",
361
+ "dehumanisation": "dehumanization",
362
+ "dehumanise": "dehumanize",
363
+ "dehumanised": "dehumanized",
364
+ "dehumanises": "dehumanizes",
365
+ "dehumanising": "dehumanizing",
366
+ "demeanour": "demeanor",
367
+ "demilitarisation": "demilitarization",
368
+ "demilitarise": "demilitarize",
369
+ "demilitarised": "demilitarized",
370
+ "demilitarises": "demilitarizes",
371
+ "demilitarising": "demilitarizing",
372
+ "demobilisation": "demobilization",
373
+ "demobilise": "demobilize",
374
+ "demobilised": "demobilized",
375
+ "demobilises": "demobilizes",
376
+ "demobilising": "demobilizing",
377
+ "democratisation": "democratization",
378
+ "democratise": "democratize",
379
+ "democratised": "democratized",
380
+ "democratises": "democratizes",
381
+ "democratising": "democratizing",
382
+ "demonise": "demonize",
383
+ "demonised": "demonized",
384
+ "demonises": "demonizes",
385
+ "demonising": "demonizing",
386
+ "demoralisation": "demoralization",
387
+ "demoralise": "demoralize",
388
+ "demoralised": "demoralized",
389
+ "demoralises": "demoralizes",
390
+ "demoralising": "demoralizing",
391
+ "denationalisation": "denationalization",
392
+ "denationalise": "denationalize",
393
+ "denationalised": "denationalized",
394
+ "denationalises": "denationalizes",
395
+ "denationalising": "denationalizing",
396
+ "deodorise": "deodorize",
397
+ "deodorised": "deodorized",
398
+ "deodorises": "deodorizes",
399
+ "deodorising": "deodorizing",
400
+ "depersonalise": "depersonalize",
401
+ "depersonalised": "depersonalized",
402
+ "depersonalises": "depersonalizes",
403
+ "depersonalising": "depersonalizing",
404
+ "deputise": "deputize",
405
+ "deputised": "deputized",
406
+ "deputises": "deputizes",
407
+ "deputising": "deputizing",
408
+ "desensitisation": "desensitization",
409
+ "desensitise": "desensitize",
410
+ "desensitised": "desensitized",
411
+ "desensitises": "desensitizes",
412
+ "desensitising": "desensitizing",
413
+ "destabilisation": "destabilization",
414
+ "destabilise": "destabilize",
415
+ "destabilised": "destabilized",
416
+ "destabilises": "destabilizes",
417
+ "destabilising": "destabilizing",
418
+ "dialled": "dialed",
419
+ "dialling": "dialing",
420
+ "dialogue": "dialog",
421
+ "dialogues": "dialogs",
422
+ "diarrhoea": "diarrhea",
423
+ "digitise": "digitize",
424
+ "digitised": "digitized",
425
+ "digitises": "digitizes",
426
+ "digitising": "digitizing",
427
+ "disc": "disk",
428
+ "discolour": "discolor",
429
+ "discoloured": "discolored",
430
+ "discolouring": "discoloring",
431
+ "discolours": "discolors",
432
+ "discs": "disks",
433
+ "disembowelled": "disemboweled",
434
+ "disembowelling": "disemboweling",
435
+ "disfavour": "disfavor",
436
+ "dishevelled": "disheveled",
437
+ "dishonour": "dishonor",
438
+ "dishonourable": "dishonorable",
439
+ "dishonourably": "dishonorably",
440
+ "dishonoured": "dishonored",
441
+ "dishonouring": "dishonoring",
442
+ "dishonours": "dishonors",
443
+ "disorganisation": "disorganization",
444
+ "disorganised": "disorganized",
445
+ "distil": "distill",
446
+ "distils": "distills",
447
+ "dramatisation": "dramatization",
448
+ "dramatisations": "dramatizations",
449
+ "dramatise": "dramatize",
450
+ "dramatised": "dramatized",
451
+ "dramatises": "dramatizes",
452
+ "dramatising": "dramatizing",
453
+ "draught": "draft",
454
+ "draughtboard": "draftboard",
455
+ "draughtboards": "draftboards",
456
+ "draughtier": "draftier",
457
+ "draughtiest": "draftiest",
458
+ "draughts": "drafts",
459
+ "draughtsman": "draftsman",
460
+ "draughtsmanship": "draftsmanship",
461
+ "draughtsmen": "draftsmen",
462
+ "draughtswoman": "draftswoman",
463
+ "draughtswomen": "draftswomen",
464
+ "draughty": "drafty",
465
+ "drivelled": "driveled",
466
+ "drivelling": "driveling",
467
+ "duelled": "dueled",
468
+ "duelling": "dueling",
469
+ "economise": "economize",
470
+ "economised": "economized",
471
+ "economises": "economizes",
472
+ "economising": "economizing",
473
+ "editorialise": "editorialize",
474
+ "editorialised": "editorialized",
475
+ "editorialises": "editorializes",
476
+ "editorialising": "editorializing",
477
+ "edoema": "edema",
478
+ "empathise": "empathize",
479
+ "empathised": "empathized",
480
+ "empathises": "empathizes",
481
+ "empathising": "empathizing",
482
+ "emphasise": "emphasize",
483
+ "emphasised": "emphasized",
484
+ "emphasises": "emphasizes",
485
+ "emphasising": "emphasizing",
486
+ "enamelled": "enameled",
487
+ "enamelling": "enameling",
488
+ "enamoured": "enamored",
489
+ "encyclopaedia": "encyclopedia",
490
+ "encyclopaedias": "encyclopedias",
491
+ "encyclopaedic": "encyclopedic",
492
+ "endeavour": "endeavor",
493
+ "endeavoured": "endeavored",
494
+ "endeavouring": "endeavoring",
495
+ "endeavours": "endeavors",
496
+ "energise": "energize",
497
+ "energised": "energized",
498
+ "energises": "energizes",
499
+ "energising": "energizing",
500
+ "enrol": "enroll",
501
+ "enrols": "enrolls",
502
+ "enthral": "enthrall",
503
+ "enthrals": "enthralls",
504
+ "epaulette": "epaulet",
505
+ "epaulettes": "epaulets",
506
+ "epicentre": "epicenter",
507
+ "epicentres": "epicenters",
508
+ "epilogue": "epilog",
509
+ "epilogues": "epilogs",
510
+ "epitomise": "epitomize",
511
+ "epitomised": "epitomized",
512
+ "epitomises": "epitomizes",
513
+ "epitomising": "epitomizing",
514
+ "equalisation": "equalization",
515
+ "equalise": "equalize",
516
+ "equalised": "equalized",
517
+ "equaliser": "equalizer",
518
+ "equalisers": "equalizers",
519
+ "equalises": "equalizes",
520
+ "equalising": "equalizing",
521
+ "eulogise": "eulogize",
522
+ "eulogised": "eulogized",
523
+ "eulogises": "eulogizes",
524
+ "eulogising": "eulogizing",
525
+ "evangelise": "evangelize",
526
+ "evangelised": "evangelized",
527
+ "evangelises": "evangelizes",
528
+ "evangelising": "evangelizing",
529
+ "exorcise": "exorcize",
530
+ "exorcised": "exorcized",
531
+ "exorcises": "exorcizes",
532
+ "exorcising": "exorcizing",
533
+ "extemporisation": "extemporization",
534
+ "extemporise": "extemporize",
535
+ "extemporised": "extemporized",
536
+ "extemporises": "extemporizes",
537
+ "extemporising": "extemporizing",
538
+ "externalisation": "externalization",
539
+ "externalisations": "externalizations",
540
+ "externalise": "externalize",
541
+ "externalised": "externalized",
542
+ "externalises": "externalizes",
543
+ "externalising": "externalizing",
544
+ "factorise": "factorize",
545
+ "factorised": "factorized",
546
+ "factorises": "factorizes",
547
+ "factorising": "factorizing",
548
+ "faecal": "fecal",
549
+ "faeces": "feces",
550
+ "familiarisation": "familiarization",
551
+ "familiarise": "familiarize",
552
+ "familiarised": "familiarized",
553
+ "familiarises": "familiarizes",
554
+ "familiarising": "familiarizing",
555
+ "fantasise": "fantasize",
556
+ "fantasised": "fantasized",
557
+ "fantasises": "fantasizes",
558
+ "fantasising": "fantasizing",
559
+ "favour": "favor",
560
+ "favourable": "favorable",
561
+ "favourably": "favorably",
562
+ "favoured": "favored",
563
+ "favouring": "favoring",
564
+ "favourite": "favorite",
565
+ "favourites": "favorites",
566
+ "favouritism": "favoritism",
567
+ "favours": "favors",
568
+ "feminise": "feminize",
569
+ "feminised": "feminized",
570
+ "feminises": "feminizes",
571
+ "feminising": "feminizing",
572
+ "fertilisation": "fertilization",
573
+ "fertilise": "fertilize",
574
+ "fertilised": "fertilized",
575
+ "fertiliser": "fertilizer",
576
+ "fertilisers": "fertilizers",
577
+ "fertilises": "fertilizes",
578
+ "fertilising": "fertilizing",
579
+ "fervour": "fervor",
580
+ "fibre": "fiber",
581
+ "fibreglass": "fiberglass",
582
+ "fibres": "fibers",
583
+ "fictionalisation": "fictionalization",
584
+ "fictionalisations": "fictionalizations",
585
+ "fictionalise": "fictionalize",
586
+ "fictionalised": "fictionalized",
587
+ "fictionalises": "fictionalizes",
588
+ "fictionalising": "fictionalizing",
589
+ "fillet": "filet",
590
+ "filleted": "fileted",
591
+ "filleting": "fileting",
592
+ "fillets": "filets",
593
+ "finalisation": "finalization",
594
+ "finalise": "finalize",
595
+ "finalised": "finalized",
596
+ "finalises": "finalizes",
597
+ "finalising": "finalizing",
598
+ "flautist": "flutist",
599
+ "flautists": "flutists",
600
+ "flavour": "flavor",
601
+ "flavoured": "flavored",
602
+ "flavouring": "flavoring",
603
+ "flavourings": "flavorings",
604
+ "flavourless": "flavorless",
605
+ "flavours": "flavors",
606
+ "flavoursome": "flavorsome",
607
+ "flyer / flier": "flier / flyer",
608
+ "foetal": "fetal",
609
+ "foetid": "fetid",
610
+ "foetus": "fetus",
611
+ "foetuses": "fetuses",
612
+ "formalisation": "formalization",
613
+ "formalise": "formalize",
614
+ "formalised": "formalized",
615
+ "formalises": "formalizes",
616
+ "formalising": "formalizing",
617
+ "fossilisation": "fossilization",
618
+ "fossilise": "fossilize",
619
+ "fossilised": "fossilized",
620
+ "fossilises": "fossilizes",
621
+ "fossilising": "fossilizing",
622
+ "fraternisation": "fraternization",
623
+ "fraternise": "fraternize",
624
+ "fraternised": "fraternized",
625
+ "fraternises": "fraternizes",
626
+ "fraternising": "fraternizing",
627
+ "fulfil": "fulfill",
628
+ "fulfilment": "fulfillment",
629
+ "fulfils": "fulfills",
630
+ "funnelled": "funneled",
631
+ "funnelling": "funneling",
632
+ "gage": "gauge",
633
+ "gaged": "gauged",
634
+ "gages": "gauges",
635
+ "gaging": "gauging",
636
+ "galvanise": "galvanize",
637
+ "galvanised": "galvanized",
638
+ "galvanises": "galvanizes",
639
+ "galvanising": "galvanizing",
640
+ "gambolled": "gamboled",
641
+ "gambolling": "gamboling",
642
+ "gaol": "jail",
643
+ "gaolbird": "jailbird",
644
+ "gaolbirds": "jailbirds",
645
+ "gaolbreak": "jailbreak",
646
+ "gaolbreaks": "jailbreaks",
647
+ "gaoled": "jailed",
648
+ "gaoler": "jailer",
649
+ "gaolers": "jailers",
650
+ "gaoling": "jailing",
651
+ "gaols": "jails",
652
+ "gasses": "gases",
653
+ "generalisation": "generalization",
654
+ "generalisations": "generalizations",
655
+ "generalise": "generalize",
656
+ "generalised": "generalized",
657
+ "generalises": "generalizes",
658
+ "generalising": "generalizing",
659
+ "ghettoise": "ghettoize",
660
+ "ghettoised": "ghettoized",
661
+ "ghettoises": "ghettoizes",
662
+ "ghettoising": "ghettoizing",
663
+ "gipsies": "gypsies",
664
+ "glamor": "glamour",
665
+ "glamorise": "glamorize",
666
+ "glamorised": "glamorized",
667
+ "glamorises": "glamorizes",
668
+ "glamorising": "glamorizing",
669
+ "globalisation": "globalization",
670
+ "globalise": "globalize",
671
+ "globalised": "globalized",
672
+ "globalises": "globalizes",
673
+ "globalising": "globalizing",
674
+ "glueing": "gluing",
675
+ "goitre": "goiter",
676
+ "goitres": "goiters",
677
+ "gonorrhoea": "gonorrhea",
678
+ "gramme": "gram",
679
+ "grammes": "grams",
680
+ "gravelled": "graveled",
681
+ "grey": "gray",
682
+ "greyed": "grayed",
683
+ "greying": "graying",
684
+ "greyish": "grayish",
685
+ "greyness": "grayness",
686
+ "greys": "grays",
687
+ "grovelled": "groveled",
688
+ "grovelling": "groveling",
689
+ "groyne": "groin",
690
+ "groynes": "groins",
691
+ "gruelling": "grueling",
692
+ "gruellingly": "gruelingly",
693
+ "gryphon": "griffin",
694
+ "gryphons": "griffins",
695
+ "gynaecological": "gynecological",
696
+ "gynaecologist": "gynecologist",
697
+ "gynaecologists": "gynecologists",
698
+ "gynaecology": "gynecology",
699
+ "haematological": "hematological",
700
+ "haematologist": "hematologist",
701
+ "haematologists": "hematologists",
702
+ "haematology": "hematology",
703
+ "haemoglobin": "hemoglobin",
704
+ "haemophilia": "hemophilia",
705
+ "haemophiliac": "hemophiliac",
706
+ "haemophiliacs": "hemophiliacs",
707
+ "haemorrhage": "hemorrhage",
708
+ "haemorrhaged": "hemorrhaged",
709
+ "haemorrhages": "hemorrhages",
710
+ "haemorrhaging": "hemorrhaging",
711
+ "haemorrhoids": "hemorrhoids",
712
+ "harbour": "harbor",
713
+ "harboured": "harbored",
714
+ "harbouring": "harboring",
715
+ "harbours": "harbors",
716
+ "harmonisation": "harmonization",
717
+ "harmonise": "harmonize",
718
+ "harmonised": "harmonized",
719
+ "harmonises": "harmonizes",
720
+ "harmonising": "harmonizing",
721
+ "homoeopath": "homeopath",
722
+ "homoeopathic": "homeopathic",
723
+ "homoeopaths": "homeopaths",
724
+ "homoeopathy": "homeopathy",
725
+ "homogenise": "homogenize",
726
+ "homogenised": "homogenized",
727
+ "homogenises": "homogenizes",
728
+ "homogenising": "homogenizing",
729
+ "honour": "honor",
730
+ "honourable": "honorable",
731
+ "honourably": "honorably",
732
+ "honoured": "honored",
733
+ "honouring": "honoring",
734
+ "honours": "honors",
735
+ "hospitalisation": "hospitalization",
736
+ "hospitalise": "hospitalize",
737
+ "hospitalised": "hospitalized",
738
+ "hospitalises": "hospitalizes",
739
+ "hospitalising": "hospitalizing",
740
+ "humanise": "humanize",
741
+ "humanised": "humanized",
742
+ "humanises": "humanizes",
743
+ "humanising": "humanizing",
744
+ "humour": "humor",
745
+ "humoured": "humored",
746
+ "humouring": "humoring",
747
+ "humourless": "humorless",
748
+ "humours": "humors",
749
+ "hybridise": "hybridize",
750
+ "hybridised": "hybridized",
751
+ "hybridises": "hybridizes",
752
+ "hybridising": "hybridizing",
753
+ "hypnotise": "hypnotize",
754
+ "hypnotised": "hypnotized",
755
+ "hypnotises": "hypnotizes",
756
+ "hypnotising": "hypnotizing",
757
+ "hypothesise": "hypothesize",
758
+ "hypothesised": "hypothesized",
759
+ "hypothesises": "hypothesizes",
760
+ "hypothesising": "hypothesizing",
761
+ "idealisation": "idealization",
762
+ "idealise": "idealize",
763
+ "idealised": "idealized",
764
+ "idealises": "idealizes",
765
+ "idealising": "idealizing",
766
+ "idolise": "idolize",
767
+ "idolised": "idolized",
768
+ "idolises": "idolizes",
769
+ "idolising": "idolizing",
770
+ "immobilisation": "immobilization",
771
+ "immobilise": "immobilize",
772
+ "immobilised": "immobilized",
773
+ "immobiliser": "immobilizer",
774
+ "immobilisers": "immobilizers",
775
+ "immobilises": "immobilizes",
776
+ "immobilising": "immobilizing",
777
+ "immortalise": "immortalize",
778
+ "immortalised": "immortalized",
779
+ "immortalises": "immortalizes",
780
+ "immortalising": "immortalizing",
781
+ "immunisation": "immunization",
782
+ "immunise": "immunize",
783
+ "immunised": "immunized",
784
+ "immunises": "immunizes",
785
+ "immunising": "immunizing",
786
+ "impanelled": "impaneled",
787
+ "impanelling": "impaneling",
788
+ "imperilled": "imperiled",
789
+ "imperilling": "imperiling",
790
+ "individualise": "individualize",
791
+ "individualised": "individualized",
792
+ "individualises": "individualizes",
793
+ "individualising": "individualizing",
794
+ "industrialise": "industrialize",
795
+ "industrialised": "industrialized",
796
+ "industrialises": "industrializes",
797
+ "industrialising": "industrializing",
798
+ "inflexion": "inflection",
799
+ "inflexions": "inflections",
800
+ "initialise": "initialize",
801
+ "initialised": "initialized",
802
+ "initialises": "initializes",
803
+ "initialising": "initializing",
804
+ "initialled": "initialed",
805
+ "initialling": "initialing",
806
+ "instal": "install",
807
+ "instalment": "installment",
808
+ "instalments": "installments",
809
+ "instals": "installs",
810
+ "instil": "instill",
811
+ "instils": "instills",
812
+ "institutionalisation": "institutionalization",
813
+ "institutionalise": "institutionalize",
814
+ "institutionalised": "institutionalized",
815
+ "institutionalises": "institutionalizes",
816
+ "institutionalising": "institutionalizing",
817
+ "intellectualise": "intellectualize",
818
+ "intellectualised": "intellectualized",
819
+ "intellectualises": "intellectualizes",
820
+ "intellectualising": "intellectualizing",
821
+ "internalisation": "internalization",
822
+ "internalise": "internalize",
823
+ "internalised": "internalized",
824
+ "internalises": "internalizes",
825
+ "internalising": "internalizing",
826
+ "internationalisation": "internationalization",
827
+ "internationalise": "internationalize",
828
+ "internationalised": "internationalized",
829
+ "internationalises": "internationalizes",
830
+ "internationalising": "internationalizing",
831
+ "ionisation": "ionization",
832
+ "ionise": "ionize",
833
+ "ionised": "ionized",
834
+ "ioniser": "ionizer",
835
+ "ionisers": "ionizers",
836
+ "ionises": "ionizes",
837
+ "ionising": "ionizing",
838
+ "italicise": "italicize",
839
+ "italicised": "italicized",
840
+ "italicises": "italicizes",
841
+ "italicising": "italicizing",
842
+ "itemise": "itemize",
843
+ "itemised": "itemized",
844
+ "itemises": "itemizes",
845
+ "itemising": "itemizing",
846
+ "jeopardise": "jeopardize",
847
+ "jeopardised": "jeopardized",
848
+ "jeopardises": "jeopardizes",
849
+ "jeopardising": "jeopardizing",
850
+ "jewelled": "jeweled",
851
+ "jeweller": "jeweler",
852
+ "jewellers": "jewelers",
853
+ "jewellery": "jewelry",
854
+ "judgement": "judgment",
855
+ "kilogramme": "kilogram",
856
+ "kilogrammes": "kilograms",
857
+ "kilometre": "kilometer",
858
+ "kilometres": "kilometers",
859
+ "labelled": "labeled",
860
+ "labelling": "labeling",
861
+ "labour": "labor",
862
+ "laboured": "labored",
863
+ "labourer": "laborer",
864
+ "labourers": "laborers",
865
+ "labouring": "laboring",
866
+ "labours": "labors",
867
+ "lacklustre": "lackluster",
868
+ "legalisation": "legalization",
869
+ "legalise": "legalize",
870
+ "legalised": "legalized",
871
+ "legalises": "legalizes",
872
+ "legalising": "legalizing",
873
+ "legitimise": "legitimize",
874
+ "legitimised": "legitimized",
875
+ "legitimises": "legitimizes",
876
+ "legitimising": "legitimizing",
877
+ "leukaemia": "leukemia",
878
+ "levelled": "leveled",
879
+ "leveller": "leveler",
880
+ "levellers": "levelers",
881
+ "levelling": "leveling",
882
+ "libelled": "libeled",
883
+ "libelling": "libeling",
884
+ "libellous": "libelous",
885
+ "liberalisation": "liberalization",
886
+ "liberalise": "liberalize",
887
+ "liberalised": "liberalized",
888
+ "liberalises": "liberalizes",
889
+ "liberalising": "liberalizing",
890
+ "licence": "license",
891
+ "licenced": "licensed",
892
+ "licences": "licenses",
893
+ "licencing": "licensing",
894
+ "likeable": "likable",
895
+ "lionisation": "lionization",
896
+ "lionise": "lionize",
897
+ "lionised": "lionized",
898
+ "lionises": "lionizes",
899
+ "lionising": "lionizing",
900
+ "liquidise": "liquidize",
901
+ "liquidised": "liquidized",
902
+ "liquidiser": "liquidizer",
903
+ "liquidisers": "liquidizers",
904
+ "liquidises": "liquidizes",
905
+ "liquidising": "liquidizing",
906
+ "litre": "liter",
907
+ "litres": "liters",
908
+ "localise": "localize",
909
+ "localised": "localized",
910
+ "localises": "localizes",
911
+ "localising": "localizing",
912
+ "louvre": "louver",
913
+ "louvred": "louvered",
914
+ "louvres": "louvers",
915
+ "lustre": "luster",
916
+ "magnetise": "magnetize",
917
+ "magnetised": "magnetized",
918
+ "magnetises": "magnetizes",
919
+ "magnetising": "magnetizing",
920
+ "manoeuvrability": "maneuverability",
921
+ "manoeuvrable": "maneuverable",
922
+ "manoeuvre": "maneuver",
923
+ "manoeuvred": "maneuvered",
924
+ "manoeuvres": "maneuvers",
925
+ "manoeuvring": "maneuvering",
926
+ "manoeuvrings": "maneuverings",
927
+ "marginalisation": "marginalization",
928
+ "marginalise": "marginalize",
929
+ "marginalised": "marginalized",
930
+ "marginalises": "marginalizes",
931
+ "marginalising": "marginalizing",
932
+ "marshalled": "marshaled",
933
+ "marshalling": "marshaling",
934
+ "marvelled": "marveled",
935
+ "marvelling": "marveling",
936
+ "marvellous": "marvelous",
937
+ "marvellously": "marvelously",
938
+ "materialisation": "materialization",
939
+ "materialise": "materialize",
940
+ "materialised": "materialized",
941
+ "materialises": "materializes",
942
+ "materialising": "materializing",
943
+ "maximisation": "maximization",
944
+ "maximise": "maximize",
945
+ "maximised": "maximized",
946
+ "maximises": "maximizes",
947
+ "maximising": "maximizing",
948
+ "meagre": "meager",
949
+ "mechanisation": "mechanization",
950
+ "mechanise": "mechanize",
951
+ "mechanised": "mechanized",
952
+ "mechanises": "mechanizes",
953
+ "mechanising": "mechanizing",
954
+ "mediaeval": "medieval",
955
+ "memorialise": "memorialize",
956
+ "memorialised": "memorialized",
957
+ "memorialises": "memorializes",
958
+ "memorialising": "memorializing",
959
+ "memorise": "memorize",
960
+ "memorised": "memorized",
961
+ "memorises": "memorizes",
962
+ "memorising": "memorizing",
963
+ "mesmerise": "mesmerize",
964
+ "mesmerised": "mesmerized",
965
+ "mesmerises": "mesmerizes",
966
+ "mesmerising": "mesmerizing",
967
+ "metabolise": "metabolize",
968
+ "metabolised": "metabolized",
969
+ "metabolises": "metabolizes",
970
+ "metabolising": "metabolizing",
971
+ "metre": "meter",
972
+ "metres": "meters",
973
+ "mhm": "hmm",
974
+ "micrometre": "micrometer",
975
+ "micrometres": "micrometers",
976
+ "militarise": "militarize",
977
+ "militarised": "militarized",
978
+ "militarises": "militarizes",
979
+ "militarising": "militarizing",
980
+ "milligramme": "milligram",
981
+ "milligrammes": "milligrams",
982
+ "millilitre": "milliliter",
983
+ "millilitres": "milliliters",
984
+ "millimetre": "millimeter",
985
+ "millimetres": "millimeters",
986
+ "miniaturisation": "miniaturization",
987
+ "miniaturise": "miniaturize",
988
+ "miniaturised": "miniaturized",
989
+ "miniaturises": "miniaturizes",
990
+ "miniaturising": "miniaturizing",
991
+ "minibusses": "minibuses",
992
+ "minimise": "minimize",
993
+ "minimised": "minimized",
994
+ "minimises": "minimizes",
995
+ "minimising": "minimizing",
996
+ "misbehaviour": "misbehavior",
997
+ "misdemeanour": "misdemeanor",
998
+ "misdemeanours": "misdemeanors",
999
+ "misspelt": "misspelled",
1000
+ "mitre": "miter",
1001
+ "mitres": "miters",
1002
+ "mm": "hmm",
1003
+ "mmm": "hmm",
1004
+ "mobilisation": "mobilization",
1005
+ "mobilise": "mobilize",
1006
+ "mobilised": "mobilized",
1007
+ "mobilises": "mobilizes",
1008
+ "mobilising": "mobilizing",
1009
+ "modelled": "modeled",
1010
+ "modeller": "modeler",
1011
+ "modellers": "modelers",
1012
+ "modelling": "modeling",
1013
+ "modernise": "modernize",
1014
+ "modernised": "modernized",
1015
+ "modernises": "modernizes",
1016
+ "modernising": "modernizing",
1017
+ "moisturise": "moisturize",
1018
+ "moisturised": "moisturized",
1019
+ "moisturiser": "moisturizer",
1020
+ "moisturisers": "moisturizers",
1021
+ "moisturises": "moisturizes",
1022
+ "moisturising": "moisturizing",
1023
+ "monologue": "monolog",
1024
+ "monologues": "monologs",
1025
+ "monopolisation": "monopolization",
1026
+ "monopolise": "monopolize",
1027
+ "monopolised": "monopolized",
1028
+ "monopolises": "monopolizes",
1029
+ "monopolising": "monopolizing",
1030
+ "moralise": "moralize",
1031
+ "moralised": "moralized",
1032
+ "moralises": "moralizes",
1033
+ "moralising": "moralizing",
1034
+ "motorised": "motorized",
1035
+ "mould": "mold",
1036
+ "moulded": "molded",
1037
+ "moulder": "molder",
1038
+ "mouldered": "moldered",
1039
+ "mouldering": "moldering",
1040
+ "moulders": "molders",
1041
+ "mouldier": "moldier",
1042
+ "mouldiest": "moldiest",
1043
+ "moulding": "molding",
1044
+ "mouldings": "moldings",
1045
+ "moulds": "molds",
1046
+ "mouldy": "moldy",
1047
+ "moult": "molt",
1048
+ "moulted": "molted",
1049
+ "moulting": "molting",
1050
+ "moults": "molts",
1051
+ "moustache": "mustache",
1052
+ "moustached": "mustached",
1053
+ "moustaches": "mustaches",
1054
+ "moustachioed": "mustachioed",
1055
+ "multicoloured": "multicolored",
1056
+ "nationalisation": "nationalization",
1057
+ "nationalisations": "nationalizations",
1058
+ "nationalise": "nationalize",
1059
+ "nationalised": "nationalized",
1060
+ "nationalises": "nationalizes",
1061
+ "nationalising": "nationalizing",
1062
+ "naturalisation": "naturalization",
1063
+ "naturalise": "naturalize",
1064
+ "naturalised": "naturalized",
1065
+ "naturalises": "naturalizes",
1066
+ "naturalising": "naturalizing",
1067
+ "neighbour": "neighbor",
1068
+ "neighbourhood": "neighborhood",
1069
+ "neighbourhoods": "neighborhoods",
1070
+ "neighbouring": "neighboring",
1071
+ "neighbourliness": "neighborliness",
1072
+ "neighbourly": "neighborly",
1073
+ "neighbours": "neighbors",
1074
+ "neutralisation": "neutralization",
1075
+ "neutralise": "neutralize",
1076
+ "neutralised": "neutralized",
1077
+ "neutralises": "neutralizes",
1078
+ "neutralising": "neutralizing",
1079
+ "normalisation": "normalization",
1080
+ "normalise": "normalize",
1081
+ "normalised": "normalized",
1082
+ "normalises": "normalizes",
1083
+ "normalising": "normalizing",
1084
+ "odour": "odor",
1085
+ "odourless": "odorless",
1086
+ "odours": "odors",
1087
+ "oesophagus": "esophagus",
1088
+ "oesophaguses": "esophaguses",
1089
+ "oestrogen": "estrogen",
1090
+ "offence": "offense",
1091
+ "offences": "offenses",
1092
+ "omelette": "omelet",
1093
+ "omelettes": "omelets",
1094
+ "optimise": "optimize",
1095
+ "optimised": "optimized",
1096
+ "optimises": "optimizes",
1097
+ "optimising": "optimizing",
1098
+ "organisation": "organization",
1099
+ "organisational": "organizational",
1100
+ "organisations": "organizations",
1101
+ "organise": "organize",
1102
+ "organised": "organized",
1103
+ "organiser": "organizer",
1104
+ "organisers": "organizers",
1105
+ "organises": "organizes",
1106
+ "organising": "organizing",
1107
+ "orthopaedic": "orthopedic",
1108
+ "orthopaedics": "orthopedics",
1109
+ "ostracise": "ostracize",
1110
+ "ostracised": "ostracized",
1111
+ "ostracises": "ostracizes",
1112
+ "ostracising": "ostracizing",
1113
+ "outmanoeuvre": "outmaneuver",
1114
+ "outmanoeuvred": "outmaneuvered",
1115
+ "outmanoeuvres": "outmaneuvers",
1116
+ "outmanoeuvring": "outmaneuvering",
1117
+ "overemphasise": "overemphasize",
1118
+ "overemphasised": "overemphasized",
1119
+ "overemphasises": "overemphasizes",
1120
+ "overemphasising": "overemphasizing",
1121
+ "oxidisation": "oxidization",
1122
+ "oxidise": "oxidize",
1123
+ "oxidised": "oxidized",
1124
+ "oxidises": "oxidizes",
1125
+ "oxidising": "oxidizing",
1126
+ "paederast": "pederast",
1127
+ "paederasts": "pederasts",
1128
+ "paediatric": "pediatric",
1129
+ "paediatrician": "pediatrician",
1130
+ "paediatricians": "pediatricians",
1131
+ "paediatrics": "pediatrics",
1132
+ "paedophile": "pedophile",
1133
+ "paedophiles": "pedophiles",
1134
+ "paedophilia": "pedophilia",
1135
+ "palaeolithic": "paleolithic",
1136
+ "palaeontologist": "paleontologist",
1137
+ "palaeontologists": "paleontologists",
1138
+ "palaeontology": "paleontology",
1139
+ "panelled": "paneled",
1140
+ "panelling": "paneling",
1141
+ "panellist": "panelist",
1142
+ "panellists": "panelists",
1143
+ "paralyse": "paralyze",
1144
+ "paralysed": "paralyzed",
1145
+ "paralyses": "paralyzes",
1146
+ "paralysing": "paralyzing",
1147
+ "parcelled": "parceled",
1148
+ "parcelling": "parceling",
1149
+ "parlour": "parlor",
1150
+ "parlours": "parlors",
1151
+ "particularise": "particularize",
1152
+ "particularised": "particularized",
1153
+ "particularises": "particularizes",
1154
+ "particularising": "particularizing",
1155
+ "passivisation": "passivization",
1156
+ "passivise": "passivize",
1157
+ "passivised": "passivized",
1158
+ "passivises": "passivizes",
1159
+ "passivising": "passivizing",
1160
+ "pasteurisation": "pasteurization",
1161
+ "pasteurise": "pasteurize",
1162
+ "pasteurised": "pasteurized",
1163
+ "pasteurises": "pasteurizes",
1164
+ "pasteurising": "pasteurizing",
1165
+ "patronise": "patronize",
1166
+ "patronised": "patronized",
1167
+ "patronises": "patronizes",
1168
+ "patronising": "patronizing",
1169
+ "patronisingly": "patronizingly",
1170
+ "pedalled": "pedaled",
1171
+ "pedalling": "pedaling",
1172
+ "pedestrianisation": "pedestrianization",
1173
+ "pedestrianise": "pedestrianize",
1174
+ "pedestrianised": "pedestrianized",
1175
+ "pedestrianises": "pedestrianizes",
1176
+ "pedestrianising": "pedestrianizing",
1177
+ "penalise": "penalize",
1178
+ "penalised": "penalized",
1179
+ "penalises": "penalizes",
1180
+ "penalising": "penalizing",
1181
+ "pencilled": "penciled",
1182
+ "pencilling": "penciling",
1183
+ "personalise": "personalize",
1184
+ "personalised": "personalized",
1185
+ "personalises": "personalizes",
1186
+ "personalising": "personalizing",
1187
+ "pharmacopoeia": "pharmacopeia",
1188
+ "pharmacopoeias": "pharmacopeias",
1189
+ "philosophise": "philosophize",
1190
+ "philosophised": "philosophized",
1191
+ "philosophises": "philosophizes",
1192
+ "philosophising": "philosophizing",
1193
+ "philtre": "filter",
1194
+ "philtres": "filters",
1195
+ "phoney": "phony",
1196
+ "plagiarise": "plagiarize",
1197
+ "plagiarised": "plagiarized",
1198
+ "plagiarises": "plagiarizes",
1199
+ "plagiarising": "plagiarizing",
1200
+ "plough": "plow",
1201
+ "ploughed": "plowed",
1202
+ "ploughing": "plowing",
1203
+ "ploughman": "plowman",
1204
+ "ploughmen": "plowmen",
1205
+ "ploughs": "plows",
1206
+ "ploughshare": "plowshare",
1207
+ "ploughshares": "plowshares",
1208
+ "polarisation": "polarization",
1209
+ "polarise": "polarize",
1210
+ "polarised": "polarized",
1211
+ "polarises": "polarizes",
1212
+ "polarising": "polarizing",
1213
+ "politicisation": "politicization",
1214
+ "politicise": "politicize",
1215
+ "politicised": "politicized",
1216
+ "politicises": "politicizes",
1217
+ "politicising": "politicizing",
1218
+ "popularisation": "popularization",
1219
+ "popularise": "popularize",
1220
+ "popularised": "popularized",
1221
+ "popularises": "popularizes",
1222
+ "popularising": "popularizing",
1223
+ "pouffe": "pouf",
1224
+ "pouffes": "poufs",
1225
+ "practise": "practice",
1226
+ "practised": "practiced",
1227
+ "practises": "practices",
1228
+ "practising": "practicing",
1229
+ "praesidium": "presidium",
1230
+ "praesidiums": "presidiums",
1231
+ "pressurisation": "pressurization",
1232
+ "pressurise": "pressurize",
1233
+ "pressurised": "pressurized",
1234
+ "pressurises": "pressurizes",
1235
+ "pressurising": "pressurizing",
1236
+ "pretence": "pretense",
1237
+ "pretences": "pretenses",
1238
+ "primaeval": "primeval",
1239
+ "prioritisation": "prioritization",
1240
+ "prioritise": "prioritize",
1241
+ "prioritised": "prioritized",
1242
+ "prioritises": "prioritizes",
1243
+ "prioritising": "prioritizing",
1244
+ "privatisation": "privatization",
1245
+ "privatisations": "privatizations",
1246
+ "privatise": "privatize",
1247
+ "privatised": "privatized",
1248
+ "privatises": "privatizes",
1249
+ "privatising": "privatizing",
1250
+ "professionalisation": "professionalization",
1251
+ "professionalise": "professionalize",
1252
+ "professionalised": "professionalized",
1253
+ "professionalises": "professionalizes",
1254
+ "professionalising": "professionalizing",
1255
+ "programme": "program",
1256
+ "programmes": "programs",
1257
+ "prologue": "prolog",
1258
+ "prologues": "prologs",
1259
+ "propagandise": "propagandize",
1260
+ "propagandised": "propagandized",
1261
+ "propagandises": "propagandizes",
1262
+ "propagandising": "propagandizing",
1263
+ "proselytise": "proselytize",
1264
+ "proselytised": "proselytized",
1265
+ "proselytiser": "proselytizer",
1266
+ "proselytisers": "proselytizers",
1267
+ "proselytises": "proselytizes",
1268
+ "proselytising": "proselytizing",
1269
+ "psychoanalyse": "psychoanalyze",
1270
+ "psychoanalysed": "psychoanalyzed",
1271
+ "psychoanalyses": "psychoanalyzes",
1272
+ "psychoanalysing": "psychoanalyzing",
1273
+ "publicise": "publicize",
1274
+ "publicised": "publicized",
1275
+ "publicises": "publicizes",
1276
+ "publicising": "publicizing",
1277
+ "pulverisation": "pulverization",
1278
+ "pulverise": "pulverize",
1279
+ "pulverised": "pulverized",
1280
+ "pulverises": "pulverizes",
1281
+ "pulverising": "pulverizing",
1282
+ "pummelled": "pummel",
1283
+ "pummelling": "pummeled",
1284
+ "pyjama": "pajama",
1285
+ "pyjamas": "pajamas",
1286
+ "pzazz": "pizzazz",
1287
+ "quarrelled": "quarreled",
1288
+ "quarrelling": "quarreling",
1289
+ "radicalise": "radicalize",
1290
+ "radicalised": "radicalized",
1291
+ "radicalises": "radicalizes",
1292
+ "radicalising": "radicalizing",
1293
+ "rancour": "rancor",
1294
+ "randomise": "randomize",
1295
+ "randomised": "randomized",
1296
+ "randomises": "randomizes",
1297
+ "randomising": "randomizing",
1298
+ "rationalisation": "rationalization",
1299
+ "rationalisations": "rationalizations",
1300
+ "rationalise": "rationalize",
1301
+ "rationalised": "rationalized",
1302
+ "rationalises": "rationalizes",
1303
+ "rationalising": "rationalizing",
1304
+ "ravelled": "raveled",
1305
+ "ravelling": "raveling",
1306
+ "realisable": "realizable",
1307
+ "realisation": "realization",
1308
+ "realisations": "realizations",
1309
+ "realise": "realize",
1310
+ "realised": "realized",
1311
+ "realises": "realizes",
1312
+ "realising": "realizing",
1313
+ "recognisable": "recognizable",
1314
+ "recognisably": "recognizably",
1315
+ "recognisance": "recognizance",
1316
+ "recognise": "recognize",
1317
+ "recognised": "recognized",
1318
+ "recognises": "recognizes",
1319
+ "recognising": "recognizing",
1320
+ "reconnoitre": "reconnoiter",
1321
+ "reconnoitred": "reconnoitered",
1322
+ "reconnoitres": "reconnoiters",
1323
+ "reconnoitring": "reconnoitering",
1324
+ "refuelled": "refueled",
1325
+ "refuelling": "refueling",
1326
+ "regularisation": "regularization",
1327
+ "regularise": "regularize",
1328
+ "regularised": "regularized",
1329
+ "regularises": "regularizes",
1330
+ "regularising": "regularizing",
1331
+ "remodelled": "remodeled",
1332
+ "remodelling": "remodeling",
1333
+ "remould": "remold",
1334
+ "remoulded": "remolded",
1335
+ "remoulding": "remolding",
1336
+ "remoulds": "remolds",
1337
+ "reorganisation": "reorganization",
1338
+ "reorganisations": "reorganizations",
1339
+ "reorganise": "reorganize",
1340
+ "reorganised": "reorganized",
1341
+ "reorganises": "reorganizes",
1342
+ "reorganising": "reorganizing",
1343
+ "revelled": "reveled",
1344
+ "reveller": "reveler",
1345
+ "revellers": "revelers",
1346
+ "revelling": "reveling",
1347
+ "revitalise": "revitalize",
1348
+ "revitalised": "revitalized",
1349
+ "revitalises": "revitalizes",
1350
+ "revitalising": "revitalizing",
1351
+ "revolutionise": "revolutionize",
1352
+ "revolutionised": "revolutionized",
1353
+ "revolutionises": "revolutionizes",
1354
+ "revolutionising": "revolutionizing",
1355
+ "rhapsodise": "rhapsodize",
1356
+ "rhapsodised": "rhapsodized",
1357
+ "rhapsodises": "rhapsodizes",
1358
+ "rhapsodising": "rhapsodizing",
1359
+ "rigour": "rigor",
1360
+ "rigours": "rigors",
1361
+ "ritualised": "ritualized",
1362
+ "rivalled": "rivaled",
1363
+ "rivalling": "rivaling",
1364
+ "romanticise": "romanticize",
1365
+ "romanticised": "romanticized",
1366
+ "romanticises": "romanticizes",
1367
+ "romanticising": "romanticizing",
1368
+ "rumour": "rumor",
1369
+ "rumoured": "rumored",
1370
+ "rumours": "rumors",
1371
+ "sabre": "saber",
1372
+ "sabres": "sabers",
1373
+ "saltpetre": "saltpeter",
1374
+ "sanitise": "sanitize",
1375
+ "sanitised": "sanitized",
1376
+ "sanitises": "sanitizes",
1377
+ "sanitising": "sanitizing",
1378
+ "satirise": "satirize",
1379
+ "satirised": "satirized",
1380
+ "satirises": "satirizes",
1381
+ "satirising": "satirizing",
1382
+ "saviour": "savior",
1383
+ "saviours": "saviors",
1384
+ "savour": "savor",
1385
+ "savoured": "savored",
1386
+ "savouries": "savories",
1387
+ "savouring": "savoring",
1388
+ "savours": "savors",
1389
+ "savoury": "savory",
1390
+ "scandalise": "scandalize",
1391
+ "scandalised": "scandalized",
1392
+ "scandalises": "scandalizes",
1393
+ "scandalising": "scandalizing",
1394
+ "sceptic": "skeptic",
1395
+ "sceptical": "skeptical",
1396
+ "sceptically": "skeptically",
1397
+ "scepticism": "skepticism",
1398
+ "sceptics": "skeptics",
1399
+ "sceptre": "scepter",
1400
+ "sceptres": "scepters",
1401
+ "scrutinise": "scrutinize",
1402
+ "scrutinised": "scrutinized",
1403
+ "scrutinises": "scrutinizes",
1404
+ "scrutinising": "scrutinizing",
1405
+ "secularisation": "secularization",
1406
+ "secularise": "secularize",
1407
+ "secularised": "secularized",
1408
+ "secularises": "secularizes",
1409
+ "secularising": "secularizing",
1410
+ "sensationalise": "sensationalize",
1411
+ "sensationalised": "sensationalized",
1412
+ "sensationalises": "sensationalizes",
1413
+ "sensationalising": "sensationalizing",
1414
+ "sensitise": "sensitize",
1415
+ "sensitised": "sensitized",
1416
+ "sensitises": "sensitizes",
1417
+ "sensitising": "sensitizing",
1418
+ "sentimentalise": "sentimentalize",
1419
+ "sentimentalised": "sentimentalized",
1420
+ "sentimentalises": "sentimentalizes",
1421
+ "sentimentalising": "sentimentalizing",
1422
+ "sepulchre": "sepulcher",
1423
+ "sepulchres": "sepulchers",
1424
+ "serialisation": "serialization",
1425
+ "serialisations": "serializations",
1426
+ "serialise": "serialize",
1427
+ "serialised": "serialized",
1428
+ "serialises": "serializes",
1429
+ "serialising": "serializing",
1430
+ "sermonise": "sermonize",
1431
+ "sermonised": "sermonized",
1432
+ "sermonises": "sermonizes",
1433
+ "sermonising": "sermonizing",
1434
+ "sheikh": "sheik",
1435
+ "shovelled": "shoveled",
1436
+ "shovelling": "shoveling",
1437
+ "shrivelled": "shriveled",
1438
+ "shrivelling": "shriveling",
1439
+ "signalise": "signalize",
1440
+ "signalised": "signalized",
1441
+ "signalises": "signalizes",
1442
+ "signalising": "signalizing",
1443
+ "signalled": "signaled",
1444
+ "signalling": "signaling",
1445
+ "smoulder": "smolder",
1446
+ "smouldered": "smoldered",
1447
+ "smouldering": "smoldering",
1448
+ "smoulders": "smolders",
1449
+ "snivelled": "sniveled",
1450
+ "snivelling": "sniveling",
1451
+ "snorkelled": "snorkeled",
1452
+ "snorkelling": "snorkeling",
1453
+ "snowplough": "snowplow",
1454
+ "snowploughs": "snowplow",
1455
+ "socialisation": "socialization",
1456
+ "socialise": "socialize",
1457
+ "socialised": "socialized",
1458
+ "socialises": "socializes",
1459
+ "socialising": "socializing",
1460
+ "sodomise": "sodomize",
1461
+ "sodomised": "sodomized",
1462
+ "sodomises": "sodomizes",
1463
+ "sodomising": "sodomizing",
1464
+ "solemnise": "solemnize",
1465
+ "solemnised": "solemnized",
1466
+ "solemnises": "solemnizes",
1467
+ "solemnising": "solemnizing",
1468
+ "sombre": "somber",
1469
+ "specialisation": "specialization",
1470
+ "specialisations": "specializations",
1471
+ "specialise": "specialize",
1472
+ "specialised": "specialized",
1473
+ "specialises": "specializes",
1474
+ "specialising": "specializing",
1475
+ "spectre": "specter",
1476
+ "spectres": "specters",
1477
+ "spiralled": "spiraled",
1478
+ "spiralling": "spiraling",
1479
+ "splendour": "splendor",
1480
+ "splendours": "splendors",
1481
+ "squirrelled": "squirreled",
1482
+ "squirrelling": "squirreling",
1483
+ "stabilisation": "stabilization",
1484
+ "stabilise": "stabilize",
1485
+ "stabilised": "stabilized",
1486
+ "stabiliser": "stabilizer",
1487
+ "stabilisers": "stabilizers",
1488
+ "stabilises": "stabilizes",
1489
+ "stabilising": "stabilizing",
1490
+ "standardisation": "standardization",
1491
+ "standardise": "standardize",
1492
+ "standardised": "standardized",
1493
+ "standardises": "standardizes",
1494
+ "standardising": "standardizing",
1495
+ "stencilled": "stenciled",
1496
+ "stencilling": "stenciling",
1497
+ "sterilisation": "sterilization",
1498
+ "sterilisations": "sterilizations",
1499
+ "sterilise": "sterilize",
1500
+ "sterilised": "sterilized",
1501
+ "steriliser": "sterilizer",
1502
+ "sterilisers": "sterilizers",
1503
+ "sterilises": "sterilizes",
1504
+ "sterilising": "sterilizing",
1505
+ "stigmatisation": "stigmatization",
1506
+ "stigmatise": "stigmatize",
1507
+ "stigmatised": "stigmatized",
1508
+ "stigmatises": "stigmatizes",
1509
+ "stigmatising": "stigmatizing",
1510
+ "storey": "story",
1511
+ "storeys": "stories",
1512
+ "subsidisation": "subsidization",
1513
+ "subsidise": "subsidize",
1514
+ "subsidised": "subsidized",
1515
+ "subsidiser": "subsidizer",
1516
+ "subsidisers": "subsidizers",
1517
+ "subsidises": "subsidizes",
1518
+ "subsidising": "subsidizing",
1519
+ "succour": "succor",
1520
+ "succoured": "succored",
1521
+ "succouring": "succoring",
1522
+ "succours": "succors",
1523
+ "sulphate": "sulfate",
1524
+ "sulphates": "sulfates",
1525
+ "sulphide": "sulfide",
1526
+ "sulphides": "sulfides",
1527
+ "sulphur": "sulfur",
1528
+ "sulphurous": "sulfurous",
1529
+ "summarise": "summarize",
1530
+ "summarised": "summarized",
1531
+ "summarises": "summarizes",
1532
+ "summarising": "summarizing",
1533
+ "swivelled": "swiveled",
1534
+ "swivelling": "swiveling",
1535
+ "symbolise": "symbolize",
1536
+ "symbolised": "symbolized",
1537
+ "symbolises": "symbolizes",
1538
+ "symbolising": "symbolizing",
1539
+ "sympathise": "sympathize",
1540
+ "sympathised": "sympathized",
1541
+ "sympathiser": "sympathizer",
1542
+ "sympathisers": "sympathizers",
1543
+ "sympathises": "sympathizes",
1544
+ "sympathising": "sympathizing",
1545
+ "synchronisation": "synchronization",
1546
+ "synchronise": "synchronize",
1547
+ "synchronised": "synchronized",
1548
+ "synchronises": "synchronizes",
1549
+ "synchronising": "synchronizing",
1550
+ "synthesise": "synthesize",
1551
+ "synthesised": "synthesized",
1552
+ "synthesiser": "synthesizer",
1553
+ "synthesisers": "synthesizers",
1554
+ "synthesises": "synthesizes",
1555
+ "synthesising": "synthesizing",
1556
+ "syphon": "siphon",
1557
+ "syphoned": "siphoned",
1558
+ "syphoning": "siphoning",
1559
+ "syphons": "siphons",
1560
+ "systematisation": "systematization",
1561
+ "systematise": "systematize",
1562
+ "systematised": "systematized",
1563
+ "systematises": "systematizes",
1564
+ "systematising": "systematizing",
1565
+ "tantalise": "tantalize",
1566
+ "tantalised": "tantalized",
1567
+ "tantalises": "tantalizes",
1568
+ "tantalising": "tantalizing",
1569
+ "tantalisingly": "tantalizingly",
1570
+ "tasselled": "tasseled",
1571
+ "technicolour": "technicolor",
1572
+ "temporise": "temporize",
1573
+ "temporised": "temporized",
1574
+ "temporises": "temporizes",
1575
+ "temporising": "temporizing",
1576
+ "tenderise": "tenderize",
1577
+ "tenderised": "tenderized",
1578
+ "tenderises": "tenderizes",
1579
+ "tenderising": "tenderizing",
1580
+ "terrorise": "terrorize",
1581
+ "terrorised": "terrorized",
1582
+ "terrorises": "terrorizes",
1583
+ "terrorising": "terrorizing",
1584
+ "theatre": "theater",
1585
+ "theatregoer": "theatergoer",
1586
+ "theatregoers": "theatergoers",
1587
+ "theatres": "theaters",
1588
+ "theorise": "theorize",
1589
+ "theorised": "theorized",
1590
+ "theorises": "theorizes",
1591
+ "theorising": "theorizing",
1592
+ "tonne": "ton",
1593
+ "tonnes": "tons",
1594
+ "towelled": "toweled",
1595
+ "towelling": "toweling",
1596
+ "toxaemia": "toxemia",
1597
+ "tranquillise": "tranquilize",
1598
+ "tranquillised": "tranquilized",
1599
+ "tranquilliser": "tranquilizer",
1600
+ "tranquillisers": "tranquilizers",
1601
+ "tranquillises": "tranquilizes",
1602
+ "tranquillising": "tranquilizing",
1603
+ "tranquillity": "tranquility",
1604
+ "tranquillize": "tranquilize",
1605
+ "tranquillized": "tranquilized",
1606
+ "tranquillizer": "tranquilizer",
1607
+ "tranquillizers": "tranquilizers",
1608
+ "tranquillizes": "tranquilizes",
1609
+ "tranquillizing": "tranquilizing",
1610
+ "tranquilly": "tranquility",
1611
+ "transistorised": "transistorized",
1612
+ "traumatise": "traumatize",
1613
+ "traumatised": "traumatized",
1614
+ "traumatises": "traumatizes",
1615
+ "traumatising": "traumatizing",
1616
+ "travelled": "traveled",
1617
+ "traveller": "traveler",
1618
+ "travellers": "travelers",
1619
+ "travelling": "traveling",
1620
+ "travelog": "travelogue",
1621
+ "travelogs": "travelogues",
1622
+ "trialled": "trialed",
1623
+ "trialling": "trialing",
1624
+ "tricolour": "tricolor",
1625
+ "tricolours": "tricolors",
1626
+ "trivialise": "trivialize",
1627
+ "trivialised": "trivialized",
1628
+ "trivialises": "trivializes",
1629
+ "trivialising": "trivializing",
1630
+ "tumour": "tumor",
1631
+ "tumours": "tumors",
1632
+ "tunnelled": "tunneled",
1633
+ "tunnelling": "tunneling",
1634
+ "tyrannise": "tyrannize",
1635
+ "tyrannised": "tyrannized",
1636
+ "tyrannises": "tyrannizes",
1637
+ "tyrannising": "tyrannizing",
1638
+ "tyre": "tire",
1639
+ "tyres": "tires",
1640
+ "unauthorised": "unauthorized",
1641
+ "uncivilised": "uncivilized",
1642
+ "underutilised": "underutilized",
1643
+ "unequalled": "unequaled",
1644
+ "unfavourable": "unfavorable",
1645
+ "unfavourably": "unfavorably",
1646
+ "unionisation": "unionization",
1647
+ "unionise": "unionize",
1648
+ "unionised": "unionized",
1649
+ "unionises": "unionizes",
1650
+ "unionising": "unionizing",
1651
+ "unorganised": "unorganized",
1652
+ "unravelled": "unraveled",
1653
+ "unravelling": "unraveling",
1654
+ "unrecognisable": "unrecognizable",
1655
+ "unrecognised": "unrecognized",
1656
+ "unrivalled": "unrivaled",
1657
+ "unsavoury": "unsavory",
1658
+ "untrammelled": "untrammeled",
1659
+ "urbanisation": "urbanization",
1660
+ "urbanise": "urbanize",
1661
+ "urbanised": "urbanized",
1662
+ "urbanises": "urbanizes",
1663
+ "urbanising": "urbanizing",
1664
+ "utilisable": "utilizable",
1665
+ "utilisation": "utilization",
1666
+ "utilise": "utilize",
1667
+ "utilised": "utilized",
1668
+ "utilises": "utilizes",
1669
+ "utilising": "utilizing",
1670
+ "valour": "valor",
1671
+ "vandalise": "vandalize",
1672
+ "vandalised": "vandalized",
1673
+ "vandalises": "vandalizes",
1674
+ "vandalising": "vandalizing",
1675
+ "vaporisation": "vaporization",
1676
+ "vaporise": "vaporize",
1677
+ "vaporised": "vaporized",
1678
+ "vaporises": "vaporizes",
1679
+ "vaporising": "vaporizing",
1680
+ "vapour": "vapor",
1681
+ "vapours": "vapors",
1682
+ "verbalise": "verbalize",
1683
+ "verbalised": "verbalized",
1684
+ "verbalises": "verbalizes",
1685
+ "verbalising": "verbalizing",
1686
+ "victimisation": "victimization",
1687
+ "victimise": "victimize",
1688
+ "victimised": "victimized",
1689
+ "victimises": "victimizes",
1690
+ "victimising": "victimizing",
1691
+ "videodisc": "videodisk",
1692
+ "videodiscs": "videodisks",
1693
+ "vigour": "vigor",
1694
+ "visualisation": "visualization",
1695
+ "visualisations": "visualizations",
1696
+ "visualise": "visualize",
1697
+ "visualised": "visualized",
1698
+ "visualises": "visualizes",
1699
+ "visualising": "visualizing",
1700
+ "vocalisation": "vocalization",
1701
+ "vocalisations": "vocalizations",
1702
+ "vocalise": "vocalize",
1703
+ "vocalised": "vocalized",
1704
+ "vocalises": "vocalizes",
1705
+ "vocalising": "vocalizing",
1706
+ "vulcanised": "vulcanized",
1707
+ "vulgarisation": "vulgarization",
1708
+ "vulgarise": "vulgarize",
1709
+ "vulgarised": "vulgarized",
1710
+ "vulgarises": "vulgarizes",
1711
+ "vulgarising": "vulgarizing",
1712
+ "waggon": "wagon",
1713
+ "waggons": "wagons",
1714
+ "watercolour": "watercolor",
1715
+ "watercolours": "watercolors",
1716
+ "weaselled": "weaseled",
1717
+ "weaselling": "weaseling",
1718
+ "westernisation": "westernization",
1719
+ "westernise": "westernize",
1720
+ "westernised": "westernized",
1721
+ "westernises": "westernizes",
1722
+ "westernising": "westernizing",
1723
+ "womanise": "womanize",
1724
+ "womanised": "womanized",
1725
+ "womaniser": "womanizer",
1726
+ "womanisers": "womanizers",
1727
+ "womanises": "womanizes",
1728
+ "womanising": "womanizing",
1729
+ "woollen": "woolen",
1730
+ "woollens": "woolens",
1731
+ "woollies": "woolies",
1732
+ "woolly": "wooly",
1733
+ "worshipped": "worshiped",
1734
+ "worshipper": "worshiper",
1735
+ "worshipping": "worshiping",
1736
+ "yodelled": "yodeled",
1737
+ "yodelling": "yodeling",
1738
+ "yoghourt": "yogurt",
1739
+ "yoghourts": "yogurts",
1740
+ "yoghurt": "yogurt",
1741
+ "yoghurts": "yogurts"
1742
+ }
scripts/distil-whisper-lora-run5/adapter/preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 80,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "WhisperProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
scripts/distil-whisper-lora-run5/adapter/projection.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e85d8e6c11656df9da335aa15192b77641333f3386d0ddf95659bec1f81bf478
3
+ size 3938794
scripts/distil-whisper-lora-run5/adapter/special_tokens_map.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|startoftranscript|>",
5
+ "<|en|>",
6
+ "<|zh|>",
7
+ "<|de|>",
8
+ "<|es|>",
9
+ "<|ru|>",
10
+ "<|ko|>",
11
+ "<|fr|>",
12
+ "<|ja|>",
13
+ "<|pt|>",
14
+ "<|tr|>",
15
+ "<|pl|>",
16
+ "<|ca|>",
17
+ "<|nl|>",
18
+ "<|ar|>",
19
+ "<|sv|>",
20
+ "<|it|>",
21
+ "<|id|>",
22
+ "<|hi|>",
23
+ "<|fi|>",
24
+ "<|vi|>",
25
+ "<|he|>",
26
+ "<|uk|>",
27
+ "<|el|>",
28
+ "<|ms|>",
29
+ "<|cs|>",
30
+ "<|ro|>",
31
+ "<|da|>",
32
+ "<|hu|>",
33
+ "<|ta|>",
34
+ "<|no|>",
35
+ "<|th|>",
36
+ "<|ur|>",
37
+ "<|hr|>",
38
+ "<|bg|>",
39
+ "<|lt|>",
40
+ "<|la|>",
41
+ "<|mi|>",
42
+ "<|ml|>",
43
+ "<|cy|>",
44
+ "<|sk|>",
45
+ "<|te|>",
46
+ "<|fa|>",
47
+ "<|lv|>",
48
+ "<|bn|>",
49
+ "<|sr|>",
50
+ "<|az|>",
51
+ "<|sl|>",
52
+ "<|kn|>",
53
+ "<|et|>",
54
+ "<|mk|>",
55
+ "<|br|>",
56
+ "<|eu|>",
57
+ "<|is|>",
58
+ "<|hy|>",
59
+ "<|ne|>",
60
+ "<|mn|>",
61
+ "<|bs|>",
62
+ "<|kk|>",
63
+ "<|sq|>",
64
+ "<|sw|>",
65
+ "<|gl|>",
66
+ "<|mr|>",
67
+ "<|pa|>",
68
+ "<|si|>",
69
+ "<|km|>",
70
+ "<|sn|>",
71
+ "<|yo|>",
72
+ "<|so|>",
73
+ "<|af|>",
74
+ "<|oc|>",
75
+ "<|ka|>",
76
+ "<|be|>",
77
+ "<|tg|>",
78
+ "<|sd|>",
79
+ "<|gu|>",
80
+ "<|am|>",
81
+ "<|yi|>",
82
+ "<|lo|>",
83
+ "<|uz|>",
84
+ "<|fo|>",
85
+ "<|ht|>",
86
+ "<|ps|>",
87
+ "<|tk|>",
88
+ "<|nn|>",
89
+ "<|mt|>",
90
+ "<|sa|>",
91
+ "<|lb|>",
92
+ "<|my|>",
93
+ "<|bo|>",
94
+ "<|tl|>",
95
+ "<|mg|>",
96
+ "<|as|>",
97
+ "<|tt|>",
98
+ "<|haw|>",
99
+ "<|ln|>",
100
+ "<|ha|>",
101
+ "<|ba|>",
102
+ "<|jw|>",
103
+ "<|su|>",
104
+ "<|translate|>",
105
+ "<|transcribe|>",
106
+ "<|startoflm|>",
107
+ "<|startofprev|>",
108
+ "<|nocaptions|>",
109
+ "<|notimestamps|>"
110
+ ],
111
+ "bos_token": {
112
+ "content": "<|endoftext|>",
113
+ "lstrip": false,
114
+ "normalized": false,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "eos_token": {
119
+ "content": "<|endoftext|>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
+ "pad_token": {
126
+ "content": "<|endoftext|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false
131
+ },
132
+ "unk_token": {
133
+ "content": "<|endoftext|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false
138
+ }
139
+ }
scripts/distil-whisper-lora-run5/adapter/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/distil-whisper-lora-run5/adapter/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/distil-whisper-lora-run5/training_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lr": 3.752055855124284e-05,
3
+ "lora_dropout": 0.15798625466052896,
4
+ "temperature": 2.1649165607921677,
5
+ "kl_weight": 0.6118528947223795,
6
+ "hidden_beta": 0.4184815819561255,
7
+ "grad_accum": 4,
8
+ "batch_size": 8,
9
+ "lora_r": 128,
10
+ "lora_alpha": 32
11
+ }
scripts/distil-whisper-lora-run5/training_history.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"steps": [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600], "train": [12.822773933410645, 12.107272148132324, 11.161516189575195, 10.361945152282715, 10.062054634094238, 9.982120513916016, 9.8756742477417, 9.892913818359375, 9.879964828491211, 9.774263381958008, 9.635120391845703, 9.672687530517578, 9.555618286132812, 9.515853881835938, 9.418013572692871, 9.39698314666748, 9.18628215789795, 9.527929306030273, 9.209389686584473, 9.22828483581543, 9.14479923248291, 9.032608032226562, 8.780029296875, 8.873854637145996, 8.795372009277344, 8.598912239074707, 8.615389823913574, 8.336470603942871, 8.273077964782715, 8.088512420654297, 7.692420482635498, 7.881086826324463], "val": [12.244791666666666, 10.263020833333334, 9.984375, 9.877604166666666, 9.828125, 9.713541666666666, 9.669270833333334, 9.5703125, 9.528645833333334, 9.4921875, 9.4765625, 9.455729166666666, 9.471354166666666, 9.458333333333334, 9.604166666666666, 9.598958333333334]}
scripts/hyperparameter_search.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import warnings
5
+ import itertools
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+ from torch.cuda.amp import autocast
13
+ from torch.utils.data import DataLoader
14
+ from transformers import WhisperForConditionalGeneration, set_seed
15
+ from peft import LoraConfig, get_peft_model, TaskType
16
+
17
+ import optuna
18
+ from optuna.samplers import TPESampler
19
+
20
+ # 添加项目路径
21
+ sys.path.append(str(Path(__file__).parent.parent))
22
+
23
+ from src.utils.config import load_config
24
+
25
+
26
+ def objective(trial, train_ds, safe_collate_fn, device):
27
+ """Optuna 目标函数 - 使用 YAML 配置"""
28
+ # 加载基础配置
29
+ config = load_config("../configs/default_config.yaml")
30
+
31
+ # Optuna 搜索的超参数
32
+ lr = trial.suggest_float("lr", 1e-6, 1e-3, log=True)
33
+ lora_dropout = trial.suggest_float("lora_dropout", 0.05, 0.3)
34
+ temperature = trial.suggest_float("temperature", 1.0, 5.0)
35
+ kl_weight = trial.suggest_float("kl_weight", 0.0, 1.0)
36
+ hidden_beta = trial.suggest_float("hidden_beta", 0.0, 3.0)
37
+ grad_accum = trial.suggest_categorical("grad_accum", [1, 2, 4])
38
+ batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
39
+ lora_r = trial.suggest_categorical("lora_r", [32, 64, 128])
40
+ lora_alpha = trial.suggest_categorical("lora_alpha", [8, 16, 32])
41
+
42
+ # 加载教师和学生模型
43
+ teacher = WhisperForConditionalGeneration.from_pretrained(
44
+ config.model.teacher_model, torch_dtype=torch.float16, use_cache=False
45
+ ).eval()
46
+ for p in teacher.parameters():
47
+ p.requires_grad = False
48
+
49
+ base = WhisperForConditionalGeneration.from_pretrained(
50
+ config.model.student_model, torch_dtype=torch.float16, use_cache=False
51
+ )
52
+ for p in itertools.chain(base.model.encoder.parameters(),
53
+ base.model.decoder.parameters()):
54
+ p.requires_grad = False
55
+
56
+ # LoRA 配置
57
+ lcfg = LoraConfig(
58
+ task_type=TaskType.SEQ_2_SEQ_LM,
59
+ r=lora_r,
60
+ lora_alpha=lora_alpha,
61
+ lora_dropout=lora_dropout,
62
+ target_modules=config.lora.target_modules,
63
+ bias="none"
64
+ )
65
+ student = get_peft_model(base, lcfg).to(device)
66
+ proj = nn.Linear(config.model.student_hidden_dim, config.model.teacher_hidden_dim).to(device) \
67
+ if hidden_beta > 0 else None
68
+
69
+ opt = torch.optim.AdamW(
70
+ list(student.parameters()) + ([] if proj is None else list(proj.parameters())),
71
+ lr=lr, weight_decay=config.optimizer.weight_decay
72
+ )
73
+
74
+ loader = DataLoader(train_ds,
75
+ batch_size=batch_size,
76
+ shuffle=True,
77
+ collate_fn=safe_collate_fn,
78
+ num_workers=0,
79
+ drop_last=True)
80
+
81
+ total_loss, count = 0.0, 0
82
+ for i, batch in enumerate(loader):
83
+ if i >= 5:
84
+ break
85
+ feats = batch["input_features"].half().to(device)
86
+ labels = batch["labels"].to(device)
87
+ mask = (feats.sum(1) != 0).long()
88
+
89
+ with autocast():
90
+ out = student.model(input_features=feats,
91
+ attention_mask=mask,
92
+ labels=labels,
93
+ output_hidden_states=True)
94
+ loss = out.loss
95
+
96
+ loss.backward()
97
+ opt.step()
98
+ opt.zero_grad()
99
+ total_loss += loss.item()
100
+ count += 1
101
+
102
+ # 清理内存
103
+ del teacher, student, base
104
+ if proj:
105
+ del proj
106
+ torch.cuda.empty_cache()
107
+
108
+ return total_loss / max(1, count)
109
+
110
+
111
+ def run_hyperparameter_search(train_dataset, collate_function):
112
+ """运行超参数搜索"""
113
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
114
+
115
+ # 设置随机种子
116
+ SEED = 42
117
+ random.seed(SEED)
118
+ np.random.seed(SEED)
119
+ torch.manual_seed(SEED)
120
+ torch.cuda.manual_seed_all(SEED)
121
+ set_seed(SEED)
122
+ warnings.filterwarnings("ignore")
123
+
124
+ print(f"Using device: {device}")
125
+
126
+ # 创建 Optuna 研究
127
+ study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=SEED))
128
+ study.optimize(
129
+ lambda trial: objective(trial, train_dataset, collate_function, device),
130
+ n_trials=50
131
+ )
132
+
133
+ # 获取基础配置和最佳参数
134
+ config = load_config("../configs/default_config.yaml")
135
+ best_params = study.best_params
136
+
137
+ # 更新配置 cfg = CFG(); for k, v in study.best_params.items(): setattr(cfg, k, v)
138
+ import copy
139
+ updated_config = copy.deepcopy(config)
140
+
141
+ for k, v in best_params.items():
142
+ if k == 'lr':
143
+ updated_config.optimizer.lr = v
144
+ elif k == 'lora_dropout':
145
+ updated_config.lora.dropout = v
146
+ elif k == 'temperature':
147
+ updated_config.distillation.temperature = v
148
+ elif k == 'kl_weight':
149
+ updated_config.distillation.kl_weight = v
150
+ elif k == 'hidden_beta':
151
+ updated_config.distillation.hidden_beta = v
152
+ elif k == 'grad_accum':
153
+ updated_config.training.grad_accum = v
154
+ elif k == 'batch_size':
155
+ updated_config.training.batch_size = v
156
+ elif k == 'lora_r':
157
+ updated_config.lora.r = v
158
+ elif k == 'lora_alpha':
159
+ updated_config.lora.alpha = v
160
+
161
+ # 保存配置
162
+ os.makedirs(config.output.dir, exist_ok=True)
163
+
164
+ # 保存最佳参数到 JSON(用于记录)
165
+ with open(os.path.join(config.output.dir, "training_config.json"), "w") as f:
166
+ json.dump(best_params, f, indent=2)
167
+
168
+ # 将最佳参数写回到 default_config.yaml
169
+ import yaml
170
+ config_path = "../configs/default_config.yaml"
171
+
172
+ # 读取原始 YAML 文件
173
+ with open(config_path, 'r', encoding='utf-8') as f:
174
+ yaml_data = yaml.safe_load(f)
175
+
176
+ # 更新相关参数
177
+ for k, v in best_params.items():
178
+ if k == 'lr':
179
+ yaml_data['optimizer']['lr'] = v
180
+ elif k == 'lora_dropout':
181
+ yaml_data['lora']['dropout'] = v
182
+ elif k == 'temperature':
183
+ yaml_data['distillation']['temperature'] = v
184
+ elif k == 'kl_weight':
185
+ yaml_data['distillation']['kl_weight'] = v
186
+ elif k == 'hidden_beta':
187
+ yaml_data['distillation']['hidden_beta'] = v
188
+ elif k == 'grad_accum':
189
+ yaml_data['training']['grad_accum'] = v
190
+ elif k == 'batch_size':
191
+ yaml_data['training']['batch_size'] = v
192
+ elif k == 'lora_r':
193
+ yaml_data['lora']['r'] = v
194
+ elif k == 'lora_alpha':
195
+ yaml_data['lora']['alpha'] = v
196
+
197
+ # 写回文件
198
+ with open(config_path, 'w', encoding='utf-8') as f:
199
+ yaml.dump(yaml_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
200
+
201
+ print(f"已将最佳参数更新到 {config_path}")
202
+
203
+ # 打印结果
204
+ print("=== Hyperparameters from 50-run auto-search ===")
205
+ print(f" lr: {best_params['lr']}")
206
+ print(f" lora_dropout: {best_params['lora_dropout']}")
207
+ print(f" temperature: {best_params['temperature']}")
208
+ print(f" kl_weight: {best_params['kl_weight']}")
209
+ print(f" hidden_beta: {best_params['hidden_beta']}")
210
+ print(f" grad_accum: {best_params['grad_accum']}")
211
+ print(f" batch_size: {best_params['batch_size']}")
212
+ print(f" lora_r: {best_params['lora_r']}")
213
+ print(f" lora_alpha: {best_params['lora_alpha']}")
214
+
215
+ return updated_config
scripts/inference.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference script for trained ASR model.
3
+ """
4
+
5
+ import argparse
6
+
7
+ def parse_args():
8
+ parser = argparse.ArgumentParser()
9
+ # Add arguments here
10
+ return parser.parse_args()
11
+
12
+ def main():
13
+ args = parse_args()
14
+ # Inference logic here
15
+
16
+ if __name__ == "__main__":
17
+ main()
scripts/train.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import torch
4
+ from transformers import WhisperProcessor, set_seed
5
+ from typing import Optional, Dict, Any
6
+
7
+ # 设置HuggingFace镜像
8
+ os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
9
+ os.environ['HF_HOME'] = '/root/.cache/huggingface'
10
+ os.environ['TRANSFORMERS_CACHE'] = '/root/.cache/huggingface/transformers'
11
+
12
+ from src.utils.config import load_config
13
+ from src.data.dataset import load_dataset
14
+ from src.data.dataloader import ASRDataLoader
15
+ from src.trainers.lora_trainer import LoRADistillationTrainer
16
+
17
+ # 导入超参数搜索功能
18
+ from scripts.hyperparameter_search import run_hyperparameter_search
19
+
20
+ def run_training(config_path: str = "../configs/default_config.yaml") -> float:
21
+ """运行完整训练流程:超参数优化 + 训练
22
+
23
+ Args:
24
+ config_path: 配置文件路径
25
+
26
+ Returns:
27
+ float: 验证集上的最佳性能指标
28
+ """
29
+
30
+ print("========================================")
31
+ print("开始完整训练流程")
32
+ print("========================================")
33
+
34
+ # 1. 加载配置和数据
35
+ cfg = load_config(config_path)
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ set_seed(42)
38
+
39
+ # 2. 加载数据集和处理器
40
+ processor = WhisperProcessor.from_pretrained(cfg.model.teacher_model)
41
+ train_ds, val_ds = load_dataset(
42
+ root_dir=cfg.data.root_dir,
43
+ processor=processor,
44
+ sample_cap=cfg.data.sample_cap,
45
+ val_ratio=cfg.data.val_ratio
46
+ )
47
+
48
+ # 3. 创建数据加载器(用于超参数搜索和训练)
49
+ data_loader = ASRDataLoader(
50
+ processor=processor,
51
+ batch_size=cfg.training.batch_size,
52
+ num_workers=cfg.data.num_workers,
53
+ max_frames=cfg.data.max_frames,
54
+ sample_rate=cfg.data.sample_rate,
55
+ pin_memory=cfg.data.pin_memory,
56
+ persistent_workers=cfg.data.persistent_workers,
57
+ prefetch_factor=cfg.data.prefetch_factor
58
+ )
59
+
60
+ # 4. 第一步:超参数优化
61
+ print("\n第一步: 开始超参数优化...")
62
+ print("正在运行 50 次试验以找到最佳超参数...")
63
+
64
+ # 获取 collate function
65
+ safe_collate_fn = data_loader.safe_collate_fn
66
+
67
+ # 运行超参数搜索(使用已有的函数)
68
+ best_params = run_hyperparameter_search(train_ds, safe_collate_fn)
69
+ print("✓ 超参数优化完成,最佳参数已更新到配置文件")
70
+
71
+ # 5. 重新加载更新后的配置
72
+ cfg = load_config(config_path)
73
+
74
+ # 6. 第二步:使用最佳参数进行训练
75
+ print("\n第二步: 开始正式训练...")
76
+ print("使用优化后的超参数进行模型训练...")
77
+
78
+ # 重新创建数据加载器(使用新的batch_size)
79
+ data_loader = ASRDataLoader(
80
+ processor=processor,
81
+ batch_size=cfg.training.batch_size,
82
+ num_workers=cfg.data.num_workers,
83
+ max_frames=cfg.data.max_frames,
84
+ sample_rate=cfg.data.sample_rate,
85
+ pin_memory=cfg.data.pin_memory,
86
+ persistent_workers=cfg.data.persistent_workers,
87
+ prefetch_factor=cfg.data.prefetch_factor
88
+ )
89
+ train_loader = data_loader.get_loader(train_ds, shuffle=True, drop_last=True)
90
+ val_loader = data_loader.get_loader(val_ds, shuffle=False, drop_last=False)
91
+
92
+ # 7. 创建训练器并开始训练
93
+ trainer = LoRADistillationTrainer(
94
+ config=cfg,
95
+ processor=processor,
96
+ train_dl=train_loader,
97
+ val_dl=val_loader,
98
+ device=device
99
+ )
100
+
101
+ # 8. 开始训练
102
+ best_metric = trainer.train()
103
+
104
+ # 9. 清理资源
105
+ del trainer
106
+ torch.cuda.empty_cache()
107
+ gc.collect()
108
+
109
+ print("\n========================================")
110
+ print("✓ 完整训练流程成功完成!")
111
+ print(" 1. 超参数优化 ✓")
112
+ print(" 2. 模型训练 ✓")
113
+ print("========================================")
114
+
115
+ return best_metric
116
+
117
+ if __name__ == "__main__":
118
+ run_training()
119
+
scripts/train.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Training script for ASR model
4
+
5
+ # 帮助函数
6
+ print_usage() {
7
+ echo "用法: $0 [选项]"
8
+ echo "选项:"
9
+ echo " -h, --help 显示帮助信息"
10
+ echo ""
11
+ echo "执行流程:"
12
+ echo " 1. 运行超参数优化 (50次试验)"
13
+ echo " 2. 将最佳参数自动更新到配置文件"
14
+ echo " 3. 使用最佳参数进行完整训练"
15
+ echo ""
16
+ echo "注意: 整个流程会自动完成,无需手动干预"
17
+ }
18
+
19
+ # 解析命令行参数
20
+ while [ $# -gt 0 ]; do
21
+ case $1 in
22
+ -h|--help)
23
+ print_usage
24
+ exit 0
25
+ ;;
26
+ *)
27
+ echo "错误: 未知参数 $1"
28
+ print_usage
29
+ exit 1
30
+ ;;
31
+ esac
32
+ done
33
+
34
+ # 设置Python路径
35
+ export PYTHONPATH="$PYTHONPATH:$(cd .. && pwd)"
36
+
37
+ # 运行完整训练流程
38
+ echo "启动完整训练流程..."
39
+ echo "执行命令: python train.py"
40
+ echo ""
41
+
42
+ python train.py
43
+
44
+ # 检查执行结果
45
+ if [ $? -eq 0 ]; then
46
+ echo ""
47
+ echo "🎉 训练流程成功完成!"
48
+ else
49
+ echo ""
50
+ echo "❌ 训练流程执行失败"
51
+ exit 1
52
+ fi
scripts/train_taid.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 使用 TAID (Temperature-Aware Interpolation Distillation) 进行 LoRA 知识蒸馏训练。
3
+ 包含超参数调优和完整训练流程。
4
+ """
5
+
6
+ import os
7
+ import gc
8
+ import json
9
+ import torch
10
+ from transformers import WhisperProcessor, set_seed
11
+
12
+ from src.data.dataset import load_dataset
13
+ from src.data.dataloader import ASRDataLoader
14
+ from src.utils.config import load_config
15
+ from src.trainers.taid_trainer import TAIDDistillationTrainer
16
+
17
+ # 导入超参数搜索功能
18
+ from scripts.hyperparameter_search import run_hyperparameter_search
19
+
20
+ def run_training(config_path: str = "../configs/default_config.yaml") -> float:
21
+ """运行完整训练流程:超参数优化 + 训练
22
+
23
+ Args:
24
+ config_path: 配置文件路径
25
+
26
+ Returns:
27
+ float: 验证集上的最佳性能指标
28
+ """
29
+ print("========================================")
30
+ print("开始完整训练流程")
31
+ print("========================================")
32
+
33
+ # 1. 设置环境
34
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
+ set_seed(42)
36
+
37
+ # 2. 加载配置
38
+ cfg = load_config(config_path)
39
+
40
+ # 3. 加载数据集和处理器
41
+ processor = WhisperProcessor.from_pretrained(cfg.model.student_model)
42
+ train_ds, val_ds = load_dataset(
43
+ root_dir=cfg.data.root_dir,
44
+ processor=processor,
45
+ sample_cap=cfg.data.sample_cap,
46
+ val_ratio=cfg.data.val_ratio
47
+ )
48
+
49
+ # 4. 创建数据加载器
50
+ data_loader = ASRDataLoader(
51
+ processor=processor,
52
+ batch_size=cfg.training.batch_size,
53
+ num_workers=cfg.data.num_workers,
54
+ max_frames=cfg.data.max_frames,
55
+ sample_rate=cfg.data.sample_rate,
56
+ pin_memory=cfg.data.pin_memory,
57
+ persistent_workers=cfg.data.persistent_workers,
58
+ prefetch_factor=cfg.data.prefetch_factor
59
+ )
60
+
61
+ # 5. 第一步:超参数优化
62
+ print("\n第一步: 开始超参数优化...")
63
+ best_params = run_hyperparameter_search(train_ds, data_loader.safe_collate_fn)
64
+ print("✓ 超参数优化完成,最佳参数已更新到配置文件")
65
+
66
+ # 6. 重新加载更新后的配置
67
+ cfg = load_config(config_path)
68
+
69
+ # 7. 第二步:使用最佳参数进行训练
70
+ print("\n第二步: 开始正式训练...")
71
+ print("使用优化后的超参数进行模型训练...")
72
+
73
+ # 重新创建数据加载器(使用新的batch_size)
74
+ data_loader = ASRDataLoader(
75
+ processor=processor,
76
+ batch_size=cfg.training.batch_size,
77
+ num_workers=cfg.data.num_workers,
78
+ max_frames=cfg.data.max_frames,
79
+ sample_rate=cfg.data.sample_rate,
80
+ pin_memory=cfg.data.pin_memory,
81
+ persistent_workers=cfg.data.persistent_workers,
82
+ prefetch_factor=cfg.data.prefetch_factor
83
+ )
84
+ train_loader = data_loader.get_loader(train_ds, shuffle=True, drop_last=True)
85
+ val_loader = data_loader.get_loader(val_ds, shuffle=False, drop_last=False)
86
+
87
+ # 8. 创建训练器并开始训练
88
+ trainer = TAIDDistillationTrainer(cfg, processor, train_loader, val_loader, device)
89
+ best_metric = trainer.train()
90
+
91
+ # 9. 清理资源
92
+ trainer.cleanup()
93
+ torch.cuda.empty_cache()
94
+ gc.collect()
95
+
96
+ # 10. 显示训练结果
97
+ out_dir = cfg.output.dir
98
+ print("\n训练完成!")
99
+ print("输出目录内容:", os.listdir(out_dir))
100
+ print("适配器目录内容:", os.listdir(os.path.join(out_dir, "adapter")))
101
+
102
+ # 11. 显示 TAID lambda 进展
103
+ hist = json.load(open(os.path.join(out_dir, "training_history.json")))
104
+ if "taid_lambda" in hist:
105
+ print("\nTAID Lambda 进展:")
106
+ for i in range(0, len(hist["taid_lambda"]), max(1, len(hist["taid_lambda"])//5)):
107
+ step = hist["steps"][i]
108
+ lambda_val = hist["taid_lambda"][i]
109
+ print(f" Step {step}: λ = {lambda_val:.3f}")
110
+
111
+ print("\n========================================")
112
+ print("✓ 完整训练流程成功完成!")
113
+ print(" 1. 超参数优化 ✓")
114
+ print(" 2. 模型训练 ✓")
115
+ print("========================================")
116
+
117
+ return best_metric
118
+
119
+ if __name__ == "__main__":
120
+ run_training()
scripts/train_taid.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Training script for ASR model with TAID
4
+
5
+ # 帮助函数
6
+ print_usage() {
7
+ echo "用法: $0 [选项]"
8
+ echo "选项:"
9
+ echo " -h, --help 显示帮助信息"
10
+ echo ""
11
+ echo "执行流程:"
12
+ echo " 1. 运行超参数优化 (50次试验)"
13
+ echo " 2. 将最佳参数自动更新到配置文件"
14
+ echo " 3. 使用最佳参数进行完整 TAID 训练"
15
+ echo ""
16
+ echo "注意: 整个流程会自动完成,无需手动干预"
17
+ }
18
+
19
+ # 解析命令行参数
20
+ while [ $# -gt 0 ]; do
21
+ case $1 in
22
+ -h|--help)
23
+ print_usage
24
+ exit 0
25
+ ;;
26
+ *)
27
+ echo "错误: 未知参数 $1"
28
+ print_usage
29
+ exit 1
30
+ ;;
31
+ esac
32
+ done
33
+
34
+ # 设置Python路径
35
+ export PYTHONPATH="$PYTHONPATH:$(cd .. && pwd)"
36
+
37
+ # 运行完整训练流程
38
+ echo "启动完整 TAID 训练流程..."
39
+ echo "执行命令: python train_taid.py"
40
+ echo ""
41
+
42
+ python train_taid.py
43
+
44
+ # 检查执行结果
45
+ if [ $? -eq 0 ]; then
46
+ echo ""
47
+ echo "🎉 TAID 训练流程成功完成!"
48
+ else
49
+ echo ""
50
+ echo "❌ TAID 训练流程执行失败"
51
+ exit 1
52
+ fi
src/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (183 Bytes). View file
 
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (133 Bytes). View file
 
src/data/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .dataset import LibriSpeechDataset
2
+ from .dataloader import ASRDataLoader
3
+
4
+ __all__ = ['LibriSpeechDataset', 'ASRDataLoader']
src/data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (351 Bytes). View file
 
src/data/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (277 Bytes). View file
 
src/data/__pycache__/dataloader.cpython-311.pyc ADDED
Binary file (4.84 kB). View file
 
src/data/__pycache__/dataloader.cpython-312.pyc ADDED
Binary file (4.53 kB). View file
 
src/data/__pycache__/dataset.cpython-311.pyc ADDED
Binary file (7.14 kB). View file
 
src/data/__pycache__/dataset.cpython-312.pyc ADDED
Binary file (6.8 kB). View file
 
src/data/dataloader.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DataLoader implementation for ASR training.
3
+ """
4
+
5
+ import torch
6
+ from torch.utils.data import DataLoader
7
+ from transformers import WhisperProcessor
8
+
9
+ class ASRDataLoader:
10
+ def __init__(
11
+ self,
12
+ processor: WhisperProcessor,
13
+ batch_size: int = 32,
14
+ num_workers: int = 2,
15
+ max_frames: int = 3000,
16
+ sample_rate: int = 16000,
17
+ pin_memory: bool = True,
18
+ persistent_workers: bool = True,
19
+ prefetch_factor: int = 2
20
+ ):
21
+
22
+ self.processor = processor
23
+ self.batch_size = batch_size
24
+ self.num_workers = num_workers
25
+ self.max_frames = max_frames
26
+ self.sample_rate = sample_rate
27
+ self.pin_memory = pin_memory
28
+ self.persistent_workers = persistent_workers
29
+ self.prefetch_factor = prefetch_factor
30
+
31
+ def collate_fn(self, batch):
32
+ batch = [b for b in batch if b and b["transcription"].strip()]
33
+ if not batch:
34
+ return None
35
+
36
+ processed_features = []
37
+ processed_labels = []
38
+
39
+ for item in batch:
40
+ audio_array = item["audio"]["array"]
41
+ text = item["transcription"]
42
+
43
+ audio_inputs = self.processor.feature_extractor(
44
+ audio_array,
45
+ sampling_rate=self.sample_rate,
46
+ return_tensors="pt"
47
+ )
48
+
49
+ text_inputs = self.processor.tokenizer(
50
+ text,
51
+ padding="max_length",
52
+ truncation=True,
53
+ max_length=256,
54
+ return_tensors="pt"
55
+ )
56
+
57
+ feats = audio_inputs.input_features[0]
58
+ if feats.shape[-1] > self.max_frames:
59
+ feats = feats[..., :self.max_frames]
60
+ elif feats.shape[-1] < self.max_frames:
61
+ pad_size = self.max_frames - feats.shape[-1]
62
+ feats = torch.nn.functional.pad(feats, (0, pad_size))
63
+
64
+ processed_features.append(feats)
65
+
66
+ labels = text_inputs.input_ids[0]
67
+ labels[labels == self.processor.tokenizer.pad_token_id] = -100
68
+ processed_labels.append(labels)
69
+
70
+ batch_features = torch.stack(processed_features, dim=0)
71
+ batch_labels = torch.stack(processed_labels, dim=0)
72
+
73
+ return {
74
+ "input_features": batch_features,
75
+ "labels": batch_labels
76
+ }
77
+
78
+ def safe_collate_fn(self, batch):
79
+ try:
80
+ result = self.collate_fn(batch)
81
+ if result is None:
82
+ return {
83
+ "input_features": torch.zeros(1, 80, self.max_frames),
84
+ "labels": torch.full((1, 256), -100, dtype=torch.long)
85
+ }
86
+ return result
87
+ except Exception as e:
88
+ print(f"Collate error: {e}")
89
+ return {
90
+ "input_features": torch.zeros(1, 80, self.max_frames),
91
+ "labels": torch.full((1, 256), -100, dtype=torch.long)
92
+ }
93
+
94
+ def get_loader(self, dataset, shuffle=True, drop_last=True):
95
+ return DataLoader(
96
+ dataset,
97
+ batch_size=self.batch_size,
98
+ shuffle=shuffle,
99
+ num_workers=self.num_workers,
100
+ pin_memory=self.pin_memory,
101
+ collate_fn=self.safe_collate_fn,
102
+ persistent_workers=self.persistent_workers,
103
+ prefetch_factor=self.prefetch_factor,
104
+ drop_last=drop_last
105
+ )
src/data/dataset.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset implementation for LibriSpeech ASR.
3
+ """
4
+
5
+ import os
6
+ import glob
7
+ from pathlib import Path
8
+ from typing import List, Optional, Tuple, Dict, Any
9
+
10
+ import torch
11
+ from datasets import Dataset, Audio
12
+ from sklearn.model_selection import train_test_split
13
+
14
+ def load_dataset(
15
+ root_dir: str,
16
+ processor: Any,
17
+ sample_cap: Optional[int] = None,
18
+ val_ratio: float = 0.2
19
+ ) -> Tuple[Dataset, Dataset]:
20
+ """加载并准备数据集
21
+
22
+ Args:
23
+ root_dir: 数据集根目录
24
+ processor: Whisper处理器
25
+ sample_cap: 样本数量上限
26
+ val_ratio: 验证集比例
27
+
28
+ Returns:
29
+ 训练集和验证集的元组
30
+ """
31
+ dataset = LibriSpeechDataset(
32
+ root_dir=root_dir,
33
+ sample_cap=sample_cap,
34
+ val_ratio=val_ratio
35
+ )
36
+ train_ds, val_ds, _ = dataset.prepare_datasets()
37
+ return train_ds, val_ds
38
+
39
+ class LibriSpeechDataset:
40
+ def __init__(
41
+ self,
42
+ root_dir: str,
43
+ sample_rate: int = 16000,
44
+ sample_cap: Optional[int] = None,
45
+ val_ratio: float = 0.2
46
+ ):
47
+ self.root_dir = root_dir
48
+ self.sample_rate = sample_rate
49
+ self.sample_cap = sample_cap
50
+ self.val_ratio = val_ratio
51
+ self.train_ds = None
52
+ self.val_ds = None
53
+ self.test_ds = None
54
+
55
+ def load_split(self, splits: list, cap: int = None) -> Dataset:
56
+ audio_paths, transcripts = [], []
57
+ for split in splits:
58
+ split_dir = os.path.join(self.root_dir, split)
59
+ if not os.path.isdir(split_dir):
60
+ print(f"Warning: missing {split_dir}, skipping")
61
+ continue
62
+ for flac_path in glob.glob(f"{split_dir}/**/*.flac", recursive=True):
63
+ stem = Path(flac_path).stem
64
+ for txt in glob.glob(f"{os.path.dirname(flac_path)}/*.trans.txt"):
65
+ with open(txt, "r", encoding="utf-8") as f:
66
+ for line in f:
67
+ if line.startswith(stem):
68
+ text = line.strip().split(" ", 1)[1]
69
+ audio_paths.append(flac_path)
70
+ transcripts.append(text)
71
+ break
72
+ if len(audio_paths) >= (cap or float("inf")):
73
+ break
74
+ if cap and len(audio_paths) >= cap:
75
+ break
76
+ if cap and len(audio_paths) >= cap:
77
+ break
78
+ if not audio_paths:
79
+ raise ValueError(f"No audio files found under {self.root_dir} for {splits}")
80
+ print(f"Found {len(audio_paths)} audio files")
81
+ ds = Dataset.from_dict({"audio": audio_paths, "transcription": transcripts})
82
+ return ds.cast_column("audio", Audio(sampling_rate=self.sample_rate))
83
+
84
+ def prepare_datasets(self) -> Tuple[Dataset, Dataset, Dataset]:
85
+ """准备训练、验证和测试数据集"""
86
+ # 加载数据
87
+ train_val_ds = self.load_split(["train-clean-100", "dev-clean"], cap=self.sample_cap)
88
+ test_ds = self.load_split(["test-clean"], cap=None)
89
+
90
+ # 分割训练和验证集
91
+ idx = list(range(len(train_val_ds)))
92
+ train_idx, val_idx = train_test_split(
93
+ idx,
94
+ test_size=self.val_ratio,
95
+ random_state=42,
96
+ shuffle=True
97
+ )
98
+
99
+ self.train_ds = train_val_ds.select(train_idx)
100
+ self.val_ds = train_val_ds.select(val_idx)
101
+ self.test_ds = test_ds
102
+
103
+ print(f"Samples → train: {len(self.train_ds)}, "
104
+ f"val: {len(self.val_ds)}, test: {len(self.test_ds)}")
105
+
106
+ return self.train_ds, self.val_ds, self.test_ds
107
+
108
+ @staticmethod
109
+ def summarize_real_durations(ds, name, sr=16_000):
110
+ if not isinstance(ds, Dataset):
111
+ raise RuntimeError(f"{name}_ds 不是 HuggingFace Dataset,请重新运行 Cell 1")
112
+
113
+ ds2 = ds.cast_column("audio", Audio(sampling_rate=sr))
114
+ ds2 = ds2.map(
115
+ lambda batch: {
116
+ "duration": [
117
+ len(item["array"]) / sr
118
+ for item in batch["audio"]
119
+ ]
120
+ },
121
+ batched=True,
122
+ batch_size=32,
123
+ num_proc=4,
124
+ remove_columns=["audio"],
125
+ load_from_cache_file=False,
126
+ )
127
+
128
+ durations = ds2["duration"]
129
+ total_h = sum(durations) / 3600.0
130
+ avg_s = sum(durations) / len(durations)
131
+ print(f"{name:5s} | Samples: {len(ds2):4d} | Avg: {avg_s:5.1f}s | Total: {total_h:5.1f}h")
src/models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
src/models/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (140 Bytes). View file
 
src/models/__pycache__/lora.cpython-311.pyc ADDED
Binary file (4.56 kB). View file
 
src/models/__pycache__/lora.cpython-312.pyc ADDED
Binary file (4.35 kB). View file
 
src/models/lora.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LoRA (Low-Rank Adaptation) implementation.
3
+ """
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from transformers import WhisperForConditionalGeneration
8
+ from peft import LoraConfig, get_peft_model, TaskType, PeftModel
9
+ from typing import Optional
10
+ import itertools
11
+
12
+
13
+ def build_lora_student_model(
14
+ config,
15
+ device: torch.device
16
+ ) -> tuple[PeftModel, Optional[nn.Linear]]:
17
+ """构建 LoRA 学生模型"""
18
+ base = WhisperForConditionalGeneration.from_pretrained(
19
+ config.model.student_model,
20
+ torch_dtype=torch.float16,
21
+ use_cache=False
22
+ )
23
+ base.gradient_checkpointing_enable()
24
+
25
+ for p in itertools.chain(base.model.encoder.parameters(),
26
+ base.model.decoder.parameters()):
27
+ p.requires_grad = False
28
+
29
+ # LoRA 配置
30
+ lcfg = LoraConfig(
31
+ task_type=TaskType.SEQ_2_SEQ_LM,
32
+ r=config.lora.r,
33
+ lora_alpha=config.lora.alpha,
34
+ lora_dropout=config.lora.dropout,
35
+ target_modules=config.lora.target_modules,
36
+ bias="none"
37
+ )
38
+
39
+ student = get_peft_model(base, lcfg).to(device)
40
+
41
+ proj = nn.Linear(
42
+ config.model.student_hidden_dim,
43
+ config.model.teacher_hidden_dim
44
+ ).to(device) if config.distillation.hidden_beta > 0 else None
45
+
46
+ trainable = sum(p.numel() for p in student.parameters() if p.requires_grad)
47
+ if proj:
48
+ trainable += sum(p.numel() for p in proj.parameters())
49
+ total = sum(p.numel() for p in student.parameters()) + \
50
+ (sum(p.numel() for p in proj.parameters()) if proj else 0)
51
+ print(f"Trainable parameters: {trainable:,} ({trainable/total*100:.2f}%)")
52
+
53
+ return student, proj
54
+
55
+
56
+ def build_teacher_model(
57
+ config,
58
+ device: torch.device
59
+ ) -> WhisperForConditionalGeneration:
60
+ """构建教师模型"""
61
+ teacher = WhisperForConditionalGeneration.from_pretrained(
62
+ config.model.teacher_model,
63
+ torch_dtype=torch.float16,
64
+ use_cache=False
65
+ ).eval()
66
+
67
+ # 冻结所有参数
68
+ for p in teacher.parameters():
69
+ p.requires_grad = False
70
+
71
+ return teacher
src/trainers/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/trainers/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (142 Bytes). View file