|
|
--- |
|
|
tags: |
|
|
- espnet |
|
|
- audio |
|
|
--- |
|
|
|
|
|
|
|
|
Repository: `ms180/mini_an4_integration_test` |
|
|
|
|
|
|
|
|
## Training |
|
|
|
|
|
- System: `ASRSystem` |
|
|
- Recipe: `mini_an4/asr` |
|
|
- Creator: `ms180` |
|
|
- Created: `2026-01-14T00:36:01.951598` |
|
|
- Git: `8509faad9811b58d5024f29fb9d68ffb026b5e73` (dirty) |
|
|
|
|
|
## Pack |
|
|
|
|
|
- Archive: `model_pack` |
|
|
- Strategy: `espnet3` |
|
|
- Exp dir: `exp/train_asr_rnn_data_aug_debug` |
|
|
|
|
|
|
|
|
## Train config |
|
|
|
|
|
<details><summary>expand</summary> |
|
|
|
|
|
``` |
|
|
num_device: 1 |
|
|
num_nodes: 1 |
|
|
task: espnet3.systems.asr.task.ASRTask |
|
|
recipe_dir: . |
|
|
data_dir: ./data |
|
|
exp_tag: train_asr_rnn_data_aug_debug |
|
|
exp_dir: ./exp/train_asr_rnn_data_aug_debug |
|
|
stats_dir: ./exp/stats |
|
|
decode_dir: ./exp/train_asr_rnn_data_aug_debug/decode |
|
|
dataset_dir: ./data/mini_an4 |
|
|
create_dataset: |
|
|
func: src.create_dataset.create_dataset |
|
|
dataset_dir: ./data/mini_an4 |
|
|
archive_path: ./../../egs2/mini_an4/asr1/downloads.tar.gz |
|
|
dataset: |
|
|
_target_: espnet3.components.data.data_organizer.DataOrganizer |
|
|
train: |
|
|
- name: train_nodev |
|
|
dataset: |
|
|
_target_: src.dataset.MiniAN4Dataset |
|
|
manifest_path: ./data/mini_an4/manifest/train_nodev.tsv |
|
|
valid: |
|
|
- name: train_dev |
|
|
dataset: |
|
|
_target_: src.dataset.MiniAN4Dataset |
|
|
manifest_path: ./data/mini_an4/manifest/train_dev.tsv |
|
|
preprocessor: |
|
|
_target_: espnet2.train.preprocessor.CommonPreprocessor |
|
|
_convert_: all |
|
|
fs: 16000 |
|
|
train: true |
|
|
data_aug_effects: |
|
|
- - 0.1 |
|
|
- contrast |
|
|
- enhancement_amount: 75.0 |
|
|
- - 0.1 |
|
|
- highpass |
|
|
- cutoff_freq: 5000 |
|
|
Q: 0.707 |
|
|
- - 0.1 |
|
|
- equalization |
|
|
- center_freq: 1000 |
|
|
gain: 0 |
|
|
Q: 0.707 |
|
|
- - 0.1 |
|
|
- - - 0.3 |
|
|
- speed_perturb |
|
|
- factor: 0.9 |
|
|
- - 0.3 |
|
|
- speed_perturb |
|
|
- factor: 1.1 |
|
|
- - 0.3 |
|
|
- speed_perturb |
|
|
- factor: 1.3 |
|
|
data_aug_num: |
|
|
- 1 |
|
|
- 4 |
|
|
data_aug_prob: 1.0 |
|
|
token_type: bpe |
|
|
token_list: ./data/bpe_30/tokens.txt |
|
|
bpemodel: ./data/bpe_30/bpe.model |
|
|
parallel: |
|
|
env: local |
|
|
n_workers: 1 |
|
|
dataloader: |
|
|
collate_fn: |
|
|
_target_: espnet2.train.collate_fn.CommonCollateFn |
|
|
int_pad_value: -1 |
|
|
train: |
|
|
multiple_iterator: false |
|
|
num_shards: 1 |
|
|
iter_factory: |
|
|
_target_: espnet2.iterators.sequence_iter_factory.SequenceIterFactory |
|
|
shuffle: true |
|
|
collate_fn: |
|
|
_target_: espnet2.train.collate_fn.CommonCollateFn |
|
|
int_pad_value: -1 |
|
|
num_workers: 0 |
|
|
batches: |
|
|
type: sorted |
|
|
shape_files: |
|
|
- ./exp/stats/train/feats_shape |
|
|
batch_size: 2 |
|
|
batch_bins: 200000 |
|
|
valid: |
|
|
multiple_iterator: false |
|
|
num_shards: 1 |
|
|
iter_factory: |
|
|
_target_: espnet2.iterators.sequence_iter_factory.SequenceIterFactory |
|
|
shuffle: false |
|
|
collate_fn: |
|
|
_target_: espnet2.train.collate_fn.CommonCollateFn |
|
|
int_pad_value: -1 |
|
|
batches: |
|
|
type: sorted |
|
|
shape_files: |
|
|
- ./exp/stats/valid/feats_shape |
|
|
batch_size: 2 |
|
|
batch_bins: 200000 |
|
|
optim: |
|
|
_target_: torch.optim.Adam |
|
|
lr: 0.001 |
|
|
weight_decay: 0.0 |
|
|
scheduler: |
|
|
_target_: torch.optim.lr_scheduler.ReduceLROnPlateau |
|
|
mode: min |
|
|
factor: 0.5 |
|
|
patience: 1 |
|
|
val_scheduler_criterion: valid/loss |
|
|
best_model_criterion: |
|
|
- - valid/acc |
|
|
- 1 |
|
|
- max |
|
|
trainer: |
|
|
accelerator: auto |
|
|
devices: 1 |
|
|
num_nodes: 1 |
|
|
accumulate_grad_batches: 1 |
|
|
check_val_every_n_epoch: 1 |
|
|
gradient_clip_val: 1.0 |
|
|
log_every_n_steps: 1 |
|
|
max_epochs: 1 |
|
|
limit_train_batches: 1 |
|
|
limit_val_batches: 1 |
|
|
precision: 32 |
|
|
logger: |
|
|
- _target_: lightning.pytorch.loggers.TensorBoardLogger |
|
|
save_dir: ./exp/train_asr_rnn_data_aug_debug/tensorboard |
|
|
name: tb_logger |
|
|
strategy: auto |
|
|
tokenizer: |
|
|
vocab_size: 30 |
|
|
character_coverage: 1.0 |
|
|
model_type: bpe |
|
|
save_path: ./data/bpe_30 |
|
|
text_builder: |
|
|
func: src.tokenizer.gather_training_text |
|
|
manifest_path: ./data/mini_an4/manifest/train_nodev.tsv |
|
|
model: |
|
|
vocab_size: 30 |
|
|
token_list: ./data/bpe_30/tokens.txt |
|
|
encoder: vgg_rnn |
|
|
encoder_conf: |
|
|
num_layers: 1 |
|
|
hidden_size: 2 |
|
|
output_size: 2 |
|
|
decoder: rnn |
|
|
decoder_conf: |
|
|
hidden_size: 2 |
|
|
normalize: utterance_mvn |
|
|
normalize_conf: {} |
|
|
model_conf: |
|
|
ctc_weight: 0.3 |
|
|
lsm_weight: 0.1 |
|
|
length_normalized_loss: false |
|
|
frontend: default |
|
|
frontend_conf: |
|
|
n_fft: 512 |
|
|
win_length: 400 |
|
|
hop_length: 160 |
|
|
|
|
|
``` |
|
|
|
|
|
</details> |
|
|
|
|
|
### Citing ESPnet |
|
|
|
|
|
``` |
|
|
@inproceedings{watanabe2018espnet, |
|
|
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and |
|
|
Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner |
|
|
and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
|
|
title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
|
|
year={2018}, |
|
|
booktitle={Proceedings of Interspeech}, |
|
|
pages={2207--2211}, |
|
|
doi={10.21437/Interspeech.2018-1456} |
|
|
} |
|
|
``` |
|
|
|