flexthink
/

soundchoice-g2p

@@ -10,60 +10,41 @@
 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
-__set_seed: !apply:torch.manual_seed [1234]
 # Tokenizers
-char_tokenize: false
 char_token_type: unigram  # ["unigram", "bpe", "char"]
 char_token_output: 512
-char_token_wordwise: true
-phn_tokenize: false
 phn_token_type: unigram  # ["unigram", "bpe", "char"]
 phn_token_output: 512  # index(blank/eos/bos/unk) = 0
-phn_token_wordwise: true
 character_coverage: 1.0
 phonemes_count: 43
 graphemes_count: 31
-phonemes_enable_space: true
-# Training Parameters
-lexicon_epochs: 50
-lexicon_ctc_epochs: 10
-lexicon_limit_to_stop: 50                    # No stopping by default, can override
-lexicon_limit_warmup: 50                    # No stopping by default, can override
-sentence_epochs: 13
-sentence_ctc_epochs: 10
-sentence_limit_to_stop: 3
-sentence_limit_warmup: 3
-homograph_epochs: 50
-homograph_ctc_epochs: 10
-homograph_limit_to_stop: 5
-homograph_limit_warmup: 10
-lexicon_batch_size: 1024
-sentence_batch_size: 32
-homograph_batch_size: 32
 ctc_weight: 0.5
 ctc_window_size: 0
 homograph_loss_weight: 2.0
-lr: 0.002
-save_for_pretrained: true
 # Model parameters
-output_neurons: &id004 !apply:speechbrain.utils.hparams.choice
-  value: false
   choices:
-    true: 513
-    false: 43
-enc_num_embeddings: &id005 !apply:speechbrain.utils.hparams.choice
-  value: false
   choices:
-    true: 513
-    false: 31
 enc_dropout: 0.5
 enc_neurons: 512
@@ -118,7 +99,7 @@ word_emb_dim: 768
 word_emb_enc_dim: 256
 word_emb_norm_type: batch
-graphemes: &id028
 - A
 - B
 - C
@@ -148,9 +129,7 @@ graphemes: &id028
 - "'"
 - ' '
-phonemes: &id001
 - AA
 - AE
 - AH
@@ -192,94 +171,91 @@ phonemes: &id001
 - ZH
 - ' '
-enc_input_dim: &id003 !apply:speechbrain.lobes.models.g2p.model.input_dim
-  use_word_emb: true
-  word_emb_enc_dim: 256
-  embedding_dim: 512
-phn_char_map: &id002 !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
-# Models
-  tokens: *id001
-char_phn_map: &id023 !apply:speechbrain.lobes.models.g2p.dataio.flip_map
-  map_dict: *id002
-enc: &id006 !new:speechbrain.nnet.RNN.LSTM
-  input_shape: [null, null, *id003]
-  bidirectional: true
-  hidden_size: 512
-  num_layers: 4
-  dropout: 0.5
-lin: &id010 !new:speechbrain.nnet.linear.Linear
-  input_size: 512
-  n_neurons: *id004
   bias: false
-ctc_lin: &id013 !new:speechbrain.nnet.linear.Linear
-  input_size: 1024
-  n_neurons: *id004
-encoder_emb: &id007 !new:speechbrain.nnet.embedding.Embedding
-  num_embeddings: *id005
-  embedding_dim: 512
-emb: &id008 !new:speechbrain.nnet.embedding.Embedding
-  num_embeddings: *id004
-  embedding_dim: 512
-dec: &id009 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
-  enc_dim: 1024
-  input_size: 512
   rnn_type: gru
   attn_type: content
-  dropout: 0.5
-  hidden_size: 512
-  attn_dim: 256
-  num_layers: 4
-word_emb_enc: &id012 !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
-  word_emb_dim: 768
-  word_emb_enc_dim: 256
   norm_type: batch
 word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
   init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
     model: bert-base-uncased
-log_softmax: &id011 !new:speechbrain.nnet.activations.Softmax
   apply_log: true
 modules:
-  model: &id014 !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
-    enc: *id006
-    encoder_emb: *id007
-    emb: *id008
-    dec: *id009
-    lin: *id010
-    out: *id011
-    use_word_emb: true
-    word_emb_enc: *id012
-  enc: *id006
-  encoder_emb: *id007
-  emb: *id008
-  dec: *id009
-  lin: *id010
-  ctc_lin: *id013
-  out: *id011
   word_emb: !ref <word_emb>
-  word_emb_enc: *id012
-model: *id014
-lm_model: &id015 !new:speechbrain.lobes.models.RNNLM.RNNLM
-  embedding_dim: 256
-  rnn_layers: 2
-  rnn_neurons: 512
-  output_neurons: 43
-  return_hidden: true
-opt_class: !name:torch.optim.Adam
-  lr: 0.002
 ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
   eos_index: !ref <eos_index>
@@ -296,66 +272,38 @@ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
       coverage: !ref <beam_search_coverage_penalty>
       ctc: !ref <ctc_weight>
-beam_searcher: &id029 !new:speechbrain.decoders.S2SRNNBeamSearcher
-  embedding: *id008
-  decoder: *id009
-  linear: *id010
   bos_index: !ref <bos_index>
   eos_index: !ref <eos_index>
-  min_decode_ratio: 0
-  max_decode_ratio: 1.0
-  beam_size: 16
-  eos_threshold: 10.0
-  using_max_attn_shift: false
-  max_attn_shift: 10
   scorer: !ref <scorer>
 beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
-  embedding: *id008
-  decoder: *id009
-  linear: *id010
   bos_index: !ref <bos_index>
   eos_index: !ref <eos_index>
-  min_decode_ratio: 0
-  max_decode_ratio: 1.0
-  beam_size: 16
-  eos_threshold: 10.0
-  using_max_attn_shift: false
-  max_attn_shift: 10
   scorer: !ref <scorer>
-lr_annealing: &id018 !new:speechbrain.nnet.schedulers.NewBobScheduler
-  initial_value: 0.002
-  improvement_threshold: 0.0
-  annealing_factor: 0.8
-  patient: 0
 homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
-seq_cost: &id016 !name:speechbrain.nnet.losses.nll_loss
-  label_smoothing: 0.1
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-  blank_index: 2
-seq_cost_metric: &id017 !name:speechbrain.nnet.losses.nll_loss
-  label_smoothing: 0.1
-  reduction: batch
-homograph_cost: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceLoss
-  seq_cost: *id016
-seq_stats: !name:speechbrain.utils.metric_stats.MetricStats
-  metric: *id017
-seq_stats_homograph: !name:speechbrain.utils.metric_stats.MetricStats
-  metric: *id017
-classification_stats_homograph: !name:speechbrain.utils.metric_stats.ClassificationStats
-per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
-per_stats_homograph: !name:speechbrain.utils.metric_stats.ErrorRateStats
 model_output_keys:
 - p_seq
 - char_lens
@@ -368,46 +316,45 @@ phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
 grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
   init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
     model_dir: grapheme_tokenizer
-    bos_id: 0
-    eos_id: 1
-    unk_id: 2
-    vocab_size: 512
-    annotation_train: tokenizer_annotation_train.json
     annotation_read: char
-    model_type: unigram                    # ["unigram", "bpe", "char"]
-    character_coverage: 1.0
     annotation_format: json
     text_file: grapheme_annotations.txt
-phoneme_tokenizer: &id022 !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
   init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
     model_dir: phoneme_tokenizer
-    bos_id: 0
-    eos_id: 1
-    unk_id: 2
-    vocab_size: 512
-    annotation_train: tokenizer_annotation_train.json
     annotation_read: phn
-    model_type: unigram                   # ["unigram", "bpe", "char"]
-    character_coverage: 1.0
-    annotation_list_to_check: [tokenizer_annotation_valid.json]
     annotation_format: json
-    text_file: phoneme_annotations.txt
-out_phoneme_decoder_tok: &id025 !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
-  tokenizer: *id022
-  char_map: *id023
-  token_space_index: 512
-  wordwise: true
-out_phoneme_decoder_raw: &id026 !name:speechbrain.lobes.models.g2p.dataio.text_decode
-  encoder: *id024
 out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
   value: false
   choices:
-    true: *id025
-    false: *id026
 encode_pipeline:
   batch: false
   use_padded_data: true
@@ -418,22 +365,22 @@ encode_pipeline:
   - word_emb
   init:
   - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
-      encoder: *id027
-      tokens: *id028
-      bos_index: 0
-      eos_index: 1
   - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
-      encoder: *id024
-      tokens: *id001
-      bos_index: 0
-      eos_index: 1
   steps:
   - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
-      graphemes: *id028
     takes: txt
     provides: txt_cleaned
   - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
-      grapheme_encoder: *id027
     takes: txt_cleaned
     provides:
     - grapheme_list
@@ -441,7 +388,7 @@ encode_pipeline:
     - grapheme_encoded_raw
   - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
-      encoder: *id027
     takes: grapheme_encoded_list
     provides:
     - grapheme_encoded
@@ -464,7 +411,7 @@ decode_pipeline:
   - phonemes
   steps:
   - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
-      beam_searcher: *id029
     takes:
     - char_lens
     - encoder_out
@@ -474,13 +421,13 @@ decode_pipeline:
   - func: !apply:speechbrain.utils.hparams.choice
       value: false
       choices:
-        true: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
-          tokenizer: *id022
-          char_map: *id023
-          token_space_index: 512
-          wordwise: true
-        false: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
-          phoneme_encoder: *id024
     takes:
     - hyps
     provides:
@@ -489,6 +436,5 @@ decode_pipeline:
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
-    model: *id014
-    ctc_lin: *id013

 # Seed needs to be set at top of yaml, before objects with parameters are made
 seed: 1234
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
 # Tokenizers
+char_tokenize: False
 char_token_type: unigram  # ["unigram", "bpe", "char"]
 char_token_output: 512
+char_token_wordwise: True
+phn_tokenize: False
 phn_token_type: unigram  # ["unigram", "bpe", "char"]
 phn_token_output: 512  # index(blank/eos/bos/unk) = 0
+phn_token_wordwise: True
 character_coverage: 1.0
 phonemes_count: 43
 graphemes_count: 31
+phonemes_enable_space: True
 ctc_weight: 0.5
 ctc_window_size: 0
 homograph_loss_weight: 2.0
 # Model parameters
+output_neurons: !apply:speechbrain.utils.hparams.choice
+  value: !ref <phn_tokenize>
   choices:
+    True: !ref <phn_token_output> + 1
+    False: !ref <phonemes_count>
+enc_num_embeddings: !apply:speechbrain.utils.hparams.choice
+  value: !ref <char_tokenize>
   choices:
+    True: !ref <char_token_output> + 1
+    False: !ref <graphemes_count>
 enc_dropout: 0.5
 enc_neurons: 512
 word_emb_enc_dim: 256
 word_emb_norm_type: batch
+graphemes:
 - A
 - B
 - C
 - "'"
 - ' '
+phonemes:
 - AA
 - AE
 - AH
 - ZH
 - ' '
+enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim
+  use_word_emb: !ref <use_word_emb>
+  word_emb_enc_dim: !ref <word_emb_enc_dim>
+  embedding_dim: !ref <embedding_dim>
+phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
+  tokens: !ref <phonemes>
+char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map
+  map_dict: !ref <phn_char_map>
+enc: !new:speechbrain.nnet.RNN.LSTM
+  input_shape: [null, null, !ref <enc_input_dim>]
+  bidirectional: True
+  hidden_size: !ref <enc_neurons>
+  num_layers: !ref <enc_num_layers>
+  dropout: !ref <enc_dropout>
+lin: !new:speechbrain.nnet.linear.Linear
+  input_size: !ref <dec_neurons>
+  n_neurons: !ref <output_neurons>
   bias: false
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+  input_size: !ref 2 * <enc_neurons>
+  n_neurons: !ref <output_neurons>
+encoder_emb: !new:speechbrain.nnet.embedding.Embedding
+  num_embeddings: !ref <enc_num_embeddings>
+  embedding_dim: !ref <embedding_dim>
+emb: !new:speechbrain.nnet.embedding.Embedding
+  num_embeddings: !ref <output_neurons>
+  embedding_dim: !ref <embedding_dim>
+dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
+  enc_dim: !ref <enc_neurons> * 2
+  input_size: !ref <embedding_dim>
   rnn_type: gru
   attn_type: content
+  dropout: !ref <dec_dropout>
+  hidden_size: !ref <dec_neurons>
+  attn_dim: !ref <dec_att_neurons>
+  num_layers: !ref <dec_num_layers>
+word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
+  word_emb_dim: !ref <word_emb_dim>
+  word_emb_enc_dim: !ref <word_emb_enc_dim>
   norm_type: batch
 word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
   init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
     model: bert-base-uncased
+log_softmax: !new:speechbrain.nnet.activations.Softmax
   apply_log: true
+model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
+  enc: !ref <enc>
+  encoder_emb: !ref <encoder_emb>
+  emb: !ref <emb>
+  dec: !ref <dec>
+  lin: !ref <lin>
+  out: !ref <log_softmax>
+  use_word_emb: !ref <use_word_emb>
+  word_emb_enc: !ref <word_emb_enc>
 modules:
+  model: !ref <model>
+  enc: !ref <enc>
+  encoder_emb: !ref <encoder_emb>
+  emb: !ref <emb>
+  dec: !ref <dec>
+  lin: !ref <lin>
+  ctc_lin: !ref <ctc_lin>
+  out: !ref <log_softmax>
   word_emb: !ref <word_emb>
+  word_emb_enc: !ref <word_emb_enc>
+lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
+  embedding_dim: !ref <lm_emb_dim>
+  rnn_layers: !ref <lm_layers>
+  rnn_neurons: !ref <lm_rnn_size>
+  output_neurons: !ref <lm_output_neurons>
+  return_hidden: True
 ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
   eos_index: !ref <eos_index>
       coverage: !ref <beam_search_coverage_penalty>
       ctc: !ref <ctc_weight>
+beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
+  embedding: !ref <emb>
+  decoder: !ref <dec>
+  linear: !ref <lin>
   bos_index: !ref <bos_index>
   eos_index: !ref <eos_index>
+  min_decode_ratio: !ref <beam_search_min_decode_ratio>
+  max_decode_ratio: !ref <beam_search_max_decode_ratio>
+  beam_size: !ref <beam_search_beam_size>
+  eos_threshold: !ref <beam_search_eos_threshold>
+  using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
+  max_attn_shift: !ref <beam_search_max_attn_shift>
+  temperature: !ref <beam_search_temperature>
   scorer: !ref <scorer>
 beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
+  embedding: !ref <emb>
+  decoder: !ref <dec>
+  linear: !ref <lin>
   bos_index: !ref <bos_index>
   eos_index: !ref <eos_index>
+  min_decode_ratio: !ref <beam_search_min_decode_ratio>
+  max_decode_ratio: !ref <beam_search_max_decode_ratio>
+  beam_size: !ref <beam_search_beam_size>
+  eos_threshold: !ref <beam_search_eos_threshold>
+  using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
+  max_attn_shift: !ref <beam_search_max_attn_shift>
+  temperature: !ref <beam_search_temperature>
   scorer: !ref <scorer>
 homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
 model_output_keys:
 - p_seq
 - char_lens
 grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
   init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
     model_dir: grapheme_tokenizer
+    bos_id: !ref <bos_index>
+    eos_id: !ref <eos_index>
+    unk_id: !ref <unk_index>
+    vocab_size: !ref <char_token_output>
+    annotation_train: null
     annotation_read: char
+    model_type: !ref <char_token_type> # ["unigram", "bpe", "char"]
+    character_coverage: !ref <character_coverage>
     annotation_format: json
     text_file: grapheme_annotations.txt
+phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
   init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
     model_dir: phoneme_tokenizer
+    bos_id: !ref <bos_index>
+    eos_id: !ref <eos_index>
+    unk_id: !ref <unk_index>
+    vocab_size: !ref <phn_token_output>
+    annotation_train: null
     annotation_read: phn
+    model_type: !ref <phn_token_type> # ["unigram", "bpe", "char"]
+    character_coverage: !ref <character_coverage>
     annotation_format: json
+    text_file: null
+out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
+  tokenizer: !ref <phoneme_tokenizer>
+  char_map: !ref <char_phn_map>
+  token_space_index: !ref <token_space_index>
+  wordwise: !ref <phn_token_wordwise>
+out_phoneme_decoder_raw:  !name:speechbrain.lobes.models.g2p.dataio.text_decode
+  encoder: !ref <phoneme_encoder>
 out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
   value: false
   choices:
+    True: !ref <out_phoneme_decoder_tok>
+    False: !ref <out_phoneme_decoder_raw>
 encode_pipeline:
   batch: false
   use_padded_data: true
   - word_emb
   init:
   - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
+      encoder: !ref <grapheme_encoder>
+      tokens: !ref <graphemes>
+      bos_index: !ref <bos_index>
+      eos_index: !ref <eos_index>
   - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
+      encoder: !ref <phoneme_encoder>
+      tokens: !ref <phonemes>
+      bos_index: !ref <bos_index>
+      eos_index: !ref <eos_index>
   steps:
   - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
+      graphemes: !ref <graphemes>
     takes: txt
     provides: txt_cleaned
   - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
+      grapheme_encoder: !ref <grapheme_encoder>
     takes: txt_cleaned
     provides:
     - grapheme_list
     - grapheme_encoded_raw
   - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
+      encoder: !ref <grapheme_encoder>
     takes: grapheme_encoded_list
     provides:
     - grapheme_encoded
   - phonemes
   steps:
   - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
+      beam_searcher: !ref <beam_searcher>
     takes:
     - char_lens
     - encoder_out
   - func: !apply:speechbrain.utils.hparams.choice
       value: false
       choices:
+        True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
+          tokenizer: !ref <phoneme_tokenizer>
+          char_map: !ref <char_phn_map>
+          token_space_index: !ref <token_space_index>
+          wordwise: !ref <phn_token_wordwise>
+        False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
+          phoneme_encoder: !ref <phoneme_encoder>
     takes:
     - hyps
     provides:
 pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
+    model: !ref <model>
+    ctc_lin: !ref <ctc_lin>