| { |
| "architectures": [ |
| "CohereAsrForConditionalGeneration" |
| ], |
| "auto_map": { |
| "AutoConfig": "configuration_cohere_asr.CohereAsrConfig", |
| "AutoFeatureExtractor": "processing_cohere_asr.CohereAsrFeatureExtractor", |
| "AutoModel": "modeling_cohere_asr.CohereAsrModel", |
| "AutoModelForSpeechSeq2Seq": "modeling_cohere_asr.CohereAsrForConditionalGeneration", |
| "AutoProcessor": "processing_cohere_asr.CohereAsrProcessor", |
| "AutoTokenizer": "tokenization_cohere_asr.CohereAsrTokenizer" |
| }, |
| "batch_size": 128, |
| "decoding": { |
| "beam": { |
| "beam_size": 1, |
| "len_pen": 0.0, |
| "max_generation_delta": 50 |
| }, |
| "return_best_hypothesis": true, |
| "strategy": "beam" |
| }, |
| "encoder": { |
| "att_context_size": [ |
| -1, |
| -1 |
| ], |
| "causal_downsampling": false, |
| "conv_context_size": null, |
| "conv_kernel_size": 9, |
| "conv_norm_type": "batch_norm", |
| "d_model": 1280, |
| "dropout": 0, |
| "dropout_att": 0, |
| "dropout_emb": 0, |
| "dropout_pre_encoder": 0, |
| "feat_in": 128, |
| "feat_out": -1, |
| "ff_expansion_factor": 4, |
| "n_heads": 8, |
| "n_layers": 48, |
| "pos_emb_max_len": 5000, |
| "reduction": null, |
| "reduction_factor": 1, |
| "reduction_position": null, |
| "self_attention_model": "rel_pos", |
| "subsampling": "dw_striding", |
| "subsampling_conv_channels": 256, |
| "subsampling_factor": 8, |
| "untie_biases": true, |
| "xscaling": false |
| }, |
| "head": { |
| "activation": "relu", |
| "dropout": 0, |
| "hidden_size": 1024, |
| "log_softmax": true, |
| "num_classes": 16384, |
| "num_layers": 1, |
| "use_transformer_init": true |
| }, |
| "is_encoder_decoder": true, |
| "log_batch_stats": false, |
| "log_prediction": true, |
| "max_audio_clip_s": 35, |
| "max_seq_len": 1024, |
| "model_defaults": { |
| "asr_enc_hidden": 1280, |
| "lm_dec_hidden": 1024, |
| "lm_enc_hidden": 1024 |
| }, |
| "model_type": "cohere_asr", |
| "multitask_metrics_cfg": { |
| "log_predictions": true, |
| "metrics": { |
| "wer": { |
| "constraint": ".source_lang==.target_lang" |
| } |
| } |
| }, |
| "overlap_chunk_second": 5, |
| "preprocessor": { |
| "dither": 1e-05, |
| "features": 128, |
| "frame_splicing": 1, |
| "log": true, |
| "n_fft": 512, |
| "normalize": "per_feature", |
| "pad_to": 0, |
| "pad_value": 0.0, |
| "sample_rate": 16000, |
| "window": "hann", |
| "window_size": 0.025, |
| "window_stride": 0.01 |
| }, |
| "prompt_defaults": [ |
| { |
| "role": "user", |
| "slots": { |
| "decodercontext": "", |
| "diarize": "<|nodiarize|>", |
| "emotion": "<|emo:undefined|>", |
| "itn": "<|noitn|>", |
| "pnc": "<|pnc|>", |
| "source_lang": "<|en|>", |
| "target_lang": "<|en|>", |
| "timestamp": "<|notimestamp|>" |
| } |
| }, |
| { |
| "role": "user_partial", |
| "slots": { |
| "decodercontext": "" |
| } |
| } |
| ], |
| "prompt_format": "cohere_asr", |
| "sample_rate": 16000, |
| "supported_languages": [ |
| "en", |
| "fr", |
| "de", |
| "es", |
| "it", |
| "pt", |
| "nl", |
| "pl", |
| "el", |
| "ar", |
| "ja", |
| "zh", |
| "vi", |
| "ko" |
| ], |
| "transf_decoder": { |
| "config_dict": { |
| "attn_layer_dropout": 0, |
| "attn_score_dropout": 0, |
| "embedding_dropout": 0, |
| "ffn_dropout": 0, |
| "hidden_act": "relu", |
| "hidden_size": 1024, |
| "inner_size": 4096, |
| "learn_positional_encodings": false, |
| "lm_dec_hidden": 1280, |
| "max_sequence_length": 1024, |
| "num_attention_heads": 8, |
| "num_layers": 8, |
| "num_token_types": 0, |
| "pre_ln": true, |
| "vocab_size": "None" |
| }, |
| "encoder": null, |
| "model_name": null, |
| "pre_ln_final_layer_norm": true, |
| "pretrained": false |
| }, |
| "transf_encoder": { |
| "attn_layer_dropout": 0, |
| "attn_score_dropout": 0, |
| "ffn_dropout": 0, |
| "hidden_size": 1024, |
| "inner_size": 4096, |
| "mask_future": false, |
| "num_attention_heads": 8, |
| "num_layers": 0, |
| "pre_ln": true, |
| "pre_ln_final_layer_norm": true |
| }, |
| "use_loss_mask_for_prompt": false, |
| "vocab_size": 16384 |
| } |
|
|