Cohere_asr / config.json
mohdasif81's picture
Upload folder using huggingface_hub
1117853 verified
{
"architectures": [
"CohereAsrForConditionalGeneration"
],
"auto_map": {
"AutoConfig": "configuration_cohere_asr.CohereAsrConfig",
"AutoFeatureExtractor": "processing_cohere_asr.CohereAsrFeatureExtractor",
"AutoModel": "modeling_cohere_asr.CohereAsrModel",
"AutoModelForSpeechSeq2Seq": "modeling_cohere_asr.CohereAsrForConditionalGeneration",
"AutoProcessor": "processing_cohere_asr.CohereAsrProcessor",
"AutoTokenizer": "tokenization_cohere_asr.CohereAsrTokenizer"
},
"batch_size": 128,
"decoding": {
"beam": {
"beam_size": 1,
"len_pen": 0.0,
"max_generation_delta": 50
},
"return_best_hypothesis": true,
"strategy": "beam"
},
"encoder": {
"att_context_size": [
-1,
-1
],
"causal_downsampling": false,
"conv_context_size": null,
"conv_kernel_size": 9,
"conv_norm_type": "batch_norm",
"d_model": 1280,
"dropout": 0,
"dropout_att": 0,
"dropout_emb": 0,
"dropout_pre_encoder": 0,
"feat_in": 128,
"feat_out": -1,
"ff_expansion_factor": 4,
"n_heads": 8,
"n_layers": 48,
"pos_emb_max_len": 5000,
"reduction": null,
"reduction_factor": 1,
"reduction_position": null,
"self_attention_model": "rel_pos",
"subsampling": "dw_striding",
"subsampling_conv_channels": 256,
"subsampling_factor": 8,
"untie_biases": true,
"xscaling": false
},
"head": {
"activation": "relu",
"dropout": 0,
"hidden_size": 1024,
"log_softmax": true,
"num_classes": 16384,
"num_layers": 1,
"use_transformer_init": true
},
"is_encoder_decoder": true,
"log_batch_stats": false,
"log_prediction": true,
"max_audio_clip_s": 35,
"max_seq_len": 1024,
"model_defaults": {
"asr_enc_hidden": 1280,
"lm_dec_hidden": 1024,
"lm_enc_hidden": 1024
},
"model_type": "cohere_asr",
"multitask_metrics_cfg": {
"log_predictions": true,
"metrics": {
"wer": {
"constraint": ".source_lang==.target_lang"
}
}
},
"overlap_chunk_second": 5,
"preprocessor": {
"dither": 1e-05,
"features": 128,
"frame_splicing": 1,
"log": true,
"n_fft": 512,
"normalize": "per_feature",
"pad_to": 0,
"pad_value": 0.0,
"sample_rate": 16000,
"window": "hann",
"window_size": 0.025,
"window_stride": 0.01
},
"prompt_defaults": [
{
"role": "user",
"slots": {
"decodercontext": "",
"diarize": "<|nodiarize|>",
"emotion": "<|emo:undefined|>",
"itn": "<|noitn|>",
"pnc": "<|pnc|>",
"source_lang": "<|en|>",
"target_lang": "<|en|>",
"timestamp": "<|notimestamp|>"
}
},
{
"role": "user_partial",
"slots": {
"decodercontext": ""
}
}
],
"prompt_format": "cohere_asr",
"sample_rate": 16000,
"supported_languages": [
"en",
"fr",
"de",
"es",
"it",
"pt",
"nl",
"pl",
"el",
"ar",
"ja",
"zh",
"vi",
"ko"
],
"transf_decoder": {
"config_dict": {
"attn_layer_dropout": 0,
"attn_score_dropout": 0,
"embedding_dropout": 0,
"ffn_dropout": 0,
"hidden_act": "relu",
"hidden_size": 1024,
"inner_size": 4096,
"learn_positional_encodings": false,
"lm_dec_hidden": 1280,
"max_sequence_length": 1024,
"num_attention_heads": 8,
"num_layers": 8,
"num_token_types": 0,
"pre_ln": true,
"vocab_size": "None"
},
"encoder": null,
"model_name": null,
"pre_ln_final_layer_norm": true,
"pretrained": false
},
"transf_encoder": {
"attn_layer_dropout": 0,
"attn_score_dropout": 0,
"ffn_dropout": 0,
"hidden_size": 1024,
"inner_size": 4096,
"mask_future": false,
"num_attention_heads": 8,
"num_layers": 0,
"pre_ln": true,
"pre_ln_final_layer_norm": true
},
"use_loss_mask_for_prompt": false,
"vocab_size": 16384
}