jxie
/

sma-language-pretrained

Model card Files Files and versions

sma-language-pretrained / config.json

jxie's picture

Upload SMAForSSL

e44c65c verified about 2 years ago

history blame contribute delete

2.49 kB

	{
	"_name_or_path": null,
	"architectures": [
	"SMAForSSL"
	],
	"attention_dropout_prob": 0.0,
	"cross_attention_widening_factor": 1,
	"cross_eval_noising_args": null,
	"cross_train_noising_args": [
	[
	"RandomlySelectedCrossAttentionMasking",
	{
	"exclude_seen_reconstruction": true,
	"masking_ratio": 0.15,
	"num_per_query": 4,
	"varying_length": true
	}
	]
	],
	"decoder_attention_channels": 512,
	"decoder_heads": 8,
	"decoder_latent_channels": 512,
	"decoder_type": "cross_attention",
	"dense_use_bias": true,
	"drop_path_rate": 0.0,
	"embedded_channels": 512,
	"encoder_cross_attention_channels": 256,
	"encoder_type": "cross_attention",
	"final_project": true,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.0,
	"initializer_range": 0.02,
	"input_channels": 3,
	"input_type": "discrete",
	"latent_channels": 1024,
	"layer_norm_eps": 1e-12,
	"layernorm_eps": 1e-12,
	"loss_fn": "mse",
	"max_position_embeddings": 1024,
	"model_type": "sma",
	"num_blocks": 1,
	"num_cross_attention_heads": 8,
	"num_discrete_tokens": 262,
	"num_latents": 256,
	"num_outputs": 1024,
	"num_self_attends_per_block": 16,
	"num_self_attention_heads": 8,
	"output_channels": 262,
	"pe_initializer_range": 0.02,
	"post_decoder_layers": null,
	"project_after_concat": true,
	"qk_channels": 256,
	"self_attention_widening_factor": 1,
	"share_decoder_queries": true,
	"share_embedding_weights": true,
	"teacher_args": {
	"auxiliary_loss_fn": "mse",
	"auxiliary_loss_weight": 1.0,
	"ema_args": {
	"ema_decay_end": 0.0,
	"ema_decay_start": 0.0
	},
	"eval_transform_args": null,
	"mask_replace": 3,
	"num_layer_target_avg": null,
	"reconstruction_decoder_args": {
	"num_heads": 1,
	"num_outputs": 1024,
	"output_channels": 262,
	"qk_channels": 256,
	"query_num_channels": 512,
	"share_decoder_queries": true,
	"share_embedding_weights": true,
	"use_query_residual": true,
	"v_channels": 512
	},
	"reconstruction_loss_fn": "crossentropy",
	"reconstruction_loss_weight": 1.0,
	"reconstruction_weighted_loss": false,
	"target_normalization_fn": "layernorm",
	"train_transform_args": null
	},
	"teacher_name": "ReconstructionTeacher",
	"torch_dtype": "float32",
	"transformers_version": "4.26.0.dev0",
	"use_decoder": false,
	"use_position_embeddings": true,
	"use_query_residual": true,
	"v_channels": 1024
	}