Upload LlamaForCausalLMWithGNN

e2468d9 verified over 1 year ago

4.72 kB

	{
	"architectures": [
	"LlamaForCausalLMWithGNN"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"gnn_config": {
	"GIN_after_attention": true,
	"GIN_after_attention_pre_GIN_norm": true,
	"GIN_after_attention_skip": true,
	"GIN_edge_weight_scaling": true,
	"GIN_hidden_dim_multiplier": 1,
	"GIN_use_MLP": true,
	"GIN_use_norm": false,
	"LlamaAttentionHierarchicalPerceiverAR_use_rope": true,
	"LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true,
	"MLP_type": "standard_MLP",
	"N_GNN_from_attention_layers": 3,
	"activation": "prelu",
	"add_rope": false,
	"adj_construction_method": "sum",
	"adj_transform_hidden_dim": 128,
	"attention_GIN_MLP_GIN_MLP_mode": "shared",
	"attention_GIN_MLP_GIN_MLP_pre_aggregate": true,
	"attention_GIN_MLP_GIN_binary_scale": 1.0,
	"attention_GIN_MLP_GIN_fuse_mode": "epsilon",
	"attention_GIN_MLP_GIN_learnable_threshold": false,
	"attention_GIN_MLP_GIN_mode": "default",
	"attention_GIN_MLP_GIN_sharp_softplus_beta": 10.0,
	"attention_GIN_MLP_GIN_softmax_temperature": 1.0,
	"attention_GIN_MLP_GIN_threshold_mode": "none",
	"attention_GIN_MLP_GIN_threshold_value": 0.2,
	"attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1,
	"attention_GIN_MLP_GIN_use_ReLU_instead_of_softmax": true,
	"attention_GIN_MLP_GIN_use_softmax": false,
	"attention_GIN_MLP_attention_mix_mode": "A",
	"attention_GIN_MLP_multiplier": 2,
	"attention_GIN_MLP_o_proj_at_end": false,
	"attention_GIN_MLP_scoring_hidden_dim": 512,
	"attention_GIN_MLP_second_order_factor": 0.1,
	"attention_GIN_MLP_separate_attention": false,
	"attention_GIN_MLP_use_scoring_fnct": true,
	"attention_GIN_MLP_use_second_order": false,
	"attention_epsilon_strategy": "default",
	"attention_epsilon_uniform_value": 0.5,
	"combined_norm": false,
	"continuous_transform_alpha": 10.0,
	"distance_scaling_method": "power",
	"distance_weight_strength": 1.0,
	"dropout": 0.1,
	"enforce_causality": true,
	"epsilon_threshold": 0.6,
	"gnn_logic": "before_MLP",
	"gnn_mode": "single",
	"gnn_residual": false,
	"gnn_type": "causal_gin",
	"group_tokens_for_coarse_graining": false,
	"hidden_dim": 155,
	"hierarchical_enc_dec_type": "PerceiverAR",
	"initial_sharpening_value": 1.0,
	"lambda_GNN": 0.5,
	"lambda_GNN_initial": 0.0,
	"learnable_aggregate_activation": "softmax",
	"max_position_embeddings": 2048,
	"mix_weights_initial": 0.5,
	"model_type": "",
	"norm_to_hidden_states": false,
	"num_latent_layers": 4,
	"num_latents": 32,
	"num_latents_list": [
	64,
	32,
	8
	],
	"num_layers": 1,
	"per_head_ff": false,
	"plot_for_debugging": false,
	"remove_self_connections": false,
	"residual_epsilon_strategy": "default",
	"residual_epsilon_uniform_value": 0.1,
	"rms_norm_eps": 1e-05,
	"sharpening_value_init": "value",
	"soft_masking_initial_threshold": 0.01,
	"soft_masking_k": 10.0,
	"threshold": 0.1,
	"threshold_any_tau": 0.1,
	"tokenizer": null,
	"top_k": 8,
	"use_GNN_from_attention": "none",
	"use_GNN_from_attention_add_RoPE_at_every_layer": false,
	"use_differential_attention": false,
	"use_differential_attention_group_norm": false,
	"use_distance_scaling": false,
	"use_fixed_number_of_tokens_per_latent": false,
	"use_graph_property_modulation": false,
	"use_graph_property_modulation_with_norm": false,
	"use_graph_property_modulation_with_norm_use_causal_clustering": true,
	"use_hierarchical_attention": false,
	"use_layer_norm": true,
	"use_layer_norm_in_GIN_MLP": false,
	"use_no_norm_in_GIN_MLP": false,
	"use_original_hidden_states": false,
	"use_original_hidden_states_add_attention": false,
	"use_projection": true,
	"use_sharpening": false,
	"use_soft_masking": false,
	"zero_below_epsilon_threshold": true
	},
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 3072,
	"initializer_range": 0.02,
	"intermediate_size": 8192,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 24,
	"num_hidden_layers": 28,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": true,
	"torch_dtype": "float32",
	"transformers_version": "4.46.1",
	"use_cache": false,
	"vocab_size": 128256
	}