| { |
| "architectures": [ |
| "LlamaForCausalLMWithGNN" |
| ], |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 128000, |
| "eos_token_id": [ |
| 128001, |
| 128008, |
| 128009 |
| ], |
| "gnn_config": { |
| "GIN_after_attention": true, |
| "GIN_after_attention_pre_GIN_norm": true, |
| "GIN_after_attention_skip": true, |
| "GIN_edge_weight_scaling": true, |
| "GIN_hidden_dim_multiplier": 1, |
| "GIN_use_MLP": true, |
| "GIN_use_norm": false, |
| "LlamaAttentionHierarchicalPerceiverAR_use_rope": true, |
| "LlamaAttentionHierarchicalVariant_2_PerceiverAR_use_skip": true, |
| "MLP_type": "standard_MLP", |
| "N_GNN_from_attention_layers": 3, |
| "activation": "prelu", |
| "add_rope": false, |
| "adj_construction_method": "sum", |
| "adj_transform_hidden_dim": 128, |
| "attention_GIN_MLP_GIN_MLP_mode": "shared", |
| "attention_GIN_MLP_GIN_MLP_pre_aggregate": true, |
| "attention_GIN_MLP_GIN_binary_scale": 1.0, |
| "attention_GIN_MLP_GIN_fuse_mode": "epsilon", |
| "attention_GIN_MLP_GIN_learnable_threshold": false, |
| "attention_GIN_MLP_GIN_mode": "default", |
| "attention_GIN_MLP_GIN_sharp_softplus_beta": 10.0, |
| "attention_GIN_MLP_GIN_softmax_temperature": 1.0, |
| "attention_GIN_MLP_GIN_threshold_mode": "none", |
| "attention_GIN_MLP_GIN_threshold_value": 0.2, |
| "attention_GIN_MLP_GIN_top_k_fraction_of_sequence_length": 0.1, |
| "attention_GIN_MLP_GIN_use_ReLU_instead_of_softmax": true, |
| "attention_GIN_MLP_GIN_use_softmax": false, |
| "attention_GIN_MLP_attention_mix_mode": "A", |
| "attention_GIN_MLP_multiplier": 2, |
| "attention_GIN_MLP_o_proj_at_end": false, |
| "attention_GIN_MLP_scoring_hidden_dim": 512, |
| "attention_GIN_MLP_second_order_factor": 0.1, |
| "attention_GIN_MLP_separate_attention": false, |
| "attention_GIN_MLP_use_scoring_fnct": true, |
| "attention_GIN_MLP_use_second_order": false, |
| "attention_epsilon_strategy": "default", |
| "attention_epsilon_uniform_value": 0.5, |
| "combined_norm": false, |
| "continuous_transform_alpha": 10.0, |
| "distance_scaling_method": "power", |
| "distance_weight_strength": 1.0, |
| "dropout": 0.1, |
| "enforce_causality": true, |
| "epsilon_threshold": 0.6, |
| "gnn_logic": "before_MLP", |
| "gnn_mode": "single", |
| "gnn_residual": false, |
| "gnn_type": "causal_gin", |
| "group_tokens_for_coarse_graining": false, |
| "hidden_dim": 155, |
| "hierarchical_enc_dec_type": "PerceiverAR", |
| "initial_sharpening_value": 1.0, |
| "lambda_GNN": 0.5, |
| "lambda_GNN_initial": 0.0, |
| "learnable_aggregate_activation": "softmax", |
| "max_position_embeddings": 2048, |
| "mix_weights_initial": 0.5, |
| "model_type": "", |
| "norm_to_hidden_states": false, |
| "num_latent_layers": 4, |
| "num_latents": 32, |
| "num_latents_list": [ |
| 64, |
| 32, |
| 8 |
| ], |
| "num_layers": 1, |
| "per_head_ff": false, |
| "plot_for_debugging": false, |
| "remove_self_connections": false, |
| "residual_epsilon_strategy": "default", |
| "residual_epsilon_uniform_value": 0.1, |
| "rms_norm_eps": 1e-05, |
| "sharpening_value_init": "value", |
| "soft_masking_initial_threshold": 0.01, |
| "soft_masking_k": 10.0, |
| "threshold": 0.1, |
| "threshold_any_tau": 0.1, |
| "tokenizer": null, |
| "top_k": 8, |
| "use_GNN_from_attention": "none", |
| "use_GNN_from_attention_add_RoPE_at_every_layer": false, |
| "use_differential_attention": false, |
| "use_differential_attention_group_norm": false, |
| "use_distance_scaling": false, |
| "use_fixed_number_of_tokens_per_latent": false, |
| "use_graph_property_modulation": false, |
| "use_graph_property_modulation_with_norm": false, |
| "use_graph_property_modulation_with_norm_use_causal_clustering": true, |
| "use_hierarchical_attention": false, |
| "use_layer_norm": true, |
| "use_layer_norm_in_GIN_MLP": false, |
| "use_no_norm_in_GIN_MLP": false, |
| "use_original_hidden_states": false, |
| "use_original_hidden_states_add_attention": false, |
| "use_projection": true, |
| "use_sharpening": false, |
| "use_soft_masking": false, |
| "zero_below_epsilon_threshold": true |
| }, |
| "head_dim": 128, |
| "hidden_act": "silu", |
| "hidden_size": 3072, |
| "initializer_range": 0.02, |
| "intermediate_size": 8192, |
| "max_position_embeddings": 131072, |
| "mlp_bias": false, |
| "model_type": "llama", |
| "num_attention_heads": 24, |
| "num_hidden_layers": 28, |
| "num_key_value_heads": 8, |
| "pretraining_tp": 1, |
| "rms_norm_eps": 1e-05, |
| "rope_scaling": { |
| "factor": 32.0, |
| "high_freq_factor": 4.0, |
| "low_freq_factor": 1.0, |
| "original_max_position_embeddings": 8192, |
| "rope_type": "llama3" |
| }, |
| "rope_theta": 500000.0, |
| "tie_word_embeddings": true, |
| "torch_dtype": "float32", |
| "transformers_version": "4.46.1", |
| "use_cache": false, |
| "vocab_size": 128256 |
| } |
|
|