david-thrower's picture
Upload HelixForCausalLM
33e7263 verified
{
"act_threshold": 0.99,
"architectures": [
"HelixForCausalLM"
],
"attention_mode": "hybrid",
"batch_size": 8,
"bos_token_id": 0,
"cca_min_scale": 0.05,
"cca_ramp_mode": "cubic_ease",
"cca_warmup_steps": 10000,
"chat_template": null,
"d_model": 256,
"device": "auto",
"do_sample": true,
"dropout": 0.05,
"dtype": "float32",
"eos_token_id": 50256,
"epochs": 3,
"ffn_expansion": 2.0,
"fusion_strategy": "perceiver",
"gate_sinkhorn_iters": 5,
"grad_clip": 1.0,
"hybrid_full_attention_interval": 4,
"initializer_range": 0.02,
"is_vlm": false,
"k_proj_dim": 32,
"lateral_p": 0.5,
"linear_feature_dim": 64,
"loop_dim_ratio": 0.125,
"lr": 0.001,
"max_new_tokens": 20,
"memory_efficient_forward": false,
"model_type": "helix",
"n_columns": 2,
"n_heads": 4,
"n_loops": 2,
"nodes_per_column": [
2,
2
],
"pad_token_id": 50256,
"repetition_penalty": 1.0,
"rope_theta": 10000.0,
"seq_len": 512,
"ssm_bias": false,
"ssm_conv_bias": true,
"ssm_d_conv": 4,
"ssm_d_state": 64,
"ssm_dt_rank": "auto",
"ssm_expand": 2,
"stop_strings": [
"<|endoftext|>",
"<|im_end|>",
"</s>"
],
"temperature": 0.8,
"tie_word_embeddings": true,
"titans_always_select": true,
"titans_dropout": 0.0,
"titans_eta_init": 0.01,
"titans_feature_dim": 64,
"titans_n_heads": 4,
"tokenizer_name": "gpt2",
"top_k": 50,
"top_p": 0.95,
"transformers_version": "5.8.1",
"use_cache": true,
"use_cca": true,
"use_rope": true,
"use_ssm": false,
"use_titans_memory": false,
"vertical_depth": 2,
"vertical_p": 0.7,
"vision_encoder": null,
"vision_hidden_size": 768,
"vision_image_size": 448,
"vision_intermediate_size": 3072,
"vision_num_attention_heads": 16,
"vision_num_hidden_layers": 24,
"vision_patch_size": 16,
"vocab_size": 50257,
"warmup_steps": 200,
"weight_decay": 0.01
}