| { | |
| "n_layers": 2, | |
| "d_model": 256, | |
| "n_ctx": 1024, | |
| "d_head": 32, | |
| "model_name": "custom", | |
| "n_heads": 8, | |
| "d_mlp": null, | |
| "act_fn": null, | |
| "d_vocab": 5000, | |
| "eps": 1e-05, | |
| "use_attn_result": false, | |
| "use_attn_scale": true, | |
| "attn_scale": 5.656854249492381, | |
| "use_split_qkv_input": false, | |
| "use_hook_mlp_in": false, | |
| "use_attn_in": false, | |
| "use_local_attn": false, | |
| "original_architecture": null, | |
| "from_checkpoint": false, | |
| "checkpoint_index": null, | |
| "checkpoint_label_type": null, | |
| "checkpoint_value": null, | |
| "tokenizer_name": "georgeyw/TinyStories-tokenizer-5k", | |
| "window_size": null, | |
| "attn_types": null, | |
| "init_mode": "gpt2", | |
| "normalization_type": "LN", | |
| "device": "cuda", | |
| "n_devices": 1, | |
| "attention_dir": "causal", | |
| "attn_only": true, | |
| "seed": 1, | |
| "initializer_range": 0.05, | |
| "init_weights": true, | |
| "scale_attn_by_inverse_layer_idx": false, | |
| "positional_embedding_type": "shortformer", | |
| "final_rms": false, | |
| "d_vocab_out": 5000, | |
| "parallel_attn_mlp": false, | |
| "rotary_dim": null, | |
| "n_params": 524288, | |
| "use_hook_tokens": false, | |
| "gated_mlp": false, | |
| "default_prepend_bos": true, | |
| "dtype": "torch.float32", | |
| "tokenizer_prepends_bos": false, | |
| "n_key_value_heads": null, | |
| "post_embedding_ln": false, | |
| "rotary_base": 10000, | |
| "trust_remote_code": false, | |
| "rotary_adjacent_pairs": false, | |
| "load_in_4bit": false, | |
| "num_experts": null, | |
| "experts_per_token": null, | |
| "relative_attention_max_distance": null, | |
| "relative_attention_num_buckets": null, | |
| "decoder_start_token_id": null, | |
| "tie_word_embeddings": false, | |
| "use_normalization_before_and_after": false, | |
| "attn_scores_soft_cap": -1.0, | |
| "output_logits_soft_cap": -1.0 | |
| } |