| { | |
| "architectures": [ | |
| "CuriousForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "attn_logit_softcapping": 50.0, | |
| "bos_token_id": 2, | |
| "cache_implementation": "hybrid", | |
| "curious_architecture_type": "instruction_tuned_transformer", | |
| "curious_base_model": "adam-2b-original", | |
| "curious_capabilities": [ | |
| "Natural conversations", | |
| "Question answering", | |
| "Creative writing", | |
| "Code assistance", | |
| "Mathematical reasoning", | |
| "Instruction following" | |
| ], | |
| "curious_features": [ | |
| "Enhanced 2B parameter architecture", | |
| "Instruction-tuned for better conversations", | |
| "Advanced attention mechanisms", | |
| "Optimized activation functions", | |
| "Sliding window attention", | |
| "Logit softcapping for stability" | |
| ], | |
| "curious_instruction_tuned": true, | |
| "curious_model_size": "2B", | |
| "curious_training": "Instruction tuned on conversational data", | |
| "curious_version": "2.0", | |
| "dtype": "float32", | |
| "eos_token_id": [ | |
| 1, | |
| 107 | |
| ], | |
| "final_logit_softcapping": 30.0, | |
| "head_dim": 256, | |
| "hidden_act": "gelu_pytorch_tanh", | |
| "hidden_activation": "gelu_pytorch_tanh", | |
| "hidden_size": 2304, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 9216, | |
| "layer_types": [ | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention", | |
| "sliding_attention", | |
| "full_attention" | |
| ], | |
| "max_position_embeddings": 8192, | |
| "model_type": "curious_text", | |
| "num_attention_heads": 8, | |
| "num_hidden_layers": 26, | |
| "num_key_value_heads": 4, | |
| "pad_token_id": 0, | |
| "query_pre_attn_scalar": 256, | |
| "rms_norm_eps": 1e-06, | |
| "rope_theta": 10000.0, | |
| "sliding_window": 4096, | |
| "transformers_version": "4.56.2", | |
| "use_bidirectional_attention": null, | |
| "use_cache": true, | |
| "vocab_size": 256000 | |
| } | |