| { |
| "_name_or_path": "rpt-torch-1", |
| "add_null_attn": true, |
| "architectures": [ |
| "RPTForCausalLM" |
| ], |
| "attn_pdrop": 0.0, |
| "augment_across_neighbors": true, |
| "augment_neighbors": true, |
| "aux_loss_schedule_steps": 12500, |
| "aux_scale": 0.1, |
| "bos_token_id": 0, |
| "cca_freq": 1, |
| "chunk_size": 64, |
| "document_length": 16384, |
| "embd_pdrop": 0.0, |
| "eos_token_id": 1, |
| "fcm_max_ratio": 0.0, |
| "fcm_min_ratio": 0.0, |
| "gated_ff": true, |
| "hidden_size": 2048, |
| "initializer_range": 1, |
| "intermediate_size": 5504, |
| "margin_schedule_steps": 56250, |
| "max_margin": 4, |
| "max_sequence_length": 4096, |
| "mesh_dim": null, |
| "model_type": "rpt", |
| "mult_in_complex": false, |
| "n_windows": 1, |
| "num_attention_heads": 16, |
| "num_document_chunks": 256, |
| "num_hidden_layers": 22, |
| "num_key_value_heads": 16, |
| "num_neighbors": 2, |
| "num_scored_neighbors": 20, |
| "num_sequence_chunks": 64, |
| "palm_init": true, |
| "remat_attention": "", |
| "remat_block": "nothing_saveable", |
| "remat_mlp": "", |
| "resid_pdrop": 0.0, |
| "retriever_fill_value": -10000.0, |
| "return_ret_metrics": true, |
| "rms_norm_eps": 1e-06, |
| "rms_one_baseline": true, |
| "rot_dim": 0, |
| "run_modules": "all", |
| "scan_attention": false, |
| "scan_key_chunk_size": 2048, |
| "scan_mlp": false, |
| "scan_mlp_chunk_size": 1024, |
| "scan_query_chunk_size": 1024, |
| "scheduled_sampling_max_prob": 1.0, |
| "scheduled_sampling_min_prob": 0.01, |
| "sliding_window": false, |
| "ss_schedule_steps": 56250, |
| "stride": 1024, |
| "threshold_nei_scores": 0.0, |
| "tie_word_embeddings": true, |
| "torch_dtype": "float32", |
| "transformers_version": "4.38.0", |
| "use_cache": true, |
| "use_cca_norm2": false, |
| "use_xnei_bias": true, |
| "vocab_size": 50277, |
| "window_length": 2048 |
| } |
|
|