File size: 2,185 Bytes
24bcd35 7159e17 24bcd35 7159e17 24bcd35 7159e17 24bcd35 7159e17 24bcd35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
{
"model_name": "radon-dark-ultima",
"model_type": "mistral",
"hidden_size": 16384,
"num_layers": 200,
"num_attention_heads": 128,
"num_kv_heads": 16,
"intermediate_size": 65536,
"vocab_size": 256000,
"max_position_embeddings": 32768,
"sliding_window": 16384,
"rope_theta": 100000.0,
"rms_norm_eps": 1e-06,
"activation_function": "silu",
"layer_norm_eps": 1e-06,
"use_cache": true,
"output_attentions": false,
"output_hidden_states": false,
"torch_dtype": "float16",
"pad_token_id": 0,
"eos_token_id": 2,
"bos_token_id": 1,
"unk_token_id": 3,
"attention_dropout": 0.0,
"hidden_dropout": 0.0,
"initializer_range": 0.02,
"use_flash_attention_2": true,
"gradient_checkpointing": true,
"tie_word_embeddings": false,
"architectures": [
"MistralForCausalLM"
],
"auto_map": {
"AutoModelForCausalLM": "models.mistral_model.MistralForCausalLM"
},
"transformers_version": "4.36.0",
"model_size": "5TB",
"parameters": 2500000000000,
"context_length": 32768,
"languages": [
"russian",
"english",
"code",
"multilingual"
],
"optimizations": [
"flash_attention_2",
"gradient_checkpointing",
"fp16",
"int8_hybrid",
"sharded_weights",
"tensor_parallel",
"pipeline_parallel",
"expert_parallel"
],
"performance": {
"memory_efficient": true,
"speed_optimized": true,
"production_ready": false,
"experimental": true,
"ultra_large_scale": true
},
"sharding": {
"enabled": true,
"total_shards": 100,
"shard_size_gb": 50,
"strategy": "layer_wise",
"quantization": "fp16_int8_hybrid"
},
"hardware_requirements": {
"minimum_vram": "5TB",
"recommended_vram": "10TB+",
"minimum_ram": "10TB",
"recommended_ram": "20TB+",
"storage": "15TB+",
"gpu_types": [
"A100",
"H100",
"RTX 4090 x16+"
]
},
"creator": "MagistrTheOne",
"description": "RadonDarkUltima: 5TB parameter ultra-large scale Mistral-based Russian-English transformer. Experimental model requiring massive computational resources."
} |