RadonDarkUltima / config.json
MagistrTheOne's picture
Add RadonDarkUltima framework (5TB model - weights pending)
7159e17 verified
{
"model_name": "radon-dark-ultima",
"model_type": "mistral",
"hidden_size": 16384,
"num_layers": 200,
"num_attention_heads": 128,
"num_kv_heads": 16,
"intermediate_size": 65536,
"vocab_size": 256000,
"max_position_embeddings": 32768,
"sliding_window": 16384,
"rope_theta": 100000.0,
"rms_norm_eps": 1e-06,
"activation_function": "silu",
"layer_norm_eps": 1e-06,
"use_cache": true,
"output_attentions": false,
"output_hidden_states": false,
"torch_dtype": "float16",
"pad_token_id": 0,
"eos_token_id": 2,
"bos_token_id": 1,
"unk_token_id": 3,
"attention_dropout": 0.0,
"hidden_dropout": 0.0,
"initializer_range": 0.02,
"use_flash_attention_2": true,
"gradient_checkpointing": true,
"tie_word_embeddings": false,
"architectures": [
"MistralForCausalLM"
],
"auto_map": {
"AutoModelForCausalLM": "models.mistral_model.MistralForCausalLM"
},
"transformers_version": "4.36.0",
"model_size": "5TB",
"parameters": 2500000000000,
"context_length": 32768,
"languages": [
"russian",
"english",
"code",
"multilingual"
],
"optimizations": [
"flash_attention_2",
"gradient_checkpointing",
"fp16",
"int8_hybrid",
"sharded_weights",
"tensor_parallel",
"pipeline_parallel",
"expert_parallel"
],
"performance": {
"memory_efficient": true,
"speed_optimized": true,
"production_ready": false,
"experimental": true,
"ultra_large_scale": true
},
"sharding": {
"enabled": true,
"total_shards": 100,
"shard_size_gb": 50,
"strategy": "layer_wise",
"quantization": "fp16_int8_hybrid"
},
"hardware_requirements": {
"minimum_vram": "5TB",
"recommended_vram": "10TB+",
"minimum_ram": "10TB",
"recommended_ram": "20TB+",
"storage": "15TB+",
"gpu_types": [
"A100",
"H100",
"RTX 4090 x16+"
]
},
"creator": "MagistrTheOne",
"description": "RadonDarkUltima: 5TB parameter ultra-large scale Mistral-based Russian-English transformer. Experimental model requiring massive computational resources."
}