{ "model_name": "radon-dark-ultima", "model_type": "mistral", "hidden_size": 16384, "num_layers": 200, "num_attention_heads": 128, "num_kv_heads": 16, "intermediate_size": 65536, "vocab_size": 256000, "max_position_embeddings": 32768, "sliding_window": 16384, "rope_theta": 100000.0, "rms_norm_eps": 1e-06, "activation_function": "silu", "layer_norm_eps": 1e-06, "use_cache": true, "output_attentions": false, "output_hidden_states": false, "torch_dtype": "float16", "pad_token_id": 0, "eos_token_id": 2, "bos_token_id": 1, "unk_token_id": 3, "attention_dropout": 0.0, "hidden_dropout": 0.0, "initializer_range": 0.02, "use_flash_attention_2": true, "gradient_checkpointing": true, "tie_word_embeddings": false, "architectures": [ "MistralForCausalLM" ], "auto_map": { "AutoModelForCausalLM": "models.mistral_model.MistralForCausalLM" }, "transformers_version": "4.36.0", "model_size": "5TB", "parameters": 2500000000000, "context_length": 32768, "languages": [ "russian", "english", "code", "multilingual" ], "optimizations": [ "flash_attention_2", "gradient_checkpointing", "fp16", "int8_hybrid", "sharded_weights", "tensor_parallel", "pipeline_parallel", "expert_parallel" ], "performance": { "memory_efficient": true, "speed_optimized": true, "production_ready": false, "experimental": true, "ultra_large_scale": true }, "sharding": { "enabled": true, "total_shards": 100, "shard_size_gb": 50, "strategy": "layer_wise", "quantization": "fp16_int8_hybrid" }, "hardware_requirements": { "minimum_vram": "5TB", "recommended_vram": "10TB+", "minimum_ram": "10TB", "recommended_ram": "20TB+", "storage": "15TB+", "gpu_types": [ "A100", "H100", "RTX 4090 x16+" ] }, "creator": "MagistrTheOne", "description": "RadonDarkUltima: 5TB parameter ultra-large scale Mistral-based Russian-English transformer. Experimental model requiring massive computational resources." }