config.json · MagistrTheOne/RadonDarkUltima at main

RadonDarkUltima / config.json

Add RadonDarkUltima framework (5TB model - weights pending)

7159e17 verified 3 months ago

2.19 kB

	{
	"model_name": "radon-dark-ultima",
	"model_type": "mistral",
	"hidden_size": 16384,
	"num_layers": 200,
	"num_attention_heads": 128,
	"num_kv_heads": 16,
	"intermediate_size": 65536,
	"vocab_size": 256000,
	"max_position_embeddings": 32768,
	"sliding_window": 16384,
	"rope_theta": 100000.0,
	"rms_norm_eps": 1e-06,
	"activation_function": "silu",
	"layer_norm_eps": 1e-06,
	"use_cache": true,
	"output_attentions": false,
	"output_hidden_states": false,
	"torch_dtype": "float16",
	"pad_token_id": 0,
	"eos_token_id": 2,
	"bos_token_id": 1,
	"unk_token_id": 3,
	"attention_dropout": 0.0,
	"hidden_dropout": 0.0,
	"initializer_range": 0.02,
	"use_flash_attention_2": true,
	"gradient_checkpointing": true,
	"tie_word_embeddings": false,
	"architectures": [
	"MistralForCausalLM"
	],
	"auto_map": {
	"AutoModelForCausalLM": "models.mistral_model.MistralForCausalLM"
	},
	"transformers_version": "4.36.0",
	"model_size": "5TB",
	"parameters": 2500000000000,
	"context_length": 32768,
	"languages": [
	"russian",
	"english",
	"code",
	"multilingual"
	],
	"optimizations": [
	"flash_attention_2",
	"gradient_checkpointing",
	"fp16",
	"int8_hybrid",
	"sharded_weights",
	"tensor_parallel",
	"pipeline_parallel",
	"expert_parallel"
	],
	"performance": {
	"memory_efficient": true,
	"speed_optimized": true,
	"production_ready": false,
	"experimental": true,
	"ultra_large_scale": true
	},
	"sharding": {
	"enabled": true,
	"total_shards": 100,
	"shard_size_gb": 50,
	"strategy": "layer_wise",
	"quantization": "fp16_int8_hybrid"
	},
	"hardware_requirements": {
	"minimum_vram": "5TB",
	"recommended_vram": "10TB+",
	"minimum_ram": "10TB",
	"recommended_ram": "20TB+",
	"storage": "15TB+",
	"gpu_types": [
	"A100",
	"H100",
	"RTX 4090 x16+"
	]
	},
	"creator": "MagistrTheOne",
	"description": "RadonDarkUltima: 5TB parameter ultra-large scale Mistral-based Russian-English transformer. Experimental model requiring massive computational resources."
	}