AnthonyPa57
/

ChemMultiRegressor-65M-38H

Text Generation

Mixture of Experts

Eval Results (legacy)

Model card Files Files and versions

ChemMultiRegressor-65M-38H / config.json

AnthonyPa57's picture

Push model using huggingface_hub.

ecfbea3 verified 4 months ago

history blame contribute delete

545 Bytes

	{
	"args": {
	"capacity_factor": 1.0,
	"context_window": 512,
	"d_ff": 256,
	"device": "cuda:0",
	"dim": 256,
	"dtype_str": "bfloat16",
	"fp8_recipe": "tensorwise",
	"impl": "grouped",
	"k": 1,
	"moe_type": "pytorch",
	"moe_zloss_weight": 0.1,
	"n_heads": 2,
	"n_kv_heads": 2,
	"n_layers": 1,
	"n_regression_heads": 48,
	"num_experts": 2,
	"output_moe_weights": false,
	"soft_cap": 20,
	"theta": 10000.0,
	"torch_compile": false,
	"use_sparse": false,
	"vocab_size": 30000
	}
	}