Hazan-Lab
/

Transformer-340M-0428

Model card Files Files and versions

Transformer-340M-0428 / config.json

windsornguyen's picture

add: tokenizer/config files

b7c583f verified 10 months ago

history blame contribute delete

1.29 kB

	{
	"model_type": "Transformer",
	"_name_or_path": "Transformer-340M-0408",
	"architectures": ["TransformerForCausalLM"],
	"dim": 1024,
	"num_heads": 4,
	"num_layers": 11,
	"seq_len": 4096,
	"vocab_size": 200064,
	"inter_dim": 4096,
	"mlp_scale": 12,
	"bias": false,
	"weight_tying": true,
	"rope_theta": 10000.0,
	"num_epochs": 1,
	"global_bsz": 524288,
	"bsz": 8,
	"warmup_steps": 1907,
	"eval_period": 50,
	"save_period": 1000,
	"max_lr": 4.0e-4,
	"min_lr": 4.0e-5,
	"max_norm": 1.0,
	"fsdp": true,
	"ddp": false,
	"reshard_after_forward_policy": "default",
	"mixed_precision": true,
	"torch_dtype": "bfloat16",
	"cpu_offload": false,
	"sharding_strategy": "full_shard",
	"state_dict_type": "full",
	"auto_wrap_policy": "partial",
	"backward_prefetch": "backward_pre",
	"forward_prefetch": false,
	"sync_module_states": true,
	"use_orig_params": true,
	"device_id": null,
	"precision": {
	"param": "bfloat16",
	"reduce": "bfloat16",
	"buffer": "bfloat16"
	},
	"fsdp_modules": [
	"AttentionLayer"
	],
	"num_workers": 0,
	"snapshot_every_n_steps": 50,
	"use_activation_checkpointing": true,
	"torch_compile": true,
	"torch_compile_kwargs": {
	"mode": "default",
	"fullgraph": true
	},
	"enable_compiled_autograd": false
	}