Austin207
/

Map-NEO

Text Generation

custom-architecture

flash-attention

Model card Files Files and versions

Map-NEO / configs /training_config.json

Austin207's picture

Upload folder using huggingface_hub

a683148 verified 4 months ago

history blame contribute delete

738 Bytes

	{
	"model": {
	"vocab_size": 50257,
	"max_seq_len": 2048,
	"dim": 1024,
	"n_layers": 16,
	"n_heads": 16,
	"hidden_dim": 2736,
	"dropout": 0.0
	},
	"training": {
	"batch_size": 1,
	"gradient_accumulation_steps": 32,
	"max_steps": 50000,
	"warmup_steps": 2000,
	"learning_rate": 0.0003,
	"weight_decay": 0.01,
	"grad_clip": 1.0,
	"mixed_precision": "bf16",
	"gradient_checkpointing": true
	},
	"data": {
	"seq_length": 1024,
	"data_path": "data/tokens/packed_1024.txt"
	},
	"hardware": {
	"device": "cuda",
	"compile_model": false
	},
	"logging": {
	"log_interval": 10,
	"save_interval": 2000,
	"output_dir": "checkpoints"
	}
	}