LaughLM

LaughLM / configs /gpu_test.yaml

Duplicate from Dhiraj45/LaughLM

9639af0 6 days ago

1.36 kB


	model:
	vocab_size: 50257
	d_model: 1024
	num_layers: 12
	num_heads: 16
	num_kv_heads: 4
	max_seq_len: 2048

	architecture:
	positional: learned
	normalization: layer_norm
	norm_placement: post
	attention_variant: mha
	attention_impl: standard
	ffn_type: swiglu
	residual: standard
	embeddings: standard
	bias: false
	weight_tying: true

	initialization:
	method: normal
	std: 0.02
	embedding_std: 0.02
	attention_std: 0.02
	mlp_std: 0.02
	residual_scale: 1.0

	optimizer:
	type: adamw
	learning_rate: 3e-4
	beta1: 0.9
	beta2: 0.95
	eps: 1e-8
	weight_decay: 0.01
	gradient_clip: 1.0

	scheduler:
	type: cosine
	warmup_steps: 200
	min_lr_ratio: 0.1

	runtime:
	seq_len: 1024
	micro_batch_per_device: 8
	gradient_accumulation: 2
	total_tokens: 100000000

	eval_interval: 100
	log_interval: 10

	checkpoint_interval: 200
	checkpoint_max_to_keep: 3
	checkpoint_dir: checkpoints

	data:
	sources: []
	max_seq_len: 1024
	packing: false
	eos_between_docs: true
	pad_to_multiple: 128

	tokenizer:
	algorithm: bpe
	vocab_size: 50257
	pre_tokenizer: byte_level
	number_tokenization: single_digit
	output_format: huggingface_fast

	hardware:
	accelerator: tpu
	type: v5e

	parallelism:
	data_parallel: 1
	model_parallel: 1
	compute_dtype: bfloat16
	param_dtype: float32

	monitoring:
	tensorboard: false
	rich_terminal: true