deqing
/

lstm-12layer-v5

Model card Files Files and versions

lstm-12layer-v5 / experiment_config.json

deqing's picture

Sync main at tokens-200M tokens

2e0af2f verified about 2 months ago

history blame contribute delete

1.15 kB

	{
	"embed_dim": 1024,
	"hidden_size": 1024,
	"num_layers": 12,
	"lstm_dropout": 0.1,
	"no_tie_embeddings": false,
	"base_model": "meta-llama/Llama-3.2-1B",
	"dataset_name": "HuggingFaceFW/fineweb-edu",
	"dataset_config": "sample-10BT",
	"local_data_path": null,
	"text_column": "text",
	"train_samples": null,
	"val_samples": 500,
	"mode": "window",
	"window_size": 4,
	"batch_size": 16,
	"grad_accum": 16,
	"epochs": 1,
	"lr": 0.001,
	"min_lr_ratio": 0.1,
	"warmup_steps": 500,
	"weight_decay": 0.01,
	"adam_beta2": 0.95,
	"eval_steps": 500,
	"save_steps": 1000,
	"context_length": 1024,
	"num_dataloader_workers": 0,
	"gradient_checkpointing": false,
	"chunk_data": true,
	"chunk_num_proc": null,
	"chunk_batch_size": 1000,
	"output_dir": "/scratch1/deqingfu/checkpoints/fourier_emergence_fineweb-edu_sample-10BT_300M_v5/lstm_12layer",
	"seed": 42,
	"push_to_hub": true,
	"no_push_to_hub": false,
	"hub_model_id": "deqing/lstm-12layer-v5",
	"push_every_n_tokens": 200000000,
	"no_wandb": false,
	"wandb_run_suffix": null,
	"resume": false,
	"no_compile": false,
	"optimizer": "adamw",
	"max_steps": -1
	}