lstm-12layer-v5 / experiment_config.json
deqing's picture
Sync main at tokens-200M tokens
2e0af2f verified
{
"embed_dim": 1024,
"hidden_size": 1024,
"num_layers": 12,
"lstm_dropout": 0.1,
"no_tie_embeddings": false,
"base_model": "meta-llama/Llama-3.2-1B",
"dataset_name": "HuggingFaceFW/fineweb-edu",
"dataset_config": "sample-10BT",
"local_data_path": null,
"text_column": "text",
"train_samples": null,
"val_samples": 500,
"mode": "window",
"window_size": 4,
"batch_size": 16,
"grad_accum": 16,
"epochs": 1,
"lr": 0.001,
"min_lr_ratio": 0.1,
"warmup_steps": 500,
"weight_decay": 0.01,
"adam_beta2": 0.95,
"eval_steps": 500,
"save_steps": 1000,
"context_length": 1024,
"num_dataloader_workers": 0,
"gradient_checkpointing": false,
"chunk_data": true,
"chunk_num_proc": null,
"chunk_batch_size": 1000,
"output_dir": "/scratch1/deqingfu/checkpoints/fourier_emergence_fineweb-edu_sample-10BT_300M_v5/lstm_12layer",
"seed": 42,
"push_to_hub": true,
"no_push_to_hub": false,
"hub_model_id": "deqing/lstm-12layer-v5",
"push_every_n_tokens": 200000000,
"no_wandb": false,
"wandb_run_suffix": null,
"resume": false,
"no_compile": false,
"optimizer": "adamw",
"max_steps": -1
}