{
  "embed_dim": 1024,
  "hidden_size": 1024,
  "num_layers": 12,
  "lstm_dropout": 0.1,
  "no_tie_embeddings": false,
  "base_model": "meta-llama/Llama-3.2-1B",
  "dataset_name": "HuggingFaceFW/fineweb-edu",
  "dataset_config": "sample-10BT",
  "local_data_path": null,
  "text_column": "text",
  "train_samples": null,
  "val_samples": 500,
  "mode": "window",
  "window_size": 4,
  "batch_size": 16,
  "grad_accum": 16,
  "epochs": 1,
  "lr": 0.001,
  "min_lr_ratio": 0.1,
  "warmup_steps": 500,
  "weight_decay": 0.01,
  "adam_beta2": 0.95,
  "eval_steps": 500,
  "save_steps": 1000,
  "context_length": 1024,
  "num_dataloader_workers": 0,
  "gradient_checkpointing": false,
  "chunk_data": true,
  "chunk_num_proc": null,
  "chunk_batch_size": 1000,
  "output_dir": "/scratch1/deqingfu/checkpoints/fourier_emergence_fineweb-edu_sample-10BT_300M_v5/lstm_12layer",
  "seed": 42,
  "push_to_hub": true,
  "no_push_to_hub": false,
  "hub_model_id": "deqing/lstm-12layer-v5",
  "push_every_n_tokens": 200000000,
  "no_wandb": false,
  "wandb_run_suffix": null,
  "resume": false,
  "no_compile": false,
  "optimizer": "adamw",
  "max_steps": -1
}