| { | |
| "model_config": { | |
| "model_class": "LlamaForCausalLM", | |
| "vocab_size": 4096, | |
| "hidden_act": "silu", | |
| "max_position_embeddings": 512, | |
| "initializer_range": 0.02, | |
| "rms_norm_eps": 1e-06, | |
| "bos_token_id": 0, | |
| "eos_token_id": 1, | |
| "tie_word_embeddings": false, | |
| "rope_theta": 10000.0, | |
| "rope_scaling": null, | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "hidden_size": 12, | |
| "intermediate_size": 48, | |
| "num_attention_heads": 2, | |
| "num_hidden_layers": 1, | |
| "num_key_value_heads": 1 | |
| }, | |
| "max_seq_len": 512, | |
| "run_name": "2024_04_27_03_51_29", | |
| "output_dir": "/home/jai/.local/share/delphi/2024_04_27_03_51_29", | |
| "device": "auto", | |
| "checkpoint_interval": 400, | |
| "extra_checkpoint_iters": [ | |
| 1, | |
| 2, | |
| 4, | |
| 8, | |
| 16, | |
| 32, | |
| 64, | |
| 128, | |
| 256, | |
| 512 | |
| ], | |
| "log_interval": 40, | |
| "eval_iters": 10, | |
| "resume_from_path": null, | |
| "batch_size": 256, | |
| "max_epochs": 10, | |
| "grad_clip": 1.0, | |
| "gradient_accumulation_steps": 1, | |
| "adam": { | |
| "learning_rate": 0.0005, | |
| "weight_decay": 0.1, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "decay_lr": true, | |
| "warmup_iters": 1000, | |
| "min_lr": 0.0 | |
| }, | |
| "batch_ordering_seed": 1337, | |
| "torch_seed": 42, | |
| "save_optimizer": true, | |
| "dataset": { | |
| "name": "delphi-suite/stories-tokenized", | |
| "feature": "tokens", | |
| "train_split": "train", | |
| "validation_split": "validation" | |
| }, | |
| "wandb": null, | |
| "out_repo_id": "delphi-demo/llama-100k", | |
| "debug_config": { | |
| "no_training": false, | |
| "no_eval": false | |
| } | |
| } |