File size: 3,171 Bytes
ff6babd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf97630
ff6babd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
{
  "_hub_mixin_config": {
    "optimizer_class": "adamw",
    "scheduler_class": "cosine",
    "input_files": null,
    "dataset_subset_size": 1000000,
    "block_size": 128,
    "dataset_type": "fixed",
    "train_ratio": 0.98,
    "val_ratio": 0.01,
    "encoding_name": "gpt2",
    "n_heads": 12,
    "n_blocks": 12,
    "norm": "prenorm",
    "pos_embed_type": "absolute",
    "dropout": 0.1,
    "embed_size": 64,
    "hidden_size": 64,
    "k": 3,
    "num_inner_layers": 1,
    "embedding_type": "full",
    "checkpoint": null,
    "batch_size": 64,
    "num_epochs": 5,
    "learning_rate": 0.0003,
    "warmup_ratio": 0.1,
    "log_interval_steps": 250,
    "save_interval_steps": 5625,
    "save_dir": "./saved_models/lstm_tiny",
    "save_latest": true,
    "save_best": true,
    "loss_metric_for_best_model": "val",
    "prompt": "Once upon a",
    "max_new_tokens": 50,
    "top_p": 0.9,
    "use_wandb": true,
    "wandb_entity": "pico-llm",
    "wandb_project": "training",
    "wandb_name": "lstm-tiny",
    "upload_model_to_hub": true,
    "repo_id": "pico-llm/lstm-tiny",
    "device": "cuda:0",
    "seed": 42,
    "monosemantic_analysis": true,
    "num_steps": 76565
  },
  "hf_api": "<huggingface_hub.hf_api.HfApi object at 0x14f5796b5150>",
  "wandb_writer": "<wandb.sdk.wandb_run.Run object at 0x14f5796fd7f0>",
  "wandb_table": "<wandb.sdk.data_types.table.Table object at 0x14f5796fe7b0>",
  "optimizer": "AdamW (\nParameter Group 0\n    amsgrad: False\n    betas: (0.9, 0.999)\n    capturable: False\n    decoupled_weight_decay: True\n    differentiable: False\n    eps: 1e-08\n    foreach: None\n    fused: None\n    initial_lr: 0.0003\n    lr: 7.350326937918619e-06\n    maximize: False\n    weight_decay: 0.05\n)",
  "scheduler": "<torch.optim.lr_scheduler.SequentialLR object at 0x14f5798c2660>",
  "optimizer_class": "adamw",
  "scheduler_class": "cosine",
  "model": "LSTMSeqModel(\n  (embedding): Embedding(50257, 64)\n  (lstm): LSTM(64, 64)\n  (linear): Linear(in_features=64, out_features=50257, bias=True)\n)",
  "learning_rate": 0.0003,
  "_init_kwargs": {
    "input_files": null,
    "dataset_subset_size": 1000000,
    "block_size": 128,
    "dataset_type": "fixed",
    "train_ratio": 0.98,
    "val_ratio": 0.01,
    "encoding_name": "gpt2",
    "n_heads": 12,
    "n_blocks": 12,
    "norm": "prenorm",
    "pos_embed_type": "absolute",
    "dropout": 0.1,
    "embed_size": 64,
    "hidden_size": 64,
    "k": 3,
    "num_inner_layers": 1,
    "embedding_type": "full",
    "checkpoint": null,
    "batch_size": 64,
    "num_epochs": 5,
    "warmup_ratio": 0.1,
    "log_interval_steps": 250,
    "save_interval_steps": 5625,
    "save_dir": "./saved_models/lstm_tiny",
    "save_latest": true,
    "save_best": true,
    "loss_metric_for_best_model": "val",
    "prompt": "Once upon a",
    "max_new_tokens": 50,
    "top_p": 0.9,
    "use_wandb": true,
    "wandb_entity": "pico-llm",
    "wandb_project": "training",
    "wandb_name": "lstm-tiny",
    "upload_model_to_hub": true,
    "repo_id": "pico-llm/lstm-tiny",
    "device": "cuda:0",
    "seed": 42,
    "monosemantic_analysis": true,
    "num_steps": 76565
  }
}