File size: 1,267 Bytes
0945711
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
{
  "data": {
    "cache_dir": "data",
    "group": true,
    "train": "openwebtext",
    "train_name": null,
    "valid": "wikitext103",
    "valid_name": null
  },
  "eval": {
    "batch_size": 512,
    "perplexity": true,
    "perplexity_batch_size": 32
  },
  "graph": {
    "file": "data",
    "report_all": false,
    "type": "uniform"
  },
  "model": {
    "cond_dim": 128,
    "dropout": 0.1,
    "hidden_size": 768,
    "length": 1024,
    "n_blocks": 12,
    "n_heads": 12,
    "name": "small",
    "scale_by_sigma": false,
    "type": "ddit"
  },
  "ngpus": 8,
  "noise": {
    "sigma_max": 20,
    "sigma_min": 0.0001,
    "type": "loglinear"
  },
  "optim": {
    "beta1": 0.9,
    "beta2": 0.999,
    "eps": 1e-08,
    "grad_clip": 1.0,
    "lr": 0.0003,
    "optimizer": "AdamW",
    "warmup": 2500,
    "weight_decay": 0
  },
  "sampling": {
    "noise_removal": true,
    "predictor": "euler",
    "steps": 128
  },
  "tokens": 50257,
  "training": {
    "accum": 4,
    "batch_size": 512,
    "ema": 0.9999,
    "eval_freq": 100,
    "log_freq": 50,
    "n_iters": 400000,
    "snapshot_freq": 4000,
    "snapshot_freq_for_preemption": 1000,
    "snapshot_sampling": true,
    "weight": "standard"
  },
  "wandb_name": "m_small-g_uniform-pretrain"
}