| attn_pdrop: 0.1 | |
| b1: 0.9 | |
| b2: 0.95 | |
| batch_size: 1024 | |
| blocks: 6 | |
| d_model: 128 | |
| embd_pdrop: 0.1 | |
| epochs: 500000 | |
| heads: 4 | |
| lr: 0.0003 | |
| model_type: gpt | |
| num_workers: 4 | |
| resid_pdrop: 0.1 | |
| stoi: | |
| ' ': 1 | |
| '''': 2 | |
| '-': 3 | |
| .: 0 | |
| a: 4 | |
| b: 5 | |
| c: 6 | |
| d: 7 | |
| e: 8 | |
| f: 9 | |
| g: 10 | |
| h: 11 | |
| i: 12 | |
| j: 13 | |
| k: 14 | |
| l: 15 | |
| m: 16 | |
| n: 17 | |
| o: 18 | |
| p: 19 | |
| q: 20 | |
| r: 21 | |
| s: 22 | |
| t: 23 | |
| u: 24 | |
| v: 25 | |
| w: 26 | |
| x: 27 | |
| y: 28 | |
| z: 29 | |
| vocab: 30 | |
| weight_decay: 0.1 | |
| window: 32 | |