| attn_pdrop: 0.1 | |
| b1: 0.9 | |
| b2: 0.95 | |
| batch_size: 4096 | |
| blocks: 6 | |
| d_model: 128 | |
| embd_pdrop: 0.1 | |
| epochs: 50000 | |
| heads: 4 | |
| lr: 0.0003 | |
| model_type: gpt | |
| num_final_chars_in_dataset: 2 | |
| num_workers: 4 | |
| resid_pdrop: 0.1 | |
| stoi: | |
| .: 0 | |
| a: 1 | |
| b: 2 | |
| c: 3 | |
| d: 4 | |
| e: 5 | |
| f: 6 | |
| g: 7 | |
| h: 8 | |
| i: 9 | |
| j: 10 | |
| k: 11 | |
| l: 12 | |
| m: 13 | |
| n: 14 | |
| o: 15 | |
| p: 16 | |
| q: 17 | |
| r: 18 | |
| s: 19 | |
| t: 20 | |
| u: 21 | |
| v: 22 | |
| w: 23 | |
| x: 24 | |
| y: 25 | |
| z: 26 | |
| vocab: 27 | |
| weight_decay: 0.1 | |
| window: 32 | |