File size: 1,528 Bytes
22300e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
{
  "method": "downscaling_projection",
  "source_model": "LisaMegaWatts/JuliaSLM",
  "source_params": 5037312,
  "source_loss": 3.552281068317482,
  "best_config": "A-3M",
  "finetune_steps": 2000,
  "finetune_lr": 0.0006,
  "configs": {
    "A-3M": {
      "config": {
        "d_model": 192,
        "n_layers": 6,
        "n_heads": 3,
        "head_dim": 64,
        "ffn_inner": 480,
        "context_length": 256,
        "vocab_size": 2000
      },
      "params": 2930112,
      "reduction": 0.41831834121055034,
      "pre_finetune_loss": 6.9663840472007115,
      "post_finetune_loss": 3.9081690383949335,
      "post_finetune_ppl": 49.8076724983192
    },
    "B-2.5M": {
      "config": {
        "d_model": 192,
        "n_layers": 5,
        "n_heads": 3,
        "head_dim": 64,
        "ffn_inner": 480,
        "context_length": 256,
        "vocab_size": 2000
      },
      "params": 2505792,
      "reduction": 0.5025537429486202,
      "pre_finetune_loss": 7.5051396219378885,
      "post_finetune_loss": 3.9149324556950593,
      "post_finetune_ppl": 50.14568434125149
    },
    "C-2M": {
      "config": {
        "d_model": 192,
        "n_layers": 4,
        "n_heads": 3,
        "head_dim": 64,
        "ffn_inner": 480,
        "context_length": 256,
        "vocab_size": 2000
      },
      "params": 2081472,
      "reduction": 0.58678914468669,
      "pre_finetune_loss": 8.09140216928519,
      "post_finetune_loss": 3.9662209947702864,
      "post_finetune_ppl": 52.78467987729236
    }
  }
}