File size: 1,528 Bytes
22300e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | {
"method": "downscaling_projection",
"source_model": "LisaMegaWatts/JuliaSLM",
"source_params": 5037312,
"source_loss": 3.552281068317482,
"best_config": "A-3M",
"finetune_steps": 2000,
"finetune_lr": 0.0006,
"configs": {
"A-3M": {
"config": {
"d_model": 192,
"n_layers": 6,
"n_heads": 3,
"head_dim": 64,
"ffn_inner": 480,
"context_length": 256,
"vocab_size": 2000
},
"params": 2930112,
"reduction": 0.41831834121055034,
"pre_finetune_loss": 6.9663840472007115,
"post_finetune_loss": 3.9081690383949335,
"post_finetune_ppl": 49.8076724983192
},
"B-2.5M": {
"config": {
"d_model": 192,
"n_layers": 5,
"n_heads": 3,
"head_dim": 64,
"ffn_inner": 480,
"context_length": 256,
"vocab_size": 2000
},
"params": 2505792,
"reduction": 0.5025537429486202,
"pre_finetune_loss": 7.5051396219378885,
"post_finetune_loss": 3.9149324556950593,
"post_finetune_ppl": 50.14568434125149
},
"C-2M": {
"config": {
"d_model": 192,
"n_layers": 4,
"n_heads": 3,
"head_dim": 64,
"ffn_inner": 480,
"context_length": 256,
"vocab_size": 2000
},
"params": 2081472,
"reduction": 0.58678914468669,
"pre_finetune_loss": 8.09140216928519,
"post_finetune_loss": 3.9662209947702864,
"post_finetune_ppl": 52.78467987729236
}
}
} |