datasysdev commited on
Commit
c740ca3
·
verified ·
1 Parent(s): 92ec570

Upload search_step_2000.k_sweep.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. search_step_2000.k_sweep.json +90 -0
search_step_2000.k_sweep.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ppl_full": 9.958177526791891,
3
+ "by_K": {
4
+ "16": {
5
+ "recall_avg": 0.2488617408769063,
6
+ "recall_per_layer": {
7
+ "4": 0.06929636437908497,
8
+ "8": 0.209304789624183,
9
+ "12": 0.2291079452614379,
10
+ "16": 0.31758450776143793,
11
+ "20": 0.3559002246732026,
12
+ "24": 0.3119766135620915
13
+ },
14
+ "ppl_ann": 10.705916802088419,
15
+ "ppl_gap_relative": 0.0750879639657738
16
+ },
17
+ "32": {
18
+ "recall_avg": 0.22758947123797024,
19
+ "recall_per_layer": {
20
+ "4": 0.08549663713910761,
21
+ "8": 0.1827639281906168,
22
+ "12": 0.20015622436843833,
23
+ "16": 0.29272973568733596,
24
+ "20": 0.31894826012959315,
25
+ "24": 0.28544204191272965
26
+ },
27
+ "ppl_ann": 10.40695869922638,
28
+ "ppl_gap_relative": 0.04506659689758181
29
+ },
30
+ "64": {
31
+ "recall_avg": 0.2313686687059083,
32
+ "recall_per_layer": {
33
+ "4": 0.109822914083168,
34
+ "8": 0.18851337735615079,
35
+ "12": 0.2026925869088955,
36
+ "16": 0.29454920531580686,
37
+ "20": 0.3052448898396164,
38
+ "24": 0.28738903873181215
39
+ },
40
+ "ppl_ann": 10.19960351785024,
41
+ "ppl_gap_relative": 0.02424399348262342
42
+ },
43
+ "128": {
44
+ "recall_avg": 0.2596596885325661,
45
+ "recall_per_layer": {
46
+ "4": 0.15761951733660953,
47
+ "8": 0.22230808709257394,
48
+ "12": 0.23406030798471103,
49
+ "16": 0.3191429876512097,
50
+ "20": 0.31382029543640794,
51
+ "24": 0.31100693569388443
52
+ },
53
+ "ppl_ann": 10.039695183436075,
54
+ "ppl_gap_relative": 0.008186001547458431
55
+ },
56
+ "256": {
57
+ "recall_avg": 0.3158585866292318,
58
+ "recall_per_layer": {
59
+ "4": 0.23482767740885416,
60
+ "8": 0.28606397840711806,
61
+ "12": 0.2944536844889323,
62
+ "16": 0.36897023518880206,
63
+ "20": 0.35041291978624134,
64
+ "24": 0.3604230244954427
65
+ },
66
+ "ppl_ann": 9.879923025767008,
67
+ "ppl_gap_relative": -0.0078583155215243
68
+ },
69
+ "512": {
70
+ "recall_avg": 0.4077308518545968,
71
+ "recall_per_layer": {
72
+ "4": 0.34663236708868117,
73
+ "8": 0.3869971320742652,
74
+ "12": 0.3905042466663179,
75
+ "16": 0.45224675678071524,
76
+ "20": 0.42589560009184335,
77
+ "24": 0.444109008425758
78
+ },
79
+ "ppl_ann": 9.670466581980387,
80
+ "ppl_gap_relative": -0.028891927668233962
81
+ }
82
+ },
83
+ "model": "Qwen/Qwen3-4B-Instruct-2507",
84
+ "checkpoint": "search_step_2000.pt",
85
+ "trained_layers": [4, 8, 12, 16, 20, 24],
86
+ "d_search": 64,
87
+ "seq_len": 4096,
88
+ "num_eval_batches": 12,
89
+ "eval_dataset": "Salesforce/wikitext (wikitext-103-raw-v1, validation split)"
90
+ }