ann-sparseattention / search_step_2000.k_sweep.json
datasysdev's picture
Upload search_step_2000.k_sweep.json with huggingface_hub
c740ca3 verified
{
"ppl_full": 9.958177526791891,
"by_K": {
"16": {
"recall_avg": 0.2488617408769063,
"recall_per_layer": {
"4": 0.06929636437908497,
"8": 0.209304789624183,
"12": 0.2291079452614379,
"16": 0.31758450776143793,
"20": 0.3559002246732026,
"24": 0.3119766135620915
},
"ppl_ann": 10.705916802088419,
"ppl_gap_relative": 0.0750879639657738
},
"32": {
"recall_avg": 0.22758947123797024,
"recall_per_layer": {
"4": 0.08549663713910761,
"8": 0.1827639281906168,
"12": 0.20015622436843833,
"16": 0.29272973568733596,
"20": 0.31894826012959315,
"24": 0.28544204191272965
},
"ppl_ann": 10.40695869922638,
"ppl_gap_relative": 0.04506659689758181
},
"64": {
"recall_avg": 0.2313686687059083,
"recall_per_layer": {
"4": 0.109822914083168,
"8": 0.18851337735615079,
"12": 0.2026925869088955,
"16": 0.29454920531580686,
"20": 0.3052448898396164,
"24": 0.28738903873181215
},
"ppl_ann": 10.19960351785024,
"ppl_gap_relative": 0.02424399348262342
},
"128": {
"recall_avg": 0.2596596885325661,
"recall_per_layer": {
"4": 0.15761951733660953,
"8": 0.22230808709257394,
"12": 0.23406030798471103,
"16": 0.3191429876512097,
"20": 0.31382029543640794,
"24": 0.31100693569388443
},
"ppl_ann": 10.039695183436075,
"ppl_gap_relative": 0.008186001547458431
},
"256": {
"recall_avg": 0.3158585866292318,
"recall_per_layer": {
"4": 0.23482767740885416,
"8": 0.28606397840711806,
"12": 0.2944536844889323,
"16": 0.36897023518880206,
"20": 0.35041291978624134,
"24": 0.3604230244954427
},
"ppl_ann": 9.879923025767008,
"ppl_gap_relative": -0.0078583155215243
},
"512": {
"recall_avg": 0.4077308518545968,
"recall_per_layer": {
"4": 0.34663236708868117,
"8": 0.3869971320742652,
"12": 0.3905042466663179,
"16": 0.45224675678071524,
"20": 0.42589560009184335,
"24": 0.444109008425758
},
"ppl_ann": 9.670466581980387,
"ppl_gap_relative": -0.028891927668233962
}
},
"model": "Qwen/Qwen3-4B-Instruct-2507",
"checkpoint": "search_step_2000.pt",
"trained_layers": [4, 8, 12, 16, 20, 24],
"d_search": 64,
"seq_len": 4096,
"num_eval_batches": 12,
"eval_dataset": "Salesforce/wikitext (wikitext-103-raw-v1, validation split)"
}