File size: 14,850 Bytes
0b0ec56 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | {
"base_model_class": "Gemma4TextCausalLMProxy",
"code_files": [
"/tmp/gemma4-hf-export-zxa_ushe/configuration_gemma4.py",
"/tmp/gemma4-hf-export-zxa_ushe/modeling_gemma4.py",
"/tmp/gemma4-hf-export-zxa_ushe/gemma4_optimization.py",
"/tmp/gemma4-hf-export-zxa_ushe/__init__.py"
],
"max_shard_size": "5GB",
"model_class": "OptimizedGemma4ForCausalLM",
"output_dir": "/tmp/gemma4-hf-export-zxa_ushe",
"repo_id": "haysonC/gemma4-zero-compute",
"router_checkpoint": {
"config": {
"metadata": {
"resume_summary": {
"config": {
"metadata": {
"resume_summary": {
"config": {
"metadata": {
"resume_summary": {
"loaded": false,
"reason": "resume disabled"
},
"step": 500,
"training_metrics": {
"current_lambda_zero_compute": 2.0,
"effective_batch_size": 16,
"entropy_loss": 4.538632531960806,
"entropy_term": 0.0,
"expert_usage_sample": [
1240.0,
2139.0,
1213.0,
793.0,
1992.0,
1957.0,
503.0,
1528.0,
1398.0,
2252.0,
1835.0,
1146.0,
1440.0,
1527.0,
1757.0,
803.0
],
"grad_norm": 70.5,
"gradient_accumulation_steps": 8,
"lambda_entropy": 0.0,
"lambda_router": 1.0,
"lambda_zero_compute": 2.0,
"loss": 2.327640622854233,
"micro_batch_size": 2,
"output_kl": 1.1904816403985023,
"output_kl_term": 1.1904816403985023,
"probe_output_kl": 1.1182771921157837,
"probe_router_entropy": 4.540449047088623,
"probe_router_kl": 0.051853783428668976,
"probe_same_expert_ratio": 0.8435872395833334,
"probe_zero_compute_loss": 0.5133160352706909,
"probe_zero_compute_margin_gap": 0.41453187317432216,
"probe_zero_compute_mass": 0.013335910812020301,
"probe_zero_compute_top1_ratio": 0.00654296875,
"probe_zero_compute_topk_ratio": 0.29026692708333335,
"router_entropy": 4.538632531960806,
"router_kl": 0.05338437343016267,
"router_kl_term": 0.05338437343016267,
"same_expert_ratio": 0.8424682617187501,
"step": 500,
"tokens_per_optimizer_step": 8192,
"zero_compute_loss": 0.5418873056769371,
"zero_compute_margin_gap": 0.4672557485134652,
"zero_compute_mass": 0.012517090452214084,
"zero_compute_ramp_steps": 50,
"zero_compute_term": 1.0837746113538742,
"zero_compute_top1_ratio": 0.00439453125,
"zero_compute_topk_margin": 0.0,
"zero_compute_topk_ratio": 0.24339599609374998,
"zero_compute_warmup_steps": 0,
"zero_expert_usage": 1080.0
}
},
"model_config": {
"add_zero_compute_expert": true,
"num_experts": 128,
"top_k_experts": 8,
"use_zero_compute_optimization": true
},
"num_router_keys": 90,
"router_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"source_model_id": ""
},
"config_path": "/cache/router_artifacts/router_config.json",
"loaded": true,
"loaded_key_count": 90,
"loaded_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"path": "/cache/router_artifacts/router_state_dict.pt"
},
"step": 100,
"training_metrics": {
"current_lambda_zero_compute": 3.0,
"effective_batch_size": 16,
"entropy_loss": 4.542559911807379,
"entropy_term": 0.0,
"expert_usage_sample": [
1239.0,
2070.0,
1500.0,
1201.0,
2009.0,
1821.0,
670.0,
1778.0,
1452.0,
2154.0,
2007.0,
980.0,
1320.0,
1568.0,
1522.0,
700.0
],
"grad_norm": 446.0,
"gradient_accumulation_steps": 8,
"lambda_entropy": 0.0,
"lambda_router": 1.0,
"lambda_zero_compute": 3.0,
"loss": 2.6639687418937683,
"micro_batch_size": 2,
"output_kl": 0.9797117039561272,
"output_kl_term": 0.9797117039561272,
"probe_output_kl": 1.0213916301727295,
"probe_router_entropy": 4.537958733240763,
"probe_router_kl": 0.05228007212281227,
"probe_same_expert_ratio": 0.8512044270833333,
"probe_zero_compute_loss": 0.5116603970527649,
"probe_zero_compute_margin_gap": 0.37900154244465134,
"probe_zero_compute_mass": 0.014502804105480513,
"probe_zero_compute_top1_ratio": 0.02294921875,
"probe_zero_compute_topk_ratio": 0.3021158854166667,
"router_entropy": 4.542559911807379,
"router_kl": 0.05281998496502638,
"router_kl_term": 0.05281998496502638,
"same_expert_ratio": 0.85335693359375,
"step": 100,
"tokens_per_optimizer_step": 8192,
"zero_compute_loss": 0.5438123419880867,
"zero_compute_margin_gap": 0.4289153911076331,
"zero_compute_mass": 0.013749805480862657,
"zero_compute_ramp_steps": 50,
"zero_compute_term": 1.6314370036125183,
"zero_compute_top1_ratio": 0.019120279947916666,
"zero_compute_topk_margin": 0.0,
"zero_compute_topk_ratio": 0.26689453125,
"zero_compute_warmup_steps": 0,
"zero_expert_usage": 4699.0
}
},
"model_config": {
"add_zero_compute_expert": true,
"num_experts": 128,
"top_k_experts": 8,
"use_zero_compute_optimization": true
},
"num_router_keys": 90,
"router_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"source_model_id": ""
},
"config_path": "/cache/router_artifacts/router_config.json",
"loaded": true,
"loaded_key_count": 90,
"loaded_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"path": "/cache/router_artifacts/router_state_dict.pt"
},
"step": 500,
"training_metrics": {
"current_lambda_zero_compute": 3.0,
"easy_token_ratio": 0.88720703125,
"effective_batch_size": 16,
"entropy_loss": 4.538989106814067,
"entropy_term": 0.009077978213628133,
"expert_usage_sample": [
1200.0,
2028.0,
1208.0,
783.0,
2040.0,
1847.0,
497.0,
1490.0,
1385.0,
2217.0,
1843.0,
1175.0,
1406.0,
1502.0,
1687.0,
771.0
],
"grad_norm": 204.0,
"gradient_accumulation_steps": 8,
"lambda_entropy": 0.002,
"lambda_router": 1.0,
"lambda_zero_compute": 3.0,
"loss": 2.76311457157135,
"micro_batch_size": 2,
"output_kl": 1.1192611530423164,
"output_kl_term": 1.1192611530423164,
"probe_easy_token_ratio": 0.9248046875,
"probe_output_kl": 1.1542232036590576,
"probe_router_entropy": 4.538172864913941,
"probe_router_kl": 0.0544092059135437,
"probe_same_expert_ratio": 0.8352864583333334,
"probe_teacher_confidence_mean": 0.6827144622802734,
"probe_zero_compute_loss": 0.5005475282669067,
"probe_zero_compute_margin_gap": 0.3712589807061819,
"probe_zero_compute_mass": 0.01432527024565543,
"probe_zero_compute_token_weight_mean": 0.6357069611549377,
"probe_zero_compute_top1_hits_actual": 568.0,
"probe_zero_compute_top1_ratio": 0.017643229166666666,
"probe_zero_compute_top1_ratio_actual": 0.018489583333333334,
"probe_zero_compute_topk_hits_actual": 10002.0,
"probe_zero_compute_topk_ratio": 0.31997760956028976,
"probe_zero_compute_topk_ratio_actual": 0.0406982421875,
"router_entropy": 4.538989106814067,
"router_kl": 0.054629013407975435,
"router_kl_term": 0.054629013407975435,
"same_expert_ratio": 0.8434204101562499,
"step": 500,
"teacher_confidence_mean": 0.6225322559475899,
"tokens_per_optimizer_step": 8192,
"zero_compute_loss": 0.5267154797911644,
"zero_compute_margin_gap": 0.418523011850672,
"zero_compute_mass": 0.01356517664706988,
"zero_compute_ramp_steps": 50,
"zero_compute_term": 1.5801464468240738,
"zero_compute_token_weight_mean": 0.5501668378710747,
"zero_compute_top1_hits_actual": 459.0,
"zero_compute_top1_ratio": 0.014103190104166665,
"zero_compute_top1_ratio_actual": 0.014941406249999997,
"zero_compute_topk_hits_actual": 8916.125,
"zero_compute_topk_margin": 0.0,
"zero_compute_topk_ratio": 0.27892842232367093,
"zero_compute_topk_ratio_actual": 0.03627980550130208,
"zero_compute_warmup_steps": 0,
"zero_expert_usage": 3466.0
}
},
"model_config": {
"add_zero_compute_expert": true,
"num_experts": 128,
"top_k_experts": 8,
"use_zero_compute_optimization": true
},
"num_router_keys": 90,
"router_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"source_model_id": ""
},
"config_path": "/cache/router_artifacts/router_config.json",
"loaded": true,
"loaded_key_count": 90,
"loaded_keys_sample": [
"model.layers.0.router.per_expert_scale",
"model.layers.0.router.proj.weight",
"model.layers.0.router.scale",
"model.layers.1.router.per_expert_scale",
"model.layers.1.router.proj.weight",
"model.layers.1.router.scale",
"model.layers.10.router.per_expert_scale",
"model.layers.10.router.proj.weight",
"model.layers.10.router.scale",
"model.layers.11.router.per_expert_scale",
"model.layers.11.router.proj.weight",
"model.layers.11.router.scale"
],
"path": "/cache/router_artifacts/router_state_dict.pt"
},
"source_model_id": "google/gemma-4-26B-A4B-it",
"torch_dtype": "torch.bfloat16"
} |