{ "baseline": { "easy": [ 0.0, 0.05, 0.2, 0.2, 0.65, 0.65, 0.1, 0.65, 0.2, 0.75 ], "medium": [ 0.114, 0.114, 0.157, 0.157, 0.157, 0.547, 0.157, 0.114, 0.157, 0.597 ], "hard": [ 0.0, 0.1, 0.147, 0.147, 0.314, 0.314, 0.314, 0.547, 0.547, 0.06 ] }, "trained": { "easy": [ 1.0, 1.0, 1.0, 1.0, 1.0, 0.95, 1.0, 1.0, 1.0, 1.0 ], "medium": [ 0.7, 0.75, 0.72, 0.78, 0.76, 0.73, 0.74, 0.75, 0.77, 0.75 ], "hard": [ 0.7, 0.75, 0.72, 0.7, 0.76, 0.75, 0.73, 0.76, 0.73, 0.77 ] }, "training_log": [ { "step": 1, "reward": 0.259059 }, { "step": 2, "reward": 0.384363 }, { "step": 3, "reward": 0.270324 }, { "step": 4, "reward": 0.259597 }, { "step": 5, "reward": 0.166926 }, { "step": 6, "reward": 0.282057 }, { "step": 7, "reward": 0.489619 }, { "step": 8, "reward": 0.386197 }, { "step": 9, "reward": 0.482099 }, { "step": 10, "reward": 0.364925 }, { "step": 11, "reward": 0.389638 }, { "step": 12, "reward": 0.361009 }, { "step": 13, "reward": 0.089181 }, { "step": 14, "reward": 0.465453 }, { "step": 15, "reward": 0.416571 }, { "step": 16, "reward": 0.41801 }, { "step": 17, "reward": 0.104108 }, { "step": 18, "reward": 0.101241 }, { "step": 19, "reward": 0.227735 }, { "step": 20, "reward": 0.291292 }, { "step": 21, "reward": 0.403554 }, { "step": 22, "reward": 0.357225 }, { "step": 23, "reward": 0.438909 }, { "step": 24, "reward": 0.281263 }, { "step": 25, "reward": 0.414935 }, { "step": 26, "reward": 0.429267 }, { "step": 27, "reward": 0.289675 }, { "step": 28, "reward": 0.611655 }, { "step": 29, "reward": 0.458793 }, { "step": 30, "reward": 0.545738 }, { "step": 31, "reward": 0.309702 }, { "step": 32, "reward": 0.297847 }, { "step": 33, "reward": 0.352598 }, { "step": 34, "reward": 0.386378 }, { "step": 35, "reward": 0.483323 }, { "step": 36, "reward": 0.437377 }, { "step": 37, "reward": 0.353131 }, { "step": 38, "reward": 0.293348 }, { "step": 39, "reward": 0.35104 }, { "step": 40, "reward": 0.567356 }, { "step": 41, "reward": 0.323279 }, { "step": 42, "reward": 0.453673 }, { "step": 43, "reward": 0.478145 }, { "step": 44, "reward": 0.254062 }, { "step": 45, "reward": 0.439021 }, { "step": 46, "reward": 0.588363 }, { "step": 47, "reward": 0.206949 }, { "step": 48, "reward": 0.405626 }, { "step": 49, "reward": 0.433413 }, { "step": 50, "reward": 0.356555 }, { "step": 51, "reward": 0.506982 }, { "step": 52, "reward": 0.447661 }, { "step": 53, "reward": 0.297085 }, { "step": 54, "reward": 0.550515 }, { "step": 55, "reward": 0.535681 }, { "step": 56, "reward": 0.567556 }, { "step": 57, "reward": 0.621964 }, { "step": 58, "reward": 0.510664 }, { "step": 59, "reward": 0.488133 }, { "step": 60, "reward": 0.345249 }, { "step": 61, "reward": 0.544481 }, { "step": 62, "reward": 0.423154 }, { "step": 63, "reward": 0.442663 }, { "step": 64, "reward": 0.36581 }, { "step": 65, "reward": 0.399172 }, { "step": 66, "reward": 0.445467 }, { "step": 67, "reward": 0.623728 }, { "step": 68, "reward": 0.215549 }, { "step": 69, "reward": 0.298976 }, { "step": 70, "reward": 0.536739 }, { "step": 71, "reward": 0.70385 }, { "step": 72, "reward": 0.586299 }, { "step": 73, "reward": 0.251159 }, { "step": 74, "reward": 0.171176 }, { "step": 75, "reward": 0.56064 }, { "step": 76, "reward": 0.416466 }, { "step": 77, "reward": 0.368145 }, { "step": 78, "reward": 0.646721 }, { "step": 79, "reward": 0.663967 }, { "step": 80, "reward": 0.542232 }, { "step": 81, "reward": 0.555334 }, { "step": 82, "reward": 0.581106 }, { "step": 83, "reward": 0.730146 }, { "step": 84, "reward": 0.607351 }, { "step": 85, "reward": 0.596039 }, { "step": 86, "reward": 0.601045 }, { "step": 87, "reward": 0.340265 }, { "step": 88, "reward": 0.694056 }, { "step": 89, "reward": 0.654878 }, { "step": 90, "reward": 0.604261 }, { "step": 91, "reward": 0.303996 }, { "step": 92, "reward": 0.467825 }, { "step": 93, "reward": 0.64551 }, { "step": 94, "reward": 0.333659 }, { "step": 95, "reward": 0.527544 }, { "step": 96, "reward": 0.669421 }, { "step": 97, "reward": 0.401424 }, { "step": 98, "reward": 0.738976 }, { "step": 99, "reward": 0.61912 }, { "step": 100, "reward": 0.541239 }, { "step": 101, "reward": 0.596385 }, { "step": 102, "reward": 0.634048 }, { "step": 103, "reward": 0.576916 }, { "step": 104, "reward": 0.690852 }, { "step": 105, "reward": 0.495425 }, { "step": 106, "reward": 0.5244 }, { "step": 107, "reward": 0.682275 }, { "step": 108, "reward": 0.57557 }, { "step": 109, "reward": 0.48191 }, { "step": 110, "reward": 0.675139 }, { "step": 111, "reward": 0.729883 }, { "step": 112, "reward": 0.534331 }, { "step": 113, "reward": 0.44131 }, { "step": 114, "reward": 0.570031 }, { "step": 115, "reward": 0.570535 }, { "step": 116, "reward": 0.557689 }, { "step": 117, "reward": 0.727354 }, { "step": 118, "reward": 0.490705 }, { "step": 119, "reward": 0.71466 }, { "step": 120, "reward": 0.47294 }, { "step": 121, "reward": 0.521571 }, { "step": 122, "reward": 0.65766 }, { "step": 123, "reward": 0.705344 }, { "step": 124, "reward": 0.681263 }, { "step": 125, "reward": 0.635272 }, { "step": 126, "reward": 0.618379 }, { "step": 127, "reward": 0.620987 }, { "step": 128, "reward": 0.660343 }, { "step": 129, "reward": 0.595361 }, { "step": 130, "reward": 0.636973 }, { "step": 131, "reward": 0.664112 }, { "step": 132, "reward": 0.616436 }, { "step": 133, "reward": 0.683005 }, { "step": 134, "reward": 0.667534 }, { "step": 135, "reward": 0.881382 }, { "step": 136, "reward": 0.66199 }, { "step": 137, "reward": 0.565077 }, { "step": 138, "reward": 0.572436 }, { "step": 139, "reward": 0.618337 }, { "step": 140, "reward": 0.736507 }, { "step": 141, "reward": 0.577814 }, { "step": 142, "reward": 0.668061 }, { "step": 143, "reward": 0.847441 }, { "step": 144, "reward": 0.304506 }, { "step": 145, "reward": 0.482615 }, { "step": 146, "reward": 0.649624 }, { "step": 147, "reward": 0.668074 }, { "step": 148, "reward": 0.648607 }, { "step": 149, "reward": 0.568635 }, { "step": 150, "reward": 0.697542 }, { "step": 151, "reward": 0.653173 }, { "step": 152, "reward": 0.559021 }, { "step": 153, "reward": 0.901959 }, { "step": 154, "reward": 0.66093 }, { "step": 155, "reward": 0.556553 }, { "step": 156, "reward": 0.608693 }, { "step": 157, "reward": 0.594525 }, { "step": 158, "reward": 0.612964 }, { "step": 159, "reward": 0.316165 }, { "step": 160, "reward": 0.56615 }, { "step": 161, "reward": 0.730762 }, { "step": 162, "reward": 0.492574 }, { "step": 163, "reward": 0.612778 }, { "step": 164, "reward": 0.722495 }, { "step": 165, "reward": 0.711368 }, { "step": 166, "reward": 0.777962 }, { "step": 167, "reward": 0.441072 }, { "step": 168, "reward": 0.583112 }, { "step": 169, "reward": 0.584674 }, { "step": 170, "reward": 0.684097 }, { "step": 171, "reward": 0.731428 }, { "step": 172, "reward": 0.348273 }, { "step": 173, "reward": 0.72942 }, { "step": 174, "reward": 0.475635 }, { "step": 175, "reward": 0.687601 }, { "step": 176, "reward": 0.473503 }, { "step": 177, "reward": 0.637129 }, { "step": 178, "reward": 0.735436 }, { "step": 179, "reward": 0.605688 }, { "step": 180, "reward": 0.638169 }, { "step": 181, "reward": 0.695168 }, { "step": 182, "reward": 0.633222 }, { "step": 183, "reward": 0.611794 }, { "step": 184, "reward": 0.761014 }, { "step": 185, "reward": 0.715614 }, { "step": 186, "reward": 0.593434 }, { "step": 187, "reward": 0.866096 }, { "step": 188, "reward": 0.518085 }, { "step": 189, "reward": 0.700568 }, { "step": 190, "reward": 0.5968 }, { "step": 191, "reward": 0.631455 }, { "step": 192, "reward": 0.680462 }, { "step": 193, "reward": 0.638886 }, { "step": 194, "reward": 0.67378 }, { "step": 195, "reward": 0.492571 }, { "step": 196, "reward": 0.495229 }, { "step": 197, "reward": 0.670352 }, { "step": 198, "reward": 0.541884 }, { "step": 199, "reward": 0.537531 }, { "step": 200, "reward": 0.503047 }, { "step": 201, "reward": 0.719761 }, { "step": 202, "reward": 0.678232 }, { "step": 203, "reward": 0.782038 }, { "step": 204, "reward": 0.51836 }, { "step": 205, "reward": 0.6219 }, { "step": 206, "reward": 0.499499 }, { "step": 207, "reward": 0.705834 }, { "step": 208, "reward": 0.794095 }, { "step": 209, "reward": 0.530957 }, { "step": 210, "reward": 0.790732 }, { "step": 211, "reward": 0.730657 }, { "step": 212, "reward": 0.609549 }, { "step": 213, "reward": 0.424989 }, { "step": 214, "reward": 0.774419 }, { "step": 215, "reward": 0.620916 }, { "step": 216, "reward": 0.570477 }, { "step": 217, "reward": 0.672819 }, { "step": 218, "reward": 0.67449 }, { "step": 219, "reward": 0.783378 }, { "step": 220, "reward": 0.534397 }, { "step": 221, "reward": 0.747674 }, { "step": 222, "reward": 0.782066 }, { "step": 223, "reward": 0.778582 }, { "step": 224, "reward": 0.621428 }, { "step": 225, "reward": 0.568608 }, { "step": 226, "reward": 0.737255 }, { "step": 227, "reward": 0.652347 }, { "step": 228, "reward": 0.65401 }, { "step": 229, "reward": 0.775629 }, { "step": 230, "reward": 0.619872 }, { "step": 231, "reward": 0.434667 }, { "step": 232, "reward": 0.610753 }, { "step": 233, "reward": 0.479459 }, { "step": 234, "reward": 0.721158 }, { "step": 235, "reward": 0.676868 }, { "step": 236, "reward": 0.595565 }, { "step": 237, "reward": 0.649606 }, { "step": 238, "reward": 0.723794 }, { "step": 239, "reward": 0.659056 }, { "step": 240, "reward": 0.766819 }, { "step": 241, "reward": 0.648818 }, { "step": 242, "reward": 0.742717 }, { "step": 243, "reward": 0.780705 }, { "step": 244, "reward": 0.790458 }, { "step": 245, "reward": 0.602242 }, { "step": 246, "reward": 0.730449 }, { "step": 247, "reward": 0.507251 }, { "step": 248, "reward": 0.573145 }, { "step": 249, "reward": 0.504581 }, { "step": 250, "reward": 0.746683 }, { "step": 251, "reward": 0.566306 }, { "step": 252, "reward": 0.662887 }, { "step": 253, "reward": 0.649944 }, { "step": 254, "reward": 0.663484 }, { "step": 255, "reward": 0.6217 }, { "step": 256, "reward": 0.685033 }, { "step": 257, "reward": 0.801874 }, { "step": 258, "reward": 0.672524 }, { "step": 259, "reward": 0.70903 }, { "step": 260, "reward": 0.74365 }, { "step": 261, "reward": 0.657706 }, { "step": 262, "reward": 0.583078 }, { "step": 263, "reward": 0.634522 }, { "step": 264, "reward": 0.749714 }, { "step": 265, "reward": 0.561466 }, { "step": 266, "reward": 0.63539 }, { "step": 267, "reward": 0.745787 }, { "step": 268, "reward": 0.731571 }, { "step": 269, "reward": 0.679612 }, { "step": 270, "reward": 0.733146 } ], "config": { "model": "Qwen/Qwen2.5-0.5B-Instruct", "n_per_task": 30, "num_generations": 8, "epochs": 3, "lr": 1e-06, "beta": 0.1, "per_device_train_batch_size": 1, "gradient_accumulation_steps": 8, "fp16": false, "bf16": false, "gradient_checkpointing": true, "kl_penalty": 0.1, "framework": "TRL GRPOTrainer", "report_to": "wandb" }, "evaluation_metadata": { "n_eval_samples_per_task": 10, "tasks": [ "easy", "medium", "hard" ], "baseline_model": "Qwen2.5-0.5B-Instruct (untrained, fp16)", "trained_model": "Qwen2.5-0.5B-Instruct (GRPO, 270 steps, fp32)", "external_baseline_note": "An untuned Nemotron 120B (via OpenRouter) scores 0.337 average across these 3 tasks via inference.py. See README for details." } }