Transformers
Safetensors
trl
grpo
arabic-poetry
classical-arabic
lora
AhmadAbbass commited on
Commit
e208bab
Β·
verified Β·
1 Parent(s): acdd5c0

Training in progress, step 100

Browse files
.gitattributes CHANGED
@@ -41,3 +41,5 @@ all_generations.jsonl filter=lfs diff=lfs merge=lfs -text
41
  plots/kl_chain.png filter=lfs diff=lfs merge=lfs -text
42
  plots/kl_run.png filter=lfs diff=lfs merge=lfs -text
43
  reward_meter_debug.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
41
  plots/kl_chain.png filter=lfs diff=lfs merge=lfs -text
42
  plots/kl_run.png filter=lfs diff=lfs merge=lfs -text
43
  reward_meter_debug.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ plots/arabic_gate_chain.png filter=lfs diff=lfs merge=lfs -text
45
+ plots/arabic_gate_run.png filter=lfs diff=lfs merge=lfs -text
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b22cc3a110884e81d8a76ffa02ccc5b6d8f37c1bda6ffc0ee2fcbbb1777cf44b
3
  size 639691872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:922c3172bc557ef9c01ab300dd85b84b1d072528f1502cf5cd438a6c1b5e61ea
3
  size 639691872
all_generations.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76f5fab6a16239e3cf665d10bed35aff5ce762c2dd5e3e4acb32ba98fa3a116d
3
- size 3240950
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b44866c1e0361481748c43790c9643aaa1a60cb32760553d165756d5d164e856
3
+ size 6660768
checkpoint_events.jsonl CHANGED
@@ -1,2 +1,4 @@
1
  {"timestamp_utc": "2026-04-11T19:10:17Z", "event_type": "evaluation_completed", "global_step": 50, "metrics": {"eval_loss": NaN, "eval_runtime": 84.0885, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.155}}
2
  {"timestamp_utc": "2026-04-11T19:10:19Z", "event_type": "checkpoint_saved", "global_step": 50, "local_checkpoint_dir": "/root/workspace/Shaer/grpo/outputs/train/shaer_grpo_20260411_190212/checkpoint-50", "hub_model_id": "Shaer-AI/Shaer-adapters-grpo", "expected_hub_prefix": "last-checkpoint"}
 
 
 
1
  {"timestamp_utc": "2026-04-11T19:10:17Z", "event_type": "evaluation_completed", "global_step": 50, "metrics": {"eval_loss": NaN, "eval_runtime": 84.0885, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.155}}
2
  {"timestamp_utc": "2026-04-11T19:10:19Z", "event_type": "checkpoint_saved", "global_step": 50, "local_checkpoint_dir": "/root/workspace/Shaer/grpo/outputs/train/shaer_grpo_20260411_190212/checkpoint-50", "hub_model_id": "Shaer-AI/Shaer-adapters-grpo", "expected_hub_prefix": "last-checkpoint"}
3
+ {"timestamp_utc": "2026-04-11T19:16:42Z", "event_type": "evaluation_completed", "global_step": 100, "metrics": {"eval_loss": NaN, "eval_runtime": 82.4151, "eval_samples_per_second": 1.262, "eval_steps_per_second": 0.158}}
4
+ {"timestamp_utc": "2026-04-11T19:16:44Z", "event_type": "checkpoint_saved", "global_step": 100, "local_checkpoint_dir": "/root/workspace/Shaer/grpo/outputs/train/shaer_grpo_20260411_190212/checkpoint-100", "hub_model_id": "Shaer-AI/Shaer-adapters-grpo", "expected_hub_prefix": "last-checkpoint"}
metrics.csv CHANGED
@@ -51,3 +51,54 @@ clip_ratio/high_max,clip_ratio/high_mean,clip_ratio/low_mean,clip_ratio/low_min,
51
  0.15510659664869308,0.15510659664869308,0.07543548569083214,0.07543548569083214,0.23054208233952522,0.0,233.0,233.0,174.5,174.5,114.0,114.0,2.8749085068702698,0.0019308001235712078,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,50,8.299102783203125,1.147744707763195,9.851515151515151e-06,0.1036,train,101022.0,0.41803598403930664,0.41803598403930664,0.24869033694267273,0.24869033694267273,0.41803598403930664,0.41803598403930664,0.24869033694267273,2.0,1.0149977207183838,0.2503611147403717,1.3848509788513184,0.22161900997161865,2026-04-11T19:08:52Z
52
  ,,,,,,,,,,,,,0.0019308001235712078,0.0,0.0,0.0,0.0,0.0,0.04807692307692308,424.38461538461536,356.0769230769231,197.5096153846154,182.15934518667368,40.46153846153846,40.46153846153846,2.5429875483879676,0.0,1.2288624094082758,nan,101022.0,0.3135691468532269,0.3135691468532269,0.29070903016970706,nan,0.3135691468532269,0.3135691468532269,0.29070903016970706,84.0885,1.237,1.629158432667072,1.0422258835572462,0.3146973011585382,1.1673351434560924,0.15005172731784674,0.155,,50,,,,,eval,,,,,,,,,,,,,,2026-04-11T19:10:17Z
53
  0.13118881545960903,0.13118881545960903,0.0884981770068407,0.0884981770068407,0.21968699246644974,0.0,54.0,54.0,39.5,39.5,29.0,29.0,2.0421162992715836,0.001969416126042632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,51,20.66379165649414,2.125137120485306,9.84848484848485e-06,0.0023,train,102602.0,0.48928219079971313,0.48928219079971313,0.3961140215396881,0.3961139917373657,0.48928219079971313,0.48928219079971313,0.3961140215396881,2.0,1.0036693811416626,0.1804288774728775,1.712418556213379,0.24577820301055908,2026-04-11T19:10:23Z
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  0.15510659664869308,0.15510659664869308,0.07543548569083214,0.07543548569083214,0.23054208233952522,0.0,233.0,233.0,174.5,174.5,114.0,114.0,2.8749085068702698,0.0019308001235712078,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,50,8.299102783203125,1.147744707763195,9.851515151515151e-06,0.1036,train,101022.0,0.41803598403930664,0.41803598403930664,0.24869033694267273,0.24869033694267273,0.41803598403930664,0.41803598403930664,0.24869033694267273,2.0,1.0149977207183838,0.2503611147403717,1.3848509788513184,0.22161900997161865,2026-04-11T19:08:52Z
52
  ,,,,,,,,,,,,,0.0019308001235712078,0.0,0.0,0.0,0.0,0.0,0.04807692307692308,424.38461538461536,356.0769230769231,197.5096153846154,182.15934518667368,40.46153846153846,40.46153846153846,2.5429875483879676,0.0,1.2288624094082758,nan,101022.0,0.3135691468532269,0.3135691468532269,0.29070903016970706,nan,0.3135691468532269,0.3135691468532269,0.29070903016970706,84.0885,1.237,1.629158432667072,1.0422258835572462,0.3146973011585382,1.1673351434560924,0.15005172731784674,0.155,,50,,,,,eval,,,,,,,,,,,,,,2026-04-11T19:10:17Z
53
  0.13118881545960903,0.13118881545960903,0.0884981770068407,0.0884981770068407,0.21968699246644974,0.0,54.0,54.0,39.5,39.5,29.0,29.0,2.0421162992715836,0.001969416126042632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,51,20.66379165649414,2.125137120485306,9.84848484848485e-06,0.0023,train,102602.0,0.48928219079971313,0.48928219079971313,0.3961140215396881,0.3961139917373657,0.48928219079971313,0.48928219079971313,0.3961140215396881,2.0,1.0036693811416626,0.1804288774728775,1.712418556213379,0.24577820301055908,2026-04-11T19:10:23Z
54
+ 0.10031133238226175,0.10031133238226175,0.08187933266162872,0.08187933266162872,0.18219066504389048,0.0,79.0,79.0,62.875,62.875,38.0,38.0,2.290161669254303,0.002008032128514056,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,52,12.180355072021484,1.405107393860817,9.845454545454546e-06,-0.017,train,104497.0,0.5805736780166626,0.5805736780166626,0.484282523393631,0.4842824935913086,0.5805736780166626,0.5805736780166626,0.484282523393631,2.0,1.034999966621399,0.07935328036546707,2.5338454246520996,0.21816720068454742,2026-04-11T19:10:29Z
55
+ 0.11070484854280949,0.11070484854280949,0.09725878294557333,0.09725878294557333,0.20796363148838282,0.0,66.0,66.0,42.125,42.125,28.0,28.0,1.8133485913276672,0.0020466481309854806,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,53,16.569290161132812,1.4137683361768723,9.842424242424243e-06,0.2218,train,105978.0,0.42543840408325195,0.42543840408325195,0.4286118447780609,0.4286118149757385,0.42543840408325195,0.42543840408325195,0.4286118447780609,2.0,1.0270792245864868,0.16867610812187195,1.7797749042510986,0.19726911187171936,2026-04-11T19:10:34Z
56
+ 0.12182400096207857,0.12182400096207857,0.07627276703715324,0.07627276703715324,0.19809676799923182,0.0,101.0,101.0,60.25,60.25,37.0,37.0,2.285839468240738,0.0020852641334569047,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,54,14.2871675491333,2.2034710496664047,9.83939393939394e-06,0.123,train,107780.0,0.6305832266807556,0.6305832266807556,0.4164789617061615,0.4164789617061615,0.6305832266807556,0.6305832266807556,0.4164789617061615,2.0,1.0409408807754517,0.18927349150180817,1.6645622253417969,0.21253442764282227,2026-04-11T19:10:39Z
57
+ 0.09078771062195301,0.09078771062195301,0.09197395481169224,0.09197395481169224,0.18276166543364525,0.0,65.0,65.0,51.75,51.75,40.0,40.0,2.4415661692619324,0.002123880135928329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,55,17.327545166015625,1.748019978404045,9.836363636363637e-06,0.0258,train,109474.0,0.36311012506484985,0.36311012506484985,0.2977062463760376,0.2977062463760376,0.36311012506484985,0.36311012506484985,0.2977062463760376,2.0,1.0396475791931152,0.122948557138443,2.095989227294922,0.22551913559436798,2026-04-11T19:10:44Z
58
+ 0.08283982099965215,0.08283982099965215,0.06206474732607603,0.06206474732607603,0.14490456832572818,0.0,46.0,46.0,40.75,40.75,33.0,33.0,1.4147121086716652,0.002162496138399753,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,56,18.075702667236328,1.6087606996297836,9.833333333333333e-06,0.1289,train,111016.0,0.731018602848053,0.731018602848053,0.3394193649291992,0.3394193649291992,0.731018602848053,0.731018602848053,0.3394193649291992,2.0,1.0378867387771606,0.3471723794937134,1.0579338073730469,0.16314202547073364,2026-04-11T19:10:49Z
59
+ 0.1252468191087246,0.1252468191087246,0.061714982613921165,0.061714982613921165,0.18696180172264576,0.0,347.0,347.0,284.125,284.125,209.0,209.0,3.2829161882400513,0.002201112140871177,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,57,6.15261173248291,0.9276735782623291,9.830303030303032e-06,0.069,train,114977.0,0.6026915907859802,0.6026915907859802,0.22708161175251007,0.22708161175251007,0.6026915907859802,0.6026915907859802,0.22708161175251007,2.0,1.0440524816513062,0.20789609849452972,1.5707168579101562,0.2181180864572525,2026-04-11T19:10:57Z
60
+ 0.11394045688211918,0.11394045688211918,0.07173928804695606,0.07173928804695606,0.18567974492907524,0.0,76.0,76.0,65.125,65.125,38.0,38.0,2.45593498647213,0.002239728143342601,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,58,12.134157180786133,1.2836961075663567,9.827272727272729e-06,-0.0746,train,116850.0,0.6828622817993164,0.6828622817993164,0.41145387291908264,0.41145387291908264,0.6828622817993164,0.6828622817993164,0.41145387291908264,2.0,1.040724754333496,0.3234720528125763,1.1286425590515137,0.19039244949817657,2026-04-11T19:11:02Z
61
+ 0.11741098761558533,0.11741098761558533,0.10819577984511852,0.10819577984511852,0.22560676746070385,0.0,233.0,233.0,142.75,142.75,103.0,103.0,2.218117520213127,0.002278344145814025,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,59,13.39612102508545,1.6619391813874245,9.824242424242425e-06,0.0594,train,119592.0,0.20413683354854584,0.20413683354854584,0.1784268319606781,0.1784268468618393,0.20413683354854584,0.20413683354854584,0.1784268319606781,2.0,1.0229341983795166,0.0802595466375351,2.522489547729492,0.26621776819229126,2026-04-11T19:11:09Z
62
+ 0.08825728669762611,0.08825728669762611,0.07143263425678015,0.07143263425678015,0.15968992095440626,0.0,76.0,76.0,46.125,46.125,37.0,37.0,2.614510953426361,0.0023169601482854493,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,60,15.286205291748047,1.8500682413578033,9.821212121212122e-06,0.0463,train,121201.0,0.5946000814437866,0.5946000814437866,0.3260856568813324,0.3260856568813324,0.5946000814437866,0.5946000814437866,0.3260856568813324,2.0,1.0501973628997803,0.21369504928588867,1.5432052612304688,0.20776869356632233,2026-04-11T19:11:15Z
63
+ 0.1522508319467306,0.1522508319467306,0.05286107771098614,0.05286107771098614,0.20511190965771675,0.0,43.0,43.0,31.0,31.0,22.0,22.0,2.4280820190906525,0.002355576150756874,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,61,19.05546760559082,1.743053525686264,9.81818181818182e-06,0.245,train,122769.0,0.7235491275787354,0.7235491275787354,0.44676530361175537,0.44676533341407776,0.7235491275787354,0.7235491275787354,0.44676530361175537,2.0,1.039192795753479,0.2915133237838745,1.2326695919036865,0.19886323809623718,2026-04-11T19:11:20Z
64
+ 0.11933935433626175,0.11933935433626175,0.08893644995987415,0.08893644995987415,0.2082758042961359,0.0,153.0,153.0,105.875,105.875,62.0,62.0,2.715813457965851,0.002394192153228298,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,62,10.215119361877441,1.3760973513126373,9.815151515151516e-06,0.1557,train,124968.0,0.2949569523334503,0.2949569523334503,0.2673260271549225,0.2673259973526001,0.2949569523334503,0.2949569523334503,0.2673260271549225,2.0,1.0284371376037598,0.25811153650283813,1.3543634414672852,0.22320568561553955,2026-04-11T19:11:26Z
65
+ 0.11690851114690304,0.11690851114690304,0.04621848836541176,0.04621848836541176,0.1631269995123148,0.0,84.0,84.0,70.75,70.75,48.0,48.0,2.22160741686821,0.002432808155699722,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,63,10.939824104309082,1.6386639475822449,9.812121212121212e-06,0.072,train,126782.0,0.8419647216796875,0.8419647216796875,0.3233566880226135,0.32335665822029114,0.8419647216796875,0.8419647216796875,0.3233566880226135,2.0,1.0459727048873901,0.1381281018257141,1.9795737266540527,0.19959045946598053,2026-04-11T19:11:32Z
66
+ 0.11145513504743576,0.11145513504743576,0.05013222433626652,0.05013222433626652,0.16158735938370228,0.0,483.0,483.0,390.875,390.875,315.0,315.0,2.9058758020401,0.002471424158171146,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,64,4.516471862792969,0.7644509300589561,9.809090909090911e-06,0.1276,train,131629.0,0.43812355399131775,0.43812355399131775,0.37187716364860535,0.37187716364860535,0.43812355399131775,0.43812355399131775,0.37187716364860535,2.0,1.02934730052948,0.13702331483364105,1.9876041412353516,0.19917061924934387,2026-04-11T19:11:42Z
67
+ 0.09134615398943424,0.09134615398943424,0.15403079986572266,0.15403079986572266,0.2453769538551569,0.0,48.0,48.0,34.125,34.125,24.0,24.0,1.5038166865706444,0.0025100401606425703,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,65,20.379396438598633,2.063544064760208,9.806060606060607e-06,0.0846,train,133206.0,0.39298856258392334,0.39298856258392334,0.3957642912864685,0.39576423168182373,0.39298856258392334,0.39298856258392334,0.3957642912864685,2.0,0.9815999269485474,0.10472461581230164,2.2564210891723633,0.21870994567871094,2026-04-11T19:11:46Z
68
+ 0.16264715790748596,0.16264715790748596,0.04664522036910057,0.04664522036910057,0.20929237827658653,0.0,96.0,96.0,69.625,69.625,49.0,49.0,1.3359143435955048,0.0025486561631139944,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,66,17.633543014526367,1.893242433667183,9.803030303030304e-06,0.1542,train,134995.0,0.8001365661621094,0.8001365661621094,0.3378126919269562,0.3378126621246338,0.8001365661621094,0.8001365661621094,0.3378126919269562,2.0,1.0259366035461426,0.1593431979417801,1.8366949558258057,0.20599590241909027,2026-04-11T19:11:52Z
69
+ 0.107527912594378,0.107527912594378,0.07886030618101358,0.07886030618101358,0.18638821877539158,0.0,57.0,57.0,46.5,46.5,37.0,37.0,2.648691624403,0.0025872721655854185,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,67,16.88397979736328,1.985504299402237,9.800000000000001e-06,0.0055,train,136783.0,0.6752415895462036,0.6752415895462036,0.3951883316040039,0.3951883316040039,0.6752415895462036,0.6752415895462036,0.3951883316040039,2.0,1.0569463968276978,0.32941487431526184,1.1104373931884766,0.21076610684394836,2026-04-11T19:11:56Z
70
+ 0.16749201156198978,0.16749201156198978,0.05944700539112091,0.05944700539112091,0.2269390169531107,0.0,62.0,62.0,44.75,44.75,30.0,30.0,1.8082955032587051,0.0026258881680568426,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,68,22.709701538085938,1.9188628941774368,9.796969696969698e-06,0.2069,train,138421.0,0.8447393178939819,0.8447393178939819,0.22370560467243195,0.22370561957359314,0.8447393178939819,0.8447393178939819,0.22370560467243195,2.0,1.0341558456420898,0.17428529262542725,1.7470617294311523,0.2022874504327774,2026-04-11T19:12:01Z
71
+ 0.12308589741587639,0.12308589741587639,0.08012999780476093,0.08012999780476093,0.20321589522063732,0.0,177.0,177.0,140.625,140.625,100.0,100.0,3.014527380466461,0.002664504170528267,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,69,7.776491641998291,1.129339560866356,9.793939393939394e-06,0.065,train,140882.0,0.8124464750289917,0.8124464750289917,0.22712665796279907,0.22712665796279907,0.8124464750289917,0.8124464750289917,0.22712665796279907,2.0,1.0490548610687256,0.21826361119747162,1.5220518112182617,0.2103135734796524,2026-04-11T19:12:07Z
72
+ 0.081504937261343,0.081504937261343,0.12618878670036793,0.12618878670036793,0.20769372396171093,0.0,155.0,155.0,98.375,98.375,56.0,56.0,2.8568870425224304,0.0027031201729996912,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,70,9.610224723815918,1.29872664809227,9.790909090909093e-06,0.2738,train,143117.0,0.27704793214797974,0.27704793214797974,0.31074026226997375,0.31074026226997375,0.27704793214797974,0.27704793214797974,0.31074026226997375,2.0,1.0380938053131104,0.21515779197216034,1.5363836288452148,0.24906866252422333,2026-04-11T19:12:14Z
73
+ 0.1307387314736843,0.1307387314736843,0.10119702108204365,0.10119702108204365,0.23193575255572796,0.0,126.0,126.0,81.25,81.25,45.0,45.0,2.6634568572044373,0.0027417361754711153,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,71,12.76672077178955,2.2731183916330338,9.787878787878788e-06,0.0602,train,145135.0,0.44717690348625183,0.44717690348625183,0.36459431052207947,0.36459431052207947,0.44717690348625183,0.44717690348625183,0.36459431052207947,2.0,1.0183560848236084,0.2028498500585556,1.5952892303466797,0.2269318401813507,2026-04-11T19:12:19Z
74
+ 0.08511919900774956,0.08511919900774956,0.09952043555676937,0.09952043555676937,0.18463963456451893,0.0,198.0,198.0,171.5,171.5,145.0,145.0,2.4433897733688354,0.0027803521779425394,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,72,6.893131256103516,1.1274225786328316,9.784848484848486e-06,0.0558,train,147931.0,0.34930744767189026,0.34930744767189026,0.34667089581489563,0.34667089581489563,0.34930744767189026,0.34930744767189026,0.34667089581489563,2.0,1.0105934143066406,0.21333575248718262,1.5448880195617676,0.21298885345458984,2026-04-11T19:12:26Z
75
+ 0.0748168514110148,0.0748168514110148,0.08047989010810852,0.08047989010810852,0.15529674151912332,0.0,95.0,95.0,57.5,57.5,39.0,39.0,2.515232354402542,0.0028189681804139635,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,73,13.334155082702637,1.4627206772565842,9.781818181818183e-06,0.0954,train,149855.0,0.42032575607299805,0.42032575607299805,0.4467563033103943,0.4467563033103943,0.42032575607299805,0.42032575607299805,0.4467563033103943,2.0,1.0420591831207275,0.17917320132255554,1.7194023132324219,0.2024940848350525,2026-04-11T19:12:31Z
76
+ 0.0245535708963871,0.0245535708963871,0.01854838617146015,0.01854838617146015,0.04310195706784725,0.75,512.0,504.0,466.375,329.5,155.0,155.0,0.8815881013870239,0.0028575841828853876,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,74,1.396609902381897,0.2103242427110672,9.77878787878788e-06,0.0603,train,152386.0,0.6015316843986511,0.6015316843986511,0.2646358013153076,0.2646358013153076,0.6015316843986511,0.6015316843986511,0.2646358013153076,2.0,1.0675560235977173,0.2493986040353775,1.3887028694152832,0.2191452980041504,2026-04-11T19:12:42Z
77
+ 0.07322514988481998,0.07322514988481998,0.12214752100408077,0.12214752100408077,0.19537267088890076,0.0,137.0,137.0,123.375,123.375,104.0,104.0,2.449110597372055,0.0028962001853568117,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,75,9.12006950378418,1.3578874319791794,9.775757575757576e-06,0.1183,train,154741.0,0.3497368097305298,0.3497368097305298,0.26128703355789185,0.26128703355789185,0.3497368097305298,0.3497368097305298,0.26128703355789185,2.0,1.0463789701461792,0.12505382299423218,2.0790109634399414,0.20589499175548553,2026-04-11T19:12:48Z
78
+ 0.1170717179775238,0.1170717179775238,0.09841924253851175,0.09841924253851175,0.21549096051603556,0.0,73.0,73.0,54.75,54.75,34.0,34.0,2.940678298473358,0.002934816187828236,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,76,13.480757713317871,1.7342924177646637,9.772727272727273e-06,0.2798,train,156595.0,0.5932719707489014,0.5932719707489014,0.3900352716445923,0.3900352716445923,0.5932719707489014,0.5932719707489014,0.3900352716445923,2.0,1.0449117422103882,0.13657011091709137,1.9909172058105469,0.23030723631381989,2026-04-11T19:12:53Z
79
+ 0.10580789390951395,0.10580789390951395,0.08706128969788551,0.08706128969788551,0.19286918360739946,0.0,98.0,98.0,49.75,49.75,33.0,33.0,2.5642327964305878,0.0029734321902996604,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,77,15.090164184570312,2.0099042803049088,9.76969696969697e-06,0.2751,train,158209.0,0.4592646062374115,0.4592646062374115,0.43493443727493286,0.4349344074726105,0.4592646062374115,0.4592646062374115,0.43493443727493286,2.0,1.048494815826416,0.21546848118305206,1.5349407196044922,0.22933514416217804,2026-04-11T19:12:59Z
80
+ 0.08520248159766197,0.08520248159766197,0.11678846552968025,0.11678846552968025,0.20199094712734222,0.0,205.0,205.0,164.875,164.875,98.0,98.0,2.451413109898567,0.0030120481927710845,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,78,7.737454891204834,1.39060677587986,9.766666666666667e-06,0.04,train,161112.0,0.18029195070266724,0.18029195070266724,0.1389833390712738,0.1389833241701126,0.18029195070266724,0.18029195070266724,0.1389833390712738,2.0,1.0424808263778687,0.18028594553470612,1.7132110595703125,0.2152501344680786,2026-04-11T19:13:05Z
81
+ 0.09161586128175259,0.09161586128175259,0.09096036478877068,0.09096036478877068,0.18257622607052326,0.0,439.0,439.0,342.75,342.75,227.0,227.0,2.836574971675873,0.0030506641952425086,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,79,4.973855018615723,0.8531102910637856,9.763636363636365e-06,0.0211,train,165606.0,0.2517043650150299,0.2517043650150299,0.2103721797466278,0.2103721648454666,0.2517043650150299,0.2517043650150299,0.2103721797466278,2.0,1.0435837507247925,0.16377666592597961,1.8092515468597412,0.20066173374652863,2026-04-11T19:13:15Z
82
+ 0.12067882902920246,0.12067882902920246,0.07962200790643692,0.07962200790643692,0.20030083693563938,0.0,101.0,101.0,62.625,62.625,33.0,33.0,2.800754815340042,0.0030892801977139327,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,80,12.971203804016113,1.7127311304211617,9.760606060606062e-06,0.2298,train,167379.0,0.5897529721260071,0.5897529721260071,0.3398434519767761,0.3398434519767761,0.5897529721260071,0.5897529721260071,0.3398434519767761,2.0,1.0488100051879883,0.22434468567371368,1.4945716857910156,0.20138375461101532,2026-04-11T19:13:20Z
83
+ 0.04627659637480974,0.04627659637480974,0.13005737029016018,0.13005737029016018,0.17633396666496992,0.0,47.0,47.0,43.25,43.25,36.0,36.0,0.8857106305658817,0.003127896200185357,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,81,16.626371383666992,1.9840531200170517,9.757575757575758e-06,0.0364,train,168941.0,0.32984912395477295,0.32984912395477295,0.36702674627304077,0.36702677607536316,0.32984912395477295,0.32984912395477295,0.36702674627304077,2.0,0.9905309081077576,0.08176939934492111,2.503852128982544,0.18993134796619415,2026-04-11T19:13:25Z
84
+ 0.18430931121110916,0.18430931121110916,0.030181385576725006,0.030181385576725006,0.21449069678783417,0.0,71.0,71.0,61.75,61.75,52.0,52.0,2.6517202258110046,0.003166512202656781,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,82,13.673704147338867,1.7028498351573944,9.754545454545455e-06,0.0514,train,170843.0,0.6860445737838745,0.6860445737838745,0.4127279818058014,0.412727952003479,0.6860445737838745,0.6860445737838745,0.4127279818058014,2.0,1.0191963911056519,0.3170091509819031,1.148824691772461,0.19877685606479645,2026-04-11T19:13:31Z
85
+ 0.1242379229515791,0.1242379229515791,0.0888951700180769,0.0888951700180769,0.213133092969656,0.0,274.0,274.0,193.375,193.375,119.0,119.0,3.7798104882240295,0.003205128205128205,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,83,6.804693222045898,1.5210926830768585,9.751515151515152e-06,-0.0638,train,173838.0,0.1995052993297577,0.1995052993297577,0.20155183970928192,0.20155183970928192,0.1995052993297577,0.1995052993297577,0.20155183970928192,2.0,1.0506726503372192,0.16469140350818634,1.8036818504333496,0.2313334345817566,2026-04-11T19:13:39Z
86
+ 0.044027093797922134,0.044027093797922134,0.14447841420769691,0.14447841420769691,0.18850550800561905,0.0,63.0,63.0,42.75,42.75,29.0,29.0,2.1612997502088547,0.003243744207599629,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,84,13.247566223144531,1.7388735264539719,9.74848484848485e-06,0.1179,train,175356.0,0.2839965224266052,0.2839965224266052,0.25705069303512573,0.25705069303512573,0.2839965224266052,0.2839965224266052,0.25705069303512573,2.0,1.056425929069519,0.2340237945318222,1.4523324966430664,0.20496825873851776,2026-04-11T19:13:44Z
87
+ 0.11511588096618652,0.11511588096618652,0.07980264164507389,0.07980264164507389,0.19491852261126041,0.0,145.0,145.0,94.25,94.25,56.0,56.0,2.768698900938034,0.0032823602100710537,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,85,8.849252700805664,1.4220501631498337,9.745454545454547e-06,0.1389,train,177342.0,0.5080384612083435,0.5080384612083435,0.5007855892181396,0.5007855296134949,0.5080384612083435,0.5080384612083435,0.5007855892181396,2.0,1.0272923707962036,0.1795433610677719,1.7173385620117188,0.20362532138824463,2026-04-11T19:13:50Z
88
+ 0.09607911668717861,0.09607911668717861,0.09196549654006958,0.09196549654006958,0.1880446132272482,0.0,323.0,323.0,250.875,250.875,138.0,138.0,3.1971679627895355,0.0033209762125424778,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,86,5.789488315582275,0.9879649505019188,9.742424242424244e-06,0.1116,train,181005.0,0.2234499454498291,0.2234499454498291,0.21007955074310303,0.21007955074310303,0.2234499454498291,0.2234499454498291,0.21007955074310303,2.0,1.03697669506073,0.19536978006362915,1.6328611373901367,0.2088625580072403,2026-04-11T19:13:58Z
89
+ 0.05969125218689442,0.05969125218689442,0.11843920033425093,0.11843920033425093,0.17813045252114534,0.0,55.0,55.0,45.625,45.625,33.0,33.0,2.4275936484336853,0.003359592215013902,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,87,18.224882125854492,1.5478705316781998,9.739393939393941e-06,0.0123,train,182602.0,0.3372150957584381,0.3372150957584381,0.4128407835960388,0.41284075379371643,0.3372150957584381,0.3372150957584381,0.4128407835960388,2.0,1.031768798828125,0.23187333345413208,1.461564064025879,0.21434830129146576,2026-04-11T19:14:03Z
90
+ 0.10577460564672947,0.10577460564672947,0.09458548203110695,0.09458548203110695,0.20036008767783642,0.0,203.0,203.0,179.625,179.625,154.0,154.0,2.465492472052574,0.003398208217485326,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,88,7.239674091339111,1.2833295166492462,9.736363636363637e-06,0.0868,train,185727.0,0.395793080329895,0.395793080329895,0.26035410165786743,0.26035410165786743,0.395793080329895,0.395793080329895,0.26035410165786743,2.0,1.035853385925293,0.20229095220565796,1.598048210144043,0.19813987612724304,2026-04-11T19:14:09Z
91
+ 0.12049593590199947,0.12049593590199947,0.07508473843336105,0.07508473843336105,0.19558067433536053,0.0,134.0,134.0,98.0,98.0,65.0,65.0,2.9546821117401123,0.00343682421995675,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,89,10.298208236694336,1.4282651841640472,9.733333333333334e-06,0.2009,train,187879.0,0.6336480379104614,0.6336480379104614,0.3132372200489044,0.31323719024658203,0.6336480379104614,0.6336480379104614,0.3132372200489044,2.0,1.050196647644043,0.18850484490394592,1.6686315536499023,0.20529155433177948,2026-04-11T19:14:15Z
92
+ 0.1241906825453043,0.1241906825453043,0.04326923098415136,0.04326923098415136,0.16745991352945566,0.0,64.0,64.0,48.0,48.0,33.0,33.0,2.464683413505554,0.003475440222428174,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,90,14.30321216583252,1.7402038723230362,9.730303030303031e-06,0.004,train,189559.0,0.6122487187385559,0.6122487187385559,0.3927072584629059,0.3927072584629059,0.6122487187385559,0.6122487187385559,0.3927072584629059,1.9598164558410645,1.037034511566162,0.23949864506721497,1.4292075634002686,0.20874950289726257,2026-04-11T19:14:20Z
93
+ 0.07963917776942253,0.07963917776942253,0.10331365652382374,0.10331365652382374,0.18295283429324627,0.0,262.0,262.0,174.5,174.5,121.0,121.0,3.7719518542289734,0.0035140562248995983,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,91,6.84236478805542,1.1118000373244286,9.727272727272728e-06,0.0544,train,192563.0,0.603217363357544,0.603217363357544,0.20575962960720062,0.20575964450836182,0.603217363357544,0.603217363357544,0.20575962960720062,2.0,1.0390546321868896,0.2694108486175537,1.3115177154541016,0.21506188809871674,2026-04-11T19:14:27Z
94
+ 0.07082884386181831,0.07082884386181831,0.1532957497984171,0.1532957497984171,0.2241245936602354,0.0,92.0,92.0,57.75,57.75,34.0,34.0,2.162596195936203,0.0035526722273710224,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,92,16.550352096557617,1.8161827772855759,9.724242424242426e-06,0.1559,train,194233.0,0.41938501596450806,0.41938501596450806,0.4255473017692566,0.4255472719669342,0.41938501596450806,0.41938501596450806,0.4255473017692566,2.0,1.057788372039795,0.15289942920207977,1.8779749870300293,0.2523427903652191,2026-04-11T19:14:32Z
95
+ 0.0941340047866106,0.0941340047866106,0.07444824650883675,0.07444824650883675,0.16858225129544735,0.0,344.0,344.0,312.125,312.125,268.0,268.0,3.2054056525230408,0.0035912882298424465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,93,4.504820823669434,0.8844465836882591,9.721212121212123e-06,0.0583,train,198586.0,0.1713373363018036,0.1713373363018036,0.16757051646709442,0.16757051646709442,0.1713373363018036,0.1713373363018036,0.16757051646709442,2.0,1.0430541038513184,0.19445516169071198,1.6375536918640137,0.20828686654567719,2026-04-11T19:14:41Z
96
+ 0.10831169039011002,0.10831169039011002,0.07458104286342859,0.07458104286342859,0.1828927332535386,0.0,161.0,161.0,111.625,111.625,66.0,66.0,3.28840634226799,0.003629904232313871,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,94,28.54985809326172,2.49543359875679,9.718181818181818e-06,-0.0215,train,200807.0,0.37486404180526733,0.37486404180526733,0.35751017928123474,0.35751017928123474,0.37486404180526733,0.37486404180526733,0.35751017928123474,2.0,1.0396778583526611,0.22457008063793182,1.4935674667358398,0.2191978245973587,2026-04-11T19:14:47Z
97
+ 0.03553921589627862,0.03553921589627862,0.011904762126505375,0.011904762126505375,0.047443978022783995,0.0,21.0,21.0,18.25,18.25,17.0,17.0,0.21160555630922318,0.003668520234785295,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,95,99.09620666503906,1.8823280930519104,9.715151515151516e-06,0.165,train,202121.0,0.9949363470077515,0.9949363470077515,0.0023499121889472008,0.002349905204027891,0.9949363470077515,0.9949363470077515,0.0023499121889472008,2.0,1.008124828338623,0.5384017825126648,0.8389389514923096,0.04272828623652458,2026-04-11T19:14:52Z
98
+ 0.08833824284374714,0.08833824284374714,0.1025804141536355,0.1025804141536355,0.19091865699738264,0.0,150.0,150.0,98.625,98.625,70.0,70.0,3.8312510550022125,0.0037071362372567192,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,96,8.879090309143066,1.5962010324001312,9.712121212121213e-06,0.1136,train,204182.0,0.3730781078338623,0.3730781078338623,0.2818770110607147,0.2818770110607147,0.3730781078338623,0.3730781078338623,0.2818770110607147,2.0,1.0496914386749268,0.18399666249752045,1.6928377151489258,0.2376914918422699,2026-04-11T19:14:57Z
99
+ 0.06360195018351078,0.06360195018351078,0.12544687278568745,0.12544687278568745,0.18904882296919823,0.0,144.0,144.0,94.625,94.625,60.0,60.0,2.747112363576889,0.0037457522397281433,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,97,17.37091636657715,2.371904134750366,9.70909090909091e-06,0.1295,train,206299.0,0.38106584548950195,0.38106584548950195,0.3250954747200012,0.3250954747200012,0.38106584548950195,0.38106584548950195,0.3250954747200012,2.0,1.0636258125305176,0.17890633642673492,1.7208929061889648,0.21154189109802246,2026-04-11T19:15:03Z
100
+ 0.08637197501957417,0.08637197501957417,0.10318870935589075,0.10318870935589075,0.18956068437546492,0.0,113.0,113.0,84.375,84.375,47.0,47.0,3.3423550128936768,0.0037843682421995675,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,98,9.53862190246582,1.4000272899866104,9.706060606060606e-06,0.0945,train,208238.0,0.3455486297607422,0.3455486297607422,0.4493013024330139,0.4493013024330139,0.3455486297607422,0.3455486297607422,0.4493013024330139,2.0,1.053009271621704,0.2011144906282425,1.6038808822631836,0.22088798880577087,2026-04-11T19:15:09Z
101
+ 0.03570783603936434,0.03570783603936434,0.13323841989040375,0.13323841989040375,0.16894625592976809,0.0,100.0,100.0,70.625,70.625,52.0,52.0,2.1659949123859406,0.0038229842446709916,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,99,12.195768356323242,1.4821452051401138,9.703030303030305e-06,0.0352,train,210203.0,0.18750828504562378,0.18750828504562378,0.18086771667003632,0.1808677315711975,0.18750828504562378,0.18750828504562378,0.18086771667003632,2.0,1.0437159538269043,0.24219049513339996,1.4180307388305664,0.19442309439182281,2026-04-11T19:15:14Z
102
+ 0.08358811773359776,0.08358811773359776,0.08866940345615149,0.08866940345615149,0.17225752118974924,0.0,58.0,58.0,45.0,45.0,29.0,29.0,2.016321301460266,0.0038616002471424157,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,100,14.672683715820312,1.7658196240663528,9.7e-06,0.0128,train,211811.0,0.10674573481082916,0.10674573481082916,0.08364997804164886,0.08364997804164886,0.10674573481082916,0.10674573481082916,0.08364997804164886,2.0,1.036841869354248,0.26094141602516174,1.3434593677520752,0.18656399846076965,2026-04-11T19:15:19Z
103
+ ,,,,,,,,,,,,,0.0038616002471424157,0.0,0.0,0.0,0.0,0.0,0.038461538461538464,414.7692307692308,383.84615384615387,193.94230769230768,179.54166823167068,40.61538461538461,40.61538461538461,3.318188025401189,0.0,1.2026036427571223,nan,211811.0,0.23895036027981684,0.23895036027981684,0.26200672411001646,nan,0.23895036027981684,0.23895036027981684,0.26200672411001646,82.4151,1.262,1.7262064585318933,1.0581457339800322,0.2759172205741589,1.3034063485952525,0.1760244621680333,0.158,,100,,,,,eval,,,,,,,,,,,,,,2026-04-11T19:16:42Z
104
+ 0.05549853295087814,0.05549853295087814,0.14081653021275997,0.14081653021275997,0.19631506316363811,0.0,70.0,70.0,61.375,61.375,49.0,49.0,2.9051956236362457,0.0039002162496138398,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,101,11.835779190063477,1.615099623799324,9.696969696969698e-06,0.065,train,213598.0,0.2918604016304016,0.2918604016304016,0.22347311675548553,0.22347311675548553,0.2918604016304016,0.2918604016304016,0.22347311675548553,2.0,1.06128990650177,0.32408758997917175,1.1267414093017578,0.22550015151500702,2026-04-11T19:16:49Z
metrics.jsonl CHANGED
@@ -50,3 +50,54 @@
50
  {"timestamp_utc": "2026-04-11T19:08:52Z", "mode": "train", "global_step": 50, "epoch": 0.0019308001235712078, "loss": 0.1036, "grad_norm": 8.299102783203125, "learning_rate": 9.851515151515151e-06, "num_tokens": 101022.0, "completions/mean_length": 174.5, "completions/min_length": 114.0, "completions/max_length": 233.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 174.5, "completions/min_terminated_length": 114.0, "completions/max_terminated_length": 233.0, "rewards/meter_count_clean/mean": 0.41803598403930664, "rewards/meter_count_clean/std": 0.24869033694267273, "reward": 0.41803598403930664, "reward_std": 0.24869033694267273, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22161900997161865, "sampling/sampling_logp_difference/max": 1.3848509788513184, "sampling/importance_sampling_ratio/min": 0.2503611147403717, "sampling/importance_sampling_ratio/mean": 1.0149977207183838, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.147744707763195, "entropy": 2.8749085068702698, "clip_ratio/low_mean": 0.07543548569083214, "clip_ratio/low_min": 0.07543548569083214, "clip_ratio/high_mean": 0.15510659664869308, "clip_ratio/high_max": 0.15510659664869308, "clip_ratio/region_mean": 0.23054208233952522, "reward_total_mean": 0.41803598403930664, "reward_meter_count_clean_mean": 0.41803598403930664, "reward_meter_count_clean_std": 0.24869033694267273}
51
  {"timestamp_utc": "2026-04-11T19:10:17Z", "mode": "eval", "global_step": 50, "epoch": 0.0019308001235712078, "eval_loss": NaN, "eval_runtime": 84.0885, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.155, "eval_num_tokens": 101022.0, "eval_completions/mean_length": 197.5096153846154, "eval_completions/min_length": 40.46153846153846, "eval_completions/max_length": 424.38461538461536, "eval_completions/clipped_ratio": 0.04807692307692308, "eval_completions/mean_terminated_length": 182.15934518667368, "eval_completions/min_terminated_length": 40.46153846153846, "eval_completions/max_terminated_length": 356.0769230769231, "eval_rewards/meter_count_clean/mean": 0.3135691468532269, "eval_rewards/meter_count_clean/std": 0.29070903016970706, "eval_reward": 0.3135691468532269, "eval_reward_std": NaN, "eval_frac_reward_zero_std": 0.0, "eval_sampling/sampling_logp_difference/mean": 0.15005172731784674, "eval_sampling/sampling_logp_difference/max": 1.1673351434560924, "eval_sampling/importance_sampling_ratio/min": 0.3146973011585382, "eval_sampling/importance_sampling_ratio/mean": 1.0422258835572462, "eval_sampling/importance_sampling_ratio/max": 1.629158432667072, "eval_kl": 1.2288624094082758, "eval_entropy": 2.5429875483879676, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_reward_total_mean": 0.3135691468532269, "eval_reward_meter_count_clean_mean": 0.3135691468532269, "eval_reward_meter_count_clean_std": 0.29070903016970706}
52
  {"timestamp_utc": "2026-04-11T19:10:23Z", "mode": "train", "global_step": 51, "epoch": 0.001969416126042632, "loss": 0.0023, "grad_norm": 20.66379165649414, "learning_rate": 9.84848484848485e-06, "num_tokens": 102602.0, "completions/mean_length": 39.5, "completions/min_length": 29.0, "completions/max_length": 54.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 39.5, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 54.0, "rewards/meter_count_clean/mean": 0.48928219079971313, "rewards/meter_count_clean/std": 0.3961140215396881, "reward": 0.48928219079971313, "reward_std": 0.3961139917373657, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24577820301055908, "sampling/sampling_logp_difference/max": 1.712418556213379, "sampling/importance_sampling_ratio/min": 0.1804288774728775, "sampling/importance_sampling_ratio/mean": 1.0036693811416626, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.125137120485306, "entropy": 2.0421162992715836, "clip_ratio/low_mean": 0.0884981770068407, "clip_ratio/low_min": 0.0884981770068407, "clip_ratio/high_mean": 0.13118881545960903, "clip_ratio/high_max": 0.13118881545960903, "clip_ratio/region_mean": 0.21968699246644974, "reward_total_mean": 0.48928219079971313, "reward_meter_count_clean_mean": 0.48928219079971313, "reward_meter_count_clean_std": 0.3961140215396881}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  {"timestamp_utc": "2026-04-11T19:08:52Z", "mode": "train", "global_step": 50, "epoch": 0.0019308001235712078, "loss": 0.1036, "grad_norm": 8.299102783203125, "learning_rate": 9.851515151515151e-06, "num_tokens": 101022.0, "completions/mean_length": 174.5, "completions/min_length": 114.0, "completions/max_length": 233.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 174.5, "completions/min_terminated_length": 114.0, "completions/max_terminated_length": 233.0, "rewards/meter_count_clean/mean": 0.41803598403930664, "rewards/meter_count_clean/std": 0.24869033694267273, "reward": 0.41803598403930664, "reward_std": 0.24869033694267273, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22161900997161865, "sampling/sampling_logp_difference/max": 1.3848509788513184, "sampling/importance_sampling_ratio/min": 0.2503611147403717, "sampling/importance_sampling_ratio/mean": 1.0149977207183838, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.147744707763195, "entropy": 2.8749085068702698, "clip_ratio/low_mean": 0.07543548569083214, "clip_ratio/low_min": 0.07543548569083214, "clip_ratio/high_mean": 0.15510659664869308, "clip_ratio/high_max": 0.15510659664869308, "clip_ratio/region_mean": 0.23054208233952522, "reward_total_mean": 0.41803598403930664, "reward_meter_count_clean_mean": 0.41803598403930664, "reward_meter_count_clean_std": 0.24869033694267273}
51
  {"timestamp_utc": "2026-04-11T19:10:17Z", "mode": "eval", "global_step": 50, "epoch": 0.0019308001235712078, "eval_loss": NaN, "eval_runtime": 84.0885, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.155, "eval_num_tokens": 101022.0, "eval_completions/mean_length": 197.5096153846154, "eval_completions/min_length": 40.46153846153846, "eval_completions/max_length": 424.38461538461536, "eval_completions/clipped_ratio": 0.04807692307692308, "eval_completions/mean_terminated_length": 182.15934518667368, "eval_completions/min_terminated_length": 40.46153846153846, "eval_completions/max_terminated_length": 356.0769230769231, "eval_rewards/meter_count_clean/mean": 0.3135691468532269, "eval_rewards/meter_count_clean/std": 0.29070903016970706, "eval_reward": 0.3135691468532269, "eval_reward_std": NaN, "eval_frac_reward_zero_std": 0.0, "eval_sampling/sampling_logp_difference/mean": 0.15005172731784674, "eval_sampling/sampling_logp_difference/max": 1.1673351434560924, "eval_sampling/importance_sampling_ratio/min": 0.3146973011585382, "eval_sampling/importance_sampling_ratio/mean": 1.0422258835572462, "eval_sampling/importance_sampling_ratio/max": 1.629158432667072, "eval_kl": 1.2288624094082758, "eval_entropy": 2.5429875483879676, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_reward_total_mean": 0.3135691468532269, "eval_reward_meter_count_clean_mean": 0.3135691468532269, "eval_reward_meter_count_clean_std": 0.29070903016970706}
52
  {"timestamp_utc": "2026-04-11T19:10:23Z", "mode": "train", "global_step": 51, "epoch": 0.001969416126042632, "loss": 0.0023, "grad_norm": 20.66379165649414, "learning_rate": 9.84848484848485e-06, "num_tokens": 102602.0, "completions/mean_length": 39.5, "completions/min_length": 29.0, "completions/max_length": 54.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 39.5, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 54.0, "rewards/meter_count_clean/mean": 0.48928219079971313, "rewards/meter_count_clean/std": 0.3961140215396881, "reward": 0.48928219079971313, "reward_std": 0.3961139917373657, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24577820301055908, "sampling/sampling_logp_difference/max": 1.712418556213379, "sampling/importance_sampling_ratio/min": 0.1804288774728775, "sampling/importance_sampling_ratio/mean": 1.0036693811416626, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.125137120485306, "entropy": 2.0421162992715836, "clip_ratio/low_mean": 0.0884981770068407, "clip_ratio/low_min": 0.0884981770068407, "clip_ratio/high_mean": 0.13118881545960903, "clip_ratio/high_max": 0.13118881545960903, "clip_ratio/region_mean": 0.21968699246644974, "reward_total_mean": 0.48928219079971313, "reward_meter_count_clean_mean": 0.48928219079971313, "reward_meter_count_clean_std": 0.3961140215396881}
53
+ {"timestamp_utc": "2026-04-11T19:10:29Z", "mode": "train", "global_step": 52, "epoch": 0.002008032128514056, "loss": -0.017, "grad_norm": 12.180355072021484, "learning_rate": 9.845454545454546e-06, "num_tokens": 104497.0, "completions/mean_length": 62.875, "completions/min_length": 38.0, "completions/max_length": 79.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 62.875, "completions/min_terminated_length": 38.0, "completions/max_terminated_length": 79.0, "rewards/meter_count_clean/mean": 0.5805736780166626, "rewards/meter_count_clean/std": 0.484282523393631, "reward": 0.5805736780166626, "reward_std": 0.4842824935913086, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21816720068454742, "sampling/sampling_logp_difference/max": 2.5338454246520996, "sampling/importance_sampling_ratio/min": 0.07935328036546707, "sampling/importance_sampling_ratio/mean": 1.034999966621399, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.405107393860817, "entropy": 2.290161669254303, "clip_ratio/low_mean": 0.08187933266162872, "clip_ratio/low_min": 0.08187933266162872, "clip_ratio/high_mean": 0.10031133238226175, "clip_ratio/high_max": 0.10031133238226175, "clip_ratio/region_mean": 0.18219066504389048, "reward_total_mean": 0.5805736780166626, "reward_meter_count_clean_mean": 0.5805736780166626, "reward_meter_count_clean_std": 0.484282523393631}
54
+ {"timestamp_utc": "2026-04-11T19:10:34Z", "mode": "train", "global_step": 53, "epoch": 0.0020466481309854806, "loss": 0.2218, "grad_norm": 16.569290161132812, "learning_rate": 9.842424242424243e-06, "num_tokens": 105978.0, "completions/mean_length": 42.125, "completions/min_length": 28.0, "completions/max_length": 66.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 42.125, "completions/min_terminated_length": 28.0, "completions/max_terminated_length": 66.0, "rewards/meter_count_clean/mean": 0.42543840408325195, "rewards/meter_count_clean/std": 0.4286118447780609, "reward": 0.42543840408325195, "reward_std": 0.4286118149757385, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19726911187171936, "sampling/sampling_logp_difference/max": 1.7797749042510986, "sampling/importance_sampling_ratio/min": 0.16867610812187195, "sampling/importance_sampling_ratio/mean": 1.0270792245864868, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4137683361768723, "entropy": 1.8133485913276672, "clip_ratio/low_mean": 0.09725878294557333, "clip_ratio/low_min": 0.09725878294557333, "clip_ratio/high_mean": 0.11070484854280949, "clip_ratio/high_max": 0.11070484854280949, "clip_ratio/region_mean": 0.20796363148838282, "reward_total_mean": 0.42543840408325195, "reward_meter_count_clean_mean": 0.42543840408325195, "reward_meter_count_clean_std": 0.4286118447780609}
55
+ {"timestamp_utc": "2026-04-11T19:10:39Z", "mode": "train", "global_step": 54, "epoch": 0.0020852641334569047, "loss": 0.123, "grad_norm": 14.2871675491333, "learning_rate": 9.83939393939394e-06, "num_tokens": 107780.0, "completions/mean_length": 60.25, "completions/min_length": 37.0, "completions/max_length": 101.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 60.25, "completions/min_terminated_length": 37.0, "completions/max_terminated_length": 101.0, "rewards/meter_count_clean/mean": 0.6305832266807556, "rewards/meter_count_clean/std": 0.4164789617061615, "reward": 0.6305832266807556, "reward_std": 0.4164789617061615, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21253442764282227, "sampling/sampling_logp_difference/max": 1.6645622253417969, "sampling/importance_sampling_ratio/min": 0.18927349150180817, "sampling/importance_sampling_ratio/mean": 1.0409408807754517, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.2034710496664047, "entropy": 2.285839468240738, "clip_ratio/low_mean": 0.07627276703715324, "clip_ratio/low_min": 0.07627276703715324, "clip_ratio/high_mean": 0.12182400096207857, "clip_ratio/high_max": 0.12182400096207857, "clip_ratio/region_mean": 0.19809676799923182, "reward_total_mean": 0.6305832266807556, "reward_meter_count_clean_mean": 0.6305832266807556, "reward_meter_count_clean_std": 0.4164789617061615}
56
+ {"timestamp_utc": "2026-04-11T19:10:44Z", "mode": "train", "global_step": 55, "epoch": 0.002123880135928329, "loss": 0.0258, "grad_norm": 17.327545166015625, "learning_rate": 9.836363636363637e-06, "num_tokens": 109474.0, "completions/mean_length": 51.75, "completions/min_length": 40.0, "completions/max_length": 65.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 51.75, "completions/min_terminated_length": 40.0, "completions/max_terminated_length": 65.0, "rewards/meter_count_clean/mean": 0.36311012506484985, "rewards/meter_count_clean/std": 0.2977062463760376, "reward": 0.36311012506484985, "reward_std": 0.2977062463760376, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22551913559436798, "sampling/sampling_logp_difference/max": 2.095989227294922, "sampling/importance_sampling_ratio/min": 0.122948557138443, "sampling/importance_sampling_ratio/mean": 1.0396475791931152, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.748019978404045, "entropy": 2.4415661692619324, "clip_ratio/low_mean": 0.09197395481169224, "clip_ratio/low_min": 0.09197395481169224, "clip_ratio/high_mean": 0.09078771062195301, "clip_ratio/high_max": 0.09078771062195301, "clip_ratio/region_mean": 0.18276166543364525, "reward_total_mean": 0.36311012506484985, "reward_meter_count_clean_mean": 0.36311012506484985, "reward_meter_count_clean_std": 0.2977062463760376}
57
+ {"timestamp_utc": "2026-04-11T19:10:49Z", "mode": "train", "global_step": 56, "epoch": 0.002162496138399753, "loss": 0.1289, "grad_norm": 18.075702667236328, "learning_rate": 9.833333333333333e-06, "num_tokens": 111016.0, "completions/mean_length": 40.75, "completions/min_length": 33.0, "completions/max_length": 46.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 40.75, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 46.0, "rewards/meter_count_clean/mean": 0.731018602848053, "rewards/meter_count_clean/std": 0.3394193649291992, "reward": 0.731018602848053, "reward_std": 0.3394193649291992, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.16314202547073364, "sampling/sampling_logp_difference/max": 1.0579338073730469, "sampling/importance_sampling_ratio/min": 0.3471723794937134, "sampling/importance_sampling_ratio/mean": 1.0378867387771606, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6087606996297836, "entropy": 1.4147121086716652, "clip_ratio/low_mean": 0.06206474732607603, "clip_ratio/low_min": 0.06206474732607603, "clip_ratio/high_mean": 0.08283982099965215, "clip_ratio/high_max": 0.08283982099965215, "clip_ratio/region_mean": 0.14490456832572818, "reward_total_mean": 0.731018602848053, "reward_meter_count_clean_mean": 0.731018602848053, "reward_meter_count_clean_std": 0.3394193649291992}
58
+ {"timestamp_utc": "2026-04-11T19:10:57Z", "mode": "train", "global_step": 57, "epoch": 0.002201112140871177, "loss": 0.069, "grad_norm": 6.15261173248291, "learning_rate": 9.830303030303032e-06, "num_tokens": 114977.0, "completions/mean_length": 284.125, "completions/min_length": 209.0, "completions/max_length": 347.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 284.125, "completions/min_terminated_length": 209.0, "completions/max_terminated_length": 347.0, "rewards/meter_count_clean/mean": 0.6026915907859802, "rewards/meter_count_clean/std": 0.22708161175251007, "reward": 0.6026915907859802, "reward_std": 0.22708161175251007, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2181180864572525, "sampling/sampling_logp_difference/max": 1.5707168579101562, "sampling/importance_sampling_ratio/min": 0.20789609849452972, "sampling/importance_sampling_ratio/mean": 1.0440524816513062, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.9276735782623291, "entropy": 3.2829161882400513, "clip_ratio/low_mean": 0.061714982613921165, "clip_ratio/low_min": 0.061714982613921165, "clip_ratio/high_mean": 0.1252468191087246, "clip_ratio/high_max": 0.1252468191087246, "clip_ratio/region_mean": 0.18696180172264576, "reward_total_mean": 0.6026915907859802, "reward_meter_count_clean_mean": 0.6026915907859802, "reward_meter_count_clean_std": 0.22708161175251007}
59
+ {"timestamp_utc": "2026-04-11T19:11:02Z", "mode": "train", "global_step": 58, "epoch": 0.002239728143342601, "loss": -0.0746, "grad_norm": 12.134157180786133, "learning_rate": 9.827272727272729e-06, "num_tokens": 116850.0, "completions/mean_length": 65.125, "completions/min_length": 38.0, "completions/max_length": 76.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 65.125, "completions/min_terminated_length": 38.0, "completions/max_terminated_length": 76.0, "rewards/meter_count_clean/mean": 0.6828622817993164, "rewards/meter_count_clean/std": 0.41145387291908264, "reward": 0.6828622817993164, "reward_std": 0.41145387291908264, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19039244949817657, "sampling/sampling_logp_difference/max": 1.1286425590515137, "sampling/importance_sampling_ratio/min": 0.3234720528125763, "sampling/importance_sampling_ratio/mean": 1.040724754333496, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.2836961075663567, "entropy": 2.45593498647213, "clip_ratio/low_mean": 0.07173928804695606, "clip_ratio/low_min": 0.07173928804695606, "clip_ratio/high_mean": 0.11394045688211918, "clip_ratio/high_max": 0.11394045688211918, "clip_ratio/region_mean": 0.18567974492907524, "reward_total_mean": 0.6828622817993164, "reward_meter_count_clean_mean": 0.6828622817993164, "reward_meter_count_clean_std": 0.41145387291908264}
60
+ {"timestamp_utc": "2026-04-11T19:11:09Z", "mode": "train", "global_step": 59, "epoch": 0.002278344145814025, "loss": 0.0594, "grad_norm": 13.39612102508545, "learning_rate": 9.824242424242425e-06, "num_tokens": 119592.0, "completions/mean_length": 142.75, "completions/min_length": 103.0, "completions/max_length": 233.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 142.75, "completions/min_terminated_length": 103.0, "completions/max_terminated_length": 233.0, "rewards/meter_count_clean/mean": 0.20413683354854584, "rewards/meter_count_clean/std": 0.1784268319606781, "reward": 0.20413683354854584, "reward_std": 0.1784268468618393, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.26621776819229126, "sampling/sampling_logp_difference/max": 2.522489547729492, "sampling/importance_sampling_ratio/min": 0.0802595466375351, "sampling/importance_sampling_ratio/mean": 1.0229341983795166, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6619391813874245, "entropy": 2.218117520213127, "clip_ratio/low_mean": 0.10819577984511852, "clip_ratio/low_min": 0.10819577984511852, "clip_ratio/high_mean": 0.11741098761558533, "clip_ratio/high_max": 0.11741098761558533, "clip_ratio/region_mean": 0.22560676746070385, "reward_total_mean": 0.20413683354854584, "reward_meter_count_clean_mean": 0.20413683354854584, "reward_meter_count_clean_std": 0.1784268319606781}
61
+ {"timestamp_utc": "2026-04-11T19:11:15Z", "mode": "train", "global_step": 60, "epoch": 0.0023169601482854493, "loss": 0.0463, "grad_norm": 15.286205291748047, "learning_rate": 9.821212121212122e-06, "num_tokens": 121201.0, "completions/mean_length": 46.125, "completions/min_length": 37.0, "completions/max_length": 76.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 46.125, "completions/min_terminated_length": 37.0, "completions/max_terminated_length": 76.0, "rewards/meter_count_clean/mean": 0.5946000814437866, "rewards/meter_count_clean/std": 0.3260856568813324, "reward": 0.5946000814437866, "reward_std": 0.3260856568813324, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20776869356632233, "sampling/sampling_logp_difference/max": 1.5432052612304688, "sampling/importance_sampling_ratio/min": 0.21369504928588867, "sampling/importance_sampling_ratio/mean": 1.0501973628997803, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.8500682413578033, "entropy": 2.614510953426361, "clip_ratio/low_mean": 0.07143263425678015, "clip_ratio/low_min": 0.07143263425678015, "clip_ratio/high_mean": 0.08825728669762611, "clip_ratio/high_max": 0.08825728669762611, "clip_ratio/region_mean": 0.15968992095440626, "reward_total_mean": 0.5946000814437866, "reward_meter_count_clean_mean": 0.5946000814437866, "reward_meter_count_clean_std": 0.3260856568813324}
62
+ {"timestamp_utc": "2026-04-11T19:11:20Z", "mode": "train", "global_step": 61, "epoch": 0.002355576150756874, "loss": 0.245, "grad_norm": 19.05546760559082, "learning_rate": 9.81818181818182e-06, "num_tokens": 122769.0, "completions/mean_length": 31.0, "completions/min_length": 22.0, "completions/max_length": 43.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 31.0, "completions/min_terminated_length": 22.0, "completions/max_terminated_length": 43.0, "rewards/meter_count_clean/mean": 0.7235491275787354, "rewards/meter_count_clean/std": 0.44676530361175537, "reward": 0.7235491275787354, "reward_std": 0.44676533341407776, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19886323809623718, "sampling/sampling_logp_difference/max": 1.2326695919036865, "sampling/importance_sampling_ratio/min": 0.2915133237838745, "sampling/importance_sampling_ratio/mean": 1.039192795753479, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.743053525686264, "entropy": 2.4280820190906525, "clip_ratio/low_mean": 0.05286107771098614, "clip_ratio/low_min": 0.05286107771098614, "clip_ratio/high_mean": 0.1522508319467306, "clip_ratio/high_max": 0.1522508319467306, "clip_ratio/region_mean": 0.20511190965771675, "reward_total_mean": 0.7235491275787354, "reward_meter_count_clean_mean": 0.7235491275787354, "reward_meter_count_clean_std": 0.44676530361175537}
63
+ {"timestamp_utc": "2026-04-11T19:11:26Z", "mode": "train", "global_step": 62, "epoch": 0.002394192153228298, "loss": 0.1557, "grad_norm": 10.215119361877441, "learning_rate": 9.815151515151516e-06, "num_tokens": 124968.0, "completions/mean_length": 105.875, "completions/min_length": 62.0, "completions/max_length": 153.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 105.875, "completions/min_terminated_length": 62.0, "completions/max_terminated_length": 153.0, "rewards/meter_count_clean/mean": 0.2949569523334503, "rewards/meter_count_clean/std": 0.2673260271549225, "reward": 0.2949569523334503, "reward_std": 0.2673259973526001, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22320568561553955, "sampling/sampling_logp_difference/max": 1.3543634414672852, "sampling/importance_sampling_ratio/min": 0.25811153650283813, "sampling/importance_sampling_ratio/mean": 1.0284371376037598, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.3760973513126373, "entropy": 2.715813457965851, "clip_ratio/low_mean": 0.08893644995987415, "clip_ratio/low_min": 0.08893644995987415, "clip_ratio/high_mean": 0.11933935433626175, "clip_ratio/high_max": 0.11933935433626175, "clip_ratio/region_mean": 0.2082758042961359, "reward_total_mean": 0.2949569523334503, "reward_meter_count_clean_mean": 0.2949569523334503, "reward_meter_count_clean_std": 0.2673260271549225}
64
+ {"timestamp_utc": "2026-04-11T19:11:32Z", "mode": "train", "global_step": 63, "epoch": 0.002432808155699722, "loss": 0.072, "grad_norm": 10.939824104309082, "learning_rate": 9.812121212121212e-06, "num_tokens": 126782.0, "completions/mean_length": 70.75, "completions/min_length": 48.0, "completions/max_length": 84.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 70.75, "completions/min_terminated_length": 48.0, "completions/max_terminated_length": 84.0, "rewards/meter_count_clean/mean": 0.8419647216796875, "rewards/meter_count_clean/std": 0.3233566880226135, "reward": 0.8419647216796875, "reward_std": 0.32335665822029114, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19959045946598053, "sampling/sampling_logp_difference/max": 1.9795737266540527, "sampling/importance_sampling_ratio/min": 0.1381281018257141, "sampling/importance_sampling_ratio/mean": 1.0459727048873901, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6386639475822449, "entropy": 2.22160741686821, "clip_ratio/low_mean": 0.04621848836541176, "clip_ratio/low_min": 0.04621848836541176, "clip_ratio/high_mean": 0.11690851114690304, "clip_ratio/high_max": 0.11690851114690304, "clip_ratio/region_mean": 0.1631269995123148, "reward_total_mean": 0.8419647216796875, "reward_meter_count_clean_mean": 0.8419647216796875, "reward_meter_count_clean_std": 0.3233566880226135}
65
+ {"timestamp_utc": "2026-04-11T19:11:42Z", "mode": "train", "global_step": 64, "epoch": 0.002471424158171146, "loss": 0.1276, "grad_norm": 4.516471862792969, "learning_rate": 9.809090909090911e-06, "num_tokens": 131629.0, "completions/mean_length": 390.875, "completions/min_length": 315.0, "completions/max_length": 483.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 390.875, "completions/min_terminated_length": 315.0, "completions/max_terminated_length": 483.0, "rewards/meter_count_clean/mean": 0.43812355399131775, "rewards/meter_count_clean/std": 0.37187716364860535, "reward": 0.43812355399131775, "reward_std": 0.37187716364860535, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19917061924934387, "sampling/sampling_logp_difference/max": 1.9876041412353516, "sampling/importance_sampling_ratio/min": 0.13702331483364105, "sampling/importance_sampling_ratio/mean": 1.02934730052948, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.7644509300589561, "entropy": 2.9058758020401, "clip_ratio/low_mean": 0.05013222433626652, "clip_ratio/low_min": 0.05013222433626652, "clip_ratio/high_mean": 0.11145513504743576, "clip_ratio/high_max": 0.11145513504743576, "clip_ratio/region_mean": 0.16158735938370228, "reward_total_mean": 0.43812355399131775, "reward_meter_count_clean_mean": 0.43812355399131775, "reward_meter_count_clean_std": 0.37187716364860535}
66
+ {"timestamp_utc": "2026-04-11T19:11:46Z", "mode": "train", "global_step": 65, "epoch": 0.0025100401606425703, "loss": 0.0846, "grad_norm": 20.379396438598633, "learning_rate": 9.806060606060607e-06, "num_tokens": 133206.0, "completions/mean_length": 34.125, "completions/min_length": 24.0, "completions/max_length": 48.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 34.125, "completions/min_terminated_length": 24.0, "completions/max_terminated_length": 48.0, "rewards/meter_count_clean/mean": 0.39298856258392334, "rewards/meter_count_clean/std": 0.3957642912864685, "reward": 0.39298856258392334, "reward_std": 0.39576423168182373, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21870994567871094, "sampling/sampling_logp_difference/max": 2.2564210891723633, "sampling/importance_sampling_ratio/min": 0.10472461581230164, "sampling/importance_sampling_ratio/mean": 0.9815999269485474, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.063544064760208, "entropy": 1.5038166865706444, "clip_ratio/low_mean": 0.15403079986572266, "clip_ratio/low_min": 0.15403079986572266, "clip_ratio/high_mean": 0.09134615398943424, "clip_ratio/high_max": 0.09134615398943424, "clip_ratio/region_mean": 0.2453769538551569, "reward_total_mean": 0.39298856258392334, "reward_meter_count_clean_mean": 0.39298856258392334, "reward_meter_count_clean_std": 0.3957642912864685}
67
+ {"timestamp_utc": "2026-04-11T19:11:52Z", "mode": "train", "global_step": 66, "epoch": 0.0025486561631139944, "loss": 0.1542, "grad_norm": 17.633543014526367, "learning_rate": 9.803030303030304e-06, "num_tokens": 134995.0, "completions/mean_length": 69.625, "completions/min_length": 49.0, "completions/max_length": 96.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 69.625, "completions/min_terminated_length": 49.0, "completions/max_terminated_length": 96.0, "rewards/meter_count_clean/mean": 0.8001365661621094, "rewards/meter_count_clean/std": 0.3378126919269562, "reward": 0.8001365661621094, "reward_std": 0.3378126621246338, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20599590241909027, "sampling/sampling_logp_difference/max": 1.8366949558258057, "sampling/importance_sampling_ratio/min": 0.1593431979417801, "sampling/importance_sampling_ratio/mean": 1.0259366035461426, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.893242433667183, "entropy": 1.3359143435955048, "clip_ratio/low_mean": 0.04664522036910057, "clip_ratio/low_min": 0.04664522036910057, "clip_ratio/high_mean": 0.16264715790748596, "clip_ratio/high_max": 0.16264715790748596, "clip_ratio/region_mean": 0.20929237827658653, "reward_total_mean": 0.8001365661621094, "reward_meter_count_clean_mean": 0.8001365661621094, "reward_meter_count_clean_std": 0.3378126919269562}
68
+ {"timestamp_utc": "2026-04-11T19:11:56Z", "mode": "train", "global_step": 67, "epoch": 0.0025872721655854185, "loss": 0.0055, "grad_norm": 16.88397979736328, "learning_rate": 9.800000000000001e-06, "num_tokens": 136783.0, "completions/mean_length": 46.5, "completions/min_length": 37.0, "completions/max_length": 57.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 46.5, "completions/min_terminated_length": 37.0, "completions/max_terminated_length": 57.0, "rewards/meter_count_clean/mean": 0.6752415895462036, "rewards/meter_count_clean/std": 0.3951883316040039, "reward": 0.6752415895462036, "reward_std": 0.3951883316040039, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21076610684394836, "sampling/sampling_logp_difference/max": 1.1104373931884766, "sampling/importance_sampling_ratio/min": 0.32941487431526184, "sampling/importance_sampling_ratio/mean": 1.0569463968276978, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.985504299402237, "entropy": 2.648691624403, "clip_ratio/low_mean": 0.07886030618101358, "clip_ratio/low_min": 0.07886030618101358, "clip_ratio/high_mean": 0.107527912594378, "clip_ratio/high_max": 0.107527912594378, "clip_ratio/region_mean": 0.18638821877539158, "reward_total_mean": 0.6752415895462036, "reward_meter_count_clean_mean": 0.6752415895462036, "reward_meter_count_clean_std": 0.3951883316040039}
69
+ {"timestamp_utc": "2026-04-11T19:12:01Z", "mode": "train", "global_step": 68, "epoch": 0.0026258881680568426, "loss": 0.2069, "grad_norm": 22.709701538085938, "learning_rate": 9.796969696969698e-06, "num_tokens": 138421.0, "completions/mean_length": 44.75, "completions/min_length": 30.0, "completions/max_length": 62.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 44.75, "completions/min_terminated_length": 30.0, "completions/max_terminated_length": 62.0, "rewards/meter_count_clean/mean": 0.8447393178939819, "rewards/meter_count_clean/std": 0.22370560467243195, "reward": 0.8447393178939819, "reward_std": 0.22370561957359314, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2022874504327774, "sampling/sampling_logp_difference/max": 1.7470617294311523, "sampling/importance_sampling_ratio/min": 0.17428529262542725, "sampling/importance_sampling_ratio/mean": 1.0341558456420898, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.9188628941774368, "entropy": 1.8082955032587051, "clip_ratio/low_mean": 0.05944700539112091, "clip_ratio/low_min": 0.05944700539112091, "clip_ratio/high_mean": 0.16749201156198978, "clip_ratio/high_max": 0.16749201156198978, "clip_ratio/region_mean": 0.2269390169531107, "reward_total_mean": 0.8447393178939819, "reward_meter_count_clean_mean": 0.8447393178939819, "reward_meter_count_clean_std": 0.22370560467243195}
70
+ {"timestamp_utc": "2026-04-11T19:12:07Z", "mode": "train", "global_step": 69, "epoch": 0.002664504170528267, "loss": 0.065, "grad_norm": 7.776491641998291, "learning_rate": 9.793939393939394e-06, "num_tokens": 140882.0, "completions/mean_length": 140.625, "completions/min_length": 100.0, "completions/max_length": 177.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 140.625, "completions/min_terminated_length": 100.0, "completions/max_terminated_length": 177.0, "rewards/meter_count_clean/mean": 0.8124464750289917, "rewards/meter_count_clean/std": 0.22712665796279907, "reward": 0.8124464750289917, "reward_std": 0.22712665796279907, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2103135734796524, "sampling/sampling_logp_difference/max": 1.5220518112182617, "sampling/importance_sampling_ratio/min": 0.21826361119747162, "sampling/importance_sampling_ratio/mean": 1.0490548610687256, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.129339560866356, "entropy": 3.014527380466461, "clip_ratio/low_mean": 0.08012999780476093, "clip_ratio/low_min": 0.08012999780476093, "clip_ratio/high_mean": 0.12308589741587639, "clip_ratio/high_max": 0.12308589741587639, "clip_ratio/region_mean": 0.20321589522063732, "reward_total_mean": 0.8124464750289917, "reward_meter_count_clean_mean": 0.8124464750289917, "reward_meter_count_clean_std": 0.22712665796279907}
71
+ {"timestamp_utc": "2026-04-11T19:12:14Z", "mode": "train", "global_step": 70, "epoch": 0.0027031201729996912, "loss": 0.2738, "grad_norm": 9.610224723815918, "learning_rate": 9.790909090909093e-06, "num_tokens": 143117.0, "completions/mean_length": 98.375, "completions/min_length": 56.0, "completions/max_length": 155.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 98.375, "completions/min_terminated_length": 56.0, "completions/max_terminated_length": 155.0, "rewards/meter_count_clean/mean": 0.27704793214797974, "rewards/meter_count_clean/std": 0.31074026226997375, "reward": 0.27704793214797974, "reward_std": 0.31074026226997375, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24906866252422333, "sampling/sampling_logp_difference/max": 1.5363836288452148, "sampling/importance_sampling_ratio/min": 0.21515779197216034, "sampling/importance_sampling_ratio/mean": 1.0380938053131104, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.29872664809227, "entropy": 2.8568870425224304, "clip_ratio/low_mean": 0.12618878670036793, "clip_ratio/low_min": 0.12618878670036793, "clip_ratio/high_mean": 0.081504937261343, "clip_ratio/high_max": 0.081504937261343, "clip_ratio/region_mean": 0.20769372396171093, "reward_total_mean": 0.27704793214797974, "reward_meter_count_clean_mean": 0.27704793214797974, "reward_meter_count_clean_std": 0.31074026226997375}
72
+ {"timestamp_utc": "2026-04-11T19:12:19Z", "mode": "train", "global_step": 71, "epoch": 0.0027417361754711153, "loss": 0.0602, "grad_norm": 12.76672077178955, "learning_rate": 9.787878787878788e-06, "num_tokens": 145135.0, "completions/mean_length": 81.25, "completions/min_length": 45.0, "completions/max_length": 126.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 81.25, "completions/min_terminated_length": 45.0, "completions/max_terminated_length": 126.0, "rewards/meter_count_clean/mean": 0.44717690348625183, "rewards/meter_count_clean/std": 0.36459431052207947, "reward": 0.44717690348625183, "reward_std": 0.36459431052207947, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2269318401813507, "sampling/sampling_logp_difference/max": 1.5952892303466797, "sampling/importance_sampling_ratio/min": 0.2028498500585556, "sampling/importance_sampling_ratio/mean": 1.0183560848236084, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.2731183916330338, "entropy": 2.6634568572044373, "clip_ratio/low_mean": 0.10119702108204365, "clip_ratio/low_min": 0.10119702108204365, "clip_ratio/high_mean": 0.1307387314736843, "clip_ratio/high_max": 0.1307387314736843, "clip_ratio/region_mean": 0.23193575255572796, "reward_total_mean": 0.44717690348625183, "reward_meter_count_clean_mean": 0.44717690348625183, "reward_meter_count_clean_std": 0.36459431052207947}
73
+ {"timestamp_utc": "2026-04-11T19:12:26Z", "mode": "train", "global_step": 72, "epoch": 0.0027803521779425394, "loss": 0.0558, "grad_norm": 6.893131256103516, "learning_rate": 9.784848484848486e-06, "num_tokens": 147931.0, "completions/mean_length": 171.5, "completions/min_length": 145.0, "completions/max_length": 198.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 171.5, "completions/min_terminated_length": 145.0, "completions/max_terminated_length": 198.0, "rewards/meter_count_clean/mean": 0.34930744767189026, "rewards/meter_count_clean/std": 0.34667089581489563, "reward": 0.34930744767189026, "reward_std": 0.34667089581489563, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21298885345458984, "sampling/sampling_logp_difference/max": 1.5448880195617676, "sampling/importance_sampling_ratio/min": 0.21333575248718262, "sampling/importance_sampling_ratio/mean": 1.0105934143066406, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.1274225786328316, "entropy": 2.4433897733688354, "clip_ratio/low_mean": 0.09952043555676937, "clip_ratio/low_min": 0.09952043555676937, "clip_ratio/high_mean": 0.08511919900774956, "clip_ratio/high_max": 0.08511919900774956, "clip_ratio/region_mean": 0.18463963456451893, "reward_total_mean": 0.34930744767189026, "reward_meter_count_clean_mean": 0.34930744767189026, "reward_meter_count_clean_std": 0.34667089581489563}
74
+ {"timestamp_utc": "2026-04-11T19:12:31Z", "mode": "train", "global_step": 73, "epoch": 0.0028189681804139635, "loss": 0.0954, "grad_norm": 13.334155082702637, "learning_rate": 9.781818181818183e-06, "num_tokens": 149855.0, "completions/mean_length": 57.5, "completions/min_length": 39.0, "completions/max_length": 95.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 57.5, "completions/min_terminated_length": 39.0, "completions/max_terminated_length": 95.0, "rewards/meter_count_clean/mean": 0.42032575607299805, "rewards/meter_count_clean/std": 0.4467563033103943, "reward": 0.42032575607299805, "reward_std": 0.4467563033103943, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2024940848350525, "sampling/sampling_logp_difference/max": 1.7194023132324219, "sampling/importance_sampling_ratio/min": 0.17917320132255554, "sampling/importance_sampling_ratio/mean": 1.0420591831207275, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4627206772565842, "entropy": 2.515232354402542, "clip_ratio/low_mean": 0.08047989010810852, "clip_ratio/low_min": 0.08047989010810852, "clip_ratio/high_mean": 0.0748168514110148, "clip_ratio/high_max": 0.0748168514110148, "clip_ratio/region_mean": 0.15529674151912332, "reward_total_mean": 0.42032575607299805, "reward_meter_count_clean_mean": 0.42032575607299805, "reward_meter_count_clean_std": 0.4467563033103943}
75
+ {"timestamp_utc": "2026-04-11T19:12:42Z", "mode": "train", "global_step": 74, "epoch": 0.0028575841828853876, "loss": 0.0603, "grad_norm": 1.396609902381897, "learning_rate": 9.77878787878788e-06, "num_tokens": 152386.0, "completions/mean_length": 466.375, "completions/min_length": 155.0, "completions/max_length": 512.0, "completions/clipped_ratio": 0.75, "completions/mean_terminated_length": 329.5, "completions/min_terminated_length": 155.0, "completions/max_terminated_length": 504.0, "rewards/meter_count_clean/mean": 0.6015316843986511, "rewards/meter_count_clean/std": 0.2646358013153076, "reward": 0.6015316843986511, "reward_std": 0.2646358013153076, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2191452980041504, "sampling/sampling_logp_difference/max": 1.3887028694152832, "sampling/importance_sampling_ratio/min": 0.2493986040353775, "sampling/importance_sampling_ratio/mean": 1.0675560235977173, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.2103242427110672, "entropy": 0.8815881013870239, "clip_ratio/low_mean": 0.01854838617146015, "clip_ratio/low_min": 0.01854838617146015, "clip_ratio/high_mean": 0.0245535708963871, "clip_ratio/high_max": 0.0245535708963871, "clip_ratio/region_mean": 0.04310195706784725, "reward_total_mean": 0.6015316843986511, "reward_meter_count_clean_mean": 0.6015316843986511, "reward_meter_count_clean_std": 0.2646358013153076}
76
+ {"timestamp_utc": "2026-04-11T19:12:48Z", "mode": "train", "global_step": 75, "epoch": 0.0028962001853568117, "loss": 0.1183, "grad_norm": 9.12006950378418, "learning_rate": 9.775757575757576e-06, "num_tokens": 154741.0, "completions/mean_length": 123.375, "completions/min_length": 104.0, "completions/max_length": 137.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 123.375, "completions/min_terminated_length": 104.0, "completions/max_terminated_length": 137.0, "rewards/meter_count_clean/mean": 0.3497368097305298, "rewards/meter_count_clean/std": 0.26128703355789185, "reward": 0.3497368097305298, "reward_std": 0.26128703355789185, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20589499175548553, "sampling/sampling_logp_difference/max": 2.0790109634399414, "sampling/importance_sampling_ratio/min": 0.12505382299423218, "sampling/importance_sampling_ratio/mean": 1.0463789701461792, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.3578874319791794, "entropy": 2.449110597372055, "clip_ratio/low_mean": 0.12214752100408077, "clip_ratio/low_min": 0.12214752100408077, "clip_ratio/high_mean": 0.07322514988481998, "clip_ratio/high_max": 0.07322514988481998, "clip_ratio/region_mean": 0.19537267088890076, "reward_total_mean": 0.3497368097305298, "reward_meter_count_clean_mean": 0.3497368097305298, "reward_meter_count_clean_std": 0.26128703355789185}
77
+ {"timestamp_utc": "2026-04-11T19:12:53Z", "mode": "train", "global_step": 76, "epoch": 0.002934816187828236, "loss": 0.2798, "grad_norm": 13.480757713317871, "learning_rate": 9.772727272727273e-06, "num_tokens": 156595.0, "completions/mean_length": 54.75, "completions/min_length": 34.0, "completions/max_length": 73.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 54.75, "completions/min_terminated_length": 34.0, "completions/max_terminated_length": 73.0, "rewards/meter_count_clean/mean": 0.5932719707489014, "rewards/meter_count_clean/std": 0.3900352716445923, "reward": 0.5932719707489014, "reward_std": 0.3900352716445923, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.23030723631381989, "sampling/sampling_logp_difference/max": 1.9909172058105469, "sampling/importance_sampling_ratio/min": 0.13657011091709137, "sampling/importance_sampling_ratio/mean": 1.0449117422103882, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7342924177646637, "entropy": 2.940678298473358, "clip_ratio/low_mean": 0.09841924253851175, "clip_ratio/low_min": 0.09841924253851175, "clip_ratio/high_mean": 0.1170717179775238, "clip_ratio/high_max": 0.1170717179775238, "clip_ratio/region_mean": 0.21549096051603556, "reward_total_mean": 0.5932719707489014, "reward_meter_count_clean_mean": 0.5932719707489014, "reward_meter_count_clean_std": 0.3900352716445923}
78
+ {"timestamp_utc": "2026-04-11T19:12:59Z", "mode": "train", "global_step": 77, "epoch": 0.0029734321902996604, "loss": 0.2751, "grad_norm": 15.090164184570312, "learning_rate": 9.76969696969697e-06, "num_tokens": 158209.0, "completions/mean_length": 49.75, "completions/min_length": 33.0, "completions/max_length": 98.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 49.75, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 98.0, "rewards/meter_count_clean/mean": 0.4592646062374115, "rewards/meter_count_clean/std": 0.43493443727493286, "reward": 0.4592646062374115, "reward_std": 0.4349344074726105, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22933514416217804, "sampling/sampling_logp_difference/max": 1.5349407196044922, "sampling/importance_sampling_ratio/min": 0.21546848118305206, "sampling/importance_sampling_ratio/mean": 1.048494815826416, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.0099042803049088, "entropy": 2.5642327964305878, "clip_ratio/low_mean": 0.08706128969788551, "clip_ratio/low_min": 0.08706128969788551, "clip_ratio/high_mean": 0.10580789390951395, "clip_ratio/high_max": 0.10580789390951395, "clip_ratio/region_mean": 0.19286918360739946, "reward_total_mean": 0.4592646062374115, "reward_meter_count_clean_mean": 0.4592646062374115, "reward_meter_count_clean_std": 0.43493443727493286}
79
+ {"timestamp_utc": "2026-04-11T19:13:05Z", "mode": "train", "global_step": 78, "epoch": 0.0030120481927710845, "loss": 0.04, "grad_norm": 7.737454891204834, "learning_rate": 9.766666666666667e-06, "num_tokens": 161112.0, "completions/mean_length": 164.875, "completions/min_length": 98.0, "completions/max_length": 205.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 164.875, "completions/min_terminated_length": 98.0, "completions/max_terminated_length": 205.0, "rewards/meter_count_clean/mean": 0.18029195070266724, "rewards/meter_count_clean/std": 0.1389833390712738, "reward": 0.18029195070266724, "reward_std": 0.1389833241701126, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2152501344680786, "sampling/sampling_logp_difference/max": 1.7132110595703125, "sampling/importance_sampling_ratio/min": 0.18028594553470612, "sampling/importance_sampling_ratio/mean": 1.0424808263778687, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.39060677587986, "entropy": 2.451413109898567, "clip_ratio/low_mean": 0.11678846552968025, "clip_ratio/low_min": 0.11678846552968025, "clip_ratio/high_mean": 0.08520248159766197, "clip_ratio/high_max": 0.08520248159766197, "clip_ratio/region_mean": 0.20199094712734222, "reward_total_mean": 0.18029195070266724, "reward_meter_count_clean_mean": 0.18029195070266724, "reward_meter_count_clean_std": 0.1389833390712738}
80
+ {"timestamp_utc": "2026-04-11T19:13:15Z", "mode": "train", "global_step": 79, "epoch": 0.0030506641952425086, "loss": 0.0211, "grad_norm": 4.973855018615723, "learning_rate": 9.763636363636365e-06, "num_tokens": 165606.0, "completions/mean_length": 342.75, "completions/min_length": 227.0, "completions/max_length": 439.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 342.75, "completions/min_terminated_length": 227.0, "completions/max_terminated_length": 439.0, "rewards/meter_count_clean/mean": 0.2517043650150299, "rewards/meter_count_clean/std": 0.2103721797466278, "reward": 0.2517043650150299, "reward_std": 0.2103721648454666, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20066173374652863, "sampling/sampling_logp_difference/max": 1.8092515468597412, "sampling/importance_sampling_ratio/min": 0.16377666592597961, "sampling/importance_sampling_ratio/mean": 1.0435837507247925, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.8531102910637856, "entropy": 2.836574971675873, "clip_ratio/low_mean": 0.09096036478877068, "clip_ratio/low_min": 0.09096036478877068, "clip_ratio/high_mean": 0.09161586128175259, "clip_ratio/high_max": 0.09161586128175259, "clip_ratio/region_mean": 0.18257622607052326, "reward_total_mean": 0.2517043650150299, "reward_meter_count_clean_mean": 0.2517043650150299, "reward_meter_count_clean_std": 0.2103721797466278}
81
+ {"timestamp_utc": "2026-04-11T19:13:20Z", "mode": "train", "global_step": 80, "epoch": 0.0030892801977139327, "loss": 0.2298, "grad_norm": 12.971203804016113, "learning_rate": 9.760606060606062e-06, "num_tokens": 167379.0, "completions/mean_length": 62.625, "completions/min_length": 33.0, "completions/max_length": 101.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 62.625, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 101.0, "rewards/meter_count_clean/mean": 0.5897529721260071, "rewards/meter_count_clean/std": 0.3398434519767761, "reward": 0.5897529721260071, "reward_std": 0.3398434519767761, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20138375461101532, "sampling/sampling_logp_difference/max": 1.4945716857910156, "sampling/importance_sampling_ratio/min": 0.22434468567371368, "sampling/importance_sampling_ratio/mean": 1.0488100051879883, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7127311304211617, "entropy": 2.800754815340042, "clip_ratio/low_mean": 0.07962200790643692, "clip_ratio/low_min": 0.07962200790643692, "clip_ratio/high_mean": 0.12067882902920246, "clip_ratio/high_max": 0.12067882902920246, "clip_ratio/region_mean": 0.20030083693563938, "reward_total_mean": 0.5897529721260071, "reward_meter_count_clean_mean": 0.5897529721260071, "reward_meter_count_clean_std": 0.3398434519767761}
82
+ {"timestamp_utc": "2026-04-11T19:13:25Z", "mode": "train", "global_step": 81, "epoch": 0.003127896200185357, "loss": 0.0364, "grad_norm": 16.626371383666992, "learning_rate": 9.757575757575758e-06, "num_tokens": 168941.0, "completions/mean_length": 43.25, "completions/min_length": 36.0, "completions/max_length": 47.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 43.25, "completions/min_terminated_length": 36.0, "completions/max_terminated_length": 47.0, "rewards/meter_count_clean/mean": 0.32984912395477295, "rewards/meter_count_clean/std": 0.36702674627304077, "reward": 0.32984912395477295, "reward_std": 0.36702677607536316, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.18993134796619415, "sampling/sampling_logp_difference/max": 2.503852128982544, "sampling/importance_sampling_ratio/min": 0.08176939934492111, "sampling/importance_sampling_ratio/mean": 0.9905309081077576, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.9840531200170517, "entropy": 0.8857106305658817, "clip_ratio/low_mean": 0.13005737029016018, "clip_ratio/low_min": 0.13005737029016018, "clip_ratio/high_mean": 0.04627659637480974, "clip_ratio/high_max": 0.04627659637480974, "clip_ratio/region_mean": 0.17633396666496992, "reward_total_mean": 0.32984912395477295, "reward_meter_count_clean_mean": 0.32984912395477295, "reward_meter_count_clean_std": 0.36702674627304077}
83
+ {"timestamp_utc": "2026-04-11T19:13:31Z", "mode": "train", "global_step": 82, "epoch": 0.003166512202656781, "loss": 0.0514, "grad_norm": 13.673704147338867, "learning_rate": 9.754545454545455e-06, "num_tokens": 170843.0, "completions/mean_length": 61.75, "completions/min_length": 52.0, "completions/max_length": 71.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 61.75, "completions/min_terminated_length": 52.0, "completions/max_terminated_length": 71.0, "rewards/meter_count_clean/mean": 0.6860445737838745, "rewards/meter_count_clean/std": 0.4127279818058014, "reward": 0.6860445737838745, "reward_std": 0.412727952003479, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19877685606479645, "sampling/sampling_logp_difference/max": 1.148824691772461, "sampling/importance_sampling_ratio/min": 0.3170091509819031, "sampling/importance_sampling_ratio/mean": 1.0191963911056519, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7028498351573944, "entropy": 2.6517202258110046, "clip_ratio/low_mean": 0.030181385576725006, "clip_ratio/low_min": 0.030181385576725006, "clip_ratio/high_mean": 0.18430931121110916, "clip_ratio/high_max": 0.18430931121110916, "clip_ratio/region_mean": 0.21449069678783417, "reward_total_mean": 0.6860445737838745, "reward_meter_count_clean_mean": 0.6860445737838745, "reward_meter_count_clean_std": 0.4127279818058014}
84
+ {"timestamp_utc": "2026-04-11T19:13:39Z", "mode": "train", "global_step": 83, "epoch": 0.003205128205128205, "loss": -0.0638, "grad_norm": 6.804693222045898, "learning_rate": 9.751515151515152e-06, "num_tokens": 173838.0, "completions/mean_length": 193.375, "completions/min_length": 119.0, "completions/max_length": 274.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 193.375, "completions/min_terminated_length": 119.0, "completions/max_terminated_length": 274.0, "rewards/meter_count_clean/mean": 0.1995052993297577, "rewards/meter_count_clean/std": 0.20155183970928192, "reward": 0.1995052993297577, "reward_std": 0.20155183970928192, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2313334345817566, "sampling/sampling_logp_difference/max": 1.8036818504333496, "sampling/importance_sampling_ratio/min": 0.16469140350818634, "sampling/importance_sampling_ratio/mean": 1.0506726503372192, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.5210926830768585, "entropy": 3.7798104882240295, "clip_ratio/low_mean": 0.0888951700180769, "clip_ratio/low_min": 0.0888951700180769, "clip_ratio/high_mean": 0.1242379229515791, "clip_ratio/high_max": 0.1242379229515791, "clip_ratio/region_mean": 0.213133092969656, "reward_total_mean": 0.1995052993297577, "reward_meter_count_clean_mean": 0.1995052993297577, "reward_meter_count_clean_std": 0.20155183970928192}
85
+ {"timestamp_utc": "2026-04-11T19:13:44Z", "mode": "train", "global_step": 84, "epoch": 0.003243744207599629, "loss": 0.1179, "grad_norm": 13.247566223144531, "learning_rate": 9.74848484848485e-06, "num_tokens": 175356.0, "completions/mean_length": 42.75, "completions/min_length": 29.0, "completions/max_length": 63.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 42.75, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 63.0, "rewards/meter_count_clean/mean": 0.2839965224266052, "rewards/meter_count_clean/std": 0.25705069303512573, "reward": 0.2839965224266052, "reward_std": 0.25705069303512573, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20496825873851776, "sampling/sampling_logp_difference/max": 1.4523324966430664, "sampling/importance_sampling_ratio/min": 0.2340237945318222, "sampling/importance_sampling_ratio/mean": 1.056425929069519, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7388735264539719, "entropy": 2.1612997502088547, "clip_ratio/low_mean": 0.14447841420769691, "clip_ratio/low_min": 0.14447841420769691, "clip_ratio/high_mean": 0.044027093797922134, "clip_ratio/high_max": 0.044027093797922134, "clip_ratio/region_mean": 0.18850550800561905, "reward_total_mean": 0.2839965224266052, "reward_meter_count_clean_mean": 0.2839965224266052, "reward_meter_count_clean_std": 0.25705069303512573}
86
+ {"timestamp_utc": "2026-04-11T19:13:50Z", "mode": "train", "global_step": 85, "epoch": 0.0032823602100710537, "loss": 0.1389, "grad_norm": 8.849252700805664, "learning_rate": 9.745454545454547e-06, "num_tokens": 177342.0, "completions/mean_length": 94.25, "completions/min_length": 56.0, "completions/max_length": 145.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 94.25, "completions/min_terminated_length": 56.0, "completions/max_terminated_length": 145.0, "rewards/meter_count_clean/mean": 0.5080384612083435, "rewards/meter_count_clean/std": 0.5007855892181396, "reward": 0.5080384612083435, "reward_std": 0.5007855296134949, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20362532138824463, "sampling/sampling_logp_difference/max": 1.7173385620117188, "sampling/importance_sampling_ratio/min": 0.1795433610677719, "sampling/importance_sampling_ratio/mean": 1.0272923707962036, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4220501631498337, "entropy": 2.768698900938034, "clip_ratio/low_mean": 0.07980264164507389, "clip_ratio/low_min": 0.07980264164507389, "clip_ratio/high_mean": 0.11511588096618652, "clip_ratio/high_max": 0.11511588096618652, "clip_ratio/region_mean": 0.19491852261126041, "reward_total_mean": 0.5080384612083435, "reward_meter_count_clean_mean": 0.5080384612083435, "reward_meter_count_clean_std": 0.5007855892181396}
87
+ {"timestamp_utc": "2026-04-11T19:13:58Z", "mode": "train", "global_step": 86, "epoch": 0.0033209762125424778, "loss": 0.1116, "grad_norm": 5.789488315582275, "learning_rate": 9.742424242424244e-06, "num_tokens": 181005.0, "completions/mean_length": 250.875, "completions/min_length": 138.0, "completions/max_length": 323.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 250.875, "completions/min_terminated_length": 138.0, "completions/max_terminated_length": 323.0, "rewards/meter_count_clean/mean": 0.2234499454498291, "rewards/meter_count_clean/std": 0.21007955074310303, "reward": 0.2234499454498291, "reward_std": 0.21007955074310303, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2088625580072403, "sampling/sampling_logp_difference/max": 1.6328611373901367, "sampling/importance_sampling_ratio/min": 0.19536978006362915, "sampling/importance_sampling_ratio/mean": 1.03697669506073, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.9879649505019188, "entropy": 3.1971679627895355, "clip_ratio/low_mean": 0.09196549654006958, "clip_ratio/low_min": 0.09196549654006958, "clip_ratio/high_mean": 0.09607911668717861, "clip_ratio/high_max": 0.09607911668717861, "clip_ratio/region_mean": 0.1880446132272482, "reward_total_mean": 0.2234499454498291, "reward_meter_count_clean_mean": 0.2234499454498291, "reward_meter_count_clean_std": 0.21007955074310303}
88
+ {"timestamp_utc": "2026-04-11T19:14:03Z", "mode": "train", "global_step": 87, "epoch": 0.003359592215013902, "loss": 0.0123, "grad_norm": 18.224882125854492, "learning_rate": 9.739393939393941e-06, "num_tokens": 182602.0, "completions/mean_length": 45.625, "completions/min_length": 33.0, "completions/max_length": 55.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 45.625, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 55.0, "rewards/meter_count_clean/mean": 0.3372150957584381, "rewards/meter_count_clean/std": 0.4128407835960388, "reward": 0.3372150957584381, "reward_std": 0.41284075379371643, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21434830129146576, "sampling/sampling_logp_difference/max": 1.461564064025879, "sampling/importance_sampling_ratio/min": 0.23187333345413208, "sampling/importance_sampling_ratio/mean": 1.031768798828125, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.5478705316781998, "entropy": 2.4275936484336853, "clip_ratio/low_mean": 0.11843920033425093, "clip_ratio/low_min": 0.11843920033425093, "clip_ratio/high_mean": 0.05969125218689442, "clip_ratio/high_max": 0.05969125218689442, "clip_ratio/region_mean": 0.17813045252114534, "reward_total_mean": 0.3372150957584381, "reward_meter_count_clean_mean": 0.3372150957584381, "reward_meter_count_clean_std": 0.4128407835960388}
89
+ {"timestamp_utc": "2026-04-11T19:14:09Z", "mode": "train", "global_step": 88, "epoch": 0.003398208217485326, "loss": 0.0868, "grad_norm": 7.239674091339111, "learning_rate": 9.736363636363637e-06, "num_tokens": 185727.0, "completions/mean_length": 179.625, "completions/min_length": 154.0, "completions/max_length": 203.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 179.625, "completions/min_terminated_length": 154.0, "completions/max_terminated_length": 203.0, "rewards/meter_count_clean/mean": 0.395793080329895, "rewards/meter_count_clean/std": 0.26035410165786743, "reward": 0.395793080329895, "reward_std": 0.26035410165786743, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19813987612724304, "sampling/sampling_logp_difference/max": 1.598048210144043, "sampling/importance_sampling_ratio/min": 0.20229095220565796, "sampling/importance_sampling_ratio/mean": 1.035853385925293, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.2833295166492462, "entropy": 2.465492472052574, "clip_ratio/low_mean": 0.09458548203110695, "clip_ratio/low_min": 0.09458548203110695, "clip_ratio/high_mean": 0.10577460564672947, "clip_ratio/high_max": 0.10577460564672947, "clip_ratio/region_mean": 0.20036008767783642, "reward_total_mean": 0.395793080329895, "reward_meter_count_clean_mean": 0.395793080329895, "reward_meter_count_clean_std": 0.26035410165786743}
90
+ {"timestamp_utc": "2026-04-11T19:14:15Z", "mode": "train", "global_step": 89, "epoch": 0.00343682421995675, "loss": 0.2009, "grad_norm": 10.298208236694336, "learning_rate": 9.733333333333334e-06, "num_tokens": 187879.0, "completions/mean_length": 98.0, "completions/min_length": 65.0, "completions/max_length": 134.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 98.0, "completions/min_terminated_length": 65.0, "completions/max_terminated_length": 134.0, "rewards/meter_count_clean/mean": 0.6336480379104614, "rewards/meter_count_clean/std": 0.3132372200489044, "reward": 0.6336480379104614, "reward_std": 0.31323719024658203, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20529155433177948, "sampling/sampling_logp_difference/max": 1.6686315536499023, "sampling/importance_sampling_ratio/min": 0.18850484490394592, "sampling/importance_sampling_ratio/mean": 1.050196647644043, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4282651841640472, "entropy": 2.9546821117401123, "clip_ratio/low_mean": 0.07508473843336105, "clip_ratio/low_min": 0.07508473843336105, "clip_ratio/high_mean": 0.12049593590199947, "clip_ratio/high_max": 0.12049593590199947, "clip_ratio/region_mean": 0.19558067433536053, "reward_total_mean": 0.6336480379104614, "reward_meter_count_clean_mean": 0.6336480379104614, "reward_meter_count_clean_std": 0.3132372200489044}
91
+ {"timestamp_utc": "2026-04-11T19:14:20Z", "mode": "train", "global_step": 90, "epoch": 0.003475440222428174, "loss": 0.004, "grad_norm": 14.30321216583252, "learning_rate": 9.730303030303031e-06, "num_tokens": 189559.0, "completions/mean_length": 48.0, "completions/min_length": 33.0, "completions/max_length": 64.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 48.0, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 64.0, "rewards/meter_count_clean/mean": 0.6122487187385559, "rewards/meter_count_clean/std": 0.3927072584629059, "reward": 0.6122487187385559, "reward_std": 0.3927072584629059, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20874950289726257, "sampling/sampling_logp_difference/max": 1.4292075634002686, "sampling/importance_sampling_ratio/min": 0.23949864506721497, "sampling/importance_sampling_ratio/mean": 1.037034511566162, "sampling/importance_sampling_ratio/max": 1.9598164558410645, "kl": 1.7402038723230362, "entropy": 2.464683413505554, "clip_ratio/low_mean": 0.04326923098415136, "clip_ratio/low_min": 0.04326923098415136, "clip_ratio/high_mean": 0.1241906825453043, "clip_ratio/high_max": 0.1241906825453043, "clip_ratio/region_mean": 0.16745991352945566, "reward_total_mean": 0.6122487187385559, "reward_meter_count_clean_mean": 0.6122487187385559, "reward_meter_count_clean_std": 0.3927072584629059}
92
+ {"timestamp_utc": "2026-04-11T19:14:27Z", "mode": "train", "global_step": 91, "epoch": 0.0035140562248995983, "loss": 0.0544, "grad_norm": 6.84236478805542, "learning_rate": 9.727272727272728e-06, "num_tokens": 192563.0, "completions/mean_length": 174.5, "completions/min_length": 121.0, "completions/max_length": 262.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 174.5, "completions/min_terminated_length": 121.0, "completions/max_terminated_length": 262.0, "rewards/meter_count_clean/mean": 0.603217363357544, "rewards/meter_count_clean/std": 0.20575962960720062, "reward": 0.603217363357544, "reward_std": 0.20575964450836182, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21506188809871674, "sampling/sampling_logp_difference/max": 1.3115177154541016, "sampling/importance_sampling_ratio/min": 0.2694108486175537, "sampling/importance_sampling_ratio/mean": 1.0390546321868896, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.1118000373244286, "entropy": 3.7719518542289734, "clip_ratio/low_mean": 0.10331365652382374, "clip_ratio/low_min": 0.10331365652382374, "clip_ratio/high_mean": 0.07963917776942253, "clip_ratio/high_max": 0.07963917776942253, "clip_ratio/region_mean": 0.18295283429324627, "reward_total_mean": 0.603217363357544, "reward_meter_count_clean_mean": 0.603217363357544, "reward_meter_count_clean_std": 0.20575962960720062}
93
+ {"timestamp_utc": "2026-04-11T19:14:32Z", "mode": "train", "global_step": 92, "epoch": 0.0035526722273710224, "loss": 0.1559, "grad_norm": 16.550352096557617, "learning_rate": 9.724242424242426e-06, "num_tokens": 194233.0, "completions/mean_length": 57.75, "completions/min_length": 34.0, "completions/max_length": 92.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 57.75, "completions/min_terminated_length": 34.0, "completions/max_terminated_length": 92.0, "rewards/meter_count_clean/mean": 0.41938501596450806, "rewards/meter_count_clean/std": 0.4255473017692566, "reward": 0.41938501596450806, "reward_std": 0.4255472719669342, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2523427903652191, "sampling/sampling_logp_difference/max": 1.8779749870300293, "sampling/importance_sampling_ratio/min": 0.15289942920207977, "sampling/importance_sampling_ratio/mean": 1.057788372039795, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.8161827772855759, "entropy": 2.162596195936203, "clip_ratio/low_mean": 0.1532957497984171, "clip_ratio/low_min": 0.1532957497984171, "clip_ratio/high_mean": 0.07082884386181831, "clip_ratio/high_max": 0.07082884386181831, "clip_ratio/region_mean": 0.2241245936602354, "reward_total_mean": 0.41938501596450806, "reward_meter_count_clean_mean": 0.41938501596450806, "reward_meter_count_clean_std": 0.4255473017692566}
94
+ {"timestamp_utc": "2026-04-11T19:14:41Z", "mode": "train", "global_step": 93, "epoch": 0.0035912882298424465, "loss": 0.0583, "grad_norm": 4.504820823669434, "learning_rate": 9.721212121212123e-06, "num_tokens": 198586.0, "completions/mean_length": 312.125, "completions/min_length": 268.0, "completions/max_length": 344.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 312.125, "completions/min_terminated_length": 268.0, "completions/max_terminated_length": 344.0, "rewards/meter_count_clean/mean": 0.1713373363018036, "rewards/meter_count_clean/std": 0.16757051646709442, "reward": 0.1713373363018036, "reward_std": 0.16757051646709442, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20828686654567719, "sampling/sampling_logp_difference/max": 1.6375536918640137, "sampling/importance_sampling_ratio/min": 0.19445516169071198, "sampling/importance_sampling_ratio/mean": 1.0430541038513184, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.8844465836882591, "entropy": 3.2054056525230408, "clip_ratio/low_mean": 0.07444824650883675, "clip_ratio/low_min": 0.07444824650883675, "clip_ratio/high_mean": 0.0941340047866106, "clip_ratio/high_max": 0.0941340047866106, "clip_ratio/region_mean": 0.16858225129544735, "reward_total_mean": 0.1713373363018036, "reward_meter_count_clean_mean": 0.1713373363018036, "reward_meter_count_clean_std": 0.16757051646709442}
95
+ {"timestamp_utc": "2026-04-11T19:14:47Z", "mode": "train", "global_step": 94, "epoch": 0.003629904232313871, "loss": -0.0215, "grad_norm": 28.54985809326172, "learning_rate": 9.718181818181818e-06, "num_tokens": 200807.0, "completions/mean_length": 111.625, "completions/min_length": 66.0, "completions/max_length": 161.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 111.625, "completions/min_terminated_length": 66.0, "completions/max_terminated_length": 161.0, "rewards/meter_count_clean/mean": 0.37486404180526733, "rewards/meter_count_clean/std": 0.35751017928123474, "reward": 0.37486404180526733, "reward_std": 0.35751017928123474, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2191978245973587, "sampling/sampling_logp_difference/max": 1.4935674667358398, "sampling/importance_sampling_ratio/min": 0.22457008063793182, "sampling/importance_sampling_ratio/mean": 1.0396778583526611, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.49543359875679, "entropy": 3.28840634226799, "clip_ratio/low_mean": 0.07458104286342859, "clip_ratio/low_min": 0.07458104286342859, "clip_ratio/high_mean": 0.10831169039011002, "clip_ratio/high_max": 0.10831169039011002, "clip_ratio/region_mean": 0.1828927332535386, "reward_total_mean": 0.37486404180526733, "reward_meter_count_clean_mean": 0.37486404180526733, "reward_meter_count_clean_std": 0.35751017928123474}
96
+ {"timestamp_utc": "2026-04-11T19:14:52Z", "mode": "train", "global_step": 95, "epoch": 0.003668520234785295, "loss": 0.165, "grad_norm": 99.09620666503906, "learning_rate": 9.715151515151516e-06, "num_tokens": 202121.0, "completions/mean_length": 18.25, "completions/min_length": 17.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.25, "completions/min_terminated_length": 17.0, "completions/max_terminated_length": 21.0, "rewards/meter_count_clean/mean": 0.9949363470077515, "rewards/meter_count_clean/std": 0.0023499121889472008, "reward": 0.9949363470077515, "reward_std": 0.002349905204027891, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.04272828623652458, "sampling/sampling_logp_difference/max": 0.8389389514923096, "sampling/importance_sampling_ratio/min": 0.5384017825126648, "sampling/importance_sampling_ratio/mean": 1.008124828338623, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.8823280930519104, "entropy": 0.21160555630922318, "clip_ratio/low_mean": 0.011904762126505375, "clip_ratio/low_min": 0.011904762126505375, "clip_ratio/high_mean": 0.03553921589627862, "clip_ratio/high_max": 0.03553921589627862, "clip_ratio/region_mean": 0.047443978022783995, "reward_total_mean": 0.9949363470077515, "reward_meter_count_clean_mean": 0.9949363470077515, "reward_meter_count_clean_std": 0.0023499121889472008}
97
+ {"timestamp_utc": "2026-04-11T19:14:57Z", "mode": "train", "global_step": 96, "epoch": 0.0037071362372567192, "loss": 0.1136, "grad_norm": 8.879090309143066, "learning_rate": 9.712121212121213e-06, "num_tokens": 204182.0, "completions/mean_length": 98.625, "completions/min_length": 70.0, "completions/max_length": 150.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 98.625, "completions/min_terminated_length": 70.0, "completions/max_terminated_length": 150.0, "rewards/meter_count_clean/mean": 0.3730781078338623, "rewards/meter_count_clean/std": 0.2818770110607147, "reward": 0.3730781078338623, "reward_std": 0.2818770110607147, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2376914918422699, "sampling/sampling_logp_difference/max": 1.6928377151489258, "sampling/importance_sampling_ratio/min": 0.18399666249752045, "sampling/importance_sampling_ratio/mean": 1.0496914386749268, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.5962010324001312, "entropy": 3.8312510550022125, "clip_ratio/low_mean": 0.1025804141536355, "clip_ratio/low_min": 0.1025804141536355, "clip_ratio/high_mean": 0.08833824284374714, "clip_ratio/high_max": 0.08833824284374714, "clip_ratio/region_mean": 0.19091865699738264, "reward_total_mean": 0.3730781078338623, "reward_meter_count_clean_mean": 0.3730781078338623, "reward_meter_count_clean_std": 0.2818770110607147}
98
+ {"timestamp_utc": "2026-04-11T19:15:03Z", "mode": "train", "global_step": 97, "epoch": 0.0037457522397281433, "loss": 0.1295, "grad_norm": 17.37091636657715, "learning_rate": 9.70909090909091e-06, "num_tokens": 206299.0, "completions/mean_length": 94.625, "completions/min_length": 60.0, "completions/max_length": 144.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 94.625, "completions/min_terminated_length": 60.0, "completions/max_terminated_length": 144.0, "rewards/meter_count_clean/mean": 0.38106584548950195, "rewards/meter_count_clean/std": 0.3250954747200012, "reward": 0.38106584548950195, "reward_std": 0.3250954747200012, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21154189109802246, "sampling/sampling_logp_difference/max": 1.7208929061889648, "sampling/importance_sampling_ratio/min": 0.17890633642673492, "sampling/importance_sampling_ratio/mean": 1.0636258125305176, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.371904134750366, "entropy": 2.747112363576889, "clip_ratio/low_mean": 0.12544687278568745, "clip_ratio/low_min": 0.12544687278568745, "clip_ratio/high_mean": 0.06360195018351078, "clip_ratio/high_max": 0.06360195018351078, "clip_ratio/region_mean": 0.18904882296919823, "reward_total_mean": 0.38106584548950195, "reward_meter_count_clean_mean": 0.38106584548950195, "reward_meter_count_clean_std": 0.3250954747200012}
99
+ {"timestamp_utc": "2026-04-11T19:15:09Z", "mode": "train", "global_step": 98, "epoch": 0.0037843682421995675, "loss": 0.0945, "grad_norm": 9.53862190246582, "learning_rate": 9.706060606060606e-06, "num_tokens": 208238.0, "completions/mean_length": 84.375, "completions/min_length": 47.0, "completions/max_length": 113.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 84.375, "completions/min_terminated_length": 47.0, "completions/max_terminated_length": 113.0, "rewards/meter_count_clean/mean": 0.3455486297607422, "rewards/meter_count_clean/std": 0.4493013024330139, "reward": 0.3455486297607422, "reward_std": 0.4493013024330139, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22088798880577087, "sampling/sampling_logp_difference/max": 1.6038808822631836, "sampling/importance_sampling_ratio/min": 0.2011144906282425, "sampling/importance_sampling_ratio/mean": 1.053009271621704, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4000272899866104, "entropy": 3.3423550128936768, "clip_ratio/low_mean": 0.10318870935589075, "clip_ratio/low_min": 0.10318870935589075, "clip_ratio/high_mean": 0.08637197501957417, "clip_ratio/high_max": 0.08637197501957417, "clip_ratio/region_mean": 0.18956068437546492, "reward_total_mean": 0.3455486297607422, "reward_meter_count_clean_mean": 0.3455486297607422, "reward_meter_count_clean_std": 0.4493013024330139}
100
+ {"timestamp_utc": "2026-04-11T19:15:14Z", "mode": "train", "global_step": 99, "epoch": 0.0038229842446709916, "loss": 0.0352, "grad_norm": 12.195768356323242, "learning_rate": 9.703030303030305e-06, "num_tokens": 210203.0, "completions/mean_length": 70.625, "completions/min_length": 52.0, "completions/max_length": 100.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 70.625, "completions/min_terminated_length": 52.0, "completions/max_terminated_length": 100.0, "rewards/meter_count_clean/mean": 0.18750828504562378, "rewards/meter_count_clean/std": 0.18086771667003632, "reward": 0.18750828504562378, "reward_std": 0.1808677315711975, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19442309439182281, "sampling/sampling_logp_difference/max": 1.4180307388305664, "sampling/importance_sampling_ratio/min": 0.24219049513339996, "sampling/importance_sampling_ratio/mean": 1.0437159538269043, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4821452051401138, "entropy": 2.1659949123859406, "clip_ratio/low_mean": 0.13323841989040375, "clip_ratio/low_min": 0.13323841989040375, "clip_ratio/high_mean": 0.03570783603936434, "clip_ratio/high_max": 0.03570783603936434, "clip_ratio/region_mean": 0.16894625592976809, "reward_total_mean": 0.18750828504562378, "reward_meter_count_clean_mean": 0.18750828504562378, "reward_meter_count_clean_std": 0.18086771667003632}
101
+ {"timestamp_utc": "2026-04-11T19:15:19Z", "mode": "train", "global_step": 100, "epoch": 0.0038616002471424157, "loss": 0.0128, "grad_norm": 14.672683715820312, "learning_rate": 9.7e-06, "num_tokens": 211811.0, "completions/mean_length": 45.0, "completions/min_length": 29.0, "completions/max_length": 58.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 45.0, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 58.0, "rewards/meter_count_clean/mean": 0.10674573481082916, "rewards/meter_count_clean/std": 0.08364997804164886, "reward": 0.10674573481082916, "reward_std": 0.08364997804164886, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.18656399846076965, "sampling/sampling_logp_difference/max": 1.3434593677520752, "sampling/importance_sampling_ratio/min": 0.26094141602516174, "sampling/importance_sampling_ratio/mean": 1.036841869354248, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7658196240663528, "entropy": 2.016321301460266, "clip_ratio/low_mean": 0.08866940345615149, "clip_ratio/low_min": 0.08866940345615149, "clip_ratio/high_mean": 0.08358811773359776, "clip_ratio/high_max": 0.08358811773359776, "clip_ratio/region_mean": 0.17225752118974924, "reward_total_mean": 0.10674573481082916, "reward_meter_count_clean_mean": 0.10674573481082916, "reward_meter_count_clean_std": 0.08364997804164886}
102
+ {"timestamp_utc": "2026-04-11T19:16:42Z", "mode": "eval", "global_step": 100, "epoch": 0.0038616002471424157, "eval_loss": NaN, "eval_runtime": 82.4151, "eval_samples_per_second": 1.262, "eval_steps_per_second": 0.158, "eval_num_tokens": 211811.0, "eval_completions/mean_length": 193.94230769230768, "eval_completions/min_length": 40.61538461538461, "eval_completions/max_length": 414.7692307692308, "eval_completions/clipped_ratio": 0.038461538461538464, "eval_completions/mean_terminated_length": 179.54166823167068, "eval_completions/min_terminated_length": 40.61538461538461, "eval_completions/max_terminated_length": 383.84615384615387, "eval_rewards/meter_count_clean/mean": 0.23895036027981684, "eval_rewards/meter_count_clean/std": 0.26200672411001646, "eval_reward": 0.23895036027981684, "eval_reward_std": NaN, "eval_frac_reward_zero_std": 0.0, "eval_sampling/sampling_logp_difference/mean": 0.1760244621680333, "eval_sampling/sampling_logp_difference/max": 1.3034063485952525, "eval_sampling/importance_sampling_ratio/min": 0.2759172205741589, "eval_sampling/importance_sampling_ratio/mean": 1.0581457339800322, "eval_sampling/importance_sampling_ratio/max": 1.7262064585318933, "eval_kl": 1.2026036427571223, "eval_entropy": 3.318188025401189, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_reward_total_mean": 0.23895036027981684, "eval_reward_meter_count_clean_mean": 0.23895036027981684, "eval_reward_meter_count_clean_std": 0.26200672411001646}
103
+ {"timestamp_utc": "2026-04-11T19:16:49Z", "mode": "train", "global_step": 101, "epoch": 0.0039002162496138398, "loss": 0.065, "grad_norm": 11.835779190063477, "learning_rate": 9.696969696969698e-06, "num_tokens": 213598.0, "completions/mean_length": 61.375, "completions/min_length": 49.0, "completions/max_length": 70.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 61.375, "completions/min_terminated_length": 49.0, "completions/max_terminated_length": 70.0, "rewards/meter_count_clean/mean": 0.2918604016304016, "rewards/meter_count_clean/std": 0.22347311675548553, "reward": 0.2918604016304016, "reward_std": 0.22347311675548553, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22550015151500702, "sampling/sampling_logp_difference/max": 1.1267414093017578, "sampling/importance_sampling_ratio/min": 0.32408758997917175, "sampling/importance_sampling_ratio/mean": 1.06128990650177, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.615099623799324, "entropy": 2.9051956236362457, "clip_ratio/low_mean": 0.14081653021275997, "clip_ratio/low_min": 0.14081653021275997, "clip_ratio/high_mean": 0.05549853295087814, "clip_ratio/high_max": 0.05549853295087814, "clip_ratio/region_mean": 0.19631506316363811, "reward_total_mean": 0.2918604016304016, "reward_meter_count_clean_mean": 0.2918604016304016, "reward_meter_count_clean_std": 0.22347311675548553}
plots/arabic_gate_chain.png ADDED

Git LFS Details

  • SHA256: 5b6390dc43b452d931977de97e1999d93c8e6fdf1bfe7a333536614962ba0224
  • Pointer size: 131 Bytes
  • Size of remote file: 105 kB
plots/arabic_gate_run.png ADDED

Git LFS Details

  • SHA256: 5289e870c0dd3bd550c853e518f0e12c85bc866826b1977244c6f6ec38771b66
  • Pointer size: 131 Bytes
  • Size of remote file: 105 kB
plots/chain_metrics.jsonl CHANGED
@@ -48,3 +48,54 @@
48
  {"timestamp_utc": "2026-04-11T19:08:41Z", "mode": "train", "global_step": 48, "epoch": 0.0018535681186283596, "loss": 0.0709, "grad_norm": 12.341552734375, "learning_rate": 9.857575757575758e-06, "num_tokens": 96454.0, "completions/mean_length": 108.875, "completions/min_length": 80.0, "completions/max_length": 136.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 108.875, "completions/min_terminated_length": 80.0, "completions/max_terminated_length": 136.0, "rewards/meter_count_clean/mean": 0.24804890155792236, "rewards/meter_count_clean/std": 0.3092200458049774, "reward": 0.24804890155792236, "reward_std": 0.3092200756072998, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24694077670574188, "sampling/sampling_logp_difference/max": 2.129354476928711, "sampling/importance_sampling_ratio/min": 0.11891403049230576, "sampling/importance_sampling_ratio/mean": 1.0283019542694092, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.707380473613739, "entropy": 2.3587165027856827, "clip_ratio/low_mean": 0.1601727306842804, "clip_ratio/low_min": 0.1601727306842804, "clip_ratio/high_mean": 0.051593588665127754, "clip_ratio/high_max": 0.051593588665127754, "clip_ratio/region_mean": 0.21176631934940815, "reward_total_mean": 0.24804890155792236, "reward_meter_count_clean_mean": 0.24804890155792236, "reward_meter_count_clean_std": 0.3092200458049774, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
49
  {"timestamp_utc": "2026-04-11T19:08:46Z", "mode": "train", "global_step": 49, "epoch": 0.0018921841210997837, "loss": 0.01, "grad_norm": 16.725326538085938, "learning_rate": 9.854545454545456e-06, "num_tokens": 98210.0, "completions/mean_length": 43.5, "completions/min_length": 32.0, "completions/max_length": 53.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 43.5, "completions/min_terminated_length": 32.0, "completions/max_terminated_length": 53.0, "rewards/meter_count_clean/mean": 0.730195164680481, "rewards/meter_count_clean/std": 0.38917961716651917, "reward": 0.730195164680481, "reward_std": 0.38917961716651917, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2018795609474182, "sampling/sampling_logp_difference/max": 1.1117219924926758, "sampling/importance_sampling_ratio/min": 0.32899194955825806, "sampling/importance_sampling_ratio/mean": 1.0350096225738525, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6390934735536575, "entropy": 2.2960819751024246, "clip_ratio/low_mean": 0.05414438620209694, "clip_ratio/low_min": 0.05414438620209694, "clip_ratio/high_mean": 0.1455232799053192, "clip_ratio/high_max": 0.1455232799053192, "clip_ratio/region_mean": 0.19966766610741615, "reward_total_mean": 0.730195164680481, "reward_meter_count_clean_mean": 0.730195164680481, "reward_meter_count_clean_std": 0.38917961716651917, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
50
  {"timestamp_utc": "2026-04-11T19:08:52Z", "mode": "train", "global_step": 50, "epoch": 0.0019308001235712078, "loss": 0.1036, "grad_norm": 8.299102783203125, "learning_rate": 9.851515151515151e-06, "num_tokens": 101022.0, "completions/mean_length": 174.5, "completions/min_length": 114.0, "completions/max_length": 233.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 174.5, "completions/min_terminated_length": 114.0, "completions/max_terminated_length": 233.0, "rewards/meter_count_clean/mean": 0.41803598403930664, "rewards/meter_count_clean/std": 0.24869033694267273, "reward": 0.41803598403930664, "reward_std": 0.24869033694267273, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22161900997161865, "sampling/sampling_logp_difference/max": 1.3848509788513184, "sampling/importance_sampling_ratio/min": 0.2503611147403717, "sampling/importance_sampling_ratio/mean": 1.0149977207183838, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.147744707763195, "entropy": 2.8749085068702698, "clip_ratio/low_mean": 0.07543548569083214, "clip_ratio/low_min": 0.07543548569083214, "clip_ratio/high_mean": 0.15510659664869308, "clip_ratio/high_max": 0.15510659664869308, "clip_ratio/region_mean": 0.23054208233952522, "reward_total_mean": 0.41803598403930664, "reward_meter_count_clean_mean": 0.41803598403930664, "reward_meter_count_clean_std": 0.24869033694267273, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  {"timestamp_utc": "2026-04-11T19:08:41Z", "mode": "train", "global_step": 48, "epoch": 0.0018535681186283596, "loss": 0.0709, "grad_norm": 12.341552734375, "learning_rate": 9.857575757575758e-06, "num_tokens": 96454.0, "completions/mean_length": 108.875, "completions/min_length": 80.0, "completions/max_length": 136.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 108.875, "completions/min_terminated_length": 80.0, "completions/max_terminated_length": 136.0, "rewards/meter_count_clean/mean": 0.24804890155792236, "rewards/meter_count_clean/std": 0.3092200458049774, "reward": 0.24804890155792236, "reward_std": 0.3092200756072998, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24694077670574188, "sampling/sampling_logp_difference/max": 2.129354476928711, "sampling/importance_sampling_ratio/min": 0.11891403049230576, "sampling/importance_sampling_ratio/mean": 1.0283019542694092, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.707380473613739, "entropy": 2.3587165027856827, "clip_ratio/low_mean": 0.1601727306842804, "clip_ratio/low_min": 0.1601727306842804, "clip_ratio/high_mean": 0.051593588665127754, "clip_ratio/high_max": 0.051593588665127754, "clip_ratio/region_mean": 0.21176631934940815, "reward_total_mean": 0.24804890155792236, "reward_meter_count_clean_mean": 0.24804890155792236, "reward_meter_count_clean_std": 0.3092200458049774, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
49
  {"timestamp_utc": "2026-04-11T19:08:46Z", "mode": "train", "global_step": 49, "epoch": 0.0018921841210997837, "loss": 0.01, "grad_norm": 16.725326538085938, "learning_rate": 9.854545454545456e-06, "num_tokens": 98210.0, "completions/mean_length": 43.5, "completions/min_length": 32.0, "completions/max_length": 53.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 43.5, "completions/min_terminated_length": 32.0, "completions/max_terminated_length": 53.0, "rewards/meter_count_clean/mean": 0.730195164680481, "rewards/meter_count_clean/std": 0.38917961716651917, "reward": 0.730195164680481, "reward_std": 0.38917961716651917, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2018795609474182, "sampling/sampling_logp_difference/max": 1.1117219924926758, "sampling/importance_sampling_ratio/min": 0.32899194955825806, "sampling/importance_sampling_ratio/mean": 1.0350096225738525, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6390934735536575, "entropy": 2.2960819751024246, "clip_ratio/low_mean": 0.05414438620209694, "clip_ratio/low_min": 0.05414438620209694, "clip_ratio/high_mean": 0.1455232799053192, "clip_ratio/high_max": 0.1455232799053192, "clip_ratio/region_mean": 0.19966766610741615, "reward_total_mean": 0.730195164680481, "reward_meter_count_clean_mean": 0.730195164680481, "reward_meter_count_clean_std": 0.38917961716651917, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
50
  {"timestamp_utc": "2026-04-11T19:08:52Z", "mode": "train", "global_step": 50, "epoch": 0.0019308001235712078, "loss": 0.1036, "grad_norm": 8.299102783203125, "learning_rate": 9.851515151515151e-06, "num_tokens": 101022.0, "completions/mean_length": 174.5, "completions/min_length": 114.0, "completions/max_length": 233.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 174.5, "completions/min_terminated_length": 114.0, "completions/max_terminated_length": 233.0, "rewards/meter_count_clean/mean": 0.41803598403930664, "rewards/meter_count_clean/std": 0.24869033694267273, "reward": 0.41803598403930664, "reward_std": 0.24869033694267273, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22161900997161865, "sampling/sampling_logp_difference/max": 1.3848509788513184, "sampling/importance_sampling_ratio/min": 0.2503611147403717, "sampling/importance_sampling_ratio/mean": 1.0149977207183838, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.147744707763195, "entropy": 2.8749085068702698, "clip_ratio/low_mean": 0.07543548569083214, "clip_ratio/low_min": 0.07543548569083214, "clip_ratio/high_mean": 0.15510659664869308, "clip_ratio/high_max": 0.15510659664869308, "clip_ratio/region_mean": 0.23054208233952522, "reward_total_mean": 0.41803598403930664, "reward_meter_count_clean_mean": 0.41803598403930664, "reward_meter_count_clean_std": 0.24869033694267273, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
51
+ {"timestamp_utc": "2026-04-11T19:10:17Z", "mode": "eval", "global_step": 50, "epoch": 0.0019308001235712078, "eval_loss": NaN, "eval_runtime": 84.0885, "eval_samples_per_second": 1.237, "eval_steps_per_second": 0.155, "eval_num_tokens": 101022.0, "eval_completions/mean_length": 197.5096153846154, "eval_completions/min_length": 40.46153846153846, "eval_completions/max_length": 424.38461538461536, "eval_completions/clipped_ratio": 0.04807692307692308, "eval_completions/mean_terminated_length": 182.15934518667368, "eval_completions/min_terminated_length": 40.46153846153846, "eval_completions/max_terminated_length": 356.0769230769231, "eval_rewards/meter_count_clean/mean": 0.3135691468532269, "eval_rewards/meter_count_clean/std": 0.29070903016970706, "eval_reward": 0.3135691468532269, "eval_reward_std": NaN, "eval_frac_reward_zero_std": 0.0, "eval_sampling/sampling_logp_difference/mean": 0.15005172731784674, "eval_sampling/sampling_logp_difference/max": 1.1673351434560924, "eval_sampling/importance_sampling_ratio/min": 0.3146973011585382, "eval_sampling/importance_sampling_ratio/mean": 1.0422258835572462, "eval_sampling/importance_sampling_ratio/max": 1.629158432667072, "eval_kl": 1.2288624094082758, "eval_entropy": 2.5429875483879676, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_reward_total_mean": 0.3135691468532269, "eval_reward_meter_count_clean_mean": 0.3135691468532269, "eval_reward_meter_count_clean_std": 0.29070903016970706, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
52
+ {"timestamp_utc": "2026-04-11T19:10:23Z", "mode": "train", "global_step": 51, "epoch": 0.001969416126042632, "loss": 0.0023, "grad_norm": 20.66379165649414, "learning_rate": 9.84848484848485e-06, "num_tokens": 102602.0, "completions/mean_length": 39.5, "completions/min_length": 29.0, "completions/max_length": 54.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 39.5, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 54.0, "rewards/meter_count_clean/mean": 0.48928219079971313, "rewards/meter_count_clean/std": 0.3961140215396881, "reward": 0.48928219079971313, "reward_std": 0.3961139917373657, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24577820301055908, "sampling/sampling_logp_difference/max": 1.712418556213379, "sampling/importance_sampling_ratio/min": 0.1804288774728775, "sampling/importance_sampling_ratio/mean": 1.0036693811416626, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.125137120485306, "entropy": 2.0421162992715836, "clip_ratio/low_mean": 0.0884981770068407, "clip_ratio/low_min": 0.0884981770068407, "clip_ratio/high_mean": 0.13118881545960903, "clip_ratio/high_max": 0.13118881545960903, "clip_ratio/region_mean": 0.21968699246644974, "reward_total_mean": 0.48928219079971313, "reward_meter_count_clean_mean": 0.48928219079971313, "reward_meter_count_clean_std": 0.3961140215396881, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
53
+ {"timestamp_utc": "2026-04-11T19:10:29Z", "mode": "train", "global_step": 52, "epoch": 0.002008032128514056, "loss": -0.017, "grad_norm": 12.180355072021484, "learning_rate": 9.845454545454546e-06, "num_tokens": 104497.0, "completions/mean_length": 62.875, "completions/min_length": 38.0, "completions/max_length": 79.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 62.875, "completions/min_terminated_length": 38.0, "completions/max_terminated_length": 79.0, "rewards/meter_count_clean/mean": 0.5805736780166626, "rewards/meter_count_clean/std": 0.484282523393631, "reward": 0.5805736780166626, "reward_std": 0.4842824935913086, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21816720068454742, "sampling/sampling_logp_difference/max": 2.5338454246520996, "sampling/importance_sampling_ratio/min": 0.07935328036546707, "sampling/importance_sampling_ratio/mean": 1.034999966621399, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.405107393860817, "entropy": 2.290161669254303, "clip_ratio/low_mean": 0.08187933266162872, "clip_ratio/low_min": 0.08187933266162872, "clip_ratio/high_mean": 0.10031133238226175, "clip_ratio/high_max": 0.10031133238226175, "clip_ratio/region_mean": 0.18219066504389048, "reward_total_mean": 0.5805736780166626, "reward_meter_count_clean_mean": 0.5805736780166626, "reward_meter_count_clean_std": 0.484282523393631, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
54
+ {"timestamp_utc": "2026-04-11T19:10:34Z", "mode": "train", "global_step": 53, "epoch": 0.0020466481309854806, "loss": 0.2218, "grad_norm": 16.569290161132812, "learning_rate": 9.842424242424243e-06, "num_tokens": 105978.0, "completions/mean_length": 42.125, "completions/min_length": 28.0, "completions/max_length": 66.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 42.125, "completions/min_terminated_length": 28.0, "completions/max_terminated_length": 66.0, "rewards/meter_count_clean/mean": 0.42543840408325195, "rewards/meter_count_clean/std": 0.4286118447780609, "reward": 0.42543840408325195, "reward_std": 0.4286118149757385, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19726911187171936, "sampling/sampling_logp_difference/max": 1.7797749042510986, "sampling/importance_sampling_ratio/min": 0.16867610812187195, "sampling/importance_sampling_ratio/mean": 1.0270792245864868, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4137683361768723, "entropy": 1.8133485913276672, "clip_ratio/low_mean": 0.09725878294557333, "clip_ratio/low_min": 0.09725878294557333, "clip_ratio/high_mean": 0.11070484854280949, "clip_ratio/high_max": 0.11070484854280949, "clip_ratio/region_mean": 0.20796363148838282, "reward_total_mean": 0.42543840408325195, "reward_meter_count_clean_mean": 0.42543840408325195, "reward_meter_count_clean_std": 0.4286118447780609, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
55
+ {"timestamp_utc": "2026-04-11T19:10:39Z", "mode": "train", "global_step": 54, "epoch": 0.0020852641334569047, "loss": 0.123, "grad_norm": 14.2871675491333, "learning_rate": 9.83939393939394e-06, "num_tokens": 107780.0, "completions/mean_length": 60.25, "completions/min_length": 37.0, "completions/max_length": 101.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 60.25, "completions/min_terminated_length": 37.0, "completions/max_terminated_length": 101.0, "rewards/meter_count_clean/mean": 0.6305832266807556, "rewards/meter_count_clean/std": 0.4164789617061615, "reward": 0.6305832266807556, "reward_std": 0.4164789617061615, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21253442764282227, "sampling/sampling_logp_difference/max": 1.6645622253417969, "sampling/importance_sampling_ratio/min": 0.18927349150180817, "sampling/importance_sampling_ratio/mean": 1.0409408807754517, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.2034710496664047, "entropy": 2.285839468240738, "clip_ratio/low_mean": 0.07627276703715324, "clip_ratio/low_min": 0.07627276703715324, "clip_ratio/high_mean": 0.12182400096207857, "clip_ratio/high_max": 0.12182400096207857, "clip_ratio/region_mean": 0.19809676799923182, "reward_total_mean": 0.6305832266807556, "reward_meter_count_clean_mean": 0.6305832266807556, "reward_meter_count_clean_std": 0.4164789617061615, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
56
+ {"timestamp_utc": "2026-04-11T19:10:44Z", "mode": "train", "global_step": 55, "epoch": 0.002123880135928329, "loss": 0.0258, "grad_norm": 17.327545166015625, "learning_rate": 9.836363636363637e-06, "num_tokens": 109474.0, "completions/mean_length": 51.75, "completions/min_length": 40.0, "completions/max_length": 65.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 51.75, "completions/min_terminated_length": 40.0, "completions/max_terminated_length": 65.0, "rewards/meter_count_clean/mean": 0.36311012506484985, "rewards/meter_count_clean/std": 0.2977062463760376, "reward": 0.36311012506484985, "reward_std": 0.2977062463760376, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22551913559436798, "sampling/sampling_logp_difference/max": 2.095989227294922, "sampling/importance_sampling_ratio/min": 0.122948557138443, "sampling/importance_sampling_ratio/mean": 1.0396475791931152, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.748019978404045, "entropy": 2.4415661692619324, "clip_ratio/low_mean": 0.09197395481169224, "clip_ratio/low_min": 0.09197395481169224, "clip_ratio/high_mean": 0.09078771062195301, "clip_ratio/high_max": 0.09078771062195301, "clip_ratio/region_mean": 0.18276166543364525, "reward_total_mean": 0.36311012506484985, "reward_meter_count_clean_mean": 0.36311012506484985, "reward_meter_count_clean_std": 0.2977062463760376, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
57
+ {"timestamp_utc": "2026-04-11T19:10:49Z", "mode": "train", "global_step": 56, "epoch": 0.002162496138399753, "loss": 0.1289, "grad_norm": 18.075702667236328, "learning_rate": 9.833333333333333e-06, "num_tokens": 111016.0, "completions/mean_length": 40.75, "completions/min_length": 33.0, "completions/max_length": 46.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 40.75, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 46.0, "rewards/meter_count_clean/mean": 0.731018602848053, "rewards/meter_count_clean/std": 0.3394193649291992, "reward": 0.731018602848053, "reward_std": 0.3394193649291992, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.16314202547073364, "sampling/sampling_logp_difference/max": 1.0579338073730469, "sampling/importance_sampling_ratio/min": 0.3471723794937134, "sampling/importance_sampling_ratio/mean": 1.0378867387771606, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6087606996297836, "entropy": 1.4147121086716652, "clip_ratio/low_mean": 0.06206474732607603, "clip_ratio/low_min": 0.06206474732607603, "clip_ratio/high_mean": 0.08283982099965215, "clip_ratio/high_max": 0.08283982099965215, "clip_ratio/region_mean": 0.14490456832572818, "reward_total_mean": 0.731018602848053, "reward_meter_count_clean_mean": 0.731018602848053, "reward_meter_count_clean_std": 0.3394193649291992, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
58
+ {"timestamp_utc": "2026-04-11T19:10:57Z", "mode": "train", "global_step": 57, "epoch": 0.002201112140871177, "loss": 0.069, "grad_norm": 6.15261173248291, "learning_rate": 9.830303030303032e-06, "num_tokens": 114977.0, "completions/mean_length": 284.125, "completions/min_length": 209.0, "completions/max_length": 347.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 284.125, "completions/min_terminated_length": 209.0, "completions/max_terminated_length": 347.0, "rewards/meter_count_clean/mean": 0.6026915907859802, "rewards/meter_count_clean/std": 0.22708161175251007, "reward": 0.6026915907859802, "reward_std": 0.22708161175251007, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2181180864572525, "sampling/sampling_logp_difference/max": 1.5707168579101562, "sampling/importance_sampling_ratio/min": 0.20789609849452972, "sampling/importance_sampling_ratio/mean": 1.0440524816513062, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.9276735782623291, "entropy": 3.2829161882400513, "clip_ratio/low_mean": 0.061714982613921165, "clip_ratio/low_min": 0.061714982613921165, "clip_ratio/high_mean": 0.1252468191087246, "clip_ratio/high_max": 0.1252468191087246, "clip_ratio/region_mean": 0.18696180172264576, "reward_total_mean": 0.6026915907859802, "reward_meter_count_clean_mean": 0.6026915907859802, "reward_meter_count_clean_std": 0.22708161175251007, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
59
+ {"timestamp_utc": "2026-04-11T19:11:02Z", "mode": "train", "global_step": 58, "epoch": 0.002239728143342601, "loss": -0.0746, "grad_norm": 12.134157180786133, "learning_rate": 9.827272727272729e-06, "num_tokens": 116850.0, "completions/mean_length": 65.125, "completions/min_length": 38.0, "completions/max_length": 76.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 65.125, "completions/min_terminated_length": 38.0, "completions/max_terminated_length": 76.0, "rewards/meter_count_clean/mean": 0.6828622817993164, "rewards/meter_count_clean/std": 0.41145387291908264, "reward": 0.6828622817993164, "reward_std": 0.41145387291908264, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19039244949817657, "sampling/sampling_logp_difference/max": 1.1286425590515137, "sampling/importance_sampling_ratio/min": 0.3234720528125763, "sampling/importance_sampling_ratio/mean": 1.040724754333496, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.2836961075663567, "entropy": 2.45593498647213, "clip_ratio/low_mean": 0.07173928804695606, "clip_ratio/low_min": 0.07173928804695606, "clip_ratio/high_mean": 0.11394045688211918, "clip_ratio/high_max": 0.11394045688211918, "clip_ratio/region_mean": 0.18567974492907524, "reward_total_mean": 0.6828622817993164, "reward_meter_count_clean_mean": 0.6828622817993164, "reward_meter_count_clean_std": 0.41145387291908264, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
60
+ {"timestamp_utc": "2026-04-11T19:11:09Z", "mode": "train", "global_step": 59, "epoch": 0.002278344145814025, "loss": 0.0594, "grad_norm": 13.39612102508545, "learning_rate": 9.824242424242425e-06, "num_tokens": 119592.0, "completions/mean_length": 142.75, "completions/min_length": 103.0, "completions/max_length": 233.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 142.75, "completions/min_terminated_length": 103.0, "completions/max_terminated_length": 233.0, "rewards/meter_count_clean/mean": 0.20413683354854584, "rewards/meter_count_clean/std": 0.1784268319606781, "reward": 0.20413683354854584, "reward_std": 0.1784268468618393, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.26621776819229126, "sampling/sampling_logp_difference/max": 2.522489547729492, "sampling/importance_sampling_ratio/min": 0.0802595466375351, "sampling/importance_sampling_ratio/mean": 1.0229341983795166, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6619391813874245, "entropy": 2.218117520213127, "clip_ratio/low_mean": 0.10819577984511852, "clip_ratio/low_min": 0.10819577984511852, "clip_ratio/high_mean": 0.11741098761558533, "clip_ratio/high_max": 0.11741098761558533, "clip_ratio/region_mean": 0.22560676746070385, "reward_total_mean": 0.20413683354854584, "reward_meter_count_clean_mean": 0.20413683354854584, "reward_meter_count_clean_std": 0.1784268319606781, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
61
+ {"timestamp_utc": "2026-04-11T19:11:15Z", "mode": "train", "global_step": 60, "epoch": 0.0023169601482854493, "loss": 0.0463, "grad_norm": 15.286205291748047, "learning_rate": 9.821212121212122e-06, "num_tokens": 121201.0, "completions/mean_length": 46.125, "completions/min_length": 37.0, "completions/max_length": 76.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 46.125, "completions/min_terminated_length": 37.0, "completions/max_terminated_length": 76.0, "rewards/meter_count_clean/mean": 0.5946000814437866, "rewards/meter_count_clean/std": 0.3260856568813324, "reward": 0.5946000814437866, "reward_std": 0.3260856568813324, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20776869356632233, "sampling/sampling_logp_difference/max": 1.5432052612304688, "sampling/importance_sampling_ratio/min": 0.21369504928588867, "sampling/importance_sampling_ratio/mean": 1.0501973628997803, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.8500682413578033, "entropy": 2.614510953426361, "clip_ratio/low_mean": 0.07143263425678015, "clip_ratio/low_min": 0.07143263425678015, "clip_ratio/high_mean": 0.08825728669762611, "clip_ratio/high_max": 0.08825728669762611, "clip_ratio/region_mean": 0.15968992095440626, "reward_total_mean": 0.5946000814437866, "reward_meter_count_clean_mean": 0.5946000814437866, "reward_meter_count_clean_std": 0.3260856568813324, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
62
+ {"timestamp_utc": "2026-04-11T19:11:20Z", "mode": "train", "global_step": 61, "epoch": 0.002355576150756874, "loss": 0.245, "grad_norm": 19.05546760559082, "learning_rate": 9.81818181818182e-06, "num_tokens": 122769.0, "completions/mean_length": 31.0, "completions/min_length": 22.0, "completions/max_length": 43.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 31.0, "completions/min_terminated_length": 22.0, "completions/max_terminated_length": 43.0, "rewards/meter_count_clean/mean": 0.7235491275787354, "rewards/meter_count_clean/std": 0.44676530361175537, "reward": 0.7235491275787354, "reward_std": 0.44676533341407776, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19886323809623718, "sampling/sampling_logp_difference/max": 1.2326695919036865, "sampling/importance_sampling_ratio/min": 0.2915133237838745, "sampling/importance_sampling_ratio/mean": 1.039192795753479, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.743053525686264, "entropy": 2.4280820190906525, "clip_ratio/low_mean": 0.05286107771098614, "clip_ratio/low_min": 0.05286107771098614, "clip_ratio/high_mean": 0.1522508319467306, "clip_ratio/high_max": 0.1522508319467306, "clip_ratio/region_mean": 0.20511190965771675, "reward_total_mean": 0.7235491275787354, "reward_meter_count_clean_mean": 0.7235491275787354, "reward_meter_count_clean_std": 0.44676530361175537, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
63
+ {"timestamp_utc": "2026-04-11T19:11:26Z", "mode": "train", "global_step": 62, "epoch": 0.002394192153228298, "loss": 0.1557, "grad_norm": 10.215119361877441, "learning_rate": 9.815151515151516e-06, "num_tokens": 124968.0, "completions/mean_length": 105.875, "completions/min_length": 62.0, "completions/max_length": 153.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 105.875, "completions/min_terminated_length": 62.0, "completions/max_terminated_length": 153.0, "rewards/meter_count_clean/mean": 0.2949569523334503, "rewards/meter_count_clean/std": 0.2673260271549225, "reward": 0.2949569523334503, "reward_std": 0.2673259973526001, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22320568561553955, "sampling/sampling_logp_difference/max": 1.3543634414672852, "sampling/importance_sampling_ratio/min": 0.25811153650283813, "sampling/importance_sampling_ratio/mean": 1.0284371376037598, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.3760973513126373, "entropy": 2.715813457965851, "clip_ratio/low_mean": 0.08893644995987415, "clip_ratio/low_min": 0.08893644995987415, "clip_ratio/high_mean": 0.11933935433626175, "clip_ratio/high_max": 0.11933935433626175, "clip_ratio/region_mean": 0.2082758042961359, "reward_total_mean": 0.2949569523334503, "reward_meter_count_clean_mean": 0.2949569523334503, "reward_meter_count_clean_std": 0.2673260271549225, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
64
+ {"timestamp_utc": "2026-04-11T19:11:32Z", "mode": "train", "global_step": 63, "epoch": 0.002432808155699722, "loss": 0.072, "grad_norm": 10.939824104309082, "learning_rate": 9.812121212121212e-06, "num_tokens": 126782.0, "completions/mean_length": 70.75, "completions/min_length": 48.0, "completions/max_length": 84.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 70.75, "completions/min_terminated_length": 48.0, "completions/max_terminated_length": 84.0, "rewards/meter_count_clean/mean": 0.8419647216796875, "rewards/meter_count_clean/std": 0.3233566880226135, "reward": 0.8419647216796875, "reward_std": 0.32335665822029114, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19959045946598053, "sampling/sampling_logp_difference/max": 1.9795737266540527, "sampling/importance_sampling_ratio/min": 0.1381281018257141, "sampling/importance_sampling_ratio/mean": 1.0459727048873901, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.6386639475822449, "entropy": 2.22160741686821, "clip_ratio/low_mean": 0.04621848836541176, "clip_ratio/low_min": 0.04621848836541176, "clip_ratio/high_mean": 0.11690851114690304, "clip_ratio/high_max": 0.11690851114690304, "clip_ratio/region_mean": 0.1631269995123148, "reward_total_mean": 0.8419647216796875, "reward_meter_count_clean_mean": 0.8419647216796875, "reward_meter_count_clean_std": 0.3233566880226135, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
65
+ {"timestamp_utc": "2026-04-11T19:11:42Z", "mode": "train", "global_step": 64, "epoch": 0.002471424158171146, "loss": 0.1276, "grad_norm": 4.516471862792969, "learning_rate": 9.809090909090911e-06, "num_tokens": 131629.0, "completions/mean_length": 390.875, "completions/min_length": 315.0, "completions/max_length": 483.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 390.875, "completions/min_terminated_length": 315.0, "completions/max_terminated_length": 483.0, "rewards/meter_count_clean/mean": 0.43812355399131775, "rewards/meter_count_clean/std": 0.37187716364860535, "reward": 0.43812355399131775, "reward_std": 0.37187716364860535, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19917061924934387, "sampling/sampling_logp_difference/max": 1.9876041412353516, "sampling/importance_sampling_ratio/min": 0.13702331483364105, "sampling/importance_sampling_ratio/mean": 1.02934730052948, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.7644509300589561, "entropy": 2.9058758020401, "clip_ratio/low_mean": 0.05013222433626652, "clip_ratio/low_min": 0.05013222433626652, "clip_ratio/high_mean": 0.11145513504743576, "clip_ratio/high_max": 0.11145513504743576, "clip_ratio/region_mean": 0.16158735938370228, "reward_total_mean": 0.43812355399131775, "reward_meter_count_clean_mean": 0.43812355399131775, "reward_meter_count_clean_std": 0.37187716364860535, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
66
+ {"timestamp_utc": "2026-04-11T19:11:46Z", "mode": "train", "global_step": 65, "epoch": 0.0025100401606425703, "loss": 0.0846, "grad_norm": 20.379396438598633, "learning_rate": 9.806060606060607e-06, "num_tokens": 133206.0, "completions/mean_length": 34.125, "completions/min_length": 24.0, "completions/max_length": 48.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 34.125, "completions/min_terminated_length": 24.0, "completions/max_terminated_length": 48.0, "rewards/meter_count_clean/mean": 0.39298856258392334, "rewards/meter_count_clean/std": 0.3957642912864685, "reward": 0.39298856258392334, "reward_std": 0.39576423168182373, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21870994567871094, "sampling/sampling_logp_difference/max": 2.2564210891723633, "sampling/importance_sampling_ratio/min": 0.10472461581230164, "sampling/importance_sampling_ratio/mean": 0.9815999269485474, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.063544064760208, "entropy": 1.5038166865706444, "clip_ratio/low_mean": 0.15403079986572266, "clip_ratio/low_min": 0.15403079986572266, "clip_ratio/high_mean": 0.09134615398943424, "clip_ratio/high_max": 0.09134615398943424, "clip_ratio/region_mean": 0.2453769538551569, "reward_total_mean": 0.39298856258392334, "reward_meter_count_clean_mean": 0.39298856258392334, "reward_meter_count_clean_std": 0.3957642912864685, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
67
+ {"timestamp_utc": "2026-04-11T19:11:52Z", "mode": "train", "global_step": 66, "epoch": 0.0025486561631139944, "loss": 0.1542, "grad_norm": 17.633543014526367, "learning_rate": 9.803030303030304e-06, "num_tokens": 134995.0, "completions/mean_length": 69.625, "completions/min_length": 49.0, "completions/max_length": 96.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 69.625, "completions/min_terminated_length": 49.0, "completions/max_terminated_length": 96.0, "rewards/meter_count_clean/mean": 0.8001365661621094, "rewards/meter_count_clean/std": 0.3378126919269562, "reward": 0.8001365661621094, "reward_std": 0.3378126621246338, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20599590241909027, "sampling/sampling_logp_difference/max": 1.8366949558258057, "sampling/importance_sampling_ratio/min": 0.1593431979417801, "sampling/importance_sampling_ratio/mean": 1.0259366035461426, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.893242433667183, "entropy": 1.3359143435955048, "clip_ratio/low_mean": 0.04664522036910057, "clip_ratio/low_min": 0.04664522036910057, "clip_ratio/high_mean": 0.16264715790748596, "clip_ratio/high_max": 0.16264715790748596, "clip_ratio/region_mean": 0.20929237827658653, "reward_total_mean": 0.8001365661621094, "reward_meter_count_clean_mean": 0.8001365661621094, "reward_meter_count_clean_std": 0.3378126919269562, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
68
+ {"timestamp_utc": "2026-04-11T19:11:56Z", "mode": "train", "global_step": 67, "epoch": 0.0025872721655854185, "loss": 0.0055, "grad_norm": 16.88397979736328, "learning_rate": 9.800000000000001e-06, "num_tokens": 136783.0, "completions/mean_length": 46.5, "completions/min_length": 37.0, "completions/max_length": 57.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 46.5, "completions/min_terminated_length": 37.0, "completions/max_terminated_length": 57.0, "rewards/meter_count_clean/mean": 0.6752415895462036, "rewards/meter_count_clean/std": 0.3951883316040039, "reward": 0.6752415895462036, "reward_std": 0.3951883316040039, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21076610684394836, "sampling/sampling_logp_difference/max": 1.1104373931884766, "sampling/importance_sampling_ratio/min": 0.32941487431526184, "sampling/importance_sampling_ratio/mean": 1.0569463968276978, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.985504299402237, "entropy": 2.648691624403, "clip_ratio/low_mean": 0.07886030618101358, "clip_ratio/low_min": 0.07886030618101358, "clip_ratio/high_mean": 0.107527912594378, "clip_ratio/high_max": 0.107527912594378, "clip_ratio/region_mean": 0.18638821877539158, "reward_total_mean": 0.6752415895462036, "reward_meter_count_clean_mean": 0.6752415895462036, "reward_meter_count_clean_std": 0.3951883316040039, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
69
+ {"timestamp_utc": "2026-04-11T19:12:01Z", "mode": "train", "global_step": 68, "epoch": 0.0026258881680568426, "loss": 0.2069, "grad_norm": 22.709701538085938, "learning_rate": 9.796969696969698e-06, "num_tokens": 138421.0, "completions/mean_length": 44.75, "completions/min_length": 30.0, "completions/max_length": 62.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 44.75, "completions/min_terminated_length": 30.0, "completions/max_terminated_length": 62.0, "rewards/meter_count_clean/mean": 0.8447393178939819, "rewards/meter_count_clean/std": 0.22370560467243195, "reward": 0.8447393178939819, "reward_std": 0.22370561957359314, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2022874504327774, "sampling/sampling_logp_difference/max": 1.7470617294311523, "sampling/importance_sampling_ratio/min": 0.17428529262542725, "sampling/importance_sampling_ratio/mean": 1.0341558456420898, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.9188628941774368, "entropy": 1.8082955032587051, "clip_ratio/low_mean": 0.05944700539112091, "clip_ratio/low_min": 0.05944700539112091, "clip_ratio/high_mean": 0.16749201156198978, "clip_ratio/high_max": 0.16749201156198978, "clip_ratio/region_mean": 0.2269390169531107, "reward_total_mean": 0.8447393178939819, "reward_meter_count_clean_mean": 0.8447393178939819, "reward_meter_count_clean_std": 0.22370560467243195, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
70
+ {"timestamp_utc": "2026-04-11T19:12:07Z", "mode": "train", "global_step": 69, "epoch": 0.002664504170528267, "loss": 0.065, "grad_norm": 7.776491641998291, "learning_rate": 9.793939393939394e-06, "num_tokens": 140882.0, "completions/mean_length": 140.625, "completions/min_length": 100.0, "completions/max_length": 177.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 140.625, "completions/min_terminated_length": 100.0, "completions/max_terminated_length": 177.0, "rewards/meter_count_clean/mean": 0.8124464750289917, "rewards/meter_count_clean/std": 0.22712665796279907, "reward": 0.8124464750289917, "reward_std": 0.22712665796279907, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2103135734796524, "sampling/sampling_logp_difference/max": 1.5220518112182617, "sampling/importance_sampling_ratio/min": 0.21826361119747162, "sampling/importance_sampling_ratio/mean": 1.0490548610687256, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.129339560866356, "entropy": 3.014527380466461, "clip_ratio/low_mean": 0.08012999780476093, "clip_ratio/low_min": 0.08012999780476093, "clip_ratio/high_mean": 0.12308589741587639, "clip_ratio/high_max": 0.12308589741587639, "clip_ratio/region_mean": 0.20321589522063732, "reward_total_mean": 0.8124464750289917, "reward_meter_count_clean_mean": 0.8124464750289917, "reward_meter_count_clean_std": 0.22712665796279907, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
71
+ {"timestamp_utc": "2026-04-11T19:12:14Z", "mode": "train", "global_step": 70, "epoch": 0.0027031201729996912, "loss": 0.2738, "grad_norm": 9.610224723815918, "learning_rate": 9.790909090909093e-06, "num_tokens": 143117.0, "completions/mean_length": 98.375, "completions/min_length": 56.0, "completions/max_length": 155.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 98.375, "completions/min_terminated_length": 56.0, "completions/max_terminated_length": 155.0, "rewards/meter_count_clean/mean": 0.27704793214797974, "rewards/meter_count_clean/std": 0.31074026226997375, "reward": 0.27704793214797974, "reward_std": 0.31074026226997375, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.24906866252422333, "sampling/sampling_logp_difference/max": 1.5363836288452148, "sampling/importance_sampling_ratio/min": 0.21515779197216034, "sampling/importance_sampling_ratio/mean": 1.0380938053131104, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.29872664809227, "entropy": 2.8568870425224304, "clip_ratio/low_mean": 0.12618878670036793, "clip_ratio/low_min": 0.12618878670036793, "clip_ratio/high_mean": 0.081504937261343, "clip_ratio/high_max": 0.081504937261343, "clip_ratio/region_mean": 0.20769372396171093, "reward_total_mean": 0.27704793214797974, "reward_meter_count_clean_mean": 0.27704793214797974, "reward_meter_count_clean_std": 0.31074026226997375, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
72
+ {"timestamp_utc": "2026-04-11T19:12:19Z", "mode": "train", "global_step": 71, "epoch": 0.0027417361754711153, "loss": 0.0602, "grad_norm": 12.76672077178955, "learning_rate": 9.787878787878788e-06, "num_tokens": 145135.0, "completions/mean_length": 81.25, "completions/min_length": 45.0, "completions/max_length": 126.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 81.25, "completions/min_terminated_length": 45.0, "completions/max_terminated_length": 126.0, "rewards/meter_count_clean/mean": 0.44717690348625183, "rewards/meter_count_clean/std": 0.36459431052207947, "reward": 0.44717690348625183, "reward_std": 0.36459431052207947, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2269318401813507, "sampling/sampling_logp_difference/max": 1.5952892303466797, "sampling/importance_sampling_ratio/min": 0.2028498500585556, "sampling/importance_sampling_ratio/mean": 1.0183560848236084, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.2731183916330338, "entropy": 2.6634568572044373, "clip_ratio/low_mean": 0.10119702108204365, "clip_ratio/low_min": 0.10119702108204365, "clip_ratio/high_mean": 0.1307387314736843, "clip_ratio/high_max": 0.1307387314736843, "clip_ratio/region_mean": 0.23193575255572796, "reward_total_mean": 0.44717690348625183, "reward_meter_count_clean_mean": 0.44717690348625183, "reward_meter_count_clean_std": 0.36459431052207947, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
73
+ {"timestamp_utc": "2026-04-11T19:12:26Z", "mode": "train", "global_step": 72, "epoch": 0.0027803521779425394, "loss": 0.0558, "grad_norm": 6.893131256103516, "learning_rate": 9.784848484848486e-06, "num_tokens": 147931.0, "completions/mean_length": 171.5, "completions/min_length": 145.0, "completions/max_length": 198.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 171.5, "completions/min_terminated_length": 145.0, "completions/max_terminated_length": 198.0, "rewards/meter_count_clean/mean": 0.34930744767189026, "rewards/meter_count_clean/std": 0.34667089581489563, "reward": 0.34930744767189026, "reward_std": 0.34667089581489563, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21298885345458984, "sampling/sampling_logp_difference/max": 1.5448880195617676, "sampling/importance_sampling_ratio/min": 0.21333575248718262, "sampling/importance_sampling_ratio/mean": 1.0105934143066406, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.1274225786328316, "entropy": 2.4433897733688354, "clip_ratio/low_mean": 0.09952043555676937, "clip_ratio/low_min": 0.09952043555676937, "clip_ratio/high_mean": 0.08511919900774956, "clip_ratio/high_max": 0.08511919900774956, "clip_ratio/region_mean": 0.18463963456451893, "reward_total_mean": 0.34930744767189026, "reward_meter_count_clean_mean": 0.34930744767189026, "reward_meter_count_clean_std": 0.34667089581489563, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
74
+ {"timestamp_utc": "2026-04-11T19:12:31Z", "mode": "train", "global_step": 73, "epoch": 0.0028189681804139635, "loss": 0.0954, "grad_norm": 13.334155082702637, "learning_rate": 9.781818181818183e-06, "num_tokens": 149855.0, "completions/mean_length": 57.5, "completions/min_length": 39.0, "completions/max_length": 95.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 57.5, "completions/min_terminated_length": 39.0, "completions/max_terminated_length": 95.0, "rewards/meter_count_clean/mean": 0.42032575607299805, "rewards/meter_count_clean/std": 0.4467563033103943, "reward": 0.42032575607299805, "reward_std": 0.4467563033103943, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2024940848350525, "sampling/sampling_logp_difference/max": 1.7194023132324219, "sampling/importance_sampling_ratio/min": 0.17917320132255554, "sampling/importance_sampling_ratio/mean": 1.0420591831207275, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4627206772565842, "entropy": 2.515232354402542, "clip_ratio/low_mean": 0.08047989010810852, "clip_ratio/low_min": 0.08047989010810852, "clip_ratio/high_mean": 0.0748168514110148, "clip_ratio/high_max": 0.0748168514110148, "clip_ratio/region_mean": 0.15529674151912332, "reward_total_mean": 0.42032575607299805, "reward_meter_count_clean_mean": 0.42032575607299805, "reward_meter_count_clean_std": 0.4467563033103943, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
75
+ {"timestamp_utc": "2026-04-11T19:12:42Z", "mode": "train", "global_step": 74, "epoch": 0.0028575841828853876, "loss": 0.0603, "grad_norm": 1.396609902381897, "learning_rate": 9.77878787878788e-06, "num_tokens": 152386.0, "completions/mean_length": 466.375, "completions/min_length": 155.0, "completions/max_length": 512.0, "completions/clipped_ratio": 0.75, "completions/mean_terminated_length": 329.5, "completions/min_terminated_length": 155.0, "completions/max_terminated_length": 504.0, "rewards/meter_count_clean/mean": 0.6015316843986511, "rewards/meter_count_clean/std": 0.2646358013153076, "reward": 0.6015316843986511, "reward_std": 0.2646358013153076, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2191452980041504, "sampling/sampling_logp_difference/max": 1.3887028694152832, "sampling/importance_sampling_ratio/min": 0.2493986040353775, "sampling/importance_sampling_ratio/mean": 1.0675560235977173, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.2103242427110672, "entropy": 0.8815881013870239, "clip_ratio/low_mean": 0.01854838617146015, "clip_ratio/low_min": 0.01854838617146015, "clip_ratio/high_mean": 0.0245535708963871, "clip_ratio/high_max": 0.0245535708963871, "clip_ratio/region_mean": 0.04310195706784725, "reward_total_mean": 0.6015316843986511, "reward_meter_count_clean_mean": 0.6015316843986511, "reward_meter_count_clean_std": 0.2646358013153076, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
76
+ {"timestamp_utc": "2026-04-11T19:12:48Z", "mode": "train", "global_step": 75, "epoch": 0.0028962001853568117, "loss": 0.1183, "grad_norm": 9.12006950378418, "learning_rate": 9.775757575757576e-06, "num_tokens": 154741.0, "completions/mean_length": 123.375, "completions/min_length": 104.0, "completions/max_length": 137.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 123.375, "completions/min_terminated_length": 104.0, "completions/max_terminated_length": 137.0, "rewards/meter_count_clean/mean": 0.3497368097305298, "rewards/meter_count_clean/std": 0.26128703355789185, "reward": 0.3497368097305298, "reward_std": 0.26128703355789185, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20589499175548553, "sampling/sampling_logp_difference/max": 2.0790109634399414, "sampling/importance_sampling_ratio/min": 0.12505382299423218, "sampling/importance_sampling_ratio/mean": 1.0463789701461792, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.3578874319791794, "entropy": 2.449110597372055, "clip_ratio/low_mean": 0.12214752100408077, "clip_ratio/low_min": 0.12214752100408077, "clip_ratio/high_mean": 0.07322514988481998, "clip_ratio/high_max": 0.07322514988481998, "clip_ratio/region_mean": 0.19537267088890076, "reward_total_mean": 0.3497368097305298, "reward_meter_count_clean_mean": 0.3497368097305298, "reward_meter_count_clean_std": 0.26128703355789185, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
77
+ {"timestamp_utc": "2026-04-11T19:12:53Z", "mode": "train", "global_step": 76, "epoch": 0.002934816187828236, "loss": 0.2798, "grad_norm": 13.480757713317871, "learning_rate": 9.772727272727273e-06, "num_tokens": 156595.0, "completions/mean_length": 54.75, "completions/min_length": 34.0, "completions/max_length": 73.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 54.75, "completions/min_terminated_length": 34.0, "completions/max_terminated_length": 73.0, "rewards/meter_count_clean/mean": 0.5932719707489014, "rewards/meter_count_clean/std": 0.3900352716445923, "reward": 0.5932719707489014, "reward_std": 0.3900352716445923, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.23030723631381989, "sampling/sampling_logp_difference/max": 1.9909172058105469, "sampling/importance_sampling_ratio/min": 0.13657011091709137, "sampling/importance_sampling_ratio/mean": 1.0449117422103882, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7342924177646637, "entropy": 2.940678298473358, "clip_ratio/low_mean": 0.09841924253851175, "clip_ratio/low_min": 0.09841924253851175, "clip_ratio/high_mean": 0.1170717179775238, "clip_ratio/high_max": 0.1170717179775238, "clip_ratio/region_mean": 0.21549096051603556, "reward_total_mean": 0.5932719707489014, "reward_meter_count_clean_mean": 0.5932719707489014, "reward_meter_count_clean_std": 0.3900352716445923, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
78
+ {"timestamp_utc": "2026-04-11T19:12:59Z", "mode": "train", "global_step": 77, "epoch": 0.0029734321902996604, "loss": 0.2751, "grad_norm": 15.090164184570312, "learning_rate": 9.76969696969697e-06, "num_tokens": 158209.0, "completions/mean_length": 49.75, "completions/min_length": 33.0, "completions/max_length": 98.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 49.75, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 98.0, "rewards/meter_count_clean/mean": 0.4592646062374115, "rewards/meter_count_clean/std": 0.43493443727493286, "reward": 0.4592646062374115, "reward_std": 0.4349344074726105, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22933514416217804, "sampling/sampling_logp_difference/max": 1.5349407196044922, "sampling/importance_sampling_ratio/min": 0.21546848118305206, "sampling/importance_sampling_ratio/mean": 1.048494815826416, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.0099042803049088, "entropy": 2.5642327964305878, "clip_ratio/low_mean": 0.08706128969788551, "clip_ratio/low_min": 0.08706128969788551, "clip_ratio/high_mean": 0.10580789390951395, "clip_ratio/high_max": 0.10580789390951395, "clip_ratio/region_mean": 0.19286918360739946, "reward_total_mean": 0.4592646062374115, "reward_meter_count_clean_mean": 0.4592646062374115, "reward_meter_count_clean_std": 0.43493443727493286, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
79
+ {"timestamp_utc": "2026-04-11T19:13:05Z", "mode": "train", "global_step": 78, "epoch": 0.0030120481927710845, "loss": 0.04, "grad_norm": 7.737454891204834, "learning_rate": 9.766666666666667e-06, "num_tokens": 161112.0, "completions/mean_length": 164.875, "completions/min_length": 98.0, "completions/max_length": 205.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 164.875, "completions/min_terminated_length": 98.0, "completions/max_terminated_length": 205.0, "rewards/meter_count_clean/mean": 0.18029195070266724, "rewards/meter_count_clean/std": 0.1389833390712738, "reward": 0.18029195070266724, "reward_std": 0.1389833241701126, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2152501344680786, "sampling/sampling_logp_difference/max": 1.7132110595703125, "sampling/importance_sampling_ratio/min": 0.18028594553470612, "sampling/importance_sampling_ratio/mean": 1.0424808263778687, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.39060677587986, "entropy": 2.451413109898567, "clip_ratio/low_mean": 0.11678846552968025, "clip_ratio/low_min": 0.11678846552968025, "clip_ratio/high_mean": 0.08520248159766197, "clip_ratio/high_max": 0.08520248159766197, "clip_ratio/region_mean": 0.20199094712734222, "reward_total_mean": 0.18029195070266724, "reward_meter_count_clean_mean": 0.18029195070266724, "reward_meter_count_clean_std": 0.1389833390712738, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
80
+ {"timestamp_utc": "2026-04-11T19:13:15Z", "mode": "train", "global_step": 79, "epoch": 0.0030506641952425086, "loss": 0.0211, "grad_norm": 4.973855018615723, "learning_rate": 9.763636363636365e-06, "num_tokens": 165606.0, "completions/mean_length": 342.75, "completions/min_length": 227.0, "completions/max_length": 439.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 342.75, "completions/min_terminated_length": 227.0, "completions/max_terminated_length": 439.0, "rewards/meter_count_clean/mean": 0.2517043650150299, "rewards/meter_count_clean/std": 0.2103721797466278, "reward": 0.2517043650150299, "reward_std": 0.2103721648454666, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20066173374652863, "sampling/sampling_logp_difference/max": 1.8092515468597412, "sampling/importance_sampling_ratio/min": 0.16377666592597961, "sampling/importance_sampling_ratio/mean": 1.0435837507247925, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.8531102910637856, "entropy": 2.836574971675873, "clip_ratio/low_mean": 0.09096036478877068, "clip_ratio/low_min": 0.09096036478877068, "clip_ratio/high_mean": 0.09161586128175259, "clip_ratio/high_max": 0.09161586128175259, "clip_ratio/region_mean": 0.18257622607052326, "reward_total_mean": 0.2517043650150299, "reward_meter_count_clean_mean": 0.2517043650150299, "reward_meter_count_clean_std": 0.2103721797466278, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
81
+ {"timestamp_utc": "2026-04-11T19:13:20Z", "mode": "train", "global_step": 80, "epoch": 0.0030892801977139327, "loss": 0.2298, "grad_norm": 12.971203804016113, "learning_rate": 9.760606060606062e-06, "num_tokens": 167379.0, "completions/mean_length": 62.625, "completions/min_length": 33.0, "completions/max_length": 101.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 62.625, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 101.0, "rewards/meter_count_clean/mean": 0.5897529721260071, "rewards/meter_count_clean/std": 0.3398434519767761, "reward": 0.5897529721260071, "reward_std": 0.3398434519767761, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20138375461101532, "sampling/sampling_logp_difference/max": 1.4945716857910156, "sampling/importance_sampling_ratio/min": 0.22434468567371368, "sampling/importance_sampling_ratio/mean": 1.0488100051879883, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7127311304211617, "entropy": 2.800754815340042, "clip_ratio/low_mean": 0.07962200790643692, "clip_ratio/low_min": 0.07962200790643692, "clip_ratio/high_mean": 0.12067882902920246, "clip_ratio/high_max": 0.12067882902920246, "clip_ratio/region_mean": 0.20030083693563938, "reward_total_mean": 0.5897529721260071, "reward_meter_count_clean_mean": 0.5897529721260071, "reward_meter_count_clean_std": 0.3398434519767761, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
82
+ {"timestamp_utc": "2026-04-11T19:13:25Z", "mode": "train", "global_step": 81, "epoch": 0.003127896200185357, "loss": 0.0364, "grad_norm": 16.626371383666992, "learning_rate": 9.757575757575758e-06, "num_tokens": 168941.0, "completions/mean_length": 43.25, "completions/min_length": 36.0, "completions/max_length": 47.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 43.25, "completions/min_terminated_length": 36.0, "completions/max_terminated_length": 47.0, "rewards/meter_count_clean/mean": 0.32984912395477295, "rewards/meter_count_clean/std": 0.36702674627304077, "reward": 0.32984912395477295, "reward_std": 0.36702677607536316, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.18993134796619415, "sampling/sampling_logp_difference/max": 2.503852128982544, "sampling/importance_sampling_ratio/min": 0.08176939934492111, "sampling/importance_sampling_ratio/mean": 0.9905309081077576, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.9840531200170517, "entropy": 0.8857106305658817, "clip_ratio/low_mean": 0.13005737029016018, "clip_ratio/low_min": 0.13005737029016018, "clip_ratio/high_mean": 0.04627659637480974, "clip_ratio/high_max": 0.04627659637480974, "clip_ratio/region_mean": 0.17633396666496992, "reward_total_mean": 0.32984912395477295, "reward_meter_count_clean_mean": 0.32984912395477295, "reward_meter_count_clean_std": 0.36702674627304077, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
83
+ {"timestamp_utc": "2026-04-11T19:13:31Z", "mode": "train", "global_step": 82, "epoch": 0.003166512202656781, "loss": 0.0514, "grad_norm": 13.673704147338867, "learning_rate": 9.754545454545455e-06, "num_tokens": 170843.0, "completions/mean_length": 61.75, "completions/min_length": 52.0, "completions/max_length": 71.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 61.75, "completions/min_terminated_length": 52.0, "completions/max_terminated_length": 71.0, "rewards/meter_count_clean/mean": 0.6860445737838745, "rewards/meter_count_clean/std": 0.4127279818058014, "reward": 0.6860445737838745, "reward_std": 0.412727952003479, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19877685606479645, "sampling/sampling_logp_difference/max": 1.148824691772461, "sampling/importance_sampling_ratio/min": 0.3170091509819031, "sampling/importance_sampling_ratio/mean": 1.0191963911056519, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7028498351573944, "entropy": 2.6517202258110046, "clip_ratio/low_mean": 0.030181385576725006, "clip_ratio/low_min": 0.030181385576725006, "clip_ratio/high_mean": 0.18430931121110916, "clip_ratio/high_max": 0.18430931121110916, "clip_ratio/region_mean": 0.21449069678783417, "reward_total_mean": 0.6860445737838745, "reward_meter_count_clean_mean": 0.6860445737838745, "reward_meter_count_clean_std": 0.4127279818058014, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
84
+ {"timestamp_utc": "2026-04-11T19:13:39Z", "mode": "train", "global_step": 83, "epoch": 0.003205128205128205, "loss": -0.0638, "grad_norm": 6.804693222045898, "learning_rate": 9.751515151515152e-06, "num_tokens": 173838.0, "completions/mean_length": 193.375, "completions/min_length": 119.0, "completions/max_length": 274.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 193.375, "completions/min_terminated_length": 119.0, "completions/max_terminated_length": 274.0, "rewards/meter_count_clean/mean": 0.1995052993297577, "rewards/meter_count_clean/std": 0.20155183970928192, "reward": 0.1995052993297577, "reward_std": 0.20155183970928192, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2313334345817566, "sampling/sampling_logp_difference/max": 1.8036818504333496, "sampling/importance_sampling_ratio/min": 0.16469140350818634, "sampling/importance_sampling_ratio/mean": 1.0506726503372192, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.5210926830768585, "entropy": 3.7798104882240295, "clip_ratio/low_mean": 0.0888951700180769, "clip_ratio/low_min": 0.0888951700180769, "clip_ratio/high_mean": 0.1242379229515791, "clip_ratio/high_max": 0.1242379229515791, "clip_ratio/region_mean": 0.213133092969656, "reward_total_mean": 0.1995052993297577, "reward_meter_count_clean_mean": 0.1995052993297577, "reward_meter_count_clean_std": 0.20155183970928192, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
85
+ {"timestamp_utc": "2026-04-11T19:13:44Z", "mode": "train", "global_step": 84, "epoch": 0.003243744207599629, "loss": 0.1179, "grad_norm": 13.247566223144531, "learning_rate": 9.74848484848485e-06, "num_tokens": 175356.0, "completions/mean_length": 42.75, "completions/min_length": 29.0, "completions/max_length": 63.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 42.75, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 63.0, "rewards/meter_count_clean/mean": 0.2839965224266052, "rewards/meter_count_clean/std": 0.25705069303512573, "reward": 0.2839965224266052, "reward_std": 0.25705069303512573, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20496825873851776, "sampling/sampling_logp_difference/max": 1.4523324966430664, "sampling/importance_sampling_ratio/min": 0.2340237945318222, "sampling/importance_sampling_ratio/mean": 1.056425929069519, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7388735264539719, "entropy": 2.1612997502088547, "clip_ratio/low_mean": 0.14447841420769691, "clip_ratio/low_min": 0.14447841420769691, "clip_ratio/high_mean": 0.044027093797922134, "clip_ratio/high_max": 0.044027093797922134, "clip_ratio/region_mean": 0.18850550800561905, "reward_total_mean": 0.2839965224266052, "reward_meter_count_clean_mean": 0.2839965224266052, "reward_meter_count_clean_std": 0.25705069303512573, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
86
+ {"timestamp_utc": "2026-04-11T19:13:50Z", "mode": "train", "global_step": 85, "epoch": 0.0032823602100710537, "loss": 0.1389, "grad_norm": 8.849252700805664, "learning_rate": 9.745454545454547e-06, "num_tokens": 177342.0, "completions/mean_length": 94.25, "completions/min_length": 56.0, "completions/max_length": 145.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 94.25, "completions/min_terminated_length": 56.0, "completions/max_terminated_length": 145.0, "rewards/meter_count_clean/mean": 0.5080384612083435, "rewards/meter_count_clean/std": 0.5007855892181396, "reward": 0.5080384612083435, "reward_std": 0.5007855296134949, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20362532138824463, "sampling/sampling_logp_difference/max": 1.7173385620117188, "sampling/importance_sampling_ratio/min": 0.1795433610677719, "sampling/importance_sampling_ratio/mean": 1.0272923707962036, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4220501631498337, "entropy": 2.768698900938034, "clip_ratio/low_mean": 0.07980264164507389, "clip_ratio/low_min": 0.07980264164507389, "clip_ratio/high_mean": 0.11511588096618652, "clip_ratio/high_max": 0.11511588096618652, "clip_ratio/region_mean": 0.19491852261126041, "reward_total_mean": 0.5080384612083435, "reward_meter_count_clean_mean": 0.5080384612083435, "reward_meter_count_clean_std": 0.5007855892181396, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
87
+ {"timestamp_utc": "2026-04-11T19:13:58Z", "mode": "train", "global_step": 86, "epoch": 0.0033209762125424778, "loss": 0.1116, "grad_norm": 5.789488315582275, "learning_rate": 9.742424242424244e-06, "num_tokens": 181005.0, "completions/mean_length": 250.875, "completions/min_length": 138.0, "completions/max_length": 323.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 250.875, "completions/min_terminated_length": 138.0, "completions/max_terminated_length": 323.0, "rewards/meter_count_clean/mean": 0.2234499454498291, "rewards/meter_count_clean/std": 0.21007955074310303, "reward": 0.2234499454498291, "reward_std": 0.21007955074310303, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2088625580072403, "sampling/sampling_logp_difference/max": 1.6328611373901367, "sampling/importance_sampling_ratio/min": 0.19536978006362915, "sampling/importance_sampling_ratio/mean": 1.03697669506073, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.9879649505019188, "entropy": 3.1971679627895355, "clip_ratio/low_mean": 0.09196549654006958, "clip_ratio/low_min": 0.09196549654006958, "clip_ratio/high_mean": 0.09607911668717861, "clip_ratio/high_max": 0.09607911668717861, "clip_ratio/region_mean": 0.1880446132272482, "reward_total_mean": 0.2234499454498291, "reward_meter_count_clean_mean": 0.2234499454498291, "reward_meter_count_clean_std": 0.21007955074310303, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
88
+ {"timestamp_utc": "2026-04-11T19:14:03Z", "mode": "train", "global_step": 87, "epoch": 0.003359592215013902, "loss": 0.0123, "grad_norm": 18.224882125854492, "learning_rate": 9.739393939393941e-06, "num_tokens": 182602.0, "completions/mean_length": 45.625, "completions/min_length": 33.0, "completions/max_length": 55.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 45.625, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 55.0, "rewards/meter_count_clean/mean": 0.3372150957584381, "rewards/meter_count_clean/std": 0.4128407835960388, "reward": 0.3372150957584381, "reward_std": 0.41284075379371643, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21434830129146576, "sampling/sampling_logp_difference/max": 1.461564064025879, "sampling/importance_sampling_ratio/min": 0.23187333345413208, "sampling/importance_sampling_ratio/mean": 1.031768798828125, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.5478705316781998, "entropy": 2.4275936484336853, "clip_ratio/low_mean": 0.11843920033425093, "clip_ratio/low_min": 0.11843920033425093, "clip_ratio/high_mean": 0.05969125218689442, "clip_ratio/high_max": 0.05969125218689442, "clip_ratio/region_mean": 0.17813045252114534, "reward_total_mean": 0.3372150957584381, "reward_meter_count_clean_mean": 0.3372150957584381, "reward_meter_count_clean_std": 0.4128407835960388, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
89
+ {"timestamp_utc": "2026-04-11T19:14:09Z", "mode": "train", "global_step": 88, "epoch": 0.003398208217485326, "loss": 0.0868, "grad_norm": 7.239674091339111, "learning_rate": 9.736363636363637e-06, "num_tokens": 185727.0, "completions/mean_length": 179.625, "completions/min_length": 154.0, "completions/max_length": 203.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 179.625, "completions/min_terminated_length": 154.0, "completions/max_terminated_length": 203.0, "rewards/meter_count_clean/mean": 0.395793080329895, "rewards/meter_count_clean/std": 0.26035410165786743, "reward": 0.395793080329895, "reward_std": 0.26035410165786743, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19813987612724304, "sampling/sampling_logp_difference/max": 1.598048210144043, "sampling/importance_sampling_ratio/min": 0.20229095220565796, "sampling/importance_sampling_ratio/mean": 1.035853385925293, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.2833295166492462, "entropy": 2.465492472052574, "clip_ratio/low_mean": 0.09458548203110695, "clip_ratio/low_min": 0.09458548203110695, "clip_ratio/high_mean": 0.10577460564672947, "clip_ratio/high_max": 0.10577460564672947, "clip_ratio/region_mean": 0.20036008767783642, "reward_total_mean": 0.395793080329895, "reward_meter_count_clean_mean": 0.395793080329895, "reward_meter_count_clean_std": 0.26035410165786743, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
90
+ {"timestamp_utc": "2026-04-11T19:14:15Z", "mode": "train", "global_step": 89, "epoch": 0.00343682421995675, "loss": 0.2009, "grad_norm": 10.298208236694336, "learning_rate": 9.733333333333334e-06, "num_tokens": 187879.0, "completions/mean_length": 98.0, "completions/min_length": 65.0, "completions/max_length": 134.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 98.0, "completions/min_terminated_length": 65.0, "completions/max_terminated_length": 134.0, "rewards/meter_count_clean/mean": 0.6336480379104614, "rewards/meter_count_clean/std": 0.3132372200489044, "reward": 0.6336480379104614, "reward_std": 0.31323719024658203, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20529155433177948, "sampling/sampling_logp_difference/max": 1.6686315536499023, "sampling/importance_sampling_ratio/min": 0.18850484490394592, "sampling/importance_sampling_ratio/mean": 1.050196647644043, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4282651841640472, "entropy": 2.9546821117401123, "clip_ratio/low_mean": 0.07508473843336105, "clip_ratio/low_min": 0.07508473843336105, "clip_ratio/high_mean": 0.12049593590199947, "clip_ratio/high_max": 0.12049593590199947, "clip_ratio/region_mean": 0.19558067433536053, "reward_total_mean": 0.6336480379104614, "reward_meter_count_clean_mean": 0.6336480379104614, "reward_meter_count_clean_std": 0.3132372200489044, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
91
+ {"timestamp_utc": "2026-04-11T19:14:20Z", "mode": "train", "global_step": 90, "epoch": 0.003475440222428174, "loss": 0.004, "grad_norm": 14.30321216583252, "learning_rate": 9.730303030303031e-06, "num_tokens": 189559.0, "completions/mean_length": 48.0, "completions/min_length": 33.0, "completions/max_length": 64.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 48.0, "completions/min_terminated_length": 33.0, "completions/max_terminated_length": 64.0, "rewards/meter_count_clean/mean": 0.6122487187385559, "rewards/meter_count_clean/std": 0.3927072584629059, "reward": 0.6122487187385559, "reward_std": 0.3927072584629059, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20874950289726257, "sampling/sampling_logp_difference/max": 1.4292075634002686, "sampling/importance_sampling_ratio/min": 0.23949864506721497, "sampling/importance_sampling_ratio/mean": 1.037034511566162, "sampling/importance_sampling_ratio/max": 1.9598164558410645, "kl": 1.7402038723230362, "entropy": 2.464683413505554, "clip_ratio/low_mean": 0.04326923098415136, "clip_ratio/low_min": 0.04326923098415136, "clip_ratio/high_mean": 0.1241906825453043, "clip_ratio/high_max": 0.1241906825453043, "clip_ratio/region_mean": 0.16745991352945566, "reward_total_mean": 0.6122487187385559, "reward_meter_count_clean_mean": 0.6122487187385559, "reward_meter_count_clean_std": 0.3927072584629059, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
92
+ {"timestamp_utc": "2026-04-11T19:14:27Z", "mode": "train", "global_step": 91, "epoch": 0.0035140562248995983, "loss": 0.0544, "grad_norm": 6.84236478805542, "learning_rate": 9.727272727272728e-06, "num_tokens": 192563.0, "completions/mean_length": 174.5, "completions/min_length": 121.0, "completions/max_length": 262.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 174.5, "completions/min_terminated_length": 121.0, "completions/max_terminated_length": 262.0, "rewards/meter_count_clean/mean": 0.603217363357544, "rewards/meter_count_clean/std": 0.20575962960720062, "reward": 0.603217363357544, "reward_std": 0.20575964450836182, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21506188809871674, "sampling/sampling_logp_difference/max": 1.3115177154541016, "sampling/importance_sampling_ratio/min": 0.2694108486175537, "sampling/importance_sampling_ratio/mean": 1.0390546321868896, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.1118000373244286, "entropy": 3.7719518542289734, "clip_ratio/low_mean": 0.10331365652382374, "clip_ratio/low_min": 0.10331365652382374, "clip_ratio/high_mean": 0.07963917776942253, "clip_ratio/high_max": 0.07963917776942253, "clip_ratio/region_mean": 0.18295283429324627, "reward_total_mean": 0.603217363357544, "reward_meter_count_clean_mean": 0.603217363357544, "reward_meter_count_clean_std": 0.20575962960720062, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
93
+ {"timestamp_utc": "2026-04-11T19:14:32Z", "mode": "train", "global_step": 92, "epoch": 0.0035526722273710224, "loss": 0.1559, "grad_norm": 16.550352096557617, "learning_rate": 9.724242424242426e-06, "num_tokens": 194233.0, "completions/mean_length": 57.75, "completions/min_length": 34.0, "completions/max_length": 92.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 57.75, "completions/min_terminated_length": 34.0, "completions/max_terminated_length": 92.0, "rewards/meter_count_clean/mean": 0.41938501596450806, "rewards/meter_count_clean/std": 0.4255473017692566, "reward": 0.41938501596450806, "reward_std": 0.4255472719669342, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2523427903652191, "sampling/sampling_logp_difference/max": 1.8779749870300293, "sampling/importance_sampling_ratio/min": 0.15289942920207977, "sampling/importance_sampling_ratio/mean": 1.057788372039795, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.8161827772855759, "entropy": 2.162596195936203, "clip_ratio/low_mean": 0.1532957497984171, "clip_ratio/low_min": 0.1532957497984171, "clip_ratio/high_mean": 0.07082884386181831, "clip_ratio/high_max": 0.07082884386181831, "clip_ratio/region_mean": 0.2241245936602354, "reward_total_mean": 0.41938501596450806, "reward_meter_count_clean_mean": 0.41938501596450806, "reward_meter_count_clean_std": 0.4255473017692566, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
94
+ {"timestamp_utc": "2026-04-11T19:14:41Z", "mode": "train", "global_step": 93, "epoch": 0.0035912882298424465, "loss": 0.0583, "grad_norm": 4.504820823669434, "learning_rate": 9.721212121212123e-06, "num_tokens": 198586.0, "completions/mean_length": 312.125, "completions/min_length": 268.0, "completions/max_length": 344.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 312.125, "completions/min_terminated_length": 268.0, "completions/max_terminated_length": 344.0, "rewards/meter_count_clean/mean": 0.1713373363018036, "rewards/meter_count_clean/std": 0.16757051646709442, "reward": 0.1713373363018036, "reward_std": 0.16757051646709442, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.20828686654567719, "sampling/sampling_logp_difference/max": 1.6375536918640137, "sampling/importance_sampling_ratio/min": 0.19445516169071198, "sampling/importance_sampling_ratio/mean": 1.0430541038513184, "sampling/importance_sampling_ratio/max": 2.0, "kl": 0.8844465836882591, "entropy": 3.2054056525230408, "clip_ratio/low_mean": 0.07444824650883675, "clip_ratio/low_min": 0.07444824650883675, "clip_ratio/high_mean": 0.0941340047866106, "clip_ratio/high_max": 0.0941340047866106, "clip_ratio/region_mean": 0.16858225129544735, "reward_total_mean": 0.1713373363018036, "reward_meter_count_clean_mean": 0.1713373363018036, "reward_meter_count_clean_std": 0.16757051646709442, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
95
+ {"timestamp_utc": "2026-04-11T19:14:47Z", "mode": "train", "global_step": 94, "epoch": 0.003629904232313871, "loss": -0.0215, "grad_norm": 28.54985809326172, "learning_rate": 9.718181818181818e-06, "num_tokens": 200807.0, "completions/mean_length": 111.625, "completions/min_length": 66.0, "completions/max_length": 161.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 111.625, "completions/min_terminated_length": 66.0, "completions/max_terminated_length": 161.0, "rewards/meter_count_clean/mean": 0.37486404180526733, "rewards/meter_count_clean/std": 0.35751017928123474, "reward": 0.37486404180526733, "reward_std": 0.35751017928123474, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2191978245973587, "sampling/sampling_logp_difference/max": 1.4935674667358398, "sampling/importance_sampling_ratio/min": 0.22457008063793182, "sampling/importance_sampling_ratio/mean": 1.0396778583526611, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.49543359875679, "entropy": 3.28840634226799, "clip_ratio/low_mean": 0.07458104286342859, "clip_ratio/low_min": 0.07458104286342859, "clip_ratio/high_mean": 0.10831169039011002, "clip_ratio/high_max": 0.10831169039011002, "clip_ratio/region_mean": 0.1828927332535386, "reward_total_mean": 0.37486404180526733, "reward_meter_count_clean_mean": 0.37486404180526733, "reward_meter_count_clean_std": 0.35751017928123474, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
96
+ {"timestamp_utc": "2026-04-11T19:14:52Z", "mode": "train", "global_step": 95, "epoch": 0.003668520234785295, "loss": 0.165, "grad_norm": 99.09620666503906, "learning_rate": 9.715151515151516e-06, "num_tokens": 202121.0, "completions/mean_length": 18.25, "completions/min_length": 17.0, "completions/max_length": 21.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.25, "completions/min_terminated_length": 17.0, "completions/max_terminated_length": 21.0, "rewards/meter_count_clean/mean": 0.9949363470077515, "rewards/meter_count_clean/std": 0.0023499121889472008, "reward": 0.9949363470077515, "reward_std": 0.002349905204027891, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.04272828623652458, "sampling/sampling_logp_difference/max": 0.8389389514923096, "sampling/importance_sampling_ratio/min": 0.5384017825126648, "sampling/importance_sampling_ratio/mean": 1.008124828338623, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.8823280930519104, "entropy": 0.21160555630922318, "clip_ratio/low_mean": 0.011904762126505375, "clip_ratio/low_min": 0.011904762126505375, "clip_ratio/high_mean": 0.03553921589627862, "clip_ratio/high_max": 0.03553921589627862, "clip_ratio/region_mean": 0.047443978022783995, "reward_total_mean": 0.9949363470077515, "reward_meter_count_clean_mean": 0.9949363470077515, "reward_meter_count_clean_std": 0.0023499121889472008, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
97
+ {"timestamp_utc": "2026-04-11T19:14:57Z", "mode": "train", "global_step": 96, "epoch": 0.0037071362372567192, "loss": 0.1136, "grad_norm": 8.879090309143066, "learning_rate": 9.712121212121213e-06, "num_tokens": 204182.0, "completions/mean_length": 98.625, "completions/min_length": 70.0, "completions/max_length": 150.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 98.625, "completions/min_terminated_length": 70.0, "completions/max_terminated_length": 150.0, "rewards/meter_count_clean/mean": 0.3730781078338623, "rewards/meter_count_clean/std": 0.2818770110607147, "reward": 0.3730781078338623, "reward_std": 0.2818770110607147, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.2376914918422699, "sampling/sampling_logp_difference/max": 1.6928377151489258, "sampling/importance_sampling_ratio/min": 0.18399666249752045, "sampling/importance_sampling_ratio/mean": 1.0496914386749268, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.5962010324001312, "entropy": 3.8312510550022125, "clip_ratio/low_mean": 0.1025804141536355, "clip_ratio/low_min": 0.1025804141536355, "clip_ratio/high_mean": 0.08833824284374714, "clip_ratio/high_max": 0.08833824284374714, "clip_ratio/region_mean": 0.19091865699738264, "reward_total_mean": 0.3730781078338623, "reward_meter_count_clean_mean": 0.3730781078338623, "reward_meter_count_clean_std": 0.2818770110607147, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
98
+ {"timestamp_utc": "2026-04-11T19:15:03Z", "mode": "train", "global_step": 97, "epoch": 0.0037457522397281433, "loss": 0.1295, "grad_norm": 17.37091636657715, "learning_rate": 9.70909090909091e-06, "num_tokens": 206299.0, "completions/mean_length": 94.625, "completions/min_length": 60.0, "completions/max_length": 144.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 94.625, "completions/min_terminated_length": 60.0, "completions/max_terminated_length": 144.0, "rewards/meter_count_clean/mean": 0.38106584548950195, "rewards/meter_count_clean/std": 0.3250954747200012, "reward": 0.38106584548950195, "reward_std": 0.3250954747200012, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.21154189109802246, "sampling/sampling_logp_difference/max": 1.7208929061889648, "sampling/importance_sampling_ratio/min": 0.17890633642673492, "sampling/importance_sampling_ratio/mean": 1.0636258125305176, "sampling/importance_sampling_ratio/max": 2.0, "kl": 2.371904134750366, "entropy": 2.747112363576889, "clip_ratio/low_mean": 0.12544687278568745, "clip_ratio/low_min": 0.12544687278568745, "clip_ratio/high_mean": 0.06360195018351078, "clip_ratio/high_max": 0.06360195018351078, "clip_ratio/region_mean": 0.18904882296919823, "reward_total_mean": 0.38106584548950195, "reward_meter_count_clean_mean": 0.38106584548950195, "reward_meter_count_clean_std": 0.3250954747200012, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
99
+ {"timestamp_utc": "2026-04-11T19:15:09Z", "mode": "train", "global_step": 98, "epoch": 0.0037843682421995675, "loss": 0.0945, "grad_norm": 9.53862190246582, "learning_rate": 9.706060606060606e-06, "num_tokens": 208238.0, "completions/mean_length": 84.375, "completions/min_length": 47.0, "completions/max_length": 113.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 84.375, "completions/min_terminated_length": 47.0, "completions/max_terminated_length": 113.0, "rewards/meter_count_clean/mean": 0.3455486297607422, "rewards/meter_count_clean/std": 0.4493013024330139, "reward": 0.3455486297607422, "reward_std": 0.4493013024330139, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.22088798880577087, "sampling/sampling_logp_difference/max": 1.6038808822631836, "sampling/importance_sampling_ratio/min": 0.2011144906282425, "sampling/importance_sampling_ratio/mean": 1.053009271621704, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4000272899866104, "entropy": 3.3423550128936768, "clip_ratio/low_mean": 0.10318870935589075, "clip_ratio/low_min": 0.10318870935589075, "clip_ratio/high_mean": 0.08637197501957417, "clip_ratio/high_max": 0.08637197501957417, "clip_ratio/region_mean": 0.18956068437546492, "reward_total_mean": 0.3455486297607422, "reward_meter_count_clean_mean": 0.3455486297607422, "reward_meter_count_clean_std": 0.4493013024330139, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
100
+ {"timestamp_utc": "2026-04-11T19:15:14Z", "mode": "train", "global_step": 99, "epoch": 0.0038229842446709916, "loss": 0.0352, "grad_norm": 12.195768356323242, "learning_rate": 9.703030303030305e-06, "num_tokens": 210203.0, "completions/mean_length": 70.625, "completions/min_length": 52.0, "completions/max_length": 100.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 70.625, "completions/min_terminated_length": 52.0, "completions/max_terminated_length": 100.0, "rewards/meter_count_clean/mean": 0.18750828504562378, "rewards/meter_count_clean/std": 0.18086771667003632, "reward": 0.18750828504562378, "reward_std": 0.1808677315711975, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.19442309439182281, "sampling/sampling_logp_difference/max": 1.4180307388305664, "sampling/importance_sampling_ratio/min": 0.24219049513339996, "sampling/importance_sampling_ratio/mean": 1.0437159538269043, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.4821452051401138, "entropy": 2.1659949123859406, "clip_ratio/low_mean": 0.13323841989040375, "clip_ratio/low_min": 0.13323841989040375, "clip_ratio/high_mean": 0.03570783603936434, "clip_ratio/high_max": 0.03570783603936434, "clip_ratio/region_mean": 0.16894625592976809, "reward_total_mean": 0.18750828504562378, "reward_meter_count_clean_mean": 0.18750828504562378, "reward_meter_count_clean_std": 0.18086771667003632, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
101
+ {"timestamp_utc": "2026-04-11T19:15:19Z", "mode": "train", "global_step": 100, "epoch": 0.0038616002471424157, "loss": 0.0128, "grad_norm": 14.672683715820312, "learning_rate": 9.7e-06, "num_tokens": 211811.0, "completions/mean_length": 45.0, "completions/min_length": 29.0, "completions/max_length": 58.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 45.0, "completions/min_terminated_length": 29.0, "completions/max_terminated_length": 58.0, "rewards/meter_count_clean/mean": 0.10674573481082916, "rewards/meter_count_clean/std": 0.08364997804164886, "reward": 0.10674573481082916, "reward_std": 0.08364997804164886, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 0.18656399846076965, "sampling/sampling_logp_difference/max": 1.3434593677520752, "sampling/importance_sampling_ratio/min": 0.26094141602516174, "sampling/importance_sampling_ratio/mean": 1.036841869354248, "sampling/importance_sampling_ratio/max": 2.0, "kl": 1.7658196240663528, "entropy": 2.016321301460266, "clip_ratio/low_mean": 0.08866940345615149, "clip_ratio/low_min": 0.08866940345615149, "clip_ratio/high_mean": 0.08358811773359776, "clip_ratio/high_max": 0.08358811773359776, "clip_ratio/region_mean": 0.17225752118974924, "reward_total_mean": 0.10674573481082916, "reward_meter_count_clean_mean": 0.10674573481082916, "reward_meter_count_clean_std": 0.08364997804164886, "run_id": "shaer_grpo_20260411_190212", "run_sequence_index": 0}
plots/kl_chain.png CHANGED

Git LFS Details

  • SHA256: 663569993e44212edaaf4660e4bfbb88720477150102491d87778f8841fe2795
  • Pointer size: 130 Bytes
  • Size of remote file: 86.9 kB

Git LFS Details

  • SHA256: 6ed17b94815f6eba5e8230e929e18a1c40789ca4e30bc98a593b091dbe03e57a
  • Pointer size: 131 Bytes
  • Size of remote file: 111 kB
plots/kl_run.png CHANGED

Git LFS Details

  • SHA256: e080c987bf3779ebf7dd2d10813eb4036e91cfa5453d0439dcf4cc5fca01cc33
  • Pointer size: 130 Bytes
  • Size of remote file: 86.3 kB

Git LFS Details

  • SHA256: f77bf3577b009bcc5960221787a35ce3eb4370b82b35b53b26529a02a252b924
  • Pointer size: 131 Bytes
  • Size of remote file: 111 kB
plots/reward_panels_eval_chain.png CHANGED

Git LFS Details

  • SHA256: a048e52986adeeed5485a5844c6f928e03b21468f3cb5238cd37c3f488f5e4f7
  • Pointer size: 130 Bytes
  • Size of remote file: 82.9 kB

Git LFS Details

  • SHA256: d1cbc94f4f1fef186744eb9bd83da65c0caaaee0a9160fad61d7abafe58e6492
  • Pointer size: 130 Bytes
  • Size of remote file: 73.2 kB
plots/reward_panels_eval_run.png CHANGED

Git LFS Details

  • SHA256: 2cf732ac79cb55e36fb260ac424e24c43cb487dc78bfccaa4de753cb4ed48a54
  • Pointer size: 130 Bytes
  • Size of remote file: 82.3 kB

Git LFS Details

  • SHA256: 64ed4b003bfd01b1d23c280e9e6b9596b5850cad9811146ca27772a007c10f9f
  • Pointer size: 130 Bytes
  • Size of remote file: 72.6 kB
plots/reward_panels_train_chain.png CHANGED

Git LFS Details

  • SHA256: 02abc6b41fa9afe0b31c6b355caf0e377e442ae7714292696b1217293a9de1ef
  • Pointer size: 131 Bytes
  • Size of remote file: 295 kB

Git LFS Details

  • SHA256: bacbb76ed876de655ce0da67cdf93bfad70c03743d770c194d6b615de3a88fa2
  • Pointer size: 131 Bytes
  • Size of remote file: 553 kB
plots/reward_panels_train_run.png CHANGED

Git LFS Details

  • SHA256: e189f7dabf8c536781dbcc16698d6908a10e5e176ef144eb40a3299de3681843
  • Pointer size: 131 Bytes
  • Size of remote file: 294 kB

Git LFS Details

  • SHA256: 4100656badb1664104167089fa7ca70022af3923081db3c26f0a09c0b79a8ae1
  • Pointer size: 131 Bytes
  • Size of remote file: 553 kB
plotter.pid CHANGED
@@ -1 +1 @@
1
- 23120
 
1
+ 23560
reward_meter_count_clean_debug.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
train.log CHANGED
@@ -61,3 +61,54 @@
61
  2026-04-11 19:08:52,995 | INFO | train_grpo_train | metrics_logged mode=train step=50
62
  2026-04-11 19:10:17,089 | INFO | train_grpo_train | metrics_logged mode=eval step=50
63
  2026-04-11 19:10:23,904 | INFO | train_grpo_train | metrics_logged mode=train step=51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  2026-04-11 19:08:52,995 | INFO | train_grpo_train | metrics_logged mode=train step=50
62
  2026-04-11 19:10:17,089 | INFO | train_grpo_train | metrics_logged mode=eval step=50
63
  2026-04-11 19:10:23,904 | INFO | train_grpo_train | metrics_logged mode=train step=51
64
+ 2026-04-11 19:10:29,199 | INFO | train_grpo_train | metrics_logged mode=train step=52
65
+ 2026-04-11 19:10:34,105 | INFO | train_grpo_train | metrics_logged mode=train step=53
66
+ 2026-04-11 19:10:39,672 | INFO | train_grpo_train | metrics_logged mode=train step=54
67
+ 2026-04-11 19:10:44,657 | INFO | train_grpo_train | metrics_logged mode=train step=55
68
+ 2026-04-11 19:10:49,475 | INFO | train_grpo_train | metrics_logged mode=train step=56
69
+ 2026-04-11 19:10:57,882 | INFO | train_grpo_train | metrics_logged mode=train step=57
70
+ 2026-04-11 19:11:02,879 | INFO | train_grpo_train | metrics_logged mode=train step=58
71
+ 2026-04-11 19:11:09,833 | INFO | train_grpo_train | metrics_logged mode=train step=59
72
+ 2026-04-11 19:11:15,428 | INFO | train_grpo_train | metrics_logged mode=train step=60
73
+ 2026-04-11 19:11:20,440 | INFO | train_grpo_train | metrics_logged mode=train step=61
74
+ 2026-04-11 19:11:26,714 | INFO | train_grpo_train | metrics_logged mode=train step=62
75
+ 2026-04-11 19:11:32,043 | INFO | train_grpo_train | metrics_logged mode=train step=63
76
+ 2026-04-11 19:11:42,133 | INFO | train_grpo_train | metrics_logged mode=train step=64
77
+ 2026-04-11 19:11:46,918 | INFO | train_grpo_train | metrics_logged mode=train step=65
78
+ 2026-04-11 19:11:52,158 | INFO | train_grpo_train | metrics_logged mode=train step=66
79
+ 2026-04-11 19:11:56,963 | INFO | train_grpo_train | metrics_logged mode=train step=67
80
+ 2026-04-11 19:12:01,791 | INFO | train_grpo_train | metrics_logged mode=train step=68
81
+ 2026-04-11 19:12:07,990 | INFO | train_grpo_train | metrics_logged mode=train step=69
82
+ 2026-04-11 19:12:14,515 | INFO | train_grpo_train | metrics_logged mode=train step=70
83
+ 2026-04-11 19:12:19,990 | INFO | train_grpo_train | metrics_logged mode=train step=71
84
+ 2026-04-11 19:12:26,357 | INFO | train_grpo_train | metrics_logged mode=train step=72
85
+ 2026-04-11 19:12:31,521 | INFO | train_grpo_train | metrics_logged mode=train step=73
86
+ 2026-04-11 19:12:42,519 | INFO | train_grpo_train | metrics_logged mode=train step=74
87
+ 2026-04-11 19:12:48,566 | INFO | train_grpo_train | metrics_logged mode=train step=75
88
+ 2026-04-11 19:12:53,758 | INFO | train_grpo_train | metrics_logged mode=train step=76
89
+ 2026-04-11 19:12:59,034 | INFO | train_grpo_train | metrics_logged mode=train step=77
90
+ 2026-04-11 19:13:05,682 | INFO | train_grpo_train | metrics_logged mode=train step=78
91
+ 2026-04-11 19:13:15,357 | INFO | train_grpo_train | metrics_logged mode=train step=79
92
+ 2026-04-11 19:13:20,877 | INFO | train_grpo_train | metrics_logged mode=train step=80
93
+ 2026-04-11 19:13:25,820 | INFO | train_grpo_train | metrics_logged mode=train step=81
94
+ 2026-04-11 19:13:31,640 | INFO | train_grpo_train | metrics_logged mode=train step=82
95
+ 2026-04-11 19:13:39,673 | INFO | train_grpo_train | metrics_logged mode=train step=83
96
+ 2026-04-11 19:13:44,597 | INFO | train_grpo_train | metrics_logged mode=train step=84
97
+ 2026-04-11 19:13:50,338 | INFO | train_grpo_train | metrics_logged mode=train step=85
98
+ 2026-04-11 19:13:58,375 | INFO | train_grpo_train | metrics_logged mode=train step=86
99
+ 2026-04-11 19:14:03,092 | INFO | train_grpo_train | metrics_logged mode=train step=87
100
+ 2026-04-11 19:14:09,745 | INFO | train_grpo_train | metrics_logged mode=train step=88
101
+ 2026-04-11 19:14:15,595 | INFO | train_grpo_train | metrics_logged mode=train step=89
102
+ 2026-04-11 19:14:20,429 | INFO | train_grpo_train | metrics_logged mode=train step=90
103
+ 2026-04-11 19:14:27,719 | INFO | train_grpo_train | metrics_logged mode=train step=91
104
+ 2026-04-11 19:14:32,975 | INFO | train_grpo_train | metrics_logged mode=train step=92
105
+ 2026-04-11 19:14:41,404 | INFO | train_grpo_train | metrics_logged mode=train step=93
106
+ 2026-04-11 19:14:47,558 | INFO | train_grpo_train | metrics_logged mode=train step=94
107
+ 2026-04-11 19:14:52,014 | INFO | train_grpo_train | metrics_logged mode=train step=95
108
+ 2026-04-11 19:14:57,807 | INFO | train_grpo_train | metrics_logged mode=train step=96
109
+ 2026-04-11 19:15:03,591 | INFO | train_grpo_train | metrics_logged mode=train step=97
110
+ 2026-04-11 19:15:09,523 | INFO | train_grpo_train | metrics_logged mode=train step=98
111
+ 2026-04-11 19:15:14,869 | INFO | train_grpo_train | metrics_logged mode=train step=99
112
+ 2026-04-11 19:15:19,694 | INFO | train_grpo_train | metrics_logged mode=train step=100
113
+ 2026-04-11 19:16:42,117 | INFO | train_grpo_train | metrics_logged mode=eval step=100
114
+ 2026-04-11 19:16:49,358 | INFO | train_grpo_train | metrics_logged mode=train step=101
train_stdout.log CHANGED
@@ -232,3 +232,175 @@ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and g
232
 
233
  2%|▏ | 51/3300 [06:03<28:08:33, 31.18s/it]2026-04-11 19:10:23,904 | INFO | train_grpo_train | metrics_logged mode=train step=51
234
 
235
 
236
 
237
  2%|▏ | 51/3300 [06:03<28:08:33, 31.18s/it]INFO 04-11 19:10:24 [block_pool.py:378] Successfully reset prefix cache
 
238
  2%|▏ | 52/3300 [06:08<21:07:35, 23.42s/it]2026-04-11 19:10:29,199 | INFO | train_grpo_train | metrics_logged mode=train step=52
 
239
 
 
240
  2%|▏ | 52/3300 [06:08<21:07:35, 23.42s/it]INFO 04-11 19:10:29 [block_pool.py:378] Successfully reset prefix cache
 
241
  2%|▏ | 53/3300 [06:13<16:06:42, 17.86s/it]2026-04-11 19:10:34,105 | INFO | train_grpo_train | metrics_logged mode=train step=53
 
242
 
 
243
  2%|▏ | 53/3300 [06:13<16:06:42, 17.86s/it]INFO 04-11 19:10:34 [block_pool.py:378] Successfully reset prefix cache
 
244
  2%|▏ | 54/3300 [06:19<12:46:49, 14.17s/it]2026-04-11 19:10:39,672 | INFO | train_grpo_train | metrics_logged mode=train step=54
 
245
 
 
246
  2%|▏ | 54/3300 [06:19<12:46:49, 14.17s/it]INFO 04-11 19:10:39 [block_pool.py:378] Successfully reset prefix cache
 
247
  2%|▏ | 55/3300 [06:24<10:17:20, 11.41s/it]2026-04-11 19:10:44,657 | INFO | train_grpo_train | metrics_logged mode=train step=55
 
248
 
 
249
  2%|▏ | 55/3300 [06:24<10:17:20, 11.41s/it]INFO 04-11 19:10:44 [block_pool.py:378] Successfully reset prefix cache
 
250
  2%|▏ | 56/3300 [06:28<8:30:19, 9.44s/it] 2026-04-11 19:10:49,475 | INFO | train_grpo_train | metrics_logged mode=train step=56
 
251
 
 
252
  2%|▏ | 56/3300 [06:28<8:30:19, 9.44s/it]INFO 04-11 19:10:49 [block_pool.py:378] Successfully reset prefix cache
 
253
  2%|▏ | 57/3300 [06:37<8:13:24, 9.13s/it]2026-04-11 19:10:57,882 | INFO | train_grpo_train | metrics_logged mode=train step=57
 
254
 
 
255
  2%|▏ | 57/3300 [06:37<8:13:24, 9.13s/it]INFO 04-11 19:10:58 [block_pool.py:378] Successfully reset prefix cache
 
256
  2%|▏ | 58/3300 [06:42<7:06:17, 7.89s/it]2026-04-11 19:11:02,879 | INFO | train_grpo_train | metrics_logged mode=train step=58
 
257
 
 
258
  2%|▏ | 58/3300 [06:42<7:06:17, 7.89s/it]INFO 04-11 19:11:03 [block_pool.py:378] Successfully reset prefix cache
 
259
  2%|▏ | 59/3300 [06:49<6:51:00, 7.61s/it]2026-04-11 19:11:09,833 | INFO | train_grpo_train | metrics_logged mode=train step=59
 
260
 
 
261
  2%|▏ | 59/3300 [06:49<6:51:00, 7.61s/it]INFO 04-11 19:11:10 [block_pool.py:378] Successfully reset prefix cache
 
262
  2%|▏ | 60/3300 [06:54<6:18:14, 7.00s/it]2026-04-11 19:11:15,428 | INFO | train_grpo_train | metrics_logged mode=train step=60
 
263
 
 
264
  2%|▏ | 60/3300 [06:54<6:18:14, 7.00s/it]INFO 04-11 19:11:15 [block_pool.py:378] Successfully reset prefix cache
 
265
  2%|▏ | 61/3300 [06:59<5:45:50, 6.41s/it]2026-04-11 19:11:20,440 | INFO | train_grpo_train | metrics_logged mode=train step=61
 
266
 
 
267
  2%|▏ | 61/3300 [06:59<5:45:50, 6.41s/it]INFO 04-11 19:11:20 [block_pool.py:378] Successfully reset prefix cache
 
268
  2%|▏ | 62/3300 [07:06<5:43:37, 6.37s/it]2026-04-11 19:11:26,714 | INFO | train_grpo_train | metrics_logged mode=train step=62
 
269
 
 
270
  2%|▏ | 62/3300 [07:06<5:43:37, 6.37s/it]INFO 04-11 19:11:26 [block_pool.py:378] Successfully reset prefix cache
 
271
  2%|▏ | 63/3300 [07:11<5:26:41, 6.06s/it]2026-04-11 19:11:32,043 | INFO | train_grpo_train | metrics_logged mode=train step=63
 
272
 
 
273
  2%|▏ | 63/3300 [07:11<5:26:41, 6.06s/it]INFO 04-11 19:11:32 [block_pool.py:378] Successfully reset prefix cache
 
274
  2%|▏ | 64/3300 [07:21<6:31:43, 7.26s/it]2026-04-11 19:11:42,133 | INFO | train_grpo_train | metrics_logged mode=train step=64
 
275
 
 
276
  2%|▏ | 64/3300 [07:21<6:31:43, 7.26s/it]INFO 04-11 19:11:42 [block_pool.py:378] Successfully reset prefix cache
 
277
  2%|▏ | 65/3300 [07:26<5:51:28, 6.52s/it]2026-04-11 19:11:46,918 | INFO | train_grpo_train | metrics_logged mode=train step=65
 
278
 
 
279
  2%|▏ | 65/3300 [07:26<5:51:28, 6.52s/it]INFO 04-11 19:11:47 [block_pool.py:378] Successfully reset prefix cache
 
280
  2%|▏ | 66/3300 [07:31<5:30:46, 6.14s/it]2026-04-11 19:11:52,158 | INFO | train_grpo_train | metrics_logged mode=train step=66
 
281
 
 
282
  2%|▏ | 66/3300 [07:31<5:30:46, 6.14s/it]INFO 04-11 19:11:52 [block_pool.py:378] Successfully reset prefix cache
 
283
  2%|▏ | 67/3300 [07:36<5:09:13, 5.74s/it]2026-04-11 19:11:56,963 | INFO | train_grpo_train | metrics_logged mode=train step=67
 
284
 
 
285
  2%|▏ | 67/3300 [07:36<5:09:13, 5.74s/it]INFO 04-11 19:11:57 [block_pool.py:378] Successfully reset prefix cache
 
286
  2%|▏ | 68/3300 [07:41<4:54:23, 5.47s/it]2026-04-11 19:12:01,791 | INFO | train_grpo_train | metrics_logged mode=train step=68
 
287
 
 
288
  2%|▏ | 68/3300 [07:41<4:54:23, 5.47s/it]INFO 04-11 19:12:02 [block_pool.py:378] Successfully reset prefix cache
 
289
  2%|▏ | 69/3300 [07:47<5:05:53, 5.68s/it]2026-04-11 19:12:07,990 | INFO | train_grpo_train | metrics_logged mode=train step=69
 
290
 
 
291
  2%|▏ | 69/3300 [07:47<5:05:53, 5.68s/it]INFO 04-11 19:12:08 [block_pool.py:378] Successfully reset prefix cache
 
292
  2%|▏ | 70/3300 [07:53<5:19:45, 5.94s/it]2026-04-11 19:12:14,515 | INFO | train_grpo_train | metrics_logged mode=train step=70
 
293
 
 
294
  2%|▏ | 70/3300 [07:53<5:19:45, 5.94s/it]INFO 04-11 19:12:14 [block_pool.py:378] Successfully reset prefix cache
 
295
  2%|▏ | 71/3300 [07:59<5:12:08, 5.80s/it]2026-04-11 19:12:19,990 | INFO | train_grpo_train | metrics_logged mode=train step=71
 
296
 
 
297
  2%|▏ | 71/3300 [07:59<5:12:08, 5.80s/it]INFO 04-11 19:12:20 [block_pool.py:378] Successfully reset prefix cache
 
298
  2%|▏ | 72/3300 [08:05<5:21:03, 5.97s/it]2026-04-11 19:12:26,357 | INFO | train_grpo_train | metrics_logged mode=train step=72
 
299
 
 
300
  2%|▏ | 72/3300 [08:05<5:21:03, 5.97s/it]INFO 04-11 19:12:26 [block_pool.py:378] Successfully reset prefix cache
 
301
  2%|▏ | 73/3300 [08:10<5:08:07, 5.73s/it]2026-04-11 19:12:31,521 | INFO | train_grpo_train | metrics_logged mode=train step=73
 
302
 
 
303
  2%|▏ | 73/3300 [08:10<5:08:07, 5.73s/it]INFO 04-11 19:12:31 [block_pool.py:378] Successfully reset prefix cache
 
304
  2%|▏ | 74/3300 [08:21<6:30:51, 7.27s/it]2026-04-11 19:12:42,519 | INFO | train_grpo_train | metrics_logged mode=train step=74
 
305
 
 
306
  2%|▏ | 74/3300 [08:21<6:30:51, 7.27s/it]INFO 04-11 19:12:42 [block_pool.py:378] Successfully reset prefix cache
 
307
  2%|▏ | 75/3300 [08:27<6:12:32, 6.93s/it]2026-04-11 19:12:48,566 | INFO | train_grpo_train | metrics_logged mode=train step=75
 
308
 
 
309
  2%|▏ | 75/3300 [08:27<6:12:32, 6.93s/it]INFO 04-11 19:12:48 [block_pool.py:378] Successfully reset prefix cache
 
310
  2%|▏ | 76/3300 [08:33<5:44:53, 6.42s/it]2026-04-11 19:12:53,758 | INFO | train_grpo_train | metrics_logged mode=train step=76
 
311
 
 
312
  2%|▏ | 76/3300 [08:33<5:44:53, 6.42s/it]INFO 04-11 19:12:54 [block_pool.py:378] Successfully reset prefix cache
 
313
  2%|▏ | 77/3300 [08:38<5:26:30, 6.08s/it]2026-04-11 19:12:59,034 | INFO | train_grpo_train | metrics_logged mode=train step=77
 
314
 
 
315
  2%|▏ | 77/3300 [08:38<5:26:30, 6.08s/it]INFO 04-11 19:12:59 [block_pool.py:378] Successfully reset prefix cache
 
316
  2%|▏ | 78/3300 [08:45<5:35:30, 6.25s/it]2026-04-11 19:13:05,682 | INFO | train_grpo_train | metrics_logged mode=train step=78
 
317
 
 
318
  2%|▏ | 78/3300 [08:45<5:35:30, 6.25s/it]INFO 04-11 19:13:05 [block_pool.py:378] Successfully reset prefix cache
 
319
  2%|▏ | 79/3300 [08:54<6:30:34, 7.28s/it]2026-04-11 19:13:15,357 | INFO | train_grpo_train | metrics_logged mode=train step=79
 
320
 
 
321
  2%|▏ | 79/3300 [08:54<6:30:34, 7.28s/it]INFO 04-11 19:13:15 [block_pool.py:378] Successfully reset prefix cache
 
322
  2%|▏ | 80/3300 [09:00<6:02:17, 6.75s/it]2026-04-11 19:13:20,877 | INFO | train_grpo_train | metrics_logged mode=train step=80
 
323
 
 
324
  2%|▏ | 80/3300 [09:00<6:02:17, 6.75s/it]INFO 04-11 19:13:21 [block_pool.py:378] Successfully reset prefix cache
 
325
  2%|▏ | 81/3300 [09:05<5:33:03, 6.21s/it]2026-04-11 19:13:25,820 | INFO | train_grpo_train | metrics_logged mode=train step=81
 
326
 
 
327
  2%|▏ | 81/3300 [09:05<5:33:03, 6.21s/it]INFO 04-11 19:13:26 [block_pool.py:378] Successfully reset prefix cache
 
328
  2%|▏ | 82/3300 [09:11<5:26:34, 6.09s/it]2026-04-11 19:13:31,640 | INFO | train_grpo_train | metrics_logged mode=train step=82
 
329
 
 
330
  2%|▏ | 82/3300 [09:11<5:26:34, 6.09s/it]INFO 04-11 19:13:31 [block_pool.py:378] Successfully reset prefix cache
 
331
  3%|β–Ž | 83/3300 [09:19<5:57:54, 6.68s/it]2026-04-11 19:13:39,673 | INFO | train_grpo_train | metrics_logged mode=train step=83
 
332
 
 
333
  3%|β–Ž | 83/3300 [09:19<5:57:54, 6.68s/it]INFO 04-11 19:13:39 [block_pool.py:378] Successfully reset prefix cache
 
334
  3%|β–Ž | 84/3300 [09:23<5:29:37, 6.15s/it]2026-04-11 19:13:44,597 | INFO | train_grpo_train | metrics_logged mode=train step=84
 
335
 
 
336
  3%|β–Ž | 84/3300 [09:23<5:29:37, 6.15s/it]INFO 04-11 19:13:44 [block_pool.py:378] Successfully reset prefix cache
 
337
  3%|β–Ž | 85/3300 [09:29<5:22:57, 6.03s/it]2026-04-11 19:13:50,338 | INFO | train_grpo_train | metrics_logged mode=train step=85
 
338
 
 
339
  3%|β–Ž | 85/3300 [09:29<5:22:57, 6.03s/it]INFO 04-11 19:13:50 [block_pool.py:378] Successfully reset prefix cache
 
340
  3%|β–Ž | 86/3300 [09:37<5:55:09, 6.63s/it]2026-04-11 19:13:58,375 | INFO | train_grpo_train | metrics_logged mode=train step=86
 
341
 
 
342
  3%|β–Ž | 86/3300 [09:37<5:55:09, 6.63s/it]INFO 04-11 19:13:58 [block_pool.py:378] Successfully reset prefix cache
 
343
  3%|β–Ž | 87/3300 [09:42<5:24:18, 6.06s/it]2026-04-11 19:14:03,092 | INFO | train_grpo_train | metrics_logged mode=train step=87
 
344
 
 
345
  3%|β–Ž | 87/3300 [09:42<5:24:18, 6.06s/it]INFO 04-11 19:14:03 [block_pool.py:378] Successfully reset prefix cache
 
346
  3%|β–Ž | 88/3300 [09:49<5:33:47, 6.24s/it]2026-04-11 19:14:09,745 | INFO | train_grpo_train | metrics_logged mode=train step=88
 
347
 
 
348
  3%|β–Ž | 88/3300 [09:49<5:33:47, 6.24s/it]INFO 04-11 19:14:10 [block_pool.py:378] Successfully reset prefix cache
 
349
  3%|β–Ž | 89/3300 [09:54<5:27:29, 6.12s/it]2026-04-11 19:14:15,595 | INFO | train_grpo_train | metrics_logged mode=train step=89
 
350
 
 
351
  3%|β–Ž | 89/3300 [09:54<5:27:29, 6.12s/it]INFO 04-11 19:14:15 [block_pool.py:378] Successfully reset prefix cache
 
352
  3%|β–Ž | 90/3300 [09:59<5:06:46, 5.73s/it]2026-04-11 19:14:20,429 | INFO | train_grpo_train | metrics_logged mode=train step=90
 
353
 
 
354
  3%|β–Ž | 90/3300 [09:59<5:06:46, 5.73s/it]INFO 04-11 19:14:20 [block_pool.py:378] Successfully reset prefix cache
 
355
  3%|β–Ž | 91/3300 [10:07<5:31:38, 6.20s/it]2026-04-11 19:14:27,719 | INFO | train_grpo_train | metrics_logged mode=train step=91
 
356
 
 
357
  3%|β–Ž | 91/3300 [10:07<5:31:38, 6.20s/it]INFO 04-11 19:14:27 [block_pool.py:378] Successfully reset prefix cache
 
358
  3%|β–Ž | 92/3300 [10:12<5:16:22, 5.92s/it]2026-04-11 19:14:32,975 | INFO | train_grpo_train | metrics_logged mode=train step=92
 
359
 
 
360
  3%|β–Ž | 92/3300 [10:12<5:16:22, 5.92s/it]INFO 04-11 19:14:33 [block_pool.py:378] Successfully reset prefix cache
 
361
  3%|β–Ž | 93/3300 [10:20<5:56:25, 6.67s/it]2026-04-11 19:14:41,404 | INFO | train_grpo_train | metrics_logged mode=train step=93
 
362
 
 
363
  3%|β–Ž | 93/3300 [10:20<5:56:25, 6.67s/it]INFO 04-11 19:14:41 [block_pool.py:378] Successfully reset prefix cache
 
364
  3%|β–Ž | 94/3300 [10:26<5:48:05, 6.51s/it]2026-04-11 19:14:47,558 | INFO | train_grpo_train | metrics_logged mode=train step=94
 
365
 
 
366
  3%|β–Ž | 94/3300 [10:26<5:48:05, 6.51s/it]INFO 04-11 19:14:47 [block_pool.py:378] Successfully reset prefix cache
 
367
  3%|β–Ž | 95/3300 [10:31<5:15:05, 5.90s/it]2026-04-11 19:14:52,014 | INFO | train_grpo_train | metrics_logged mode=train step=95
 
368
 
 
369
  3%|β–Ž | 95/3300 [10:31<5:15:05, 5.90s/it]INFO 04-11 19:14:52 [block_pool.py:378] Successfully reset prefix cache
 
370
  3%|β–Ž | 96/3300 [10:37<5:13:17, 5.87s/it]2026-04-11 19:14:57,807 | INFO | train_grpo_train | metrics_logged mode=train step=96
 
371
 
 
372
  3%|β–Ž | 96/3300 [10:37<5:13:17, 5.87s/it]INFO 04-11 19:14:58 [block_pool.py:378] Successfully reset prefix cache
 
373
  3%|β–Ž | 97/3300 [10:42<5:11:51, 5.84s/it]2026-04-11 19:15:03,591 | INFO | train_grpo_train | metrics_logged mode=train step=97
 
374
 
 
375
  3%|β–Ž | 97/3300 [10:42<5:11:51, 5.84s/it]INFO 04-11 19:15:03 [block_pool.py:378] Successfully reset prefix cache
 
376
  3%|β–Ž | 98/3300 [10:48<5:13:12, 5.87s/it]2026-04-11 19:15:09,523 | INFO | train_grpo_train | metrics_logged mode=train step=98
 
377
 
 
378
  3%|β–Ž | 98/3300 [10:48<5:13:12, 5.87s/it]INFO 04-11 19:15:09 [block_pool.py:378] Successfully reset prefix cache
 
379
  3%|β–Ž | 99/3300 [10:54<5:04:39, 5.71s/it]2026-04-11 19:15:14,869 | INFO | train_grpo_train | metrics_logged mode=train step=99
 
380
 
 
381
  3%|β–Ž | 99/3300 [10:54<5:04:39, 5.71s/it]INFO 04-11 19:15:15 [block_pool.py:378] Successfully reset prefix cache
 
382
  3%|β–Ž | 100/3300 [10:59<4:49:44, 5.43s/it]2026-04-11 19:15:19,694 | INFO | train_grpo_train | metrics_logged mode=train step=100
 
383
 
 
384
  3%|β–Ž | 100/3300 [10:59<4:49:44, 5.43s/it]INFO 04-11 19:15:19 [block_pool.py:378] Successfully reset prefix cache
 
 
 
 
385
  0%| | 0/13 [00:00<?, ?it/s]
 
386
  15%|β–ˆβ–Œ | 2/13 [00:05<00:29, 2.64s/it]
 
387
  23%|β–ˆβ–ˆβ–Ž | 3/13 [00:09<00:31, 3.12s/it]
 
388
  31%|β–ˆβ–ˆβ–ˆ | 4/13 [00:16<00:41, 4.57s/it]
 
389
  38%|β–ˆβ–ˆβ–ˆβ–Š | 5/13 [00:23<00:45, 5.63s/it]
 
390
  46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 6/13 [00:31<00:43, 6.19s/it]
 
391
  54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/13 [00:37<00:36, 6.14s/it]
 
392
  62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/13 [00:42<00:30, 6.02s/it]
 
393
  69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 9/13 [00:50<00:26, 6.59s/it]
 
394
  77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 10/13 [00:54<00:17, 5.82s/it]
 
395
  85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/13 [01:00<00:11, 5.86s/it]
 
396
  92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 12/13 [01:08<00:06, 6.42s/it]
 
 
397
 
 
398
 
 
399
  3%|β–Ž | 100/3300 [12:21<4:49:44, 5.43s/it]
 
 
400
  /root/workspace/Shaer/grpo/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
 
 
401
  3%|β–Ž | 101/3300 [12:28<27:17:36, 30.71s/it]2026-04-11 19:16:49,358 | INFO | train_grpo_train | metrics_logged mode=train step=101
 
402
 
 
403
  3%|β–Ž | 101/3300 [12:28<27:17:36, 30.71s/it]INFO 04-11 19:16:49 [block_pool.py:378] Successfully reset prefix cache
 
232
 
233
  2%|▏ | 51/3300 [06:03<28:08:33, 31.18s/it]2026-04-11 19:10:23,904 | INFO | train_grpo_train | metrics_logged mode=train step=51
234
 
235
 
236
 
237
  2%|▏ | 51/3300 [06:03<28:08:33, 31.18s/it]INFO 04-11 19:10:24 [block_pool.py:378] Successfully reset prefix cache
238
+
239
  2%|▏ | 52/3300 [06:08<21:07:35, 23.42s/it]2026-04-11 19:10:29,199 | INFO | train_grpo_train | metrics_logged mode=train step=52
240
+
241
 
242
+
243
  2%|▏ | 52/3300 [06:08<21:07:35, 23.42s/it]INFO 04-11 19:10:29 [block_pool.py:378] Successfully reset prefix cache
244
+
245
  2%|▏ | 53/3300 [06:13<16:06:42, 17.86s/it]2026-04-11 19:10:34,105 | INFO | train_grpo_train | metrics_logged mode=train step=53
246
+
247
 
248
+
249
  2%|▏ | 53/3300 [06:13<16:06:42, 17.86s/it]INFO 04-11 19:10:34 [block_pool.py:378] Successfully reset prefix cache
250
+
251
  2%|▏ | 54/3300 [06:19<12:46:49, 14.17s/it]2026-04-11 19:10:39,672 | INFO | train_grpo_train | metrics_logged mode=train step=54
252
+
253
 
254
+
255
  2%|▏ | 54/3300 [06:19<12:46:49, 14.17s/it]INFO 04-11 19:10:39 [block_pool.py:378] Successfully reset prefix cache
256
+
257
  2%|▏ | 55/3300 [06:24<10:17:20, 11.41s/it]2026-04-11 19:10:44,657 | INFO | train_grpo_train | metrics_logged mode=train step=55
258
+
259
 
260
+
261
  2%|▏ | 55/3300 [06:24<10:17:20, 11.41s/it]INFO 04-11 19:10:44 [block_pool.py:378] Successfully reset prefix cache
262
+
263
  2%|▏ | 56/3300 [06:28<8:30:19, 9.44s/it] 2026-04-11 19:10:49,475 | INFO | train_grpo_train | metrics_logged mode=train step=56
264
+
265
 
266
+
267
  2%|▏ | 56/3300 [06:28<8:30:19, 9.44s/it]INFO 04-11 19:10:49 [block_pool.py:378] Successfully reset prefix cache
268
+
269
  2%|▏ | 57/3300 [06:37<8:13:24, 9.13s/it]2026-04-11 19:10:57,882 | INFO | train_grpo_train | metrics_logged mode=train step=57
270
+
271
 
272
+
273
  2%|▏ | 57/3300 [06:37<8:13:24, 9.13s/it]INFO 04-11 19:10:58 [block_pool.py:378] Successfully reset prefix cache
274
+
275
  2%|▏ | 58/3300 [06:42<7:06:17, 7.89s/it]2026-04-11 19:11:02,879 | INFO | train_grpo_train | metrics_logged mode=train step=58
276
+
277
 
278
+
279
  2%|▏ | 58/3300 [06:42<7:06:17, 7.89s/it]INFO 04-11 19:11:03 [block_pool.py:378] Successfully reset prefix cache
280
+
281
  2%|▏ | 59/3300 [06:49<6:51:00, 7.61s/it]2026-04-11 19:11:09,833 | INFO | train_grpo_train | metrics_logged mode=train step=59
282
+
283
 
284
+
285
  2%|▏ | 59/3300 [06:49<6:51:00, 7.61s/it]INFO 04-11 19:11:10 [block_pool.py:378] Successfully reset prefix cache
286
+
287
  2%|▏ | 60/3300 [06:54<6:18:14, 7.00s/it]2026-04-11 19:11:15,428 | INFO | train_grpo_train | metrics_logged mode=train step=60
288
+
289
 
290
+
291
  2%|▏ | 60/3300 [06:54<6:18:14, 7.00s/it]INFO 04-11 19:11:15 [block_pool.py:378] Successfully reset prefix cache
292
+
293
  2%|▏ | 61/3300 [06:59<5:45:50, 6.41s/it]2026-04-11 19:11:20,440 | INFO | train_grpo_train | metrics_logged mode=train step=61
294
+
295
 
296
+
297
  2%|▏ | 61/3300 [06:59<5:45:50, 6.41s/it]INFO 04-11 19:11:20 [block_pool.py:378] Successfully reset prefix cache
298
+
299
  2%|▏ | 62/3300 [07:06<5:43:37, 6.37s/it]2026-04-11 19:11:26,714 | INFO | train_grpo_train | metrics_logged mode=train step=62
300
+
301
 
302
+
303
  2%|▏ | 62/3300 [07:06<5:43:37, 6.37s/it]INFO 04-11 19:11:26 [block_pool.py:378] Successfully reset prefix cache
304
+
305
  2%|▏ | 63/3300 [07:11<5:26:41, 6.06s/it]2026-04-11 19:11:32,043 | INFO | train_grpo_train | metrics_logged mode=train step=63
306
+
307
 
308
+
309
  2%|▏ | 63/3300 [07:11<5:26:41, 6.06s/it]INFO 04-11 19:11:32 [block_pool.py:378] Successfully reset prefix cache
310
+
311
  2%|▏ | 64/3300 [07:21<6:31:43, 7.26s/it]2026-04-11 19:11:42,133 | INFO | train_grpo_train | metrics_logged mode=train step=64
312
+
313
 
314
+
315
  2%|▏ | 64/3300 [07:21<6:31:43, 7.26s/it]INFO 04-11 19:11:42 [block_pool.py:378] Successfully reset prefix cache
316
+
317
  2%|▏ | 65/3300 [07:26<5:51:28, 6.52s/it]2026-04-11 19:11:46,918 | INFO | train_grpo_train | metrics_logged mode=train step=65
318
+
319
 
320
+
321
  2%|▏ | 65/3300 [07:26<5:51:28, 6.52s/it]INFO 04-11 19:11:47 [block_pool.py:378] Successfully reset prefix cache
322
+
323
  2%|▏ | 66/3300 [07:31<5:30:46, 6.14s/it]2026-04-11 19:11:52,158 | INFO | train_grpo_train | metrics_logged mode=train step=66
324
+
325
 
326
+
327
  2%|▏ | 66/3300 [07:31<5:30:46, 6.14s/it]INFO 04-11 19:11:52 [block_pool.py:378] Successfully reset prefix cache
328
+
329
  2%|▏ | 67/3300 [07:36<5:09:13, 5.74s/it]2026-04-11 19:11:56,963 | INFO | train_grpo_train | metrics_logged mode=train step=67
330
+
331
 
332
+
333
  2%|▏ | 67/3300 [07:36<5:09:13, 5.74s/it]INFO 04-11 19:11:57 [block_pool.py:378] Successfully reset prefix cache
334
+
335
  2%|▏ | 68/3300 [07:41<4:54:23, 5.47s/it]2026-04-11 19:12:01,791 | INFO | train_grpo_train | metrics_logged mode=train step=68
336
+
337
 
338
+
339
  2%|▏ | 68/3300 [07:41<4:54:23, 5.47s/it]INFO 04-11 19:12:02 [block_pool.py:378] Successfully reset prefix cache
340
+
341
  2%|▏ | 69/3300 [07:47<5:05:53, 5.68s/it]2026-04-11 19:12:07,990 | INFO | train_grpo_train | metrics_logged mode=train step=69
342
+
343
 
344
+
345
  2%|▏ | 69/3300 [07:47<5:05:53, 5.68s/it]INFO 04-11 19:12:08 [block_pool.py:378] Successfully reset prefix cache
346
+
347
  2%|▏ | 70/3300 [07:53<5:19:45, 5.94s/it]2026-04-11 19:12:14,515 | INFO | train_grpo_train | metrics_logged mode=train step=70
348
+
349
 
350
+
351
  2%|▏ | 70/3300 [07:53<5:19:45, 5.94s/it]INFO 04-11 19:12:14 [block_pool.py:378] Successfully reset prefix cache
352
+
353
  2%|▏ | 71/3300 [07:59<5:12:08, 5.80s/it]2026-04-11 19:12:19,990 | INFO | train_grpo_train | metrics_logged mode=train step=71
354
+
355
 
356
+
357
  2%|▏ | 71/3300 [07:59<5:12:08, 5.80s/it]INFO 04-11 19:12:20 [block_pool.py:378] Successfully reset prefix cache
358
+
359
  2%|▏ | 72/3300 [08:05<5:21:03, 5.97s/it]2026-04-11 19:12:26,357 | INFO | train_grpo_train | metrics_logged mode=train step=72
360
+
361
 
362
+
363
  2%|▏ | 72/3300 [08:05<5:21:03, 5.97s/it]INFO 04-11 19:12:26 [block_pool.py:378] Successfully reset prefix cache
364
+
365
  2%|▏ | 73/3300 [08:10<5:08:07, 5.73s/it]2026-04-11 19:12:31,521 | INFO | train_grpo_train | metrics_logged mode=train step=73
366
+
367
 
368
+
369
  2%|▏ | 73/3300 [08:10<5:08:07, 5.73s/it]INFO 04-11 19:12:31 [block_pool.py:378] Successfully reset prefix cache
370
+
371
  2%|▏ | 74/3300 [08:21<6:30:51, 7.27s/it]2026-04-11 19:12:42,519 | INFO | train_grpo_train | metrics_logged mode=train step=74
372
+
373
 
374
+
375
  2%|▏ | 74/3300 [08:21<6:30:51, 7.27s/it]INFO 04-11 19:12:42 [block_pool.py:378] Successfully reset prefix cache
376
+
377
  2%|▏ | 75/3300 [08:27<6:12:32, 6.93s/it]2026-04-11 19:12:48,566 | INFO | train_grpo_train | metrics_logged mode=train step=75
378
+
379
 
380
+
381
  2%|▏ | 75/3300 [08:27<6:12:32, 6.93s/it]INFO 04-11 19:12:48 [block_pool.py:378] Successfully reset prefix cache
382
+
383
  2%|▏ | 76/3300 [08:33<5:44:53, 6.42s/it]2026-04-11 19:12:53,758 | INFO | train_grpo_train | metrics_logged mode=train step=76
384
+
385
 
386
+
387
  2%|▏ | 76/3300 [08:33<5:44:53, 6.42s/it]INFO 04-11 19:12:54 [block_pool.py:378] Successfully reset prefix cache
388
+
389
  2%|▏ | 77/3300 [08:38<5:26:30, 6.08s/it]2026-04-11 19:12:59,034 | INFO | train_grpo_train | metrics_logged mode=train step=77
390
+
391
 
392
+
393
  2%|▏ | 77/3300 [08:38<5:26:30, 6.08s/it]INFO 04-11 19:12:59 [block_pool.py:378] Successfully reset prefix cache
394
+
395
  2%|▏ | 78/3300 [08:45<5:35:30, 6.25s/it]2026-04-11 19:13:05,682 | INFO | train_grpo_train | metrics_logged mode=train step=78
396
+
397
 
398
+
399
  2%|▏ | 78/3300 [08:45<5:35:30, 6.25s/it]INFO 04-11 19:13:05 [block_pool.py:378] Successfully reset prefix cache
400
+
401
  2%|▏ | 79/3300 [08:54<6:30:34, 7.28s/it]2026-04-11 19:13:15,357 | INFO | train_grpo_train | metrics_logged mode=train step=79
402
+
403
 
404
+
405
  2%|▏ | 79/3300 [08:54<6:30:34, 7.28s/it]INFO 04-11 19:13:15 [block_pool.py:378] Successfully reset prefix cache
406
+
407
  2%|▏ | 80/3300 [09:00<6:02:17, 6.75s/it]2026-04-11 19:13:20,877 | INFO | train_grpo_train | metrics_logged mode=train step=80
408
+
409
 
410
+
411
  2%|▏ | 80/3300 [09:00<6:02:17, 6.75s/it]INFO 04-11 19:13:21 [block_pool.py:378] Successfully reset prefix cache
412
+
413
  2%|▏ | 81/3300 [09:05<5:33:03, 6.21s/it]2026-04-11 19:13:25,820 | INFO | train_grpo_train | metrics_logged mode=train step=81
414
+
415
 
416
+
417
  2%|▏ | 81/3300 [09:05<5:33:03, 6.21s/it]INFO 04-11 19:13:26 [block_pool.py:378] Successfully reset prefix cache
418
+
419
  2%|▏ | 82/3300 [09:11<5:26:34, 6.09s/it]2026-04-11 19:13:31,640 | INFO | train_grpo_train | metrics_logged mode=train step=82
420
+
421
 
422
+
423
  2%|▏ | 82/3300 [09:11<5:26:34, 6.09s/it]INFO 04-11 19:13:31 [block_pool.py:378] Successfully reset prefix cache
424
+
425
  3%|β–Ž | 83/3300 [09:19<5:57:54, 6.68s/it]2026-04-11 19:13:39,673 | INFO | train_grpo_train | metrics_logged mode=train step=83
426
+
427
 
428
+
429
  3%|β–Ž | 83/3300 [09:19<5:57:54, 6.68s/it]INFO 04-11 19:13:39 [block_pool.py:378] Successfully reset prefix cache
430
+
431
  3%|β–Ž | 84/3300 [09:23<5:29:37, 6.15s/it]2026-04-11 19:13:44,597 | INFO | train_grpo_train | metrics_logged mode=train step=84
432
+
433
 
434
+
435
  3%|β–Ž | 84/3300 [09:23<5:29:37, 6.15s/it]INFO 04-11 19:13:44 [block_pool.py:378] Successfully reset prefix cache
436
+
437
  3%|β–Ž | 85/3300 [09:29<5:22:57, 6.03s/it]2026-04-11 19:13:50,338 | INFO | train_grpo_train | metrics_logged mode=train step=85
438
+
439
 
440
+
441
  3%|β–Ž | 85/3300 [09:29<5:22:57, 6.03s/it]INFO 04-11 19:13:50 [block_pool.py:378] Successfully reset prefix cache
442
+
443
  3%|β–Ž | 86/3300 [09:37<5:55:09, 6.63s/it]2026-04-11 19:13:58,375 | INFO | train_grpo_train | metrics_logged mode=train step=86
444
+
445
 
446
+
447
  3%|β–Ž | 86/3300 [09:37<5:55:09, 6.63s/it]INFO 04-11 19:13:58 [block_pool.py:378] Successfully reset prefix cache
448
+
449
  3%|β–Ž | 87/3300 [09:42<5:24:18, 6.06s/it]2026-04-11 19:14:03,092 | INFO | train_grpo_train | metrics_logged mode=train step=87
450
+
451
 
452
+
453
  3%|β–Ž | 87/3300 [09:42<5:24:18, 6.06s/it]INFO 04-11 19:14:03 [block_pool.py:378] Successfully reset prefix cache
454
+
455
  3%|β–Ž | 88/3300 [09:49<5:33:47, 6.24s/it]2026-04-11 19:14:09,745 | INFO | train_grpo_train | metrics_logged mode=train step=88
456
+
457
 
458
+
459
  3%|β–Ž | 88/3300 [09:49<5:33:47, 6.24s/it]INFO 04-11 19:14:10 [block_pool.py:378] Successfully reset prefix cache
460
+
461
  3%|β–Ž | 89/3300 [09:54<5:27:29, 6.12s/it]2026-04-11 19:14:15,595 | INFO | train_grpo_train | metrics_logged mode=train step=89
462
+
463
 
464
+
465
  3%|β–Ž | 89/3300 [09:54<5:27:29, 6.12s/it]INFO 04-11 19:14:15 [block_pool.py:378] Successfully reset prefix cache
466
+
467
  3%|β–Ž | 90/3300 [09:59<5:06:46, 5.73s/it]2026-04-11 19:14:20,429 | INFO | train_grpo_train | metrics_logged mode=train step=90
468
+
469
 
470
+
471
  3%|β–Ž | 90/3300 [09:59<5:06:46, 5.73s/it]INFO 04-11 19:14:20 [block_pool.py:378] Successfully reset prefix cache
472
+
473
  3%|β–Ž | 91/3300 [10:07<5:31:38, 6.20s/it]2026-04-11 19:14:27,719 | INFO | train_grpo_train | metrics_logged mode=train step=91
474
+
475
 
476
+
477
  3%|β–Ž | 91/3300 [10:07<5:31:38, 6.20s/it]INFO 04-11 19:14:27 [block_pool.py:378] Successfully reset prefix cache
478
+
479
  3%|β–Ž | 92/3300 [10:12<5:16:22, 5.92s/it]2026-04-11 19:14:32,975 | INFO | train_grpo_train | metrics_logged mode=train step=92
480
+
481
 
482
+
483
  3%|β–Ž | 92/3300 [10:12<5:16:22, 5.92s/it]INFO 04-11 19:14:33 [block_pool.py:378] Successfully reset prefix cache
484
+
485
  3%|β–Ž | 93/3300 [10:20<5:56:25, 6.67s/it]2026-04-11 19:14:41,404 | INFO | train_grpo_train | metrics_logged mode=train step=93
486
+
487
 
488
+
489
  3%|β–Ž | 93/3300 [10:20<5:56:25, 6.67s/it]INFO 04-11 19:14:41 [block_pool.py:378] Successfully reset prefix cache
490
+
491
  3%|β–Ž | 94/3300 [10:26<5:48:05, 6.51s/it]2026-04-11 19:14:47,558 | INFO | train_grpo_train | metrics_logged mode=train step=94
492
+
493
 
494
+
495
  3%|β–Ž | 94/3300 [10:26<5:48:05, 6.51s/it]INFO 04-11 19:14:47 [block_pool.py:378] Successfully reset prefix cache
496
+
497
  3%|β–Ž | 95/3300 [10:31<5:15:05, 5.90s/it]2026-04-11 19:14:52,014 | INFO | train_grpo_train | metrics_logged mode=train step=95
498
+
499
 
500
+
501
  3%|β–Ž | 95/3300 [10:31<5:15:05, 5.90s/it]INFO 04-11 19:14:52 [block_pool.py:378] Successfully reset prefix cache
502
+
503
  3%|β–Ž | 96/3300 [10:37<5:13:17, 5.87s/it]2026-04-11 19:14:57,807 | INFO | train_grpo_train | metrics_logged mode=train step=96
504
+
505
 
506
+
507
  3%|β–Ž | 96/3300 [10:37<5:13:17, 5.87s/it]INFO 04-11 19:14:58 [block_pool.py:378] Successfully reset prefix cache
508
+
509
  3%|β–Ž | 97/3300 [10:42<5:11:51, 5.84s/it]2026-04-11 19:15:03,591 | INFO | train_grpo_train | metrics_logged mode=train step=97
510
+
511
 
512
+
513
  3%|β–Ž | 97/3300 [10:42<5:11:51, 5.84s/it]INFO 04-11 19:15:03 [block_pool.py:378] Successfully reset prefix cache
514
+
515
  3%|β–Ž | 98/3300 [10:48<5:13:12, 5.87s/it]2026-04-11 19:15:09,523 | INFO | train_grpo_train | metrics_logged mode=train step=98
516
+
517
 
518
+
519
  3%|β–Ž | 98/3300 [10:48<5:13:12, 5.87s/it]INFO 04-11 19:15:09 [block_pool.py:378] Successfully reset prefix cache
520
+
521
  3%|β–Ž | 99/3300 [10:54<5:04:39, 5.71s/it]2026-04-11 19:15:14,869 | INFO | train_grpo_train | metrics_logged mode=train step=99
522
+
523
 
524
+
525
  3%|β–Ž | 99/3300 [10:54<5:04:39, 5.71s/it]INFO 04-11 19:15:15 [block_pool.py:378] Successfully reset prefix cache
526
+
527
  3%|β–Ž | 100/3300 [10:59<4:49:44, 5.43s/it]2026-04-11 19:15:19,694 | INFO | train_grpo_train | metrics_logged mode=train step=100
528
+
529
 
530
+
531
  3%|β–Ž | 100/3300 [10:59<4:49:44, 5.43s/it]INFO 04-11 19:15:19 [block_pool.py:378] Successfully reset prefix cache
532
+ /root/workspace/Shaer/grpo/.venv/lib/python3.11/site-packages/trl/trainer/grpo_trainer.py:1450: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /pytorch/aten/src/ATen/native/ReduceOps.cpp:1839.)
533
+ std_rewards = rewards.view(-1, self.num_generations).std(dim=1)
534
+
535
+
536
  0%| | 0/13 [00:00<?, ?it/s]
537
+
538
  15%|β–ˆβ–Œ | 2/13 [00:05<00:29, 2.64s/it]
539
+
540
  23%|β–ˆβ–ˆβ–Ž | 3/13 [00:09<00:31, 3.12s/it]
541
+
542
  31%|β–ˆβ–ˆβ–ˆ | 4/13 [00:16<00:41, 4.57s/it]
543
+
544
  38%|β–ˆβ–ˆβ–ˆβ–Š | 5/13 [00:23<00:45, 5.63s/it]
545
+
546
  46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 6/13 [00:31<00:43, 6.19s/it]
547
+
548
  54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 7/13 [00:37<00:36, 6.14s/it]
549
+
550
  62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 8/13 [00:42<00:30, 6.02s/it]
551
+
552
  69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 9/13 [00:50<00:26, 6.59s/it]
553
+
554
  77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 10/13 [00:54<00:17, 5.82s/it]
555
+
556
  85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 11/13 [01:00<00:11, 5.86s/it]
557
+
558
  92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 12/13 [01:08<00:06, 6.42s/it]
559
+
560
+
561
 
562
+
563
 
564
+
565
  3%|β–Ž | 100/3300 [12:21<4:49:44, 5.43s/it]
566
+
567
+
568
  /root/workspace/Shaer/grpo/.venv/lib/python3.11/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
569
+ warnings.warn(
570
+
571
  3%|β–Ž | 101/3300 [12:28<27:17:36, 30.71s/it]2026-04-11 19:16:49,358 | INFO | train_grpo_train | metrics_logged mode=train step=101
572
+
573
 
574
+
575
  3%|β–Ž | 101/3300 [12:28<27:17:36, 30.71s/it]INFO 04-11 19:16:49 [block_pool.py:378] Successfully reset prefix cache