Spaces:

Jayant2304
/

commitment-os

Sleeping

App Files Files Community

jayantaggarwal-sketch commited on Apr 25

Commit

af8810b

1 Parent(s): 9318eea

Sync latest code and non-binary artifacts

Browse files

Files changed (7) hide show

artifacts/training_metrics.json +821 -0
artifacts/training_summary.csv +2 -0
inference.py +3 -0
server/app.py +51 -7
server/environment.py +24 -13
server/graders.py +1 -1
training/env_factory.py +3 -1

artifacts/training_metrics.json ADDED Viewed

	@@ -0,0 +1,821 @@

+[
+  {
+    "loss": 0.6357966065406799,
+    "grad_norm": 0.5020767450332642,
+    "learning_rate": 0.0,
+    "num_tokens": 2584.0,
+    "completions/mean_length": 200.0,
+    "completions/min_length": 20.0,
+    "completions/max_length": 380.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 200.0,
+    "completions/min_terminated_length": 20.0,
+    "completions/max_terminated_length": 380.0,
+    "rewards/reward_function/mean": 0.574999988079071,
+    "rewards/reward_function/std": 0.10606600344181061,
+    "reward": 0.574999988079071,
+    "reward_std": 0.10606600344181061,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.26839178800582886,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 61.83920078600022,
+    "epoch": 0.06666666666666667,
+    "step": 1
+  },
+  {
+    "loss": -0.24778124690055847,
+    "grad_norm": 1.877001166343689,
+    "learning_rate": 1.6666666666666667e-06,
+    "num_tokens": 4297.0,
+    "completions/mean_length": 35.5,
+    "completions/min_length": 23.0,
+    "completions/max_length": 48.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 35.5,
+    "completions/min_terminated_length": 23.0,
+    "completions/max_terminated_length": 48.0,
+    "rewards/reward_function/mean": 0.40209999680519104,
+    "rewards/reward_function/std": 0.020647529512643814,
+    "reward": 0.40209999680519104,
+    "reward_std": 0.020647529512643814,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.29453833028674126,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 14.531932316999928,
+    "epoch": 0.13333333333333333,
+    "step": 2
+  },
+  {
+    "loss": -0.07843422889709473,
+    "grad_norm": 1.543445110321045,
+    "learning_rate": 3.3333333333333333e-06,
+    "num_tokens": 5871.0,
+    "completions/mean_length": 18.0,
+    "completions/min_length": 16.0,
+    "completions/max_length": 20.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 18.0,
+    "completions/min_terminated_length": 16.0,
+    "completions/max_terminated_length": 20.0,
+    "rewards/reward_function/mean": 0.4583500027656555,
+    "rewards/reward_function/std": 0.058901991695165634,
+    "reward": 0.4583500027656555,
+    "reward_std": 0.058901991695165634,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.1914939135313034,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 10.524165547000166,
+    "epoch": 0.2,
+    "step": 3
+  },
+  {
+    "loss": 0.07059085369110107,
+    "grad_norm": 1.5465283393859863,
+    "learning_rate": 5e-06,
+    "num_tokens": 7485.0,
+    "completions/mean_length": 20.0,
+    "completions/min_length": 18.0,
+    "completions/max_length": 22.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 20.0,
+    "completions/min_terminated_length": 18.0,
+    "completions/max_terminated_length": 22.0,
+    "rewards/reward_function/mean": 0.4583500027656555,
+    "rewards/reward_function/std": 0.058901991695165634,
+    "reward": 0.4583500027656555,
+    "reward_std": 0.058901991695165634,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.1294238492846489,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 10.990043340000057,
+    "epoch": 0.26666666666666666,
+    "step": 4
+  },
+  {
+    "loss": -0.008725225925445557,
+    "grad_norm": 1.369038462638855,
+    "learning_rate": 4.814814814814815e-06,
+    "num_tokens": 8692.0,
+    "completions/mean_length": 40.5,
+    "completions/min_length": 40.0,
+    "completions/max_length": 41.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 40.5,
+    "completions/min_terminated_length": 40.0,
+    "completions/max_terminated_length": 41.0,
+    "rewards/reward_function/mean": 0.5187499523162842,
+    "rewards/reward_function/std": 0.18561552464962006,
+    "reward": 0.5187499523162842,
+    "reward_std": 0.18561550974845886,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.19958198070526123,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.131567607000306,
+    "epoch": 0.3333333333333333,
+    "step": 5
+  },
+  {
+    "loss": -0.07062190771102905,
+    "grad_norm": 0.8509910702705383,
+    "learning_rate": 4.62962962962963e-06,
+    "num_tokens": 9862.0,
+    "completions/mean_length": 40.0,
+    "completions/min_length": 36.0,
+    "completions/max_length": 44.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 40.0,
+    "completions/min_terminated_length": 36.0,
+    "completions/max_terminated_length": 44.0,
+    "rewards/reward_function/mean": 0.4437499940395355,
+    "rewards/reward_function/std": 0.07954952120780945,
+    "reward": 0.4437499940395355,
+    "reward_std": 0.07954952120780945,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.1297583170235157,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.613226033000046,
+    "epoch": 0.4,
+    "step": 6
+  },
+  {
+    "loss": -5.960464477539062e-07,
+    "grad_norm": 0.1141546443104744,
+    "learning_rate": 4.444444444444444e-06,
+    "num_tokens": 11000.0,
+    "completions/mean_length": 19.0,
+    "completions/min_length": 19.0,
+    "completions/max_length": 19.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 19.0,
+    "completions/min_terminated_length": 19.0,
+    "completions/max_terminated_length": 19.0,
+    "rewards/reward_function/mean": 0.5349999666213989,
+    "rewards/reward_function/std": 0.04949747025966644,
+    "reward": 0.5349999666213989,
+    "reward_std": 0.04949747025966644,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.07411494851112366,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 8.378681117999804,
+    "epoch": 0.4666666666666667,
+    "step": 7
+  },
+  {
+    "loss": -0.5384407639503479,
+    "grad_norm": 0.575139045715332,
+    "learning_rate": 4.2592592592592596e-06,
+    "num_tokens": 12808.0,
+    "completions/mean_length": 119.0,
+    "completions/min_length": 28.0,
+    "completions/max_length": 210.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 119.0,
+    "completions/min_terminated_length": 28.0,
+    "completions/max_terminated_length": 210.0,
+    "rewards/reward_function/mean": 0.6333500146865845,
+    "rewards/reward_function/std": 0.023546643555164337,
+    "reward": 0.6333500146865845,
+    "reward_std": 0.023546643555164337,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.2676837705075741,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 37.1004951179998,
+    "epoch": 0.5333333333333333,
+    "step": 8
+  },
+  {
+    "loss": 0.0,
+    "grad_norm": 0.14882154762744904,
+    "learning_rate": 4.074074074074074e-06,
+    "num_tokens": 14288.0,
+    "completions/mean_length": 22.0,
+    "completions/min_length": 22.0,
+    "completions/max_length": 22.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 22.0,
+    "completions/min_terminated_length": 22.0,
+    "completions/max_terminated_length": 22.0,
+    "rewards/reward_function/mean": 0.4583500027656555,
+    "rewards/reward_function/std": 0.058901991695165634,
+    "reward": 0.4583500027656555,
+    "reward_std": 0.058901991695165634,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.0809866338968277,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 10.457592102000262,
+    "epoch": 0.6,
+    "step": 9
+  },
+  {
+    "loss": -0.07851982116699219,
+    "grad_norm": 1.6114908456802368,
+    "learning_rate": 3.88888888888889e-06,
+    "num_tokens": 16384.0,
+    "completions/mean_length": 27.0,
+    "completions/min_length": 24.0,
+    "completions/max_length": 30.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 27.0,
+    "completions/min_terminated_length": 24.0,
+    "completions/max_terminated_length": 30.0,
+    "rewards/reward_function/mean": 0.5333499908447266,
+    "rewards/reward_function/std": 0.16496798396110535,
+    "reward": 0.5333499908447266,
+    "reward_std": 0.16496798396110535,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.4823211133480072,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 14.966705318999857,
+    "epoch": 0.6666666666666666,
+    "step": 10
+  },
+  {
+    "loss": 0.0,
+    "grad_norm": 0.0,
+    "learning_rate": 3.7037037037037037e-06,
+    "num_tokens": 17617.0,
+    "completions/mean_length": 35.5,
+    "completions/min_length": 16.0,
+    "completions/max_length": 55.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 35.5,
+    "completions/min_terminated_length": 16.0,
+    "completions/max_terminated_length": 55.0,
+    "rewards/reward_function/mean": 0.41670000553131104,
+    "rewards/reward_function/std": 0.0,
+    "reward": 0.41670000553131104,
+    "reward_std": 0.0,
+    "frac_reward_zero_std": 1.0,
+    "entropy": 0.9841015487909317,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 13.133867920999819,
+    "epoch": 0.7333333333333333,
+    "step": 11
+  },
+  {
+    "loss": -0.05432739853858948,
+    "grad_norm": 1.1030373573303223,
+    "learning_rate": 3.5185185185185187e-06,
+    "num_tokens": 19429.0,
+    "completions/mean_length": 39.0,
+    "completions/min_length": 36.0,
+    "completions/max_length": 42.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 39.0,
+    "completions/min_terminated_length": 36.0,
+    "completions/max_terminated_length": 42.0,
+    "rewards/reward_function/mean": 0.5583499670028687,
+    "rewards/reward_function/std": 0.08251935988664627,
+    "reward": 0.5583499670028687,
+    "reward_std": 0.08251935988664627,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.12238830700516701,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 15.016316874000267,
+    "epoch": 0.8,
+    "step": 12
+  },
+  {
+    "loss": 0.149135559797287,
+    "grad_norm": 1.254807472229004,
+    "learning_rate": 3.3333333333333333e-06,
+    "num_tokens": 20626.0,
+    "completions/mean_length": 35.5,
+    "completions/min_length": 28.0,
+    "completions/max_length": 43.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 35.5,
+    "completions/min_terminated_length": 28.0,
+    "completions/max_terminated_length": 43.0,
+    "rewards/reward_function/mean": 0.4583500027656555,
+    "rewards/reward_function/std": 0.058901991695165634,
+    "reward": 0.4583500027656555,
+    "reward_std": 0.058901991695165634,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.2583826147019863,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.519657740999946,
+    "epoch": 0.8666666666666667,
+    "step": 13
+  },
+  {
+    "loss": 0.27955153584480286,
+    "grad_norm": 1.167170763015747,
+    "learning_rate": 3.1481481481481483e-06,
+    "num_tokens": 22067.0,
+    "completions/mean_length": 36.5,
+    "completions/min_length": 22.0,
+    "completions/max_length": 51.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 36.5,
+    "completions/min_terminated_length": 22.0,
+    "completions/max_terminated_length": 51.0,
+    "rewards/reward_function/mean": 0.40209999680519104,
+    "rewards/reward_function/std": 0.020647529512643814,
+    "reward": 0.40209999680519104,
+    "reward_std": 0.020647529512643814,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.24101658910512924,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 14.053339626000025,
+    "epoch": 0.9333333333333333,
+    "step": 14
+  },
+  {
+    "loss": 0.06053304672241211,
+    "grad_norm": 2.425391435623169,
+    "learning_rate": 2.962962962962963e-06,
+    "num_tokens": 23691.0,
+    "completions/mean_length": 35.0,
+    "completions/min_length": 32.0,
+    "completions/max_length": 38.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 35.0,
+    "completions/min_terminated_length": 32.0,
+    "completions/max_terminated_length": 38.0,
+    "rewards/reward_function/mean": 0.4437499940395355,
+    "rewards/reward_function/std": 0.07954952120780945,
+    "reward": 0.4437499940395355,
+    "reward_std": 0.07954952120780945,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.25765860080718994,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 13.23964212199985,
+    "epoch": 1.0,
+    "step": 15
+  },
+  {
+    "loss": 0.0,
+    "grad_norm": 0.0,
+    "learning_rate": 2.7777777777777783e-06,
+    "num_tokens": 24930.0,
+    "completions/mean_length": 56.5,
+    "completions/min_length": 55.0,
+    "completions/max_length": 58.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 56.5,
+    "completions/min_terminated_length": 55.0,
+    "completions/max_terminated_length": 58.0,
+    "rewards/reward_function/mean": 0.5,
+    "rewards/reward_function/std": 0.0,
+    "reward": 0.5,
+    "reward_std": 0.0,
+    "frac_reward_zero_std": 1.0,
+    "entropy": 0.4330967664718628,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 13.377671128000202,
+    "epoch": 1.0666666666666667,
+    "step": 16
+  },
+  {
+    "loss": 0.0672960877418518,
+    "grad_norm": 1.6053359508514404,
+    "learning_rate": 2.5925925925925925e-06,
+    "num_tokens": 26072.0,
+    "completions/mean_length": 21.0,
+    "completions/min_length": 19.0,
+    "completions/max_length": 23.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 21.0,
+    "completions/min_terminated_length": 19.0,
+    "completions/max_terminated_length": 23.0,
+    "rewards/reward_function/mean": 0.516700029373169,
+    "rewards/reward_function/std": 0.1414213478565216,
+    "reward": 0.516700029373169,
+    "reward_std": 0.1414213478565216,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.06669686548411846,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 8.82481953599995,
+    "epoch": 1.1333333333333333,
+    "step": 17
+  },
+  {
+    "loss": -0.13629436492919922,
+    "grad_norm": 1.381037950515747,
+    "learning_rate": 2.4074074074074075e-06,
+    "num_tokens": 27771.0,
+    "completions/mean_length": 28.5,
+    "completions/min_length": 23.0,
+    "completions/max_length": 34.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 28.5,
+    "completions/min_terminated_length": 23.0,
+    "completions/max_terminated_length": 34.0,
+    "rewards/reward_function/mean": 0.5583499670028687,
+    "rewards/reward_function/std": 0.08251935988664627,
+    "reward": 0.5583499670028687,
+    "reward_std": 0.08251935988664627,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.12161804735660553,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 13.361655292000023,
+    "epoch": 1.2,
+    "step": 18
+  },
+  {
+    "loss": -0.1279437243938446,
+    "grad_norm": 2.226921558380127,
+    "learning_rate": 2.222222222222222e-06,
+    "num_tokens": 29251.0,
+    "completions/mean_length": 22.0,
+    "completions/min_length": 18.0,
+    "completions/max_length": 26.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 22.0,
+    "completions/min_terminated_length": 18.0,
+    "completions/max_terminated_length": 26.0,
+    "rewards/reward_function/mean": 0.6021000146865845,
+    "rewards/reward_function/std": 0.020647529512643814,
+    "reward": 0.6021000146865845,
+    "reward_std": 0.020647529512643814,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.08564786985516548,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.087925465999888,
+    "epoch": 1.2666666666666666,
+    "step": 19
+  },
+  {
+    "loss": 0.06728780269622803,
+    "grad_norm": 1.0975555181503296,
+    "learning_rate": 2.037037037037037e-06,
+    "num_tokens": 30425.0,
+    "completions/mean_length": 42.0,
+    "completions/min_length": 38.0,
+    "completions/max_length": 46.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 42.0,
+    "completions/min_terminated_length": 38.0,
+    "completions/max_terminated_length": 46.0,
+    "rewards/reward_function/mean": 0.5020999908447266,
+    "rewards/reward_function/std": 0.1207738146185875,
+    "reward": 0.5020999908447266,
+    "reward_std": 0.1207738146185875,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.16879020631313324,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.843729876999987,
+    "epoch": 1.3333333333333333,
+    "step": 20
+  },
+  {
+    "loss": 0.054305046796798706,
+    "grad_norm": 1.0439469814300537,
+    "learning_rate": 1.8518518518518519e-06,
+    "num_tokens": 32047.0,
+    "completions/mean_length": 26.0,
+    "completions/min_length": 24.0,
+    "completions/max_length": 28.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 26.0,
+    "completions/min_terminated_length": 24.0,
+    "completions/max_terminated_length": 28.0,
+    "rewards/reward_function/mean": 0.543749988079071,
+    "rewards/reward_function/std": 0.06187182664871216,
+    "reward": 0.543749988079071,
+    "reward_std": 0.06187182664871216,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.09812109172344208,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 12.01839622600005,
+    "epoch": 1.4,
+    "step": 21
+  },
+  {
+    "loss": -0.43347233533859253,
+    "grad_norm": 0.6157990097999573,
+    "learning_rate": 1.6666666666666667e-06,
+    "num_tokens": 33354.0,
+    "completions/mean_length": 72.5,
+    "completions/min_length": 28.0,
+    "completions/max_length": 117.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 72.5,
+    "completions/min_terminated_length": 28.0,
+    "completions/max_terminated_length": 117.0,
+    "rewards/reward_function/mean": 0.4437499940395355,
+    "rewards/reward_function/std": 0.07954952120780945,
+    "reward": 0.4437499940395355,
+    "reward_std": 0.07954952120780945,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.16432299464941025,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 21.621784166999987,
+    "epoch": 1.4666666666666668,
+    "step": 22
+  },
+  {
+    "loss": 0.21844279766082764,
+    "grad_norm": 1.18323814868927,
+    "learning_rate": 1.4814814814814815e-06,
+    "num_tokens": 35143.0,
+    "completions/mean_length": 27.5,
+    "completions/min_length": 19.0,
+    "completions/max_length": 36.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 27.5,
+    "completions/min_terminated_length": 19.0,
+    "completions/max_terminated_length": 36.0,
+    "rewards/reward_function/mean": 0.6312500238418579,
+    "rewards/reward_function/std": 0.18561552464962006,
+    "reward": 0.6312500238418579,
+    "reward_std": 0.18561550974845886,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.10744666680693626,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 14.225728247999996,
+    "epoch": 1.5333333333333332,
+    "step": 23
+  },
+  {
+    "loss": -0.6148233413696289,
+    "grad_norm": 0.8197247982025146,
+    "learning_rate": 1.2962962962962962e-06,
+    "num_tokens": 36929.0,
+    "completions/mean_length": 124.0,
+    "completions/min_length": 16.0,
+    "completions/max_length": 232.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 124.0,
+    "completions/min_terminated_length": 16.0,
+    "completions/max_terminated_length": 232.0,
+    "rewards/reward_function/mean": 0.4583500027656555,
+    "rewards/reward_function/std": 0.058901991695165634,
+    "reward": 0.4583500027656555,
+    "reward_std": 0.058901991695165634,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.6331216096878052,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 39.883540922000066,
+    "epoch": 1.6,
+    "step": 24
+  },
+  {
+    "loss": -0.054349154233932495,
+    "grad_norm": 1.030266284942627,
+    "learning_rate": 1.111111111111111e-06,
+    "num_tokens": 39023.0,
+    "completions/mean_length": 26.0,
+    "completions/min_length": 24.0,
+    "completions/max_length": 28.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 26.0,
+    "completions/min_terminated_length": 24.0,
+    "completions/max_terminated_length": 28.0,
+    "rewards/reward_function/mean": 0.6749999523162842,
+    "rewards/reward_function/std": 0.1237436980009079,
+    "reward": 0.6749999523162842,
+    "reward_std": 0.1237436980009079,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.07906700298190117,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 14.697501995000039,
+    "epoch": 1.6666666666666665,
+    "step": 25
+  },
+  {
+    "loss": 0.0,
+    "grad_norm": 0.0,
+    "learning_rate": 9.259259259259259e-07,
+    "num_tokens": 40211.0,
+    "completions/mean_length": 31.0,
+    "completions/min_length": 22.0,
+    "completions/max_length": 40.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 31.0,
+    "completions/min_terminated_length": 22.0,
+    "completions/max_terminated_length": 40.0,
+    "rewards/reward_function/mean": 0.6499999761581421,
+    "rewards/reward_function/std": 0.0,
+    "reward": 0.6499999761581421,
+    "reward_std": 0.0,
+    "frac_reward_zero_std": 1.0,
+    "entropy": 0.14391540735960007,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.100019781000128,
+    "epoch": 1.7333333333333334,
+    "step": 26
+  },
+  {
+    "loss": 0.11774009466171265,
+    "grad_norm": 1.699737310409546,
+    "learning_rate": 7.407407407407407e-07,
+    "num_tokens": 42443.0,
+    "completions/mean_length": 24.0,
+    "completions/min_length": 20.0,
+    "completions/max_length": 28.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 24.0,
+    "completions/min_terminated_length": 20.0,
+    "completions/max_terminated_length": 28.0,
+    "rewards/reward_function/mean": 0.574999988079071,
+    "rewards/reward_function/std": 0.10606600344181061,
+    "reward": 0.574999988079071,
+    "reward_std": 0.10606600344181061,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.14994759857654572,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 15.477631969999948,
+    "epoch": 1.8,
+    "step": 27
+  },
+  {
+    "loss": -0.05434012413024902,
+    "grad_norm": 0.8750740885734558,
+    "learning_rate": 5.555555555555555e-07,
+    "num_tokens": 44062.0,
+    "completions/mean_length": 32.5,
+    "completions/min_length": 30.0,
+    "completions/max_length": 35.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 32.5,
+    "completions/min_terminated_length": 30.0,
+    "completions/max_terminated_length": 35.0,
+    "rewards/reward_function/mean": 0.6895999908447266,
+    "rewards/reward_function/std": 0.10309616476297379,
+    "reward": 0.6895999908447266,
+    "reward_std": 0.10309616476297379,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.142868272960186,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 12.90737977799995,
+    "epoch": 1.8666666666666667,
+    "step": 28
+  },
+  {
+    "loss": 0.15356993675231934,
+    "grad_norm": 1.543925404548645,
+    "learning_rate": 3.7037037037037036e-07,
+    "num_tokens": 45476.0,
+    "completions/mean_length": 23.0,
+    "completions/min_length": 18.0,
+    "completions/max_length": 28.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 23.0,
+    "completions/min_terminated_length": 18.0,
+    "completions/max_terminated_length": 28.0,
+    "rewards/reward_function/mean": 0.6895999908447266,
+    "rewards/reward_function/std": 0.10309616476297379,
+    "reward": 0.6895999908447266,
+    "reward_std": 0.10309616476297379,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.17455117404460907,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.09434006399988,
+    "epoch": 1.9333333333333333,
+    "step": 29
+  },
+  {
+    "loss": -0.030694186687469482,
+    "grad_norm": 1.561765432357788,
+    "learning_rate": 1.8518518518518518e-07,
+    "num_tokens": 47096.0,
+    "completions/mean_length": 23.0,
+    "completions/min_length": 22.0,
+    "completions/max_length": 24.0,
+    "completions/clipped_ratio": 0.0,
+    "completions/mean_terminated_length": 23.0,
+    "completions/min_terminated_length": 22.0,
+    "completions/max_terminated_length": 24.0,
+    "rewards/reward_function/mean": 0.543749988079071,
+    "rewards/reward_function/std": 0.06187182664871216,
+    "reward": 0.543749988079071,
+    "reward_std": 0.06187182664871216,
+    "frac_reward_zero_std": 0.0,
+    "entropy": 0.13204744830727577,
+    "clip_ratio/low_mean": 0.0,
+    "clip_ratio/low_min": 0.0,
+    "clip_ratio/high_mean": 0.0,
+    "clip_ratio/high_max": 0.0,
+    "clip_ratio/region_mean": 0.0,
+    "step_time": 11.530309271000078,
+    "epoch": 2.0,
+    "step": 30
+  },
+  {
+    "train_runtime": 507.6102,
+    "train_samples_per_second": 0.059,
+    "train_steps_per_second": 0.059,
+    "total_flos": 0.0,
+    "train_loss": -0.021817301710446674,
+    "epoch": 2.0,
+    "step": 30
+  }
+]

artifacts/training_summary.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ train_runtime_sec,train_steps,epochs,train_loss_final,reward_min,reward_max,reward_last
2	+ 507.6,30,2,-0.02182,0.40209999680519104,0.6895999908447266,0.543749988079071

inference.py CHANGED Viewed

@@ -20,11 +20,14 @@ from typing import Any, Dict, List
 import requests
 from openai import OpenAI
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
 API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or ""

 import requests
 from openai import OpenAI
+from dotenv import load_dotenv
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
+load_dotenv()
 API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
 API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or ""

server/app.py CHANGED Viewed

@@ -3,9 +3,11 @@
 from __future__ import annotations
 import os
 from openenv.core.env_server import create_fastapi_app
 from fastapi import Query
 from constants import PROJECT_DESCRIPTION, VERSION
 from models import CommitmentAction, CommitmentObservation, CommitmentState
@@ -13,10 +15,32 @@ from server.environment import CommitmentEnvironment
 from server.mcp import router as mcp_router
 from server.tasks import get_scenario_ids_grouped
-_shared_env = CommitmentEnvironment()
 app = create_fastapi_app(
-    env=lambda: _shared_env,
     action_cls=CommitmentAction,
     observation_cls=CommitmentObservation,
 )
@@ -27,7 +51,7 @@ app.version = VERSION
 app.routes[:] = [
     r for r in app.routes
-    if not (hasattr(r, "path") and r.path in ("/state", "/mcp", "/reset"))
 ]
@@ -44,9 +68,11 @@ def reset_episode(
     query params in this deployment setup, which made scenario selection
     non-deterministic for demos/evaluations.
     """
-    obs = _shared_env.reset(
         seed=seed,
-        episode_id=episode_id,
         task_id=task_id,
         difficulty=difficulty,
     )
@@ -54,12 +80,30 @@ def reset_episode(
         "observation": obs.model_dump(),
         "reward": float(obs.reward),
         "done": bool(obs.done),
     }
 @app.get("/state", response_model=CommitmentState)
-def get_state() -> CommitmentState:
-    return _shared_env.state
 @app.get("/tasks")

 from __future__ import annotations
 import os
+from threading import Lock
 from openenv.core.env_server import create_fastapi_app
 from fastapi import Query
+from pydantic import BaseModel
 from constants import PROJECT_DESCRIPTION, VERSION
 from models import CommitmentAction, CommitmentObservation, CommitmentState
 from server.mcp import router as mcp_router
 from server.tasks import get_scenario_ids_grouped
+_DEFAULT_SESSION_ID = "default"
+_env_store: dict[str, CommitmentEnvironment] = {
+    _DEFAULT_SESSION_ID: CommitmentEnvironment(),
+}
+_env_store_lock = Lock()
+def _get_env(session_id: str) -> CommitmentEnvironment:
+    """Return a per-session environment instance.
+    This avoids cross-user state bleed from a single shared mutable environment.
+    Clients can pass ``episode_id`` query param to isolate sessions.
+    """
+    with _env_store_lock:
+        env = _env_store.get(session_id)
+        if env is None:
+            env = CommitmentEnvironment()
+            _env_store[session_id] = env
+        return env
+class StepPayload(BaseModel):
+    action: CommitmentAction
 app = create_fastapi_app(
+    env=lambda: _get_env(_DEFAULT_SESSION_ID),
     action_cls=CommitmentAction,
     observation_cls=CommitmentObservation,
 )
 app.routes[:] = [
     r for r in app.routes
+    if not (hasattr(r, "path") and r.path in ("/state", "/mcp", "/reset", "/step"))
 ]
     query params in this deployment setup, which made scenario selection
     non-deterministic for demos/evaluations.
     """
+    session_id = episode_id or _DEFAULT_SESSION_ID
+    env = _get_env(session_id)
+    obs = env.reset(
         seed=seed,
+        episode_id=session_id,
         task_id=task_id,
         difficulty=difficulty,
     )
         "observation": obs.model_dump(),
         "reward": float(obs.reward),
         "done": bool(obs.done),
+        "episode_id": session_id,
+    }
+@app.post("/step")
+def step_episode(
+    payload: StepPayload,
+    episode_id: str | None = Query(default=None),
+) -> dict[str, object]:
+    session_id = episode_id or _DEFAULT_SESSION_ID
+    env = _get_env(session_id)
+    obs = env.step(payload.action)
+    return {
+        "observation": obs.model_dump(),
+        "reward": float(obs.reward),
+        "done": bool(obs.done),
+        "episode_id": session_id,
     }
 @app.get("/state", response_model=CommitmentState)
+def get_state(episode_id: str | None = Query(default=None)) -> CommitmentState:
+    session_id = episode_id or _DEFAULT_SESSION_ID
+    return _get_env(session_id).state
 @app.get("/tasks")

server/environment.py CHANGED Viewed

@@ -109,12 +109,12 @@ class CommitmentEnvironment(
             return self._finish_episode()
         step_reward = 0.0
-        tool_result = self._dispatch_tool(action, at)
         self._last_tool_result = tool_result
-        if "CONFLICT" in tool_result:
             step_reward = -0.05
-        elif at in ("schedule_meeting", "reschedule_event", "send_email", "book_restaurant"):
             step_reward = 0.05
         self._cumulative_reward += step_reward
@@ -144,14 +144,14 @@ class CommitmentEnvironment(
     # Tool dispatch
     # ------------------------------------------------------------------
-    def _dispatch_tool(self, action: CommitmentAction, at: str) -> str:
         assert self._world is not None
         turn = self._step_count
         if at == "view_calendar":
-            return self._world.view_calendar(action.date)
         elif at == "check_availability":
-            return self._world.check_availability(action.person)
         elif at == "search_restaurants":
             return self._world.search_restaurants(
                 cuisine=action.cuisine,
@@ -159,9 +159,9 @@ class CommitmentEnvironment(
                 dietary=action.dietary,
                 max_distance_miles=action.max_distance_miles,
                 near_airport=action.near_airport,
-            )
         elif at == "schedule_meeting":
-            return self._world.schedule_meeting(
                 title=action.title,
                 date=action.date,
                 time=action.time,
@@ -170,25 +170,36 @@ class CommitmentEnvironment(
                 location=action.location,
                 turn=turn,
             )
         elif at == "reschedule_event":
-            return self._world.reschedule_event(
                 event_id=action.event_id,
                 new_time=action.new_time,
                 turn=turn,
             )
         elif at == "cancel_event":
-            return self._world.cancel_event(action.event_id, turn=turn)
         elif at == "send_email":
             return self._world.send_email(
                 to=action.to,
                 subject=action.subject,
                 body=action.body,
                 turn=turn,
-            )
         elif at == "book_restaurant":
-            return self._world.book_restaurant(action.restaurant_name, turn=turn)
         else:
-            return f"Unknown action_type: '{at}'. Valid types: view_calendar, check_availability, search_restaurants, schedule_meeting, reschedule_event, cancel_event, send_email, book_restaurant, submit_plan"
     # ------------------------------------------------------------------
     # Observation builder

             return self._finish_episode()
         step_reward = 0.0
+        tool_result, dispatch_status = self._dispatch_tool(action, at)
         self._last_tool_result = tool_result
+        if dispatch_status == "conflict":
             step_reward = -0.05
+        elif dispatch_status == "success" and at in ("schedule_meeting", "reschedule_event", "send_email", "book_restaurant"):
             step_reward = 0.05
         self._cumulative_reward += step_reward
     # Tool dispatch
     # ------------------------------------------------------------------
+    def _dispatch_tool(self, action: CommitmentAction, at: str) -> tuple[str, str]:
         assert self._world is not None
         turn = self._step_count
         if at == "view_calendar":
+            return self._world.view_calendar(action.date), "info"
         elif at == "check_availability":
+            return self._world.check_availability(action.person), "info"
         elif at == "search_restaurants":
             return self._world.search_restaurants(
                 cuisine=action.cuisine,
                 dietary=action.dietary,
                 max_distance_miles=action.max_distance_miles,
                 near_airport=action.near_airport,
+            ), "info"
         elif at == "schedule_meeting":
+            result = self._world.schedule_meeting(
                 title=action.title,
                 date=action.date,
                 time=action.time,
                 location=action.location,
                 turn=turn,
             )
+            status = "conflict" if result.startswith("CONFLICT:") else "success"
+            return result, status
         elif at == "reschedule_event":
+            result = self._world.reschedule_event(
                 event_id=action.event_id,
                 new_time=action.new_time,
                 turn=turn,
             )
+            status = "conflict" if result.startswith("CONFLICT:") else ("error" if "not found" in result.lower() else "success")
+            return result, status
         elif at == "cancel_event":
+            result = self._world.cancel_event(action.event_id, turn=turn)
+            status = "error" if "not found" in result.lower() else "success"
+            return result, status
         elif at == "send_email":
             return self._world.send_email(
                 to=action.to,
                 subject=action.subject,
                 body=action.body,
                 turn=turn,
+            ), "success"
         elif at == "book_restaurant":
+            result = self._world.book_restaurant(action.restaurant_name, turn=turn)
+            status = "error" if "not found" in result.lower() else "success"
+            return result, status
         else:
+            return (
+                f"Unknown action_type: '{at}'. Valid types: view_calendar, check_availability, search_restaurants, schedule_meeting, reschedule_event, cancel_event, send_email, book_restaurant, submit_plan",
+                "error",
+            )
     # ------------------------------------------------------------------
     # Observation builder

server/graders.py CHANGED Viewed

@@ -98,7 +98,7 @@ def _check_constraint(constraint, world: WorldState) -> bool:
             em.get("to", "").lower() == lower or lower in em.get("body", "").lower()
             for em in world.emails_sent
         )
-        return higher_kept
     return False

             em.get("to", "").lower() == lower or lower in em.get("body", "").lower()
             for em in world.emails_sent
         )
+        return higher_kept and lower_moved
     return False

training/env_factory.py CHANGED Viewed

@@ -143,7 +143,9 @@ class CommitmentOSEnvFactory:
                 if obs.done:
                     break
             except Exception:
-                continue
         if not env._done:
             obs = env.step(CommitmentAction(action_type="submit_plan"))

                 if obs.done:
                     break
             except Exception:
+                # Invalid action payloads should be penalized, not silently ignored.
+                last_reward = 0.01
+                break
         if not env._done:
             obs = env.step(CommitmentAction(action_type="submit_plan"))