pyre-ppo-agent / training.log
Krooz's picture
Add training log
ba6800e verified
ep=0001 [easy ] steps=030 reward= +17.290 evac=1 hp=100.0 s30=1.00 t=0s
ep=0002 [easy ] steps=200 reward= -9.295 evac=0 hp= 98.5 s30=0.50 t=1s
ep=0003 [easy ] steps=200 reward= -9.090 evac=0 hp=100.0 s30=0.33 t=2s
ep=0004 [easy ] steps=200 reward= -16.130 evac=0 hp= 62.0 s30=0.25 t=3s
ep=0005 [easy ] steps=200 reward= -12.080 evac=0 hp=100.0 s30=0.20 t=3s
ep=0006 [easy ] steps=200 reward= -12.320 evac=0 hp=100.0 s30=0.17 t=4s
ep=0007 [easy ] steps=200 reward= -14.560 evac=0 hp=100.0 s30=0.14 t=6s
ep=0008 [easy ] steps=200 reward= -9.890 evac=0 hp=100.0 s30=0.12 t=7s
>> PPO update pi_loss=-0.0065 v_loss=4.3169 entropy=2.0778 kl=0.0066 lr=2.96e-04
ep=0009 [easy ] steps=141 reward= +10.370 evac=1 hp= 98.0 s30=0.22 t=8s
ep=0010 [easy ] steps=200 reward= -17.620 evac=0 hp=100.0 s30=0.20 t=9s
ep=0011 [easy ] steps=049 reward= +15.740 evac=1 hp=100.0 s30=0.27 t=9s
ep=0012 [easy ] steps=200 reward= -9.980 evac=0 hp=100.0 s30=0.25 t=10s
ep=0013 [easy ] steps=200 reward= -3.220 evac=0 hp=100.0 s30=0.23 t=11s
ep=0014 [easy ] steps=123 reward= +11.430 evac=1 hp=100.0 s30=0.29 t=12s
ep=0015 [easy ] steps=200 reward= -5.360 evac=0 hp=100.0 s30=0.27 t=13s
ep=0016 [easy ] steps=127 reward= +10.970 evac=1 hp=100.0 s30=0.31 t=13s
>> PPO update pi_loss=-0.0035 v_loss=6.9670 entropy=2.0211 kl=0.0010 lr=2.93e-04
ep=0017 [easy ] steps=156 reward= +10.050 evac=1 hp= 98.0 s30=0.35 t=15s
ep=0018 [easy ] steps=088 reward= +14.830 evac=1 hp=100.0 s30=0.39 t=15s
ep=0019 [easy ] steps=200 reward= -17.770 evac=0 hp=100.0 s30=0.37 t=16s
ep=0020 [easy ] steps=176 reward= +9.470 evac=1 hp=100.0 s30=0.40 t=17s
ep=0021 [easy ] steps=200 reward= -6.850 evac=0 hp= 99.0 s30=0.38 t=18s
ep=0022 [easy ] steps=021 reward= +17.650 evac=1 hp=100.0 s30=0.41 t=18s
ep=0023 [easy ] steps=096 reward= +11.280 evac=1 hp=100.0 s30=0.43 t=18s
ep=0024 [easy ] steps=200 reward= -5.890 evac=0 hp= 95.0 s30=0.42 t=19s
>> PPO update pi_loss=-0.0139 v_loss=8.8112 entropy=2.1823 kl=0.0050 lr=2.89e-04
ep=0025 [easy ] steps=139 reward= +11.360 evac=1 hp=100.0 s30=0.44 t=21s
** EVAL [hard] reward=-10.124 success=0.00
ep=0026 [easy ] steps=200 reward= -19.000 evac=0 hp=100.0 s30=0.42 t=23s
ep=0027 [easy ] steps=063 reward= +14.240 evac=1 hp=100.0 s30=0.44 t=23s
ep=0028 [easy ] steps=200 reward= -15.310 evac=0 hp=100.0 s30=0.43 t=24s
ep=0029 [easy ] steps=200 reward= -11.940 evac=0 hp=100.0 s30=0.41 t=25s
ep=0030 [easy ] steps=200 reward= -12.810 evac=0 hp=100.0 s30=0.40 t=26s
ep=0031 [easy ] steps=031 reward= +16.630 evac=1 hp=100.0 s30=0.40 t=26s
ep=0032 [easy ] steps=200 reward= -9.350 evac=0 hp=100.0 s30=0.40 t=27s
>> PPO update pi_loss=-0.0033 v_loss=6.9677 entropy=1.7971 kl=0.0012 lr=2.86e-04
ep=0033 [easy ] steps=200 reward= -12.530 evac=0 hp=100.0 s30=0.40 t=29s
ep=0034 [easy ] steps=043 reward= +15.460 evac=1 hp=100.0 s30=0.43 t=29s
ep=0035 [easy ] steps=009 reward= +17.210 evac=1 hp=100.0 s30=0.47 t=29s
ep=0036 [easy ] steps=200 reward= -9.600 evac=0 hp=100.0 s30=0.47 t=30s
ep=0037 [easy ] steps=200 reward= -13.100 evac=0 hp=100.0 s30=0.47 t=31s
ep=0038 [easy ] steps=088 reward= +14.520 evac=1 hp=100.0 s30=0.50 t=31s
ep=0039 [easy ] steps=200 reward= -8.090 evac=0 hp=100.0 s30=0.47 t=32s
ep=0040 [easy ] steps=058 reward= +15.970 evac=1 hp=100.0 s30=0.50 t=32s
>> PPO update pi_loss=-0.0045 v_loss=10.7413 entropy=2.1999 kl=0.0039 lr=2.82e-04
ep=0041 [easy ] steps=047 reward= +15.940 evac=1 hp=100.0 s30=0.50 t=33s
ep=0042 [easy ] steps=200 reward= -8.200 evac=0 hp=100.0 s30=0.50 t=34s
ep=0043 [easy ] steps=187 reward= +9.560 evac=1 hp=100.0 s30=0.53 t=35s
ep=0044 [easy ] steps=130 reward= +10.540 evac=1 hp=100.0 s30=0.53 t=36s
ep=0045 [easy ] steps=200 reward= -16.440 evac=0 hp=100.0 s30=0.53 t=36s
ep=0046 [easy ] steps=048 reward= +16.420 evac=1 hp=100.0 s30=0.53 t=37s
ep=0047 [easy ] steps=064 reward= +14.910 evac=1 hp=100.0 s30=0.53 t=37s
ep=0048 [easy ] steps=048 reward= -14.320 evac=0 hp= 0.0 s30=0.50 t=37s
>> PPO update pi_loss=-0.0083 v_loss=11.0590 entropy=2.0080 kl=0.0078 lr=2.78e-04
ep=0049 [easy ] steps=140 reward= +11.540 evac=1 hp=100.0 s30=0.53 t=38s
ep=0050 [easy ] steps=013 reward= +19.200 evac=1 hp=100.0 s30=0.53 t=38s
** EVAL [hard] reward=-11.184 success=0.00
ep=0051 [easy ] steps=200 reward= -11.910 evac=0 hp=100.0 s30=0.53 t=41s
ep=0052 [easy ] steps=080 reward= +15.090 evac=1 hp=100.0 s30=0.53 t=41s
ep=0053 [easy ] steps=088 reward= +14.720 evac=1 hp=100.0 s30=0.53 t=42s
ep=0054 [easy ] steps=004 reward= +17.580 evac=1 hp=100.0 s30=0.57 t=42s
ep=0055 [easy ] steps=200 reward= -13.970 evac=0 hp=100.0 s30=0.53 t=43s
ep=0056 [easy ] steps=062 reward= +15.320 evac=1 hp=100.0 s30=0.57 t=43s
>> PPO update pi_loss=-0.0230 v_loss=13.0751 entropy=2.0431 kl=0.0081 lr=2.75e-04
ep=0057 [easy ] steps=021 reward= +18.980 evac=1 hp=100.0 s30=0.57 t=44s
ep=0058 [easy ] steps=019 reward= +17.800 evac=1 hp=100.0 s30=0.60 t=44s
ep=0059 [easy ] steps=012 reward= +18.630 evac=1 hp=100.0 s30=0.63 t=44s
ep=0060 [easy ] steps=067 reward= +14.700 evac=1 hp=100.0 s30=0.67 t=44s
ep=0061 [easy ] steps=129 reward= +11.070 evac=1 hp=100.0 s30=0.67 t=45s
ep=0062 [easy ] steps=045 reward= +17.620 evac=1 hp=100.0 s30=0.70 t=45s
ep=0063 [easy ] steps=040 reward= +14.960 evac=1 hp=100.0 s30=0.73 t=45s
ep=0064 [easy ] steps=041 reward= +16.660 evac=1 hp=100.0 s30=0.73 t=45s
>> PPO update pi_loss=-0.0191 v_loss=44.0687 entropy=1.8650 kl=0.0033 lr=2.71e-04
ep=0065 [easy ] steps=082 reward= +14.330 evac=1 hp=100.0 s30=0.73 t=46s
ep=0066 [easy ] steps=015 reward= +17.400 evac=1 hp=100.0 s30=0.77 t=46s
ep=0067 [easy ] steps=018 reward= +17.970 evac=1 hp=100.0 s30=0.80 t=46s
ep=0068 [easy ] steps=200 reward= -16.405 evac=0 hp= 9.5 s30=0.77 t=47s
ep=0069 [easy ] steps=005 reward= +17.930 evac=1 hp=100.0 s30=0.80 t=47s
ep=0070 [easy ] steps=044 reward= +16.420 evac=1 hp=100.0 s30=0.80 t=47s
ep=0071 [easy ] steps=200 reward= -12.220 evac=0 hp=100.0 s30=0.77 t=48s
ep=0072 [easy ] steps=151 reward= +2.060 evac=1 hp=100.0 s30=0.80 t=49s
>> PPO update pi_loss=-0.0054 v_loss=13.8242 entropy=2.0114 kl=0.0043 lr=2.68e-04
ep=0073 [easy ] steps=005 reward= +17.990 evac=1 hp=100.0 s30=0.80 t=49s
ep=0074 [easy ] steps=200 reward= -12.780 evac=0 hp=100.0 s30=0.77 t=50s
ep=0075 [easy ] steps=034 reward= +16.900 evac=1 hp=100.0 s30=0.80 t=50s
** EVAL [hard] reward=-11.468 success=0.00
ep=0076 [easy ] steps=017 reward= +19.290 evac=1 hp=100.0 s30=0.80 t=51s
ep=0077 [easy ] steps=022 reward= +17.490 evac=1 hp=100.0 s30=0.80 t=51s
ep=0078 [easy ] steps=005 reward= +17.050 evac=1 hp=100.0 s30=0.83 t=51s
ep=0079 [easy ] steps=017 reward= +18.580 evac=1 hp=100.0 s30=0.83 t=51s
ep=0080 [easy ] steps=030 reward= +16.785 evac=1 hp= 99.0 s30=0.83 t=51s
>> PPO update pi_loss=-0.0150 v_loss=35.1268 entropy=2.0081 kl=0.0006 lr=2.64e-04
[curriculum] Advanced to 'medium' (suc30=0.87)
ep=0081 [easy ] steps=176 reward= +8.340 evac=1 hp=100.0 s30=0.87 t=52s
ep=0082 [medium] steps=050 reward= -19.900 evac=0 hp= 0.0 s30=0.83 t=53s
ep=0083 [medium] steps=029 reward= -14.630 evac=0 hp= 0.0 s30=0.80 t=53s
ep=0084 [medium] steps=025 reward= +15.600 evac=1 hp=100.0 s30=0.80 t=53s
ep=0085 [medium] steps=080 reward= -24.320 evac=0 hp= 0.0 s30=0.80 t=53s
ep=0086 [medium] steps=043 reward= +8.883 evac=1 hp= 45.5 s30=0.80 t=53s
ep=0087 [medium] steps=040 reward= -15.070 evac=0 hp= 0.0 s30=0.77 t=54s
ep=0088 [medium] steps=014 reward= -15.090 evac=0 hp= 0.0 s30=0.73 t=54s
>> PPO update pi_loss=-0.0087 v_loss=30.8968 entropy=1.8425 kl=0.0018 lr=2.60e-04
ep=0089 [medium] steps=014 reward= +16.320 evac=1 hp=100.0 s30=0.73 t=54s
ep=0090 [medium] steps=023 reward= +16.400 evac=1 hp=100.0 s30=0.73 t=54s
ep=0091 [medium] steps=026 reward= -14.010 evac=0 hp= 0.0 s30=0.70 t=54s
ep=0092 [medium] steps=017 reward= +15.510 evac=1 hp=100.0 s30=0.70 t=54s
ep=0093 [medium] steps=011 reward= +15.170 evac=1 hp=100.0 s30=0.70 t=54s
ep=0094 [medium] steps=036 reward= -20.530 evac=0 hp= 0.0 s30=0.67 t=54s
ep=0095 [medium] steps=150 reward= -14.250 evac=0 hp=100.0 s30=0.63 t=55s
ep=0096 [medium] steps=029 reward= -9.600 evac=0 hp= 0.0 s30=0.60 t=55s
>> PPO update pi_loss=-0.0083 v_loss=34.3949 entropy=1.8332 kl=0.0026 lr=2.57e-04
ep=0097 [medium] steps=016 reward= +16.800 evac=1 hp=100.0 s30=0.60 t=56s
ep=0098 [medium] steps=150 reward= -13.210 evac=0 hp= 99.0 s30=0.60 t=56s
ep=0099 [medium] steps=005 reward= +15.000 evac=1 hp=100.0 s30=0.60 t=56s
ep=0100 [medium] steps=150 reward= -17.065 evac=0 hp= 1.5 s30=0.57 t=57s
** EVAL [hard] reward=-9.827 success=0.00
ep=0101 [medium] steps=019 reward= +12.548 evac=1 hp= 70.5 s30=0.60 t=59s
ep=0102 [medium] steps=056 reward= -19.370 evac=0 hp= 0.0 s30=0.57 t=59s
ep=0103 [medium] steps=059 reward= +13.980 evac=1 hp=100.0 s30=0.57 t=59s
ep=0104 [medium] steps=150 reward= -26.195 evac=0 hp= 50.5 s30=0.57 t=60s
>> PPO update pi_loss=-0.0004 v_loss=14.6058 entropy=1.3296 kl=0.0008 lr=2.53e-04
ep=0105 [medium] steps=045 reward= -12.460 evac=0 hp= 0.0 s30=0.53 t=61s
ep=0106 [medium] steps=040 reward= +15.800 evac=1 hp=100.0 s30=0.53 t=61s
ep=0107 [medium] steps=150 reward= -25.180 evac=0 hp=100.0 s30=0.50 t=62s
ep=0108 [medium] steps=150 reward= -9.150 evac=0 hp= 97.0 s30=0.47 t=62s
ep=0109 [medium] steps=023 reward= +15.840 evac=1 hp=100.0 s30=0.47 t=62s
ep=0110 [medium] steps=130 reward= +2.095 evac=1 hp= 87.0 s30=0.47 t=63s
ep=0111 [medium] steps=150 reward= -12.520 evac=0 hp= 97.0 s30=0.43 t=64s
ep=0112 [medium] steps=058 reward= +13.820 evac=1 hp=100.0 s30=0.47 t=64s
>> PPO update pi_loss=-0.0084 v_loss=11.2875 entropy=1.7357 kl=0.0044 lr=2.50e-04
ep=0113 [medium] steps=001 reward= +14.260 evac=1 hp=100.0 s30=0.50 t=64s
ep=0114 [medium] steps=012 reward= +16.180 evac=1 hp=100.0 s30=0.50 t=65s
ep=0115 [medium] steps=042 reward= +10.953 evac=1 hp= 71.5 s30=0.53 t=65s
ep=0116 [medium] steps=116 reward= +4.300 evac=1 hp=100.0 s30=0.53 t=65s
ep=0117 [medium] steps=150 reward= -12.700 evac=0 hp=100.0 s30=0.53 t=66s
ep=0118 [medium] steps=083 reward= +12.210 evac=1 hp=100.0 s30=0.57 t=66s
ep=0119 [medium] steps=022 reward= -15.580 evac=0 hp= 0.0 s30=0.53 t=66s
ep=0120 [medium] steps=023 reward= +15.445 evac=1 hp= 99.0 s30=0.53 t=66s
>> PPO update pi_loss=-0.0151 v_loss=18.8505 entropy=1.7544 kl=0.0059 lr=2.46e-04
ep=0121 [medium] steps=005 reward= +15.020 evac=1 hp=100.0 s30=0.57 t=67s
ep=0122 [medium] steps=065 reward= +11.930 evac=1 hp=100.0 s30=0.57 t=67s
ep=0123 [medium] steps=010 reward= +11.925 evac=1 hp= 55.0 s30=0.57 t=67s
ep=0124 [medium] steps=044 reward= +15.040 evac=1 hp=100.0 s30=0.60 t=67s
ep=0125 [medium] steps=093 reward= -24.720 evac=0 hp= 0.0 s30=0.60 t=68s
** EVAL [hard] reward=-7.792 success=0.20
ep=0126 [medium] steps=010 reward= +14.650 evac=1 hp=100.0 s30=0.63 t=69s
ep=0127 [medium] steps=019 reward= -13.110 evac=0 hp= 0.0 s30=0.60 t=69s
ep=0128 [medium] steps=018 reward= +14.210 evac=1 hp=100.0 s30=0.63 t=69s
>> PPO update pi_loss=+0.0149 v_loss=31.2597 entropy=1.5042 kl=0.0039 lr=2.42e-04
ep=0129 [medium] steps=150 reward= -18.700 evac=0 hp= 95.0 s30=0.60 t=70s
ep=0130 [medium] steps=039 reward= +14.670 evac=1 hp=100.0 s30=0.63 t=70s
ep=0131 [medium] steps=034 reward= -10.750 evac=0 hp= 0.0 s30=0.60 t=70s
ep=0132 [medium] steps=045 reward= -23.760 evac=0 hp= 0.0 s30=0.60 t=70s
ep=0133 [medium] steps=150 reward= -17.000 evac=0 hp=100.0 s30=0.57 t=71s
ep=0134 [medium] steps=029 reward= +15.530 evac=1 hp=100.0 s30=0.60 t=71s
ep=0135 [medium] steps=150 reward= -27.040 evac=0 hp= 96.0 s30=0.60 t=72s
ep=0136 [medium] steps=016 reward= +15.660 evac=1 hp=100.0 s30=0.60 t=72s
>> PPO update pi_loss=-0.0045 v_loss=18.2808 entropy=1.4920 kl=0.0046 lr=2.39e-04
ep=0137 [medium] steps=016 reward= -14.740 evac=0 hp= 0.0 s30=0.60 t=72s
ep=0138 [medium] steps=012 reward= +16.740 evac=1 hp=100.0 s30=0.63 t=72s
ep=0139 [medium] steps=013 reward= +16.620 evac=1 hp=100.0 s30=0.63 t=73s
ep=0140 [medium] steps=008 reward= +14.720 evac=1 hp=100.0 s30=0.63 t=73s
ep=0141 [medium] steps=061 reward= -18.720 evac=0 hp= 0.0 s30=0.63 t=73s
ep=0142 [medium] steps=005 reward= +14.570 evac=1 hp=100.0 s30=0.63 t=73s
ep=0143 [medium] steps=027 reward= +12.642 evac=1 hp= 59.5 s30=0.63 t=73s
ep=0144 [medium] steps=060 reward= +13.820 evac=1 hp=100.0 s30=0.63 t=73s
>> PPO update pi_loss=-0.0005 v_loss=64.7183 entropy=1.4458 kl=0.0003 lr=2.35e-04
ep=0145 [medium] steps=150 reward= -16.260 evac=0 hp=100.0 s30=0.60 t=74s
ep=0146 [medium] steps=029 reward= +12.297 evac=1 hp= 94.5 s30=0.60 t=74s
ep=0147 [medium] steps=055 reward= +11.020 evac=1 hp=100.0 s30=0.63 t=74s
ep=0148 [medium] steps=015 reward= +16.030 evac=1 hp=100.0 s30=0.63 t=75s
ep=0149 [medium] steps=029 reward= +15.030 evac=1 hp=100.0 s30=0.67 t=75s
ep=0150 [medium] steps=050 reward= +7.125 evac=1 hp= 47.0 s30=0.67 t=75s
** EVAL [hard] reward=-4.237 success=0.40
ep=0151 [medium] steps=048 reward= -0.080 evac=1 hp= 14.0 s30=0.67 t=76s
ep=0152 [medium] steps=022 reward= +14.553 evac=1 hp= 81.5 s30=0.67 t=76s
>> PPO update pi_loss=-0.0157 v_loss=36.3181 entropy=1.6305 kl=0.0048 lr=2.32e-04
ep=0153 [medium] steps=022 reward= -7.700 evac=0 hp= 0.0 s30=0.63 t=77s
ep=0154 [medium] steps=150 reward= -29.100 evac=0 hp=100.0 s30=0.60 t=77s
ep=0155 [medium] steps=028 reward= -16.030 evac=0 hp= 0.0 s30=0.60 t=77s
ep=0156 [medium] steps=019 reward= +15.060 evac=1 hp=100.0 s30=0.60 t=77s
ep=0157 [medium] steps=004 reward= +15.550 evac=1 hp=100.0 s30=0.63 t=77s
ep=0158 [medium] steps=023 reward= +16.510 evac=1 hp=100.0 s30=0.63 t=78s
ep=0159 [medium] steps=020 reward= +16.350 evac=1 hp=100.0 s30=0.67 t=78s
ep=0160 [medium] steps=025 reward= -12.920 evac=0 hp= 0.0 s30=0.63 t=78s
>> PPO update pi_loss=-0.0265 v_loss=29.0037 entropy=1.5078 kl=0.0049 lr=2.28e-04
ep=0161 [medium] steps=038 reward= -12.270 evac=0 hp= 0.0 s30=0.63 t=78s
ep=0162 [medium] steps=035 reward= -13.590 evac=0 hp= 0.0 s30=0.63 t=79s
ep=0163 [medium] steps=037 reward= -16.930 evac=0 hp= 0.0 s30=0.63 t=79s
ep=0164 [medium] steps=150 reward= -11.790 evac=0 hp=100.0 s30=0.60 t=79s
ep=0165 [medium] steps=030 reward= -15.790 evac=0 hp= 0.0 s30=0.60 t=80s
ep=0166 [medium] steps=150 reward= -9.340 evac=0 hp=100.0 s30=0.57 t=80s
ep=0167 [medium] steps=094 reward= +10.390 evac=1 hp=100.0 s30=0.60 t=81s
ep=0168 [medium] steps=021 reward= +15.260 evac=1 hp=100.0 s30=0.60 t=81s
>> PPO update pi_loss=+0.0017 v_loss=17.6311 entropy=1.7762 kl=0.0062 lr=2.24e-04
ep=0169 [medium] steps=064 reward= -26.290 evac=0 hp= 0.0 s30=0.57 t=82s
ep=0170 [medium] steps=021 reward= -11.750 evac=0 hp= 0.0 s30=0.53 t=82s
ep=0171 [medium] steps=017 reward= +16.280 evac=1 hp=100.0 s30=0.57 t=82s
ep=0172 [medium] steps=028 reward= +15.830 evac=1 hp=100.0 s30=0.57 t=82s
ep=0173 [medium] steps=052 reward= +14.260 evac=1 hp=100.0 s30=0.57 t=82s
ep=0174 [medium] steps=018 reward= -12.960 evac=0 hp= 0.0 s30=0.53 t=82s
ep=0175 [medium] steps=040 reward= -19.070 evac=0 hp= 0.0 s30=0.53 t=82s
** EVAL [hard] reward=-6.674 success=0.20
ep=0176 [medium] steps=150 reward= -30.735 evac=0 hp= 36.5 s30=0.50 t=84s
>> PPO update pi_loss=-0.0060 v_loss=30.8760 entropy=1.4049 kl=0.0037 lr=2.21e-04
ep=0177 [medium] steps=008 reward= +15.130 evac=1 hp=100.0 s30=0.50 t=84s
ep=0178 [medium] steps=031 reward= +16.050 evac=1 hp=100.0 s30=0.50 t=84s
ep=0179 [medium] steps=009 reward= +15.070 evac=1 hp=100.0 s30=0.50 t=85s
ep=0180 [medium] steps=150 reward= -15.990 evac=0 hp= 6.0 s30=0.47 t=85s
ep=0181 [medium] steps=039 reward= -14.280 evac=0 hp= 0.0 s30=0.43 t=85s
ep=0182 [medium] steps=013 reward= +17.160 evac=1 hp=100.0 s30=0.43 t=85s
ep=0183 [medium] steps=026 reward= +14.380 evac=1 hp=100.0 s30=0.47 t=86s
ep=0184 [medium] steps=150 reward= -8.320 evac=0 hp=100.0 s30=0.47 t=86s
>> PPO update pi_loss=-0.0047 v_loss=30.2118 entropy=1.9547 kl=0.0032 lr=2.17e-04
ep=0185 [medium] steps=021 reward= +16.540 evac=1 hp=100.0 s30=0.50 t=87s
ep=0186 [medium] steps=081 reward= +11.440 evac=1 hp=100.0 s30=0.50 t=87s
ep=0187 [medium] steps=019 reward= +13.470 evac=1 hp= 84.0 s30=0.50 t=87s
ep=0188 [medium] steps=002 reward= +14.730 evac=1 hp=100.0 s30=0.50 t=87s
ep=0189 [medium] steps=150 reward= -12.590 evac=0 hp= 92.0 s30=0.47 t=88s
ep=0190 [medium] steps=028 reward= +16.133 evac=1 hp= 95.5 s30=0.50 t=88s
ep=0191 [medium] steps=038 reward= -20.930 evac=0 hp= 0.0 s30=0.50 t=89s
ep=0192 [medium] steps=034 reward= +14.270 evac=1 hp=100.0 s30=0.53 t=89s
>> PPO update pi_loss=-0.0183 v_loss=29.7914 entropy=1.7819 kl=0.0039 lr=2.14e-04
ep=0193 [medium] steps=037 reward= +11.290 evac=1 hp= 34.0 s30=0.57 t=89s
ep=0194 [medium] steps=020 reward= -16.220 evac=0 hp= 0.0 s30=0.57 t=89s
ep=0195 [medium] steps=017 reward= +17.320 evac=1 hp=100.0 s30=0.60 t=89s
ep=0196 [medium] steps=027 reward= +10.703 evac=1 hp= 35.5 s30=0.63 t=89s
ep=0197 [medium] steps=150 reward= -32.225 evac=0 hp= 1.5 s30=0.60 t=90s
ep=0198 [medium] steps=050 reward= -15.130 evac=0 hp= 0.0 s30=0.57 t=90s
ep=0199 [medium] steps=017 reward= +16.950 evac=1 hp=100.0 s30=0.60 t=90s
ep=0200 [medium] steps=150 reward= -26.000 evac=0 hp= 17.0 s30=0.60 t=91s
>> PPO update pi_loss=-0.0057 v_loss=18.1479 entropy=1.1786 kl=0.0061 lr=2.10e-04
** EVAL [hard] reward=-12.304 success=0.00
ep=0201 [medium] steps=150 reward= -16.065 evac=0 hp= 93.5 s30=0.57 t=94s
ep=0202 [medium] steps=021 reward= +16.650 evac=1 hp=100.0 s30=0.57 t=95s
ep=0203 [medium] steps=078 reward= -9.000 evac=0 hp= 0.0 s30=0.53 t=95s
ep=0204 [medium] steps=023 reward= -18.280 evac=0 hp= 0.0 s30=0.53 t=95s
ep=0205 [medium] steps=002 reward= +14.730 evac=1 hp=100.0 s30=0.57 t=95s
ep=0206 [medium] steps=044 reward= -9.060 evac=0 hp= 0.0 s30=0.57 t=95s
ep=0207 [medium] steps=033 reward= -17.560 evac=0 hp= 0.0 s30=0.53 t=95s
ep=0208 [medium] steps=018 reward= -16.030 evac=0 hp= 0.0 s30=0.50 t=96s
>> PPO update pi_loss=-0.0094 v_loss=20.9687 entropy=1.5537 kl=0.0047 lr=2.06e-04
ep=0209 [medium] steps=029 reward= +11.915 evac=1 hp= 63.0 s30=0.50 t=96s
ep=0210 [medium] steps=150 reward= -19.650 evac=0 hp=100.0 s30=0.50 t=97s
ep=0211 [medium] steps=013 reward= +16.290 evac=1 hp=100.0 s30=0.53 t=97s
ep=0212 [medium] steps=150 reward= -11.440 evac=0 hp= 88.0 s30=0.50 t=97s
ep=0213 [medium] steps=150 reward= -18.155 evac=0 hp= 99.5 s30=0.47 t=98s
ep=0214 [medium] steps=002 reward= +14.730 evac=1 hp=100.0 s30=0.50 t=98s
ep=0215 [medium] steps=140 reward= +9.375 evac=1 hp= 97.0 s30=0.50 t=99s
ep=0216 [medium] steps=150 reward= -20.495 evac=0 hp= 67.5 s30=0.47 t=99s
>> PPO update pi_loss=-0.0179 v_loss=3.1620 entropy=1.5723 kl=0.0041 lr=2.03e-04
ep=0217 [medium] steps=019 reward= +13.883 evac=1 hp= 89.5 s30=0.47 t=100s
ep=0218 [medium] steps=045 reward= -10.200 evac=0 hp= 0.0 s30=0.43 t=100s
ep=0219 [medium] steps=007 reward= +15.170 evac=1 hp=100.0 s30=0.47 t=100s
ep=0220 [medium] steps=019 reward= -11.850 evac=0 hp= 0.0 s30=0.43 t=100s
ep=0221 [medium] steps=010 reward= +15.740 evac=1 hp=100.0 s30=0.47 t=100s
ep=0222 [medium] steps=034 reward= +14.350 evac=1 hp=100.0 s30=0.47 t=100s
ep=0223 [medium] steps=150 reward= -10.765 evac=0 hp= 12.5 s30=0.43 t=101s
ep=0224 [medium] steps=150 reward= -8.865 evac=0 hp= 86.5 s30=0.43 t=102s
>> PPO update pi_loss=-0.0082 v_loss=28.3779 entropy=1.9019 kl=0.0052 lr=1.99e-04
ep=0225 [medium] steps=005 reward= +15.900 evac=1 hp=100.0 s30=0.43 t=102s
** EVAL [hard] reward=-11.080 success=0.00
ep=0226 [medium] steps=150 reward= -11.360 evac=0 hp=100.0 s30=0.40 t=105s
ep=0227 [medium] steps=011 reward= +15.840 evac=1 hp=100.0 s30=0.43 t=105s
ep=0228 [medium] steps=025 reward= +15.300 evac=1 hp=100.0 s30=0.47 t=105s
ep=0229 [medium] steps=068 reward= -22.710 evac=0 hp= 0.0 s30=0.43 t=106s
ep=0230 [medium] steps=015 reward= -12.360 evac=0 hp= 0.0 s30=0.43 t=106s
ep=0231 [medium] steps=044 reward= -11.060 evac=0 hp= 0.0 s30=0.43 t=106s
ep=0232 [medium] steps=015 reward= +17.160 evac=1 hp=100.0 s30=0.43 t=106s
>> PPO update pi_loss=+0.0029 v_loss=33.0012 entropy=1.4043 kl=0.0086 lr=1.96e-04
ep=0233 [medium] steps=013 reward= +16.830 evac=1 hp=100.0 s30=0.47 t=106s
ep=0234 [medium] steps=099 reward= +9.075 evac=1 hp= 59.0 s30=0.50 t=107s
ep=0235 [medium] steps=150 reward= -18.945 evac=0 hp= 66.5 s30=0.47 t=108s
ep=0236 [medium] steps=009 reward= +16.310 evac=1 hp=100.0 s30=0.50 t=108s
ep=0237 [medium] steps=023 reward= -14.880 evac=0 hp= 0.0 s30=0.50 t=108s
ep=0238 [medium] steps=150 reward= -19.820 evac=0 hp=100.0 s30=0.50 t=108s
ep=0239 [medium] steps=150 reward= -14.000 evac=0 hp=100.0 s30=0.47 t=109s
ep=0240 [medium] steps=053 reward= +10.170 evac=1 hp= 32.0 s30=0.50 t=109s
>> PPO update pi_loss=-0.0042 v_loss=13.0209 entropy=1.5150 kl=0.0011 lr=1.92e-04
ep=0241 [medium] steps=150 reward= -19.830 evac=0 hp=100.0 s30=0.47 t=110s
ep=0242 [medium] steps=150 reward= -11.270 evac=0 hp= 74.0 s30=0.47 t=111s
ep=0243 [medium] steps=047 reward= -10.300 evac=0 hp= 0.0 s30=0.47 t=111s
ep=0244 [medium] steps=046 reward= -21.860 evac=0 hp= 0.0 s30=0.43 t=111s
ep=0245 [medium] steps=150 reward= -12.495 evac=0 hp= 93.5 s30=0.40 t=112s
ep=0246 [medium] steps=030 reward= -10.980 evac=0 hp= 0.0 s30=0.40 t=112s
ep=0247 [medium] steps=031 reward= +9.955 evac=1 hp= 65.0 s30=0.40 t=112s
ep=0248 [medium] steps=050 reward= +14.810 evac=1 hp=100.0 s30=0.43 t=112s
>> PPO update pi_loss=-0.0064 v_loss=11.4714 entropy=1.6570 kl=0.0017 lr=1.88e-04
ep=0249 [medium] steps=033 reward= +15.620 evac=1 hp=100.0 s30=0.43 t=113s
ep=0250 [medium] steps=042 reward= -13.750 evac=0 hp= 0.0 s30=0.43 t=113s
** EVAL [hard] reward=-5.648 success=0.20
ep=0251 [medium] steps=150 reward= -21.140 evac=0 hp=100.0 s30=0.40 t=115s
ep=0252 [medium] steps=017 reward= +14.880 evac=1 hp=100.0 s30=0.40 t=115s
ep=0253 [medium] steps=025 reward= +16.060 evac=1 hp=100.0 s30=0.43 t=115s
ep=0254 [medium] steps=041 reward= +14.490 evac=1 hp=100.0 s30=0.47 t=115s
ep=0255 [medium] steps=150 reward= -10.270 evac=0 hp=100.0 s30=0.43 t=116s
ep=0256 [medium] steps=047 reward= +14.500 evac=1 hp=100.0 s30=0.47 t=116s
>> PPO update pi_loss=-0.0026 v_loss=19.7825 entropy=1.6007 kl=0.0029 lr=1.85e-04
ep=0257 [medium] steps=085 reward= -22.280 evac=0 hp= 0.0 s30=0.43 t=117s
ep=0258 [medium] steps=014 reward= +15.860 evac=1 hp=100.0 s30=0.43 t=117s
ep=0259 [medium] steps=150 reward= -10.100 evac=0 hp=100.0 s30=0.43 t=118s
ep=0260 [medium] steps=099 reward= -20.870 evac=0 hp= 0.0 s30=0.43 t=118s
ep=0261 [medium] steps=010 reward= +16.800 evac=1 hp=100.0 s30=0.47 t=118s
ep=0262 [medium] steps=043 reward= +15.430 evac=1 hp=100.0 s30=0.47 t=118s
ep=0263 [medium] steps=042 reward= +15.385 evac=1 hp= 97.0 s30=0.47 t=118s
ep=0264 [medium] steps=058 reward= -15.040 evac=0 hp= 0.0 s30=0.43 t=119s
>> PPO update pi_loss=+0.0024 v_loss=18.7074 entropy=1.6059 kl=0.0008 lr=1.81e-04
ep=0265 [medium] steps=028 reward= +15.670 evac=1 hp=100.0 s30=0.47 t=119s
ep=0266 [medium] steps=037 reward= -20.460 evac=0 hp= 0.0 s30=0.43 t=119s
ep=0267 [medium] steps=023 reward= +12.072 evac=1 hp= 39.5 s30=0.47 t=119s
ep=0268 [medium] steps=024 reward= +7.133 evac=1 hp= 21.5 s30=0.50 t=119s
ep=0269 [medium] steps=014 reward= +16.080 evac=1 hp=100.0 s30=0.53 t=119s
ep=0270 [medium] steps=013 reward= +10.907 evac=1 hp= 30.5 s30=0.53 t=120s
ep=0271 [medium] steps=068 reward= +13.167 evac=1 hp= 98.5 s30=0.57 t=120s
ep=0272 [medium] steps=026 reward= +9.660 evac=1 hp= 50.0 s30=0.60 t=120s
>> PPO update pi_loss=-0.0026 v_loss=51.5106 entropy=1.6209 kl=0.0003 lr=1.78e-04
ep=0273 [medium] steps=150 reward= -15.250 evac=0 hp= 18.0 s30=0.60 t=121s
ep=0274 [medium] steps=046 reward= +15.253 evac=1 hp= 99.5 s30=0.63 t=121s
ep=0275 [medium] steps=050 reward= -20.940 evac=0 hp= 0.0 s30=0.63 t=121s
** EVAL [hard] reward=-10.368 success=0.00
ep=0276 [medium] steps=016 reward= +16.070 evac=1 hp=100.0 s30=0.67 t=123s
ep=0277 [medium] steps=005 reward= +15.490 evac=1 hp=100.0 s30=0.67 t=123s
ep=0278 [medium] steps=032 reward= -12.180 evac=0 hp= 0.0 s30=0.63 t=123s
ep=0279 [medium] steps=027 reward= +14.932 evac=1 hp= 99.5 s30=0.63 t=123s
ep=0280 [medium] steps=007 reward= +15.780 evac=1 hp=100.0 s30=0.67 t=123s
>> PPO update pi_loss=-0.0248 v_loss=32.8994 entropy=1.7511 kl=0.0017 lr=1.74e-04
ep=0281 [medium] steps=008 reward= +14.720 evac=1 hp=100.0 s30=0.70 t=124s
ep=0282 [medium] steps=044 reward= +13.120 evac=1 hp= 98.0 s30=0.70 t=124s
ep=0283 [medium] steps=007 reward= +15.780 evac=1 hp=100.0 s30=0.70 t=124s
ep=0284 [medium] steps=025 reward= -13.390 evac=0 hp= 0.0 s30=0.67 t=124s
ep=0285 [medium] steps=059 reward= -18.330 evac=0 hp= 0.0 s30=0.67 t=124s
ep=0286 [medium] steps=014 reward= +13.222 evac=1 hp= 87.5 s30=0.67 t=124s
ep=0287 [medium] steps=033 reward= +14.592 evac=1 hp= 93.5 s30=0.70 t=125s
ep=0288 [medium] steps=003 reward= +15.160 evac=1 hp=100.0 s30=0.70 t=125s
>> PPO update pi_loss=-0.0022 v_loss=36.9067 entropy=1.3689 kl=0.0008 lr=1.70e-04
ep=0289 [medium] steps=017 reward= +15.430 evac=1 hp=100.0 s30=0.73 t=125s
ep=0290 [medium] steps=036 reward= -8.750 evac=0 hp= 0.0 s30=0.73 t=125s
ep=0291 [medium] steps=021 reward= +14.890 evac=1 hp=100.0 s30=0.73 t=125s
ep=0292 [medium] steps=003 reward= +15.160 evac=1 hp=100.0 s30=0.73 t=125s
ep=0293 [medium] steps=048 reward= -10.520 evac=0 hp= 0.0 s30=0.70 t=125s
ep=0294 [medium] steps=027 reward= +16.330 evac=1 hp=100.0 s30=0.73 t=125s
ep=0295 [medium] steps=011 reward= +16.130 evac=1 hp=100.0 s30=0.73 t=125s
ep=0296 [medium] steps=042 reward= -12.320 evac=0 hp= 0.0 s30=0.73 t=126s
>> PPO update pi_loss=-0.0016 v_loss=49.3725 entropy=1.6299 kl=0.0004 lr=1.67e-04
ep=0297 [medium] steps=150 reward= -30.925 evac=0 hp= 66.5 s30=0.70 t=126s
ep=0298 [medium] steps=008 reward= +15.670 evac=1 hp=100.0 s30=0.70 t=126s
ep=0299 [medium] steps=030 reward= +15.950 evac=1 hp=100.0 s30=0.70 t=127s
ep=0300 [medium] steps=019 reward= -19.910 evac=0 hp= 0.0 s30=0.67 t=127s
** EVAL [hard] reward=-4.421 success=0.20
ep=0301 [medium] steps=009 reward= +16.700 evac=1 hp=100.0 s30=0.67 t=129s
ep=0302 [medium] steps=014 reward= +15.740 evac=1 hp=100.0 s30=0.67 t=129s
ep=0303 [medium] steps=019 reward= +15.830 evac=1 hp=100.0 s30=0.70 t=129s
ep=0304 [medium] steps=008 reward= +16.290 evac=1 hp=100.0 s30=0.70 t=129s
>> PPO update pi_loss=-0.0023 v_loss=32.9150 entropy=1.0633 kl=0.0003 lr=1.63e-04
ep=0305 [medium] steps=018 reward= +11.265 evac=1 hp= 57.0 s30=0.73 t=129s
ep=0306 [medium] steps=006 reward= +15.840 evac=1 hp=100.0 s30=0.73 t=129s
ep=0307 [medium] steps=004 reward= +15.080 evac=1 hp=100.0 s30=0.73 t=129s
ep=0308 [medium] steps=150 reward= -14.790 evac=0 hp=100.0 s30=0.73 t=130s
ep=0309 [medium] steps=027 reward= +15.510 evac=1 hp=100.0 s30=0.73 t=130s
ep=0310 [medium] steps=027 reward= +16.010 evac=1 hp=100.0 s30=0.73 t=130s
ep=0311 [medium] steps=150 reward= -15.270 evac=0 hp=100.0 s30=0.70 t=131s
ep=0312 [medium] steps=047 reward= +16.010 evac=1 hp=100.0 s30=0.70 t=131s
>> PPO update pi_loss=-0.0012 v_loss=20.0507 entropy=1.7914 kl=0.0008 lr=1.60e-04
ep=0313 [medium] steps=030 reward= -11.460 evac=0 hp= 0.0 s30=0.67 t=131s
ep=0314 [medium] steps=010 reward= +16.820 evac=1 hp=100.0 s30=0.70 t=131s
ep=0315 [medium] steps=150 reward= -10.870 evac=0 hp=100.0 s30=0.70 t=132s
ep=0316 [medium] steps=150 reward= -24.845 evac=0 hp= 71.5 s30=0.67 t=133s
ep=0317 [medium] steps=005 reward= +14.940 evac=1 hp=100.0 s30=0.67 t=133s
ep=0318 [medium] steps=032 reward= -10.190 evac=0 hp= 0.0 s30=0.63 t=133s
ep=0319 [medium] steps=005 reward= +15.900 evac=1 hp=100.0 s30=0.63 t=133s
ep=0320 [medium] steps=002 reward= +14.730 evac=1 hp=100.0 s30=0.67 t=133s
>> PPO update pi_loss=+0.0007 v_loss=20.7397 entropy=1.4529 kl=0.0018 lr=1.56e-04
ep=0321 [medium] steps=004 reward= +15.550 evac=1 hp=100.0 s30=0.67 t=133s
ep=0322 [medium] steps=007 reward= +15.360 evac=1 hp=100.0 s30=0.67 t=133s
ep=0323 [medium] steps=023 reward= -8.390 evac=0 hp= 0.0 s30=0.67 t=133s
ep=0324 [medium] steps=016 reward= +16.550 evac=1 hp=100.0 s30=0.67 t=133s
ep=0325 [medium] steps=028 reward= +15.430 evac=1 hp=100.0 s30=0.67 t=134s
** EVAL [hard] reward=-11.180 success=0.00
ep=0326 [medium] steps=017 reward= +11.340 evac=1 hp= 72.0 s30=0.70 t=135s
ep=0327 [medium] steps=150 reward= -14.040 evac=0 hp=100.0 s30=0.70 t=136s
ep=0328 [medium] steps=020 reward= +16.560 evac=1 hp=100.0 s30=0.70 t=136s
>> PPO update pi_loss=-0.1127 v_loss=34.7078 entropy=1.6140 kl=0.0021 lr=1.52e-04
ep=0329 [medium] steps=001 reward= +14.260 evac=1 hp=100.0 s30=0.70 t=136s
ep=0330 [medium] steps=015 reward= +16.492 evac=1 hp= 99.5 s30=0.73 t=136s
ep=0331 [medium] steps=026 reward= -10.810 evac=0 hp= 0.0 s30=0.70 t=136s
ep=0332 [medium] steps=035 reward= -16.300 evac=0 hp= 0.0 s30=0.67 t=136s
ep=0333 [medium] steps=024 reward= +12.070 evac=1 hp= 66.0 s30=0.67 t=136s
ep=0334 [medium] steps=021 reward= +15.380 evac=1 hp=100.0 s30=0.67 t=136s
ep=0335 [medium] steps=025 reward= +13.668 evac=1 hp= 96.5 s30=0.67 t=137s
ep=0336 [medium] steps=001 reward= +14.260 evac=1 hp=100.0 s30=0.67 t=137s
>> PPO update pi_loss=-0.0010 v_loss=51.7629 entropy=1.4248 kl=0.0002 lr=1.49e-04
ep=0337 [medium] steps=150 reward= -8.340 evac=0 hp=100.0 s30=0.63 t=137s
ep=0338 [medium] steps=045 reward= -22.490 evac=0 hp= 0.0 s30=0.63 t=138s
ep=0339 [medium] steps=011 reward= +15.610 evac=1 hp=100.0 s30=0.63 t=138s
ep=0340 [medium] steps=020 reward= +15.480 evac=1 hp= 98.0 s30=0.63 t=138s
ep=0341 [medium] steps=033 reward= -13.980 evac=0 hp= 0.0 s30=0.63 t=138s
ep=0342 [medium] steps=026 reward= +16.010 evac=1 hp=100.0 s30=0.63 t=138s
ep=0343 [medium] steps=010 reward= +16.640 evac=1 hp=100.0 s30=0.67 t=138s
ep=0344 [medium] steps=017 reward= +9.383 evac=1 hp= 37.5 s30=0.67 t=138s
>> PPO update pi_loss=-0.0115 v_loss=29.0228 entropy=1.6334 kl=0.0006 lr=1.45e-04
ep=0345 [medium] steps=037 reward= +15.953 evac=1 hp= 97.5 s30=0.70 t=139s
ep=0346 [medium] steps=018 reward= +16.840 evac=1 hp=100.0 s30=0.73 t=139s
ep=0347 [medium] steps=014 reward= -14.940 evac=0 hp= 0.0 s30=0.70 t=139s
ep=0348 [medium] steps=009 reward= +16.250 evac=1 hp=100.0 s30=0.73 t=139s
ep=0349 [medium] steps=042 reward= +15.600 evac=1 hp=100.0 s30=0.73 t=139s
ep=0350 [medium] steps=015 reward= +16.420 evac=1 hp=100.0 s30=0.73 t=139s
** EVAL [hard] reward=-9.845 success=0.00
ep=0351 [medium] steps=009 reward= +15.030 evac=1 hp=100.0 s30=0.73 t=141s
ep=0352 [medium] steps=015 reward= +16.290 evac=1 hp=100.0 s30=0.73 t=141s
>> PPO update pi_loss=-0.0015 v_loss=44.9861 entropy=1.4760 kl=0.0002 lr=1.42e-04
ep=0353 [medium] steps=011 reward= +16.240 evac=1 hp=100.0 s30=0.77 t=141s
ep=0354 [medium] steps=150 reward= -25.450 evac=0 hp= 56.0 s30=0.73 t=142s
ep=0355 [medium] steps=150 reward= -12.510 evac=0 hp=100.0 s30=0.70 t=142s
ep=0356 [medium] steps=016 reward= +15.078 evac=1 hp= 98.5 s30=0.70 t=142s
ep=0357 [medium] steps=016 reward= +16.920 evac=1 hp=100.0 s30=0.73 t=143s
ep=0358 [medium] steps=013 reward= +16.350 evac=1 hp=100.0 s30=0.73 t=143s
ep=0359 [medium] steps=015 reward= +15.070 evac=1 hp=100.0 s30=0.73 t=143s
ep=0360 [medium] steps=016 reward= +17.010 evac=1 hp=100.0 s30=0.73 t=143s
>> PPO update pi_loss=+0.0065 v_loss=18.1264 entropy=1.2002 kl=0.0020 lr=1.38e-04
ep=0361 [medium] steps=014 reward= +16.450 evac=1 hp=100.0 s30=0.77 t=143s
ep=0362 [medium] steps=054 reward= -6.740 evac=0 hp= 0.0 s30=0.77 t=143s
ep=0363 [medium] steps=060 reward= -19.560 evac=0 hp= 0.0 s30=0.73 t=144s
[curriculum] Advanced to 'hard' (suc30=0.73)
ep=0364 [medium] steps=017 reward= +15.980 evac=1 hp=100.0 s30=0.73 t=144s
ep=0365 [hard ] steps=031 reward= -11.430 evac=0 hp= 0.0 s30=0.70 t=144s
ep=0366 [easy ] steps=200 reward= -17.860 evac=0 hp=100.0 s30=0.67 t=145s
ep=0367 [medium] steps=016 reward= +12.742 evac=1 hp= 91.5 s30=0.70 t=145s
ep=0368 [medium] steps=038 reward= -7.300 evac=0 hp= 0.0 s30=0.70 t=145s
>> PPO update pi_loss=-0.0051 v_loss=15.8902 entropy=1.2523 kl=0.0026 lr=1.34e-04
ep=0369 [hard ] steps=038 reward= -13.110 evac=0 hp= 0.0 s30=0.67 t=145s
ep=0370 [hard ] steps=018 reward= -12.610 evac=0 hp= 0.0 s30=0.63 t=146s
ep=0371 [hard ] steps=100 reward= -8.120 evac=0 hp=100.0 s30=0.63 t=146s
ep=0372 [easy ] steps=030 reward= +18.690 evac=1 hp=100.0 s30=0.63 t=146s
ep=0373 [medium] steps=010 reward= +16.470 evac=1 hp=100.0 s30=0.63 t=146s
ep=0374 [medium] steps=037 reward= +14.580 evac=1 hp=100.0 s30=0.63 t=146s
ep=0375 [hard ] steps=031 reward= -13.080 evac=0 hp= 0.0 s30=0.60 t=147s
** EVAL [hard] reward=-11.320 success=0.00
ep=0376 [easy ] steps=008 reward= +18.940 evac=1 hp=100.0 s30=0.60 t=147s
>> PPO update pi_loss=+0.0186 v_loss=42.2980 entropy=1.9139 kl=0.0080 lr=1.31e-04
ep=0377 [easy ] steps=022 reward= +17.990 evac=1 hp=100.0 s30=0.63 t=148s
ep=0378 [hard ] steps=042 reward= -11.380 evac=0 hp= 0.0 s30=0.60 t=148s
ep=0379 [hard ] steps=045 reward= -12.320 evac=0 hp= 0.0 s30=0.57 t=148s
ep=0380 [hard ] steps=024 reward= -10.440 evac=0 hp= 0.0 s30=0.53 t=148s
ep=0381 [hard ] steps=032 reward= -9.660 evac=0 hp= 0.0 s30=0.50 t=148s
ep=0382 [medium] steps=007 reward= +16.480 evac=1 hp=100.0 s30=0.50 t=148s
ep=0383 [medium] steps=150 reward= -13.880 evac=0 hp=100.0 s30=0.47 t=149s
ep=0384 [hard ] steps=032 reward= -10.670 evac=0 hp= 0.0 s30=0.47 t=149s
>> PPO update pi_loss=-0.0179 v_loss=15.9173 entropy=2.0312 kl=0.0022 lr=1.27e-04
ep=0385 [medium] steps=013 reward= +15.470 evac=1 hp=100.0 s30=0.50 t=150s
ep=0386 [hard ] steps=040 reward= -12.560 evac=0 hp= 0.0 s30=0.47 t=150s
ep=0387 [hard ] steps=015 reward= -11.420 evac=0 hp= 0.0 s30=0.43 t=150s
ep=0388 [hard ] steps=100 reward= -7.770 evac=0 hp=100.0 s30=0.40 t=150s
ep=0389 [medium] steps=150 reward= -15.300 evac=0 hp= 93.0 s30=0.37 t=151s
ep=0390 [medium] steps=150 reward= -10.360 evac=0 hp=100.0 s30=0.33 t=152s
ep=0391 [hard ] steps=100 reward= -9.240 evac=0 hp=100.0 s30=0.30 t=152s
ep=0392 [medium] steps=010 reward= +16.320 evac=1 hp=100.0 s30=0.33 t=152s
>> PPO update pi_loss=-0.0158 v_loss=8.8704 entropy=1.8786 kl=0.0033 lr=1.24e-04
ep=0393 [medium] steps=063 reward= -10.280 evac=0 hp= 0.0 s30=0.33 t=153s
ep=0394 [hard ] steps=100 reward= -10.105 evac=0 hp= 72.5 s30=0.30 t=154s
ep=0395 [medium] steps=002 reward= +14.730 evac=1 hp=100.0 s30=0.33 t=154s
ep=0396 [hard ] steps=019 reward= -12.630 evac=0 hp= 0.0 s30=0.33 t=154s
ep=0397 [hard ] steps=027 reward= -15.040 evac=0 hp= 0.0 s30=0.30 t=154s
ep=0398 [easy ] steps=011 reward= +18.760 evac=1 hp=100.0 s30=0.33 t=154s
ep=0399 [easy ] steps=163 reward= +10.190 evac=1 hp=100.0 s30=0.37 t=154s
ep=0400 [easy ] steps=014 reward= +18.110 evac=1 hp=100.0 s30=0.40 t=155s
>> PPO update pi_loss=+0.0007 v_loss=10.2287 entropy=1.9814 kl=0.0007 lr=1.20e-04
** EVAL [hard] reward=-12.256 success=0.00
ep=0401 [hard ] steps=030 reward= -11.450 evac=0 hp= 0.0 s30=0.40 t=156s
ep=0402 [hard ] steps=100 reward= -8.010 evac=0 hp=100.0 s30=0.37 t=157s
ep=0403 [medium] steps=027 reward= -10.860 evac=0 hp= 0.0 s30=0.33 t=157s
ep=0404 [medium] steps=004 reward= +15.550 evac=1 hp=100.0 s30=0.33 t=157s
ep=0405 [hard ] steps=100 reward= -8.615 evac=0 hp= 93.5 s30=0.33 t=157s
ep=0406 [medium] steps=018 reward= -15.140 evac=0 hp= 0.0 s30=0.30 t=157s
ep=0407 [hard ] steps=038 reward= +13.605 evac=1 hp= 97.0 s30=0.30 t=157s
ep=0408 [easy ] steps=026 reward= +17.300 evac=1 hp=100.0 s30=0.33 t=158s
>> PPO update pi_loss=-0.0036 v_loss=28.6206 entropy=1.8868 kl=0.0006 lr=1.16e-04
ep=0409 [hard ] steps=100 reward= -9.880 evac=0 hp=100.0 s30=0.33 t=158s
ep=0410 [medium] steps=049 reward= +13.980 evac=1 hp=100.0 s30=0.37 t=159s
ep=0411 [hard ] steps=054 reward= -11.150 evac=0 hp= 0.0 s30=0.37 t=159s
ep=0412 [medium] steps=150 reward= -10.620 evac=0 hp=100.0 s30=0.33 t=160s
ep=0413 [medium] steps=010 reward= +16.230 evac=1 hp=100.0 s30=0.37 t=160s
ep=0414 [hard ] steps=029 reward= +13.570 evac=1 hp=100.0 s30=0.40 t=160s
ep=0415 [easy ] steps=026 reward= +15.887 evac=1 hp= 84.5 s30=0.40 t=160s
ep=0416 [medium] steps=072 reward= -20.080 evac=0 hp= 0.0 s30=0.40 t=160s
>> PPO update pi_loss=-0.0059 v_loss=18.3843 entropy=1.7402 kl=0.0007 lr=1.13e-04
ep=0417 [easy ] steps=200 reward= -15.510 evac=0 hp=100.0 s30=0.40 t=161s
ep=0418 [easy ] steps=200 reward= -10.150 evac=0 hp=100.0 s30=0.40 t=162s
ep=0419 [medium] steps=019 reward= -20.390 evac=0 hp= 0.0 s30=0.40 t=162s
ep=0420 [easy ] steps=012 reward= +17.270 evac=1 hp=100.0 s30=0.43 t=162s
ep=0421 [hard ] steps=029 reward= -12.360 evac=0 hp= 0.0 s30=0.43 t=163s
ep=0422 [hard ] steps=056 reward= -17.780 evac=0 hp= 0.0 s30=0.40 t=163s
ep=0423 [hard ] steps=020 reward= -13.260 evac=0 hp= 0.0 s30=0.40 t=163s
ep=0424 [hard ] steps=041 reward= -12.590 evac=0 hp= 0.0 s30=0.40 t=163s
>> PPO update pi_loss=-0.0197 v_loss=16.7425 entropy=1.9853 kl=0.0074 lr=1.09e-04
ep=0425 [hard ] steps=038 reward= +13.130 evac=1 hp=100.0 s30=0.40 t=164s
** EVAL [hard] reward=-7.024 success=0.20
ep=0426 [hard ] steps=027 reward= -11.930 evac=0 hp= 0.0 s30=0.40 t=165s
ep=0427 [easy ] steps=001 reward= +16.760 evac=1 hp=100.0 s30=0.43 t=165s
ep=0428 [hard ] steps=100 reward= -9.300 evac=0 hp=100.0 s30=0.40 t=165s
ep=0429 [hard ] steps=016 reward= -11.390 evac=0 hp= 0.0 s30=0.37 t=165s
ep=0430 [medium] steps=019 reward= -13.260 evac=0 hp= 0.0 s30=0.33 t=166s
ep=0431 [hard ] steps=100 reward= -7.910 evac=0 hp=100.0 s30=0.33 t=166s
ep=0432 [easy ] steps=010 reward= +18.790 evac=1 hp=100.0 s30=0.37 t=166s
>> PPO update pi_loss=+0.0241 v_loss=23.2591 entropy=2.0669 kl=0.0023 lr=1.06e-04
ep=0433 [hard ] steps=020 reward= -13.380 evac=0 hp= 0.0 s30=0.37 t=167s
ep=0434 [easy ] steps=011 reward= +18.740 evac=1 hp=100.0 s30=0.37 t=167s
ep=0435 [medium] steps=023 reward= +13.832 evac=1 hp= 99.5 s30=0.40 t=167s
ep=0436 [hard ] steps=013 reward= +13.010 evac=1 hp=100.0 s30=0.43 t=167s
ep=0437 [hard ] steps=037 reward= -17.250 evac=0 hp= 0.0 s30=0.40 t=167s
ep=0438 [easy ] steps=014 reward= +18.400 evac=1 hp=100.0 s30=0.40 t=167s
ep=0439 [medium] steps=003 reward= +14.670 evac=1 hp=100.0 s30=0.43 t=167s
ep=0440 [medium] steps=034 reward= +14.730 evac=1 hp= 94.0 s30=0.43 t=167s
>> PPO update pi_loss=+0.0003 v_loss=67.2617 entropy=1.6175 kl=0.0005 lr=1.02e-04
ep=0441 [medium] steps=015 reward= +16.330 evac=1 hp=100.0 s30=0.47 t=167s
ep=0442 [hard ] steps=021 reward= -9.930 evac=0 hp= 0.0 s30=0.47 t=168s
ep=0443 [hard ] steps=100 reward= -7.920 evac=0 hp=100.0 s30=0.43 t=168s
ep=0444 [hard ] steps=027 reward= -10.220 evac=0 hp= 0.0 s30=0.40 t=168s
ep=0445 [easy ] steps=200 reward= -13.360 evac=0 hp=100.0 s30=0.37 t=169s
ep=0446 [medium] steps=026 reward= +14.897 evac=1 hp= 96.5 s30=0.40 t=169s
ep=0447 [hard ] steps=100 reward= -8.950 evac=0 hp=100.0 s30=0.40 t=170s
ep=0448 [hard ] steps=028 reward= -11.460 evac=0 hp= 0.0 s30=0.40 t=170s
>> PPO update pi_loss=-0.0618 v_loss=6.9971 entropy=1.9275 kl=0.0016 lr=9.84e-05
ep=0449 [hard ] steps=100 reward= -7.500 evac=0 hp=100.0 s30=0.40 t=171s
ep=0450 [hard ] steps=057 reward= +13.620 evac=1 hp=100.0 s30=0.40 t=171s
** EVAL [hard] reward=-10.726 success=0.00
ep=0451 [medium] steps=007 reward= +16.480 evac=1 hp=100.0 s30=0.43 t=173s
ep=0452 [medium] steps=014 reward= +15.950 evac=1 hp=100.0 s30=0.47 t=173s
ep=0453 [easy ] steps=016 reward= +18.320 evac=1 hp=100.0 s30=0.50 t=173s
ep=0454 [medium] steps=084 reward= -19.540 evac=0 hp= 0.0 s30=0.50 t=173s
ep=0455 [hard ] steps=100 reward= -8.100 evac=0 hp=100.0 s30=0.47 t=173s
ep=0456 [medium] steps=017 reward= +14.253 evac=1 hp= 99.5 s30=0.50 t=174s
>> PPO update pi_loss=+0.0019 v_loss=18.8275 entropy=1.9587 kl=0.0010 lr=9.48e-05
ep=0457 [medium] steps=012 reward= +15.530 evac=1 hp=100.0 s30=0.50 t=174s
ep=0458 [medium] steps=017 reward= -13.910 evac=0 hp= 0.0 s30=0.50 t=174s
ep=0459 [medium] steps=009 reward= +16.680 evac=1 hp=100.0 s30=0.53 t=174s
ep=0460 [medium] steps=150 reward= -18.775 evac=0 hp= 43.5 s30=0.53 t=175s
ep=0461 [medium] steps=150 reward= -14.020 evac=0 hp= 97.0 s30=0.53 t=175s
ep=0462 [medium] steps=150 reward= -11.730 evac=0 hp=100.0 s30=0.50 t=176s
ep=0463 [hard ] steps=024 reward= -10.500 evac=0 hp= 0.0 s30=0.50 t=176s
ep=0464 [hard ] steps=032 reward= -11.090 evac=0 hp= 0.0 s30=0.47 t=176s
>> PPO update pi_loss=-0.0086 v_loss=13.1403 entropy=1.5553 kl=0.0028 lr=9.12e-05
ep=0465 [hard ] steps=024 reward= -11.580 evac=0 hp= 0.0 s30=0.43 t=177s
ep=0466 [medium] steps=018 reward= +16.110 evac=1 hp=100.0 s30=0.43 t=177s
ep=0467 [hard ] steps=100 reward= -9.110 evac=0 hp=100.0 s30=0.43 t=177s
ep=0468 [medium] steps=019 reward= +15.550 evac=1 hp=100.0 s30=0.43 t=178s
ep=0469 [easy ] steps=200 reward= -23.130 evac=0 hp=100.0 s30=0.40 t=178s
ep=0470 [hard ] steps=025 reward= -14.400 evac=0 hp= 0.0 s30=0.37 t=179s
ep=0471 [medium] steps=062 reward= -13.210 evac=0 hp= 0.0 s30=0.33 t=179s
ep=0472 [medium] steps=150 reward= -23.735 evac=0 hp= 96.5 s30=0.33 t=179s
>> PPO update pi_loss=-0.0109 v_loss=10.0870 entropy=1.3562 kl=0.0016 lr=8.76e-05
ep=0473 [hard ] steps=023 reward= -13.950 evac=0 hp= 0.0 s30=0.33 t=180s
ep=0474 [hard ] steps=100 reward= -10.855 evac=0 hp= 98.5 s30=0.33 t=181s
ep=0475 [hard ] steps=100 reward= -9.080 evac=0 hp=100.0 s30=0.33 t=181s
** EVAL [hard] reward=-9.072 success=0.00
ep=0476 [hard ] steps=100 reward= -9.125 evac=0 hp= 74.5 s30=0.30 t=184s
ep=0477 [easy ] steps=011 reward= +18.740 evac=1 hp=100.0 s30=0.33 t=184s
ep=0478 [easy ] steps=016 reward= +17.720 evac=1 hp=100.0 s30=0.37 t=184s
ep=0479 [medium] steps=150 reward= -17.390 evac=0 hp=100.0 s30=0.37 t=185s
ep=0480 [easy ] steps=004 reward= +18.050 evac=1 hp=100.0 s30=0.37 t=185s
>> PPO update pi_loss=-0.0001 v_loss=7.6346 entropy=1.6924 kl=0.0008 lr=8.40e-05
ep=0481 [easy ] steps=123 reward= +12.600 evac=1 hp=100.0 s30=0.37 t=186s
ep=0482 [hard ] steps=100 reward= -10.260 evac=0 hp=100.0 s30=0.33 t=186s
ep=0483 [easy ] steps=007 reward= +18.650 evac=1 hp=100.0 s30=0.33 t=186s
ep=0484 [medium] steps=016 reward= +16.180 evac=1 hp=100.0 s30=0.37 t=186s
ep=0485 [easy ] steps=200 reward= -21.930 evac=0 hp=100.0 s30=0.37 t=187s
ep=0486 [easy ] steps=017 reward= +17.980 evac=1 hp=100.0 s30=0.37 t=187s
ep=0487 [hard ] steps=100 reward= -7.490 evac=0 hp=100.0 s30=0.33 t=188s
ep=0488 [hard ] steps=043 reward= -12.790 evac=0 hp= 0.0 s30=0.33 t=188s
>> PPO update pi_loss=-0.0032 v_loss=7.0721 entropy=1.6047 kl=0.0014 lr=8.04e-05
ep=0489 [hard ] steps=100 reward= -8.410 evac=0 hp=100.0 s30=0.30 t=189s
ep=0490 [medium] steps=015 reward= +16.340 evac=1 hp=100.0 s30=0.33 t=189s
ep=0491 [easy ] steps=029 reward= +18.520 evac=1 hp=100.0 s30=0.37 t=189s
ep=0492 [easy ] steps=012 reward= +18.370 evac=1 hp=100.0 s30=0.40 t=189s
ep=0493 [hard ] steps=036 reward= -13.860 evac=0 hp= 0.0 s30=0.40 t=190s
ep=0494 [medium] steps=150 reward= -17.640 evac=0 hp= 78.0 s30=0.40 t=190s
ep=0495 [medium] steps=003 reward= +15.160 evac=1 hp=100.0 s30=0.43 t=190s
ep=0496 [hard ] steps=025 reward= -9.910 evac=0 hp= 0.0 s30=0.40 t=190s
>> PPO update pi_loss=+0.0086 v_loss=13.3058 entropy=1.0254 kl=0.0005 lr=7.68e-05
ep=0497 [hard ] steps=032 reward= -12.570 evac=0 hp= 0.0 s30=0.40 t=191s
ep=0498 [hard ] steps=100 reward= -9.605 evac=0 hp= 56.5 s30=0.37 t=191s
ep=0499 [easy ] steps=009 reward= +18.100 evac=1 hp=100.0 s30=0.40 t=191s
ep=0500 [hard ] steps=078 reward= +11.160 evac=1 hp=100.0 s30=0.43 t=192s
** EVAL [hard] reward=-12.050 success=0.00
ep=0501 [medium] steps=026 reward= +6.730 evac=1 hp= 14.0 s30=0.47 t=194s
ep=0502 [medium] steps=017 reward= +15.100 evac=1 hp=100.0 s30=0.50 t=194s
ep=0503 [hard ] steps=037 reward= -12.610 evac=0 hp= 0.0 s30=0.50 t=194s
ep=0504 [easy ] steps=200 reward= -17.930 evac=0 hp=100.0 s30=0.50 t=195s
>> PPO update pi_loss=+0.0009 v_loss=15.9530 entropy=1.5948 kl=0.0006 lr=7.32e-05
ep=0505 [easy ] steps=200 reward= -15.700 evac=0 hp=100.0 s30=0.50 t=196s
ep=0506 [hard ] steps=100 reward= -10.160 evac=0 hp=100.0 s30=0.50 t=197s
ep=0507 [medium] steps=008 reward= +16.740 evac=1 hp=100.0 s30=0.50 t=197s
ep=0508 [hard ] steps=100 reward= -8.520 evac=0 hp=100.0 s30=0.47 t=197s
ep=0509 [hard ] steps=028 reward= -11.370 evac=0 hp= 0.0 s30=0.47 t=197s
ep=0510 [hard ] steps=032 reward= -9.960 evac=0 hp= 0.0 s30=0.43 t=198s
ep=0511 [medium] steps=017 reward= +15.520 evac=1 hp=100.0 s30=0.43 t=198s
ep=0512 [medium] steps=013 reward= +16.330 evac=1 hp=100.0 s30=0.47 t=198s
>> PPO update pi_loss=-0.0003 v_loss=10.8486 entropy=2.0690 kl=0.0005 lr=6.96e-05
ep=0513 [hard ] steps=040 reward= -10.930 evac=0 hp= 0.0 s30=0.43 t=198s
ep=0514 [hard ] steps=100 reward= -9.770 evac=0 hp=100.0 s30=0.40 t=199s
ep=0515 [easy ] steps=003 reward= +17.170 evac=1 hp=100.0 s30=0.43 t=199s
ep=0516 [hard ] steps=026 reward= +13.380 evac=1 hp=100.0 s30=0.43 t=199s
ep=0517 [hard ] steps=035 reward= -11.330 evac=0 hp= 0.0 s30=0.43 t=199s
ep=0518 [medium] steps=020 reward= +13.890 evac=1 hp= 86.0 s30=0.47 t=199s
ep=0519 [easy ] steps=019 reward= +19.480 evac=1 hp=100.0 s30=0.50 t=199s
ep=0520 [hard ] steps=033 reward= -11.450 evac=0 hp= 0.0 s30=0.47 t=200s
>> PPO update pi_loss=-0.0634 v_loss=27.4050 entropy=1.8160 kl=0.0016 lr=6.60e-05
ep=0521 [medium] steps=009 reward= +16.700 evac=1 hp=100.0 s30=0.47 t=200s
ep=0522 [medium] steps=017 reward= -13.670 evac=0 hp= 0.0 s30=0.43 t=200s
ep=0523 [hard ] steps=014 reward= -13.050 evac=0 hp= 0.0 s30=0.43 t=200s
ep=0524 [medium] steps=150 reward= -6.580 evac=0 hp=100.0 s30=0.43 t=201s
ep=0525 [hard ] steps=100 reward= -10.760 evac=0 hp=100.0 s30=0.40 t=201s
** EVAL [hard] reward=-5.528 success=0.20
ep=0526 [medium] steps=080 reward= -25.070 evac=0 hp= 0.0 s30=0.40 t=203s
ep=0527 [medium] steps=150 reward= -13.725 evac=0 hp= 91.5 s30=0.40 t=203s
ep=0528 [medium] steps=068 reward= -10.380 evac=0 hp= 0.0 s30=0.40 t=204s
>> PPO update pi_loss=+0.0074 v_loss=18.0052 entropy=1.7626 kl=0.0021 lr=6.24e-05
ep=0529 [hard ] steps=100 reward= -8.910 evac=0 hp=100.0 s30=0.37 t=205s
ep=0530 [easy ] steps=200 reward= -11.215 evac=0 hp= 95.5 s30=0.33 t=206s
ep=0531 [hard ] steps=047 reward= +14.700 evac=1 hp=100.0 s30=0.33 t=206s
ep=0532 [hard ] steps=037 reward= -11.340 evac=0 hp= 0.0 s30=0.30 t=206s
ep=0533 [hard ] steps=028 reward= -10.550 evac=0 hp= 0.0 s30=0.30 t=206s
ep=0534 [medium] steps=014 reward= +16.560 evac=1 hp=100.0 s30=0.33 t=206s
ep=0535 [medium] steps=067 reward= -21.630 evac=0 hp= 0.0 s30=0.33 t=207s
ep=0536 [hard ] steps=053 reward= -10.830 evac=0 hp= 0.0 s30=0.33 t=207s
>> PPO update pi_loss=-0.0165 v_loss=15.0212 entropy=1.7101 kl=0.0018 lr=5.88e-05
ep=0537 [medium] steps=150 reward= -6.565 evac=0 hp= 63.5 s30=0.30 t=208s
ep=0538 [hard ] steps=100 reward= -8.425 evac=0 hp= 95.5 s30=0.30 t=209s
ep=0539 [medium] steps=015 reward= +16.190 evac=1 hp=100.0 s30=0.33 t=209s
ep=0540 [hard ] steps=100 reward= -9.880 evac=0 hp=100.0 s30=0.33 t=209s
ep=0541 [medium] steps=014 reward= +17.190 evac=1 hp=100.0 s30=0.33 t=209s
ep=0542 [hard ] steps=044 reward= -10.020 evac=0 hp= 0.0 s30=0.30 t=209s
ep=0543 [easy ] steps=072 reward= +15.770 evac=1 hp=100.0 s30=0.33 t=210s
ep=0544 [hard ] steps=100 reward= -8.800 evac=0 hp=100.0 s30=0.33 t=210s
>> PPO update pi_loss=+0.0102 v_loss=7.6336 entropy=1.9990 kl=0.0004 lr=5.52e-05
ep=0545 [hard ] steps=024 reward= +14.900 evac=1 hp=100.0 s30=0.33 t=211s
ep=0546 [hard ] steps=027 reward= -14.450 evac=0 hp= 0.0 s30=0.30 t=211s
ep=0547 [easy ] steps=011 reward= +17.590 evac=1 hp=100.0 s30=0.33 t=211s
ep=0548 [easy ] steps=017 reward= +18.350 evac=1 hp=100.0 s30=0.33 t=211s
ep=0549 [hard ] steps=022 reward= -14.160 evac=0 hp= 0.0 s30=0.30 t=211s
ep=0550 [medium] steps=019 reward= +9.498 evac=1 hp= 64.5 s30=0.33 t=211s
** EVAL [hard] reward=-11.274 success=0.00
ep=0551 [easy ] steps=033 reward= +17.740 evac=1 hp=100.0 s30=0.33 t=213s
ep=0552 [medium] steps=003 reward= +14.670 evac=1 hp=100.0 s30=0.37 t=213s
>> PPO update pi_loss=-0.0022 v_loss=59.5561 entropy=1.5268 kl=0.0003 lr=5.16e-05
ep=0553 [medium] steps=014 reward= +16.810 evac=1 hp=100.0 s30=0.40 t=213s
ep=0554 [hard ] steps=022 reward= +14.210 evac=1 hp=100.0 s30=0.43 t=213s
ep=0555 [hard ] steps=035 reward= -13.820 evac=0 hp= 0.0 s30=0.43 t=213s
ep=0556 [easy ] steps=047 reward= +17.360 evac=1 hp=100.0 s30=0.47 t=214s
ep=0557 [easy ] steps=048 reward= +13.830 evac=1 hp=100.0 s30=0.50 t=214s
ep=0558 [medium] steps=008 reward= +16.440 evac=1 hp=100.0 s30=0.53 t=214s
ep=0559 [hard ] steps=100 reward= -7.800 evac=0 hp=100.0 s30=0.53 t=214s
ep=0560 [hard ] steps=025 reward= -11.750 evac=0 hp= 0.0 s30=0.53 t=215s
>> PPO update pi_loss=-0.0207 v_loss=16.8330 entropy=1.8783 kl=0.0006 lr=4.80e-05
ep=0561 [medium] steps=008 reward= +16.440 evac=1 hp=100.0 s30=0.53 t=215s
ep=0562 [easy ] steps=112 reward= +10.037 evac=1 hp= 80.5 s30=0.57 t=215s
ep=0563 [easy ] steps=016 reward= +18.610 evac=1 hp=100.0 s30=0.60 t=216s
ep=0564 [medium] steps=004 reward= +15.600 evac=1 hp=100.0 s30=0.60 t=216s
ep=0565 [medium] steps=031 reward= -9.960 evac=0 hp= 0.0 s30=0.60 t=216s
ep=0566 [hard ] steps=014 reward= -11.270 evac=0 hp= 0.0 s30=0.60 t=216s
ep=0567 [hard ] steps=100 reward= -5.240 evac=0 hp=100.0 s30=0.60 t=216s
ep=0568 [easy ] steps=045 reward= +16.020 evac=1 hp=100.0 s30=0.63 t=216s
>> PPO update pi_loss=-0.0140 v_loss=16.1354 entropy=1.8723 kl=0.0002 lr=4.44e-05
ep=0569 [medium] steps=004 reward= +15.550 evac=1 hp=100.0 s30=0.63 t=217s
ep=0570 [hard ] steps=021 reward= -9.080 evac=0 hp= 0.0 s30=0.63 t=217s
ep=0571 [hard ] steps=100 reward= -10.860 evac=0 hp=100.0 s30=0.60 t=217s
ep=0572 [medium] steps=015 reward= -19.150 evac=0 hp= 0.0 s30=0.60 t=217s
ep=0573 [hard ] steps=025 reward= +14.630 evac=1 hp=100.0 s30=0.60 t=218s
ep=0574 [hard ] steps=034 reward= -11.920 evac=0 hp= 0.0 s30=0.60 t=218s
ep=0575 [medium] steps=079 reward= -19.430 evac=0 hp= 0.0 s30=0.57 t=218s
** EVAL [hard] reward=-10.578 success=0.00
ep=0576 [medium] steps=013 reward= +16.620 evac=1 hp=100.0 s30=0.60 t=219s
>> PPO update pi_loss=+0.0070 v_loss=26.4067 entropy=1.4878 kl=0.0001 lr=4.08e-05
ep=0577 [medium] steps=001 reward= +14.260 evac=1 hp=100.0 s30=0.60 t=220s
ep=0578 [hard ] steps=030 reward= -12.950 evac=0 hp= 0.0 s30=0.57 t=220s
ep=0579 [medium] steps=150 reward= -25.410 evac=0 hp= 57.0 s30=0.57 t=221s
ep=0580 [hard ] steps=100 reward= -8.330 evac=0 hp=100.0 s30=0.53 t=221s
ep=0581 [hard ] steps=021 reward= -11.530 evac=0 hp= 0.0 s30=0.50 t=221s
ep=0582 [medium] steps=011 reward= +15.930 evac=1 hp=100.0 s30=0.50 t=221s
ep=0583 [medium] steps=014 reward= +12.270 evac=1 hp= 76.0 s30=0.50 t=221s
ep=0584 [easy ] steps=200 reward= -11.730 evac=0 hp= 67.0 s30=0.47 t=222s
>> PPO update pi_loss=-0.0108 v_loss=10.7300 entropy=1.5916 kl=0.0001 lr=3.72e-05
ep=0585 [medium] steps=016 reward= +12.483 evac=1 hp= 67.5 s30=0.50 t=223s
ep=0586 [medium] steps=031 reward= +14.330 evac=1 hp=100.0 s30=0.50 t=223s
ep=0587 [hard ] steps=027 reward= -9.930 evac=0 hp= 0.0 s30=0.47 t=223s
ep=0588 [hard ] steps=100 reward= -8.820 evac=0 hp= 99.0 s30=0.43 t=223s
ep=0589 [hard ] steps=100 reward= -8.980 evac=0 hp=100.0 s30=0.43 t=224s
ep=0590 [hard ] steps=100 reward= -10.670 evac=0 hp=100.0 s30=0.43 t=225s
ep=0591 [easy ] steps=095 reward= +11.080 evac=1 hp=100.0 s30=0.43 t=225s
ep=0592 [hard ] steps=021 reward= -10.660 evac=0 hp= 0.0 s30=0.40 t=225s
>> PPO update pi_loss=-0.0007 v_loss=5.2941 entropy=1.8152 kl=0.0001 lr=3.36e-05
ep=0593 [easy ] steps=012 reward= +18.350 evac=1 hp=100.0 s30=0.40 t=226s
ep=0594 [medium] steps=021 reward= +10.757 evac=1 hp= 74.5 s30=0.40 t=226s
ep=0595 [medium] steps=012 reward= +15.830 evac=1 hp=100.0 s30=0.43 t=226s
ep=0596 [medium] steps=028 reward= -10.200 evac=0 hp= 0.0 s30=0.43 t=226s
ep=0597 [medium] steps=019 reward= +13.750 evac=1 hp= 88.0 s30=0.47 t=226s
ep=0598 [medium] steps=009 reward= +16.700 evac=1 hp=100.0 s30=0.47 t=226s
ep=0599 [hard ] steps=100 reward= -8.560 evac=0 hp=100.0 s30=0.43 t=226s
ep=0600 [medium] steps=044 reward= +13.770 evac=1 hp=100.0 s30=0.47 t=227s
>> PPO update pi_loss=-0.0007 v_loss=29.9181 entropy=1.5398 kl=0.0001 lr=3.00e-05
** EVAL [hard] reward=-12.068 success=0.00