Update reward_shaping.py
Browse files- reward_shaping.py +1 -19
reward_shaping.py
CHANGED
|
@@ -3,7 +3,6 @@ import numpy as np
|
|
| 3 |
from gymnasium.spaces import Box
|
| 4 |
from gymnasium import Wrapper
|
| 5 |
|
| 6 |
-
# --- CONFIGURATION FOR REWARD SHAPING ---
|
| 7 |
TIME_PENALTY = -0.05
|
| 8 |
# ----------------------------------------
|
| 9 |
|
|
@@ -19,7 +18,6 @@ class LunarLanderRewardShaping(Wrapper):
|
|
| 19 |
def step(self, action):
|
| 20 |
observation, reward, terminated, truncated, info = self.env.step(action)
|
| 21 |
|
| 22 |
-
# 1. Unpack relevant variables from observation
|
| 23 |
x_pos = observation[0]
|
| 24 |
y_pos = observation[1]
|
| 25 |
x_vel = observation[2]
|
|
@@ -27,61 +25,45 @@ class LunarLanderRewardShaping(Wrapper):
|
|
| 27 |
angle = observation[4]
|
| 28 |
angular_vel = observation[5]
|
| 29 |
|
| 30 |
-
# Leg contact boolean/float values
|
| 31 |
left_leg_contact = observation[6]
|
| 32 |
right_leg_contact = observation[7]
|
| 33 |
|
| 34 |
-
# Determine if the main engine (action 2) was fired
|
| 35 |
main_engine_fired = 1 if action == 2 else 0
|
| 36 |
|
| 37 |
-
# Determine if *any* thruster (actions 1, 2, or 3) was fired
|
| 38 |
any_thruster_fired = 1 if action != 0 else 0
|
| 39 |
|
| 40 |
-
# 2. Calculate the total current shaping value
|
| 41 |
current_shaping_reward = 0.0
|
| 42 |
|
| 43 |
-
# A. Horizontal Position Penalty (Forces agent toward X=0)
|
| 44 |
-
# This ensures landing between the flags.
|
| 45 |
current_shaping_reward += -30 * np.abs(x_pos)
|
| 46 |
|
| 47 |
-
# B. Sharpened Vertical velocity penalty near the ground
|
| 48 |
|
| 49 |
y_factor = 1.0 - y_pos
|
| 50 |
current_shaping_reward += -20 * np.abs(y_vel) * y_factor
|
| 51 |
|
| 52 |
-
# C. Scaled back Horizontal speed penalty
|
| 53 |
current_shaping_reward += -10 * np.abs(x_vel)
|
| 54 |
|
| 55 |
-
# D. Penalize severe angle
|
| 56 |
current_shaping_reward += -5 * np.abs(angle)
|
| 57 |
|
| 58 |
-
# E. Penalize high spin rate
|
| 59 |
current_shaping_reward += -10 * np.abs(angular_vel)
|
| 60 |
|
| 61 |
-
# F. Main Engine usage penalty near the ground
|
| 62 |
current_shaping_reward += -50 * main_engine_fired * (1.0 - y_pos)
|
| 63 |
|
| 64 |
-
# G. Height Penalty (Penalizes distance from the ground)
|
| 65 |
current_shaping_reward += -20 * y_pos
|
| 66 |
|
| 67 |
-
# H. Post-Contact Thrust Penalty
|
| 68 |
contact_sum = left_leg_contact + right_leg_contact
|
| 69 |
if contact_sum > 0:
|
| 70 |
current_shaping_reward += -900 * any_thruster_fired * contact_sum
|
| 71 |
|
| 72 |
-
# I. Aggressive Landing Leg Use Incentive
|
| 73 |
current_shaping_reward += 15 * contact_sum
|
| 74 |
|
| 75 |
-
# 3. Calculate the differential shaping reward (reward for improvement)
|
| 76 |
if self.last_shaping_reward is not None:
|
| 77 |
shaping_reward_diff = current_shaping_reward - self.last_shaping_reward
|
| 78 |
-
|
| 79 |
reward += np.clip(shaping_reward_diff, -10.0, 10.0)
|
| 80 |
|
| 81 |
self.last_shaping_reward = current_shaping_reward
|
| 82 |
|
| 83 |
reward += TIME_PENALTY
|
| 84 |
-
# ---------------------------------------------
|
| 85 |
|
| 86 |
return observation, reward, terminated, truncated, info
|
| 87 |
|
|
|
|
| 3 |
from gymnasium.spaces import Box
|
| 4 |
from gymnasium import Wrapper
|
| 5 |
|
|
|
|
| 6 |
TIME_PENALTY = -0.05
|
| 7 |
# ----------------------------------------
|
| 8 |
|
|
|
|
| 18 |
def step(self, action):
|
| 19 |
observation, reward, terminated, truncated, info = self.env.step(action)
|
| 20 |
|
|
|
|
| 21 |
x_pos = observation[0]
|
| 22 |
y_pos = observation[1]
|
| 23 |
x_vel = observation[2]
|
|
|
|
| 25 |
angle = observation[4]
|
| 26 |
angular_vel = observation[5]
|
| 27 |
|
|
|
|
| 28 |
left_leg_contact = observation[6]
|
| 29 |
right_leg_contact = observation[7]
|
| 30 |
|
|
|
|
| 31 |
main_engine_fired = 1 if action == 2 else 0
|
| 32 |
|
|
|
|
| 33 |
any_thruster_fired = 1 if action != 0 else 0
|
| 34 |
|
|
|
|
| 35 |
current_shaping_reward = 0.0
|
| 36 |
|
|
|
|
|
|
|
| 37 |
current_shaping_reward += -30 * np.abs(x_pos)
|
| 38 |
|
|
|
|
| 39 |
|
| 40 |
y_factor = 1.0 - y_pos
|
| 41 |
current_shaping_reward += -20 * np.abs(y_vel) * y_factor
|
| 42 |
|
|
|
|
| 43 |
current_shaping_reward += -10 * np.abs(x_vel)
|
| 44 |
|
|
|
|
| 45 |
current_shaping_reward += -5 * np.abs(angle)
|
| 46 |
|
|
|
|
| 47 |
current_shaping_reward += -10 * np.abs(angular_vel)
|
| 48 |
|
|
|
|
| 49 |
current_shaping_reward += -50 * main_engine_fired * (1.0 - y_pos)
|
| 50 |
|
|
|
|
| 51 |
current_shaping_reward += -20 * y_pos
|
| 52 |
|
|
|
|
| 53 |
contact_sum = left_leg_contact + right_leg_contact
|
| 54 |
if contact_sum > 0:
|
| 55 |
current_shaping_reward += -900 * any_thruster_fired * contact_sum
|
| 56 |
|
|
|
|
| 57 |
current_shaping_reward += 15 * contact_sum
|
| 58 |
|
|
|
|
| 59 |
if self.last_shaping_reward is not None:
|
| 60 |
shaping_reward_diff = current_shaping_reward - self.last_shaping_reward
|
| 61 |
+
|
| 62 |
reward += np.clip(shaping_reward_diff, -10.0, 10.0)
|
| 63 |
|
| 64 |
self.last_shaping_reward = current_shaping_reward
|
| 65 |
|
| 66 |
reward += TIME_PENALTY
|
|
|
|
| 67 |
|
| 68 |
return observation, reward, terminated, truncated, info
|
| 69 |
|