arjeet commited on
Commit
e9c9d34
·
1 Parent(s): e26941e

inference update v5

Browse files
Files changed (1) hide show
  1. server/cust_env_environment.py +13 -3
server/cust_env_environment.py CHANGED
@@ -62,7 +62,8 @@ class DocSweeperEnvironment(Environment):
62
  active_file=""
63
  )
64
 
65
- return self._make_observation(reward=0.0, done=False)
 
66
 
67
  def step(self, action: DocAction):
68
  if self._state is None:
@@ -109,10 +110,19 @@ class DocSweeperEnvironment(Environment):
109
 
110
  new_score = self._calculate_state_score()
111
 
 
112
  delta_reward = (new_score - old_score)
113
- total_step_reward = delta_reward + step_penalty
 
 
 
 
 
 
 
 
114
 
115
- return self._make_observation(reward=total_step_reward, done=done)
116
 
117
  def _handle_edit(self, action: DocAction) -> float:
118
  """Executes the edit and returns a penalty if it fails."""
 
62
  active_file=""
63
  )
64
 
65
+ # Baseline reward is exactly 0.5 (neutral)
66
+ return self._make_observation(reward=0.5, done=False)
67
 
68
  def step(self, action: DocAction):
69
  if self._state is None:
 
110
 
111
  new_score = self._calculate_state_score()
112
 
113
+ # Calculate raw delta reward
114
  delta_reward = (new_score - old_score)
115
+ raw_step_reward = delta_reward + step_penalty
116
+
117
+ # Map reward to be strictly within (0.0, 1.0)
118
+ # raw_step_reward ranges roughly from -1.0 to 1.0. We map it so 0.0 raw = 0.5 mapped.
119
+ mapped_reward = (raw_step_reward + 1.0) / 2.0
120
+
121
+ # Clamp strictly to (0.0, 1.0) boundaries using a 0.01 epsilon
122
+ EPSILON = 0.01
123
+ final_reward = max(EPSILON, min(1.0 - EPSILON, mapped_reward))
124
 
125
+ return self._make_observation(reward=final_reward, done=done)
126
 
127
  def _handle_edit(self, action: DocAction) -> float:
128
  """Executes the edit and returns a penalty if it fails."""