TheRealAIGuy commited on
Commit
e01c591
·
verified ·
1 Parent(s): d65f1eb

Grader Reward Fix (#2)

Browse files
Files changed (1) hide show
  1. server/fin_auditor_environment.py +13 -33
server/fin_auditor_environment.py CHANGED
@@ -82,10 +82,8 @@ class FinAuditorEnvironment(Environment):
82
  self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
83
  self.sim_time_ns = 0
84
 
85
- # 1. READ TASK_ID FROM ENVIRONMENT
86
  task_id = os.getenv("TASK_ID", "anomaly_detection_hard").lower()
87
 
88
- # 2. MAP TO C++ DIFFICULTY ENUM & SYNC YAML STEPS
89
  if "easy" in task_id:
90
  self.difficulty = hft_auditor.Difficulty.EASY
91
  self._MAX_EPISODE_STEPS = 5
@@ -98,60 +96,42 @@ class FinAuditorEnvironment(Environment):
98
 
99
  def reset(self) -> AuditorObservation:
100
  self._state = State(episode_id=str(uuid4()), step_count=0)
101
-
102
- # Re-initialize the engine for a clean episode
103
- self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
104
- self.sim_time_ns = 0
105
-
106
- # Generate the first batch so step 1 has data to evaluate
107
- self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
108
-
109
- # Advance time past Δ_max to expire the batch
110
- self.sim_time_ns += 6_000_000_000
111
  self.engine.tick(self.sim_time_ns)
112
 
113
- # Get the anomaly matrix for the agent (features for step 1)
114
- anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
115
-
116
  return FinAuditorObservation(
117
- features=anomalies,
118
- message=f"Engine ready. {len(anomalies)} trades awaiting audit.",
119
- reward=0.001 / self._MAX_EPISODE_STEPS, # Safe fractional minimum
120
  done=False
121
  )
122
 
123
  def step(self, action: AuditorAction) -> AuditorObservation: # type: ignore[override]
124
  self._state.step_count += 1
125
 
126
- # 1. REWARD CALCULATION & MATHEMATICAL NORMALIZATION
127
- step_reward = 0.0
128
  if action and action.decisions:
129
  action_array = np.array(action.decisions, dtype=np.uint8)
130
  raw_reward = float(self.engine.compute_reward(action_array))
131
-
132
- # Map raw reward bounds [-4.0, 40.0] to a [0, 1] percentage
133
  normalized_raw = (raw_reward + 4.0) / 44.0
134
-
135
- # Clamp to prevent EXACT 0.0 or 1.0 boundary hits
136
- safe_clamped = max(0.01, min(0.99, normalized_raw))
137
-
138
- # Distribute over episode length so the SUM is strictly in (0, 1)
139
- step_reward = safe_clamped / self._MAX_EPISODE_STEPS
140
-
141
- # 2. GENERATE NEW DATA (Using procedural C++ engine)
142
  self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
143
 
144
- # 3. ADVANCE TIME & EXPIRE
145
  self.sim_time_ns += 6_000_000_000
146
  self.engine.tick(self.sim_time_ns)
147
 
148
- # 4. EXTRACT NEW MATRIX
149
  anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
150
  total_anomalies = len(anomalies)
151
 
152
  done = self._state.step_count >= self._MAX_EPISODE_STEPS
153
 
154
- # Expose C++ tracking metrics to the Python state
155
  self._state.last_tp = self.engine.last_tp
156
  self._state.last_tn = self.engine.last_tn
157
  self._state.last_fp = self.engine.last_fp
 
82
  self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
83
  self.sim_time_ns = 0
84
 
 
85
  task_id = os.getenv("TASK_ID", "anomaly_detection_hard").lower()
86
 
 
87
  if "easy" in task_id:
88
  self.difficulty = hft_auditor.Difficulty.EASY
89
  self._MAX_EPISODE_STEPS = 5
 
96
 
97
  def reset(self) -> AuditorObservation:
98
  self._state = State(episode_id=str(uuid4()), step_count=0)
99
+ self.sim_time_ns += self._DELTA_MAX_NS
 
 
 
 
 
 
 
 
 
100
  self.engine.tick(self.sim_time_ns)
101
 
 
 
 
102
  return FinAuditorObservation(
103
+ features=[],
104
+ message="Fin Auditor engine ready.",
105
+ reward=0.001, # Safe minimum floor, not divided
106
  done=False
107
  )
108
 
109
  def step(self, action: AuditorAction) -> AuditorObservation: # type: ignore[override]
110
  self._state.step_count += 1
111
 
112
+ # FIX: OpenEnv grader:reward evaluates EACH step's reward independently.
113
+ # Must be strictly in (0.001, 0.999) for every step, no exceptions.
114
  if action and action.decisions:
115
  action_array = np.array(action.decisions, dtype=np.uint8)
116
  raw_reward = float(self.engine.compute_reward(action_array))
117
+ # Map raw bounds [-4.0, 40.0] -> [0.0, 1.0]
 
118
  normalized_raw = (raw_reward + 4.0) / 44.0
119
+ # Clamp strictly inside (0.001, 0.999)
120
+ step_reward = max(0.001, min(0.999, normalized_raw))
121
+ else:
122
+ # Empty decisions (no-op step) - return safe floor, NOT 0.0
123
+ step_reward = 0.001
124
+
 
 
125
  self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
126
 
 
127
  self.sim_time_ns += 6_000_000_000
128
  self.engine.tick(self.sim_time_ns)
129
 
 
130
  anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
131
  total_anomalies = len(anomalies)
132
 
133
  done = self._state.step_count >= self._MAX_EPISODE_STEPS
134
 
 
135
  self._state.last_tp = self.engine.last_tp
136
  self._state.last_tn = self.engine.last_tn
137
  self._state.last_fp = self.engine.last_fp