Spaces:
Sleeping
Sleeping
Grader Reward Fix (#2)
Browse files
server/fin_auditor_environment.py
CHANGED
|
@@ -82,10 +82,8 @@ class FinAuditorEnvironment(Environment):
|
|
| 82 |
self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
|
| 83 |
self.sim_time_ns = 0
|
| 84 |
|
| 85 |
-
# 1. READ TASK_ID FROM ENVIRONMENT
|
| 86 |
task_id = os.getenv("TASK_ID", "anomaly_detection_hard").lower()
|
| 87 |
|
| 88 |
-
# 2. MAP TO C++ DIFFICULTY ENUM & SYNC YAML STEPS
|
| 89 |
if "easy" in task_id:
|
| 90 |
self.difficulty = hft_auditor.Difficulty.EASY
|
| 91 |
self._MAX_EPISODE_STEPS = 5
|
|
@@ -98,60 +96,42 @@ class FinAuditorEnvironment(Environment):
|
|
| 98 |
|
| 99 |
def reset(self) -> AuditorObservation:
|
| 100 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 101 |
-
|
| 102 |
-
# Re-initialize the engine for a clean episode
|
| 103 |
-
self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
|
| 104 |
-
self.sim_time_ns = 0
|
| 105 |
-
|
| 106 |
-
# Generate the first batch so step 1 has data to evaluate
|
| 107 |
-
self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
|
| 108 |
-
|
| 109 |
-
# Advance time past Δ_max to expire the batch
|
| 110 |
-
self.sim_time_ns += 6_000_000_000
|
| 111 |
self.engine.tick(self.sim_time_ns)
|
| 112 |
|
| 113 |
-
# Get the anomaly matrix for the agent (features for step 1)
|
| 114 |
-
anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
|
| 115 |
-
|
| 116 |
return FinAuditorObservation(
|
| 117 |
-
features=
|
| 118 |
-
message=
|
| 119 |
-
reward=0.001
|
| 120 |
done=False
|
| 121 |
)
|
| 122 |
|
| 123 |
def step(self, action: AuditorAction) -> AuditorObservation: # type: ignore[override]
|
| 124 |
self._state.step_count += 1
|
| 125 |
|
| 126 |
-
#
|
| 127 |
-
|
| 128 |
if action and action.decisions:
|
| 129 |
action_array = np.array(action.decisions, dtype=np.uint8)
|
| 130 |
raw_reward = float(self.engine.compute_reward(action_array))
|
| 131 |
-
|
| 132 |
-
# Map raw reward bounds [-4.0, 40.0] to a [0, 1] percentage
|
| 133 |
normalized_raw = (raw_reward + 4.0) / 44.0
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# 2. GENERATE NEW DATA (Using procedural C++ engine)
|
| 142 |
self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
|
| 143 |
|
| 144 |
-
# 3. ADVANCE TIME & EXPIRE
|
| 145 |
self.sim_time_ns += 6_000_000_000
|
| 146 |
self.engine.tick(self.sim_time_ns)
|
| 147 |
|
| 148 |
-
# 4. EXTRACT NEW MATRIX
|
| 149 |
anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
|
| 150 |
total_anomalies = len(anomalies)
|
| 151 |
|
| 152 |
done = self._state.step_count >= self._MAX_EPISODE_STEPS
|
| 153 |
|
| 154 |
-
# Expose C++ tracking metrics to the Python state
|
| 155 |
self._state.last_tp = self.engine.last_tp
|
| 156 |
self._state.last_tn = self.engine.last_tn
|
| 157 |
self._state.last_fp = self.engine.last_fp
|
|
|
|
| 82 |
self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
|
| 83 |
self.sim_time_ns = 0
|
| 84 |
|
|
|
|
| 85 |
task_id = os.getenv("TASK_ID", "anomaly_detection_hard").lower()
|
| 86 |
|
|
|
|
| 87 |
if "easy" in task_id:
|
| 88 |
self.difficulty = hft_auditor.Difficulty.EASY
|
| 89 |
self._MAX_EPISODE_STEPS = 5
|
|
|
|
| 96 |
|
| 97 |
def reset(self) -> AuditorObservation:
|
| 98 |
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 99 |
+
self.sim_time_ns += self._DELTA_MAX_NS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
self.engine.tick(self.sim_time_ns)
|
| 101 |
|
|
|
|
|
|
|
|
|
|
| 102 |
return FinAuditorObservation(
|
| 103 |
+
features=[],
|
| 104 |
+
message="Fin Auditor engine ready.",
|
| 105 |
+
reward=0.001, # Safe minimum floor, not divided
|
| 106 |
done=False
|
| 107 |
)
|
| 108 |
|
| 109 |
def step(self, action: AuditorAction) -> AuditorObservation: # type: ignore[override]
|
| 110 |
self._state.step_count += 1
|
| 111 |
|
| 112 |
+
# FIX: OpenEnv grader:reward evaluates EACH step's reward independently.
|
| 113 |
+
# Must be strictly in (0.001, 0.999) for every step, no exceptions.
|
| 114 |
if action and action.decisions:
|
| 115 |
action_array = np.array(action.decisions, dtype=np.uint8)
|
| 116 |
raw_reward = float(self.engine.compute_reward(action_array))
|
| 117 |
+
# Map raw bounds [-4.0, 40.0] -> [0.0, 1.0]
|
|
|
|
| 118 |
normalized_raw = (raw_reward + 4.0) / 44.0
|
| 119 |
+
# Clamp strictly inside (0.001, 0.999)
|
| 120 |
+
step_reward = max(0.001, min(0.999, normalized_raw))
|
| 121 |
+
else:
|
| 122 |
+
# Empty decisions (no-op step) - return safe floor, NOT 0.0
|
| 123 |
+
step_reward = 0.001
|
| 124 |
+
|
|
|
|
|
|
|
| 125 |
self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
|
| 126 |
|
|
|
|
| 127 |
self.sim_time_ns += 6_000_000_000
|
| 128 |
self.engine.tick(self.sim_time_ns)
|
| 129 |
|
|
|
|
| 130 |
anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
|
| 131 |
total_anomalies = len(anomalies)
|
| 132 |
|
| 133 |
done = self._state.step_count >= self._MAX_EPISODE_STEPS
|
| 134 |
|
|
|
|
| 135 |
self._state.last_tp = self.engine.last_tp
|
| 136 |
self._state.last_tn = self.engine.last_tn
|
| 137 |
self._state.last_fp = self.engine.last_fp
|