feat: add reward evolution chart to Gradio dashboard
Browse files- evolution_logs.txt +30 -0
- inference.py +2 -2
- models.py +1 -0
- server/app.py +30 -16
- server/environment.py +2 -0
evolution_logs.txt
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[START] task=task_easy env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
|
| 2 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 3 |
+
[STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
|
| 4 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 5 |
+
[STEP] step=2 action=propose_clarification reward=0.86 done=false error=null
|
| 6 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 7 |
+
[STEP] step=3 action=propose_clarification reward=0.89 done=false error=null
|
| 8 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 9 |
+
[STEP] step=4 action=propose_clarification reward=0.00 done=false error=null
|
| 10 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 11 |
+
[STEP] step=5 action=propose_clarification reward=0.86 done=true error=null
|
| 12 |
+
[END] success=true steps=5 score=0.865 rewards=0.12,0.86,0.89,0.00,0.86
|
| 13 |
+
[START] task=task_medium env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
|
| 14 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 15 |
+
[STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
|
| 16 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 17 |
+
[STEP] step=2 action=propose_clarification reward=0.00 done=false error=null
|
| 18 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 19 |
+
[STEP] step=3 action=propose_new_rule reward=0.80 done=false error=null
|
| 20 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 21 |
+
[STEP] step=4 action=propose_clarification reward=0.00 done=false error=null
|
| 22 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 23 |
+
[STEP] step=5 action=propose_clarification reward=0.00 done=true error=null
|
| 24 |
+
[END] success=false steps=5 score=0.000 rewards=0.12,0.00,0.80,0.00,0.00
|
| 25 |
+
[START] task=task_hard env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
|
| 26 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 27 |
+
[STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
|
| 28 |
+
INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
|
| 29 |
+
[STEP] step=2 action=evolve_policy reward=0.90 done=true error=null
|
| 30 |
+
[END] success=true steps=2 score=0.900 rewards=0.12,0.90
|
inference.py
CHANGED
|
@@ -103,8 +103,8 @@ def run_episode(task_id: str):
|
|
| 103 |
rewards = []
|
| 104 |
success = False
|
| 105 |
|
| 106 |
-
# Strategic refinement for
|
| 107 |
-
for _ in range(
|
| 108 |
step_num += 1
|
| 109 |
action_dict = agent.act(task_id, obs.model_dump())
|
| 110 |
|
|
|
|
| 103 |
rewards = []
|
| 104 |
success = False
|
| 105 |
|
| 106 |
+
# Strategic refinement for 5 steps (Audit Mode: 5 iterations)
|
| 107 |
+
for _ in range(5):
|
| 108 |
step_num += 1
|
| 109 |
action_dict = agent.act(task_id, obs.model_dump())
|
| 110 |
|
models.py
CHANGED
|
@@ -100,3 +100,4 @@ class State(BaseModel):
|
|
| 100 |
current_score: float = 0.0
|
| 101 |
best_score: float = 0.0
|
| 102 |
actions_taken: List[str] = Field(default_factory=list)
|
|
|
|
|
|
| 100 |
current_score: float = 0.0
|
| 101 |
best_score: float = 0.0
|
| 102 |
actions_taken: List[str] = Field(default_factory=list)
|
| 103 |
+
rewards_history: List[float] = Field(default_factory=list)
|
server/app.py
CHANGED
|
@@ -148,12 +148,15 @@ def build_custom_ui():
|
|
| 148 |
})
|
| 149 |
df_corpus = pd.DataFrame(corpus_data) if corpus_data else pd.DataFrame(columns=["ID", "Content", "System Action"])
|
| 150 |
|
| 151 |
-
#
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
| 157 |
best_score = obs.get("info", {}).get("best_score", 0.0)
|
| 158 |
steps_left = obs.get("info", {}).get("steps_remaining", 5)
|
| 159 |
episode_id = obs.get("episode_id", "N/A")[:8]
|
|
@@ -162,13 +165,13 @@ def build_custom_ui():
|
|
| 162 |
total = obs.get("corpus_size", len(corpus_data))
|
| 163 |
corpus_stat = f"### ๐ Corpus: **{shown}** of **{total}** incidents displayed"
|
| 164 |
|
| 165 |
-
return df_corpus, policy_md, best_score, steps_left, episode_id, corpus_stat
|
| 166 |
|
| 167 |
def handle_reset(task_id):
|
| 168 |
obs = env.reset(task_id=task_id).model_dump()
|
| 169 |
-
df, pol, score, steps, ep, stat = format_obs(obs)
|
| 170 |
reward_msg = "### ๐ Scenario Initialized\nReview the Data Corpus and Active Framework to identify gaps."
|
| 171 |
-
return df, pol, score, steps, ep, stat, reward_msg, json.dumps(obs, indent=2)
|
| 172 |
|
| 173 |
def handle_step(task_id, action_type, easy_term, easy_def, easy_just, easy_think,
|
| 174 |
med_domain, med_rule, med_scope, med_just, med_think,
|
|
@@ -185,15 +188,15 @@ def build_custom_ui():
|
|
| 185 |
validated_action = Action.model_validate(payload)
|
| 186 |
obs_obj = env.step(validated_action)
|
| 187 |
obs = obs_obj.model_dump()
|
| 188 |
-
df, pol, score, steps, ep, stat = format_obs(obs)
|
| 189 |
|
| 190 |
reward = obs.get("reward", 0.0)
|
| 191 |
color = "green" if reward > 0 else "orange" if reward == 0 else "red"
|
| 192 |
reward_msg = f"### <span style='color:{color}'>Latest Strategic Reward: {reward}</span>\nCurrent Project Score: {score}"
|
| 193 |
|
| 194 |
-
return df, pol, score, steps, ep, stat, reward_msg, json.dumps(obs, indent=2)
|
| 195 |
except Exception as e:
|
| 196 |
-
return pd.DataFrame(), f"### Execution Error\n{str(e)}", 0, 0, "ERROR", "### ERROR", f"Traceback:\n{traceback.format_exc()}", "{}"
|
| 197 |
|
| 198 |
with gr.Blocks(
|
| 199 |
title="PolicyEvolver Judge Console",
|
|
@@ -210,6 +213,17 @@ def build_custom_ui():
|
|
| 210 |
best_score_disp = gr.Number(label="Environment Best Score", value=0.0, interactive=False)
|
| 211 |
steps_left_disp = gr.Number(label="Remaining Execution Steps", value=5, interactive=False)
|
| 212 |
episode_disp = gr.Textbox(label="Active Episode ID", value="N/A", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
reward_outcome_disp = gr.Markdown("### Awaiting Scenario...")
|
| 214 |
|
| 215 |
gr.Markdown("---")
|
|
@@ -330,19 +344,19 @@ def build_custom_ui():
|
|
| 330 |
return (t_id, mode) + res
|
| 331 |
|
| 332 |
# Event Listeners
|
| 333 |
-
reset_btn.click(handle_reset, inputs=[task_id], outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box])
|
| 334 |
|
| 335 |
# Automatic Sync: Radio -> Dropdown & Initialize
|
| 336 |
action_mode.change(
|
| 337 |
sync_from_mode,
|
| 338 |
inputs=[action_mode],
|
| 339 |
-
outputs=[task_id, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
|
| 340 |
)
|
| 341 |
|
| 342 |
# Automatic Sync: Tab -> Dropdown & Radio & Initialize
|
| 343 |
action_tabs.select(
|
| 344 |
sync_from_tab,
|
| 345 |
-
outputs=[task_id, action_mode, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
|
| 346 |
)
|
| 347 |
|
| 348 |
step_btn.click(
|
|
@@ -353,7 +367,7 @@ def build_custom_ui():
|
|
| 353 |
med_domain, med_rule, med_scope, med_just, med_think,
|
| 354 |
hard_mods, hard_outcomes, hard_just, hard_think
|
| 355 |
],
|
| 356 |
-
outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
|
| 357 |
)
|
| 358 |
|
| 359 |
return demo
|
|
|
|
| 148 |
})
|
| 149 |
df_corpus = pd.DataFrame(corpus_data) if corpus_data else pd.DataFrame(columns=["ID", "Content", "System Action"])
|
| 150 |
|
| 151 |
+
# 3. Simple Stats & Reward History
|
| 152 |
+
history = obs.get("info", {}).get("rewards_history", [])
|
| 153 |
+
df_reward = pd.DataFrame({
|
| 154 |
+
"Step": [i + 1 for i in range(len(history))],
|
| 155 |
+
"Reward": history
|
| 156 |
+
})
|
| 157 |
+
if df_reward.empty:
|
| 158 |
+
df_reward = pd.DataFrame({"Step": [0], "Reward": [0.0]})
|
| 159 |
+
|
| 160 |
best_score = obs.get("info", {}).get("best_score", 0.0)
|
| 161 |
steps_left = obs.get("info", {}).get("steps_remaining", 5)
|
| 162 |
episode_id = obs.get("episode_id", "N/A")[:8]
|
|
|
|
| 165 |
total = obs.get("corpus_size", len(corpus_data))
|
| 166 |
corpus_stat = f"### ๐ Corpus: **{shown}** of **{total}** incidents displayed"
|
| 167 |
|
| 168 |
+
return df_corpus, policy_md, best_score, steps_left, episode_id, corpus_stat, df_reward
|
| 169 |
|
| 170 |
def handle_reset(task_id):
|
| 171 |
obs = env.reset(task_id=task_id).model_dump()
|
| 172 |
+
df, pol, score, steps, ep, stat, df_hist = format_obs(obs)
|
| 173 |
reward_msg = "### ๐ Scenario Initialized\nReview the Data Corpus and Active Framework to identify gaps."
|
| 174 |
+
return df, pol, score, steps, ep, stat, df_hist, reward_msg, json.dumps(obs, indent=2)
|
| 175 |
|
| 176 |
def handle_step(task_id, action_type, easy_term, easy_def, easy_just, easy_think,
|
| 177 |
med_domain, med_rule, med_scope, med_just, med_think,
|
|
|
|
| 188 |
validated_action = Action.model_validate(payload)
|
| 189 |
obs_obj = env.step(validated_action)
|
| 190 |
obs = obs_obj.model_dump()
|
| 191 |
+
df, pol, score, steps, ep, stat, df_hist = format_obs(obs)
|
| 192 |
|
| 193 |
reward = obs.get("reward", 0.0)
|
| 194 |
color = "green" if reward > 0 else "orange" if reward == 0 else "red"
|
| 195 |
reward_msg = f"### <span style='color:{color}'>Latest Strategic Reward: {reward}</span>\nCurrent Project Score: {score}"
|
| 196 |
|
| 197 |
+
return df, pol, score, steps, ep, stat, df_hist, reward_msg, json.dumps(obs, indent=2)
|
| 198 |
except Exception as e:
|
| 199 |
+
return pd.DataFrame(), f"### Execution Error\n{str(e)}", 0, 0, "ERROR", "### ERROR", pd.DataFrame(), f"Traceback:\n{traceback.format_exc()}", "{}"
|
| 200 |
|
| 201 |
with gr.Blocks(
|
| 202 |
title="PolicyEvolver Judge Console",
|
|
|
|
| 213 |
best_score_disp = gr.Number(label="Environment Best Score", value=0.0, interactive=False)
|
| 214 |
steps_left_disp = gr.Number(label="Remaining Execution Steps", value=5, interactive=False)
|
| 215 |
episode_disp = gr.Textbox(label="Active Episode ID", value="N/A", interactive=False)
|
| 216 |
+
|
| 217 |
+
gr.Markdown("### ๐ Reward Evolution")
|
| 218 |
+
reward_plot = gr.LinePlot(
|
| 219 |
+
label="Strategic Reward Trend",
|
| 220 |
+
x="Step",
|
| 221 |
+
y="Reward",
|
| 222 |
+
tooltip=["Step", "Reward"],
|
| 223 |
+
width=300,
|
| 224 |
+
height=200,
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
reward_outcome_disp = gr.Markdown("### Awaiting Scenario...")
|
| 228 |
|
| 229 |
gr.Markdown("---")
|
|
|
|
| 344 |
return (t_id, mode) + res
|
| 345 |
|
| 346 |
# Event Listeners
|
| 347 |
+
reset_btn.click(handle_reset, inputs=[task_id], outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box])
|
| 348 |
|
| 349 |
# Automatic Sync: Radio -> Dropdown & Initialize
|
| 350 |
action_mode.change(
|
| 351 |
sync_from_mode,
|
| 352 |
inputs=[action_mode],
|
| 353 |
+
outputs=[task_id, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
|
| 354 |
)
|
| 355 |
|
| 356 |
# Automatic Sync: Tab -> Dropdown & Radio & Initialize
|
| 357 |
action_tabs.select(
|
| 358 |
sync_from_tab,
|
| 359 |
+
outputs=[task_id, action_mode, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
|
| 360 |
)
|
| 361 |
|
| 362 |
step_btn.click(
|
|
|
|
| 367 |
med_domain, med_rule, med_scope, med_just, med_think,
|
| 368 |
hard_mods, hard_outcomes, hard_just, hard_think
|
| 369 |
],
|
| 370 |
+
outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
|
| 371 |
)
|
| 372 |
|
| 373 |
return demo
|
server/environment.py
CHANGED
|
@@ -132,6 +132,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
|
|
| 132 |
self._state.current_score = reward
|
| 133 |
self._state.best_score = max(self._state.best_score, reward)
|
| 134 |
self._persistent_best_score = max(self._persistent_best_score, reward)
|
|
|
|
| 135 |
|
| 136 |
action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
|
| 137 |
self._state.actions_taken.append(action_type)
|
|
@@ -178,6 +179,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
|
|
| 178 |
info={
|
| 179 |
"best_score": self._state.best_score,
|
| 180 |
"last_reward": reward,
|
|
|
|
| 181 |
"action_history": self._state.actions_taken,
|
| 182 |
"steps_remaining": self._state.max_steps - self._state.step_count,
|
| 183 |
},
|
|
|
|
| 132 |
self._state.current_score = reward
|
| 133 |
self._state.best_score = max(self._state.best_score, reward)
|
| 134 |
self._persistent_best_score = max(self._persistent_best_score, reward)
|
| 135 |
+
self._state.rewards_history.append(reward)
|
| 136 |
|
| 137 |
action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
|
| 138 |
self._state.actions_taken.append(action_type)
|
|
|
|
| 179 |
info={
|
| 180 |
"best_score": self._state.best_score,
|
| 181 |
"last_reward": reward,
|
| 182 |
+
"rewards_history": self._state.rewards_history,
|
| 183 |
"action_history": self._state.actions_taken,
|
| 184 |
"steps_remaining": self._state.max_steps - self._state.step_count,
|
| 185 |
},
|