Somuai12 commited on
Commit
d78cfdc
ยท
1 Parent(s): 933baa6

feat: add reward evolution chart to Gradio dashboard

Browse files
Files changed (5) hide show
  1. evolution_logs.txt +30 -0
  2. inference.py +2 -2
  3. models.py +1 -0
  4. server/app.py +30 -16
  5. server/environment.py +2 -0
evolution_logs.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [START] task=task_easy env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
2
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
3
+ [STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
4
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
5
+ [STEP] step=2 action=propose_clarification reward=0.86 done=false error=null
6
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
7
+ [STEP] step=3 action=propose_clarification reward=0.89 done=false error=null
8
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
9
+ [STEP] step=4 action=propose_clarification reward=0.00 done=false error=null
10
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
11
+ [STEP] step=5 action=propose_clarification reward=0.86 done=true error=null
12
+ [END] success=true steps=5 score=0.865 rewards=0.12,0.86,0.89,0.00,0.86
13
+ [START] task=task_medium env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
14
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
15
+ [STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
16
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
17
+ [STEP] step=2 action=propose_clarification reward=0.00 done=false error=null
18
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
19
+ [STEP] step=3 action=propose_new_rule reward=0.80 done=false error=null
20
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
21
+ [STEP] step=4 action=propose_clarification reward=0.00 done=false error=null
22
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
23
+ [STEP] step=5 action=propose_clarification reward=0.00 done=true error=null
24
+ [END] success=false steps=5 score=0.000 rewards=0.12,0.00,0.80,0.00,0.00
25
+ [START] task=task_hard env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
26
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
27
+ [STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
28
+ INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
29
+ [STEP] step=2 action=evolve_policy reward=0.90 done=true error=null
30
+ [END] success=true steps=2 score=0.900 rewards=0.12,0.90
inference.py CHANGED
@@ -103,8 +103,8 @@ def run_episode(task_id: str):
103
  rewards = []
104
  success = False
105
 
106
- # Strategic refinement for 3 steps (Fix C: Limit steps for 20min run)
107
- for _ in range(3):
108
  step_num += 1
109
  action_dict = agent.act(task_id, obs.model_dump())
110
 
 
103
  rewards = []
104
  success = False
105
 
106
+ # Strategic refinement for 5 steps (Audit Mode: 5 iterations)
107
+ for _ in range(5):
108
  step_num += 1
109
  action_dict = agent.act(task_id, obs.model_dump())
110
 
models.py CHANGED
@@ -100,3 +100,4 @@ class State(BaseModel):
100
  current_score: float = 0.0
101
  best_score: float = 0.0
102
  actions_taken: List[str] = Field(default_factory=list)
 
 
100
  current_score: float = 0.0
101
  best_score: float = 0.0
102
  actions_taken: List[str] = Field(default_factory=list)
103
+ rewards_history: List[float] = Field(default_factory=list)
server/app.py CHANGED
@@ -148,12 +148,15 @@ def build_custom_ui():
148
  })
149
  df_corpus = pd.DataFrame(corpus_data) if corpus_data else pd.DataFrame(columns=["ID", "Content", "System Action"])
150
 
151
- # 2. Policy List (Markdown)
152
- policy_md = "### ๐Ÿ“œ Active Governance Framework\n"
153
- for p in obs.get("current_policies", []):
154
- policy_md += f"- **{p.get('id')}**: {p.get('text')}\n"
155
-
156
- # 3. Simple Stats
 
 
 
157
  best_score = obs.get("info", {}).get("best_score", 0.0)
158
  steps_left = obs.get("info", {}).get("steps_remaining", 5)
159
  episode_id = obs.get("episode_id", "N/A")[:8]
@@ -162,13 +165,13 @@ def build_custom_ui():
162
  total = obs.get("corpus_size", len(corpus_data))
163
  corpus_stat = f"### ๐Ÿ“Š Corpus: **{shown}** of **{total}** incidents displayed"
164
 
165
- return df_corpus, policy_md, best_score, steps_left, episode_id, corpus_stat
166
 
167
  def handle_reset(task_id):
168
  obs = env.reset(task_id=task_id).model_dump()
169
- df, pol, score, steps, ep, stat = format_obs(obs)
170
  reward_msg = "### ๐Ÿ Scenario Initialized\nReview the Data Corpus and Active Framework to identify gaps."
171
- return df, pol, score, steps, ep, stat, reward_msg, json.dumps(obs, indent=2)
172
 
173
  def handle_step(task_id, action_type, easy_term, easy_def, easy_just, easy_think,
174
  med_domain, med_rule, med_scope, med_just, med_think,
@@ -185,15 +188,15 @@ def build_custom_ui():
185
  validated_action = Action.model_validate(payload)
186
  obs_obj = env.step(validated_action)
187
  obs = obs_obj.model_dump()
188
- df, pol, score, steps, ep, stat = format_obs(obs)
189
 
190
  reward = obs.get("reward", 0.0)
191
  color = "green" if reward > 0 else "orange" if reward == 0 else "red"
192
  reward_msg = f"### <span style='color:{color}'>Latest Strategic Reward: {reward}</span>\nCurrent Project Score: {score}"
193
 
194
- return df, pol, score, steps, ep, stat, reward_msg, json.dumps(obs, indent=2)
195
  except Exception as e:
196
- return pd.DataFrame(), f"### Execution Error\n{str(e)}", 0, 0, "ERROR", "### ERROR", f"Traceback:\n{traceback.format_exc()}", "{}"
197
 
198
  with gr.Blocks(
199
  title="PolicyEvolver Judge Console",
@@ -210,6 +213,17 @@ def build_custom_ui():
210
  best_score_disp = gr.Number(label="Environment Best Score", value=0.0, interactive=False)
211
  steps_left_disp = gr.Number(label="Remaining Execution Steps", value=5, interactive=False)
212
  episode_disp = gr.Textbox(label="Active Episode ID", value="N/A", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
213
  reward_outcome_disp = gr.Markdown("### Awaiting Scenario...")
214
 
215
  gr.Markdown("---")
@@ -330,19 +344,19 @@ def build_custom_ui():
330
  return (t_id, mode) + res
331
 
332
  # Event Listeners
333
- reset_btn.click(handle_reset, inputs=[task_id], outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box])
334
 
335
  # Automatic Sync: Radio -> Dropdown & Initialize
336
  action_mode.change(
337
  sync_from_mode,
338
  inputs=[action_mode],
339
- outputs=[task_id, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
340
  )
341
 
342
  # Automatic Sync: Tab -> Dropdown & Radio & Initialize
343
  action_tabs.select(
344
  sync_from_tab,
345
- outputs=[task_id, action_mode, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
346
  )
347
 
348
  step_btn.click(
@@ -353,7 +367,7 @@ def build_custom_ui():
353
  med_domain, med_rule, med_scope, med_just, med_think,
354
  hard_mods, hard_outcomes, hard_just, hard_think
355
  ],
356
- outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
357
  )
358
 
359
  return demo
 
148
  })
149
  df_corpus = pd.DataFrame(corpus_data) if corpus_data else pd.DataFrame(columns=["ID", "Content", "System Action"])
150
 
151
+ # 3. Simple Stats & Reward History
152
+ history = obs.get("info", {}).get("rewards_history", [])
153
+ df_reward = pd.DataFrame({
154
+ "Step": [i + 1 for i in range(len(history))],
155
+ "Reward": history
156
+ })
157
+ if df_reward.empty:
158
+ df_reward = pd.DataFrame({"Step": [0], "Reward": [0.0]})
159
+
160
  best_score = obs.get("info", {}).get("best_score", 0.0)
161
  steps_left = obs.get("info", {}).get("steps_remaining", 5)
162
  episode_id = obs.get("episode_id", "N/A")[:8]
 
165
  total = obs.get("corpus_size", len(corpus_data))
166
  corpus_stat = f"### ๐Ÿ“Š Corpus: **{shown}** of **{total}** incidents displayed"
167
 
168
+ return df_corpus, policy_md, best_score, steps_left, episode_id, corpus_stat, df_reward
169
 
170
  def handle_reset(task_id):
171
  obs = env.reset(task_id=task_id).model_dump()
172
+ df, pol, score, steps, ep, stat, df_hist = format_obs(obs)
173
  reward_msg = "### ๐Ÿ Scenario Initialized\nReview the Data Corpus and Active Framework to identify gaps."
174
+ return df, pol, score, steps, ep, stat, df_hist, reward_msg, json.dumps(obs, indent=2)
175
 
176
  def handle_step(task_id, action_type, easy_term, easy_def, easy_just, easy_think,
177
  med_domain, med_rule, med_scope, med_just, med_think,
 
188
  validated_action = Action.model_validate(payload)
189
  obs_obj = env.step(validated_action)
190
  obs = obs_obj.model_dump()
191
+ df, pol, score, steps, ep, stat, df_hist = format_obs(obs)
192
 
193
  reward = obs.get("reward", 0.0)
194
  color = "green" if reward > 0 else "orange" if reward == 0 else "red"
195
  reward_msg = f"### <span style='color:{color}'>Latest Strategic Reward: {reward}</span>\nCurrent Project Score: {score}"
196
 
197
+ return df, pol, score, steps, ep, stat, df_hist, reward_msg, json.dumps(obs, indent=2)
198
  except Exception as e:
199
+ return pd.DataFrame(), f"### Execution Error\n{str(e)}", 0, 0, "ERROR", "### ERROR", pd.DataFrame(), f"Traceback:\n{traceback.format_exc()}", "{}"
200
 
201
  with gr.Blocks(
202
  title="PolicyEvolver Judge Console",
 
213
  best_score_disp = gr.Number(label="Environment Best Score", value=0.0, interactive=False)
214
  steps_left_disp = gr.Number(label="Remaining Execution Steps", value=5, interactive=False)
215
  episode_disp = gr.Textbox(label="Active Episode ID", value="N/A", interactive=False)
216
+
217
+ gr.Markdown("### ๐Ÿ“ˆ Reward Evolution")
218
+ reward_plot = gr.LinePlot(
219
+ label="Strategic Reward Trend",
220
+ x="Step",
221
+ y="Reward",
222
+ tooltip=["Step", "Reward"],
223
+ width=300,
224
+ height=200,
225
+ )
226
+
227
  reward_outcome_disp = gr.Markdown("### Awaiting Scenario...")
228
 
229
  gr.Markdown("---")
 
344
  return (t_id, mode) + res
345
 
346
  # Event Listeners
347
+ reset_btn.click(handle_reset, inputs=[task_id], outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box])
348
 
349
  # Automatic Sync: Radio -> Dropdown & Initialize
350
  action_mode.change(
351
  sync_from_mode,
352
  inputs=[action_mode],
353
+ outputs=[task_id, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
354
  )
355
 
356
  # Automatic Sync: Tab -> Dropdown & Radio & Initialize
357
  action_tabs.select(
358
  sync_from_tab,
359
+ outputs=[task_id, action_mode, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
360
  )
361
 
362
  step_btn.click(
 
367
  med_domain, med_rule, med_scope, med_just, med_think,
368
  hard_mods, hard_outcomes, hard_just, hard_think
369
  ],
370
+ outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
371
  )
372
 
373
  return demo
server/environment.py CHANGED
@@ -132,6 +132,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
132
  self._state.current_score = reward
133
  self._state.best_score = max(self._state.best_score, reward)
134
  self._persistent_best_score = max(self._persistent_best_score, reward)
 
135
 
136
  action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
137
  self._state.actions_taken.append(action_type)
@@ -178,6 +179,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
178
  info={
179
  "best_score": self._state.best_score,
180
  "last_reward": reward,
 
181
  "action_history": self._state.actions_taken,
182
  "steps_remaining": self._state.max_steps - self._state.step_count,
183
  },
 
132
  self._state.current_score = reward
133
  self._state.best_score = max(self._state.best_score, reward)
134
  self._persistent_best_score = max(self._persistent_best_score, reward)
135
+ self._state.rewards_history.append(reward)
136
 
137
  action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
138
  self._state.actions_taken.append(action_type)
 
179
  info={
180
  "best_score": self._state.best_score,
181
  "last_reward": reward,
182
+ "rewards_history": self._state.rewards_history,
183
  "action_history": self._state.actions_taken,
184
  "steps_remaining": self._state.max_steps - self._state.step_count,
185
  },