Spaces:

luciferai-devil
/

devil-policyevolverenv

Sleeping

App Files Files Community

Somuai12 commited on Apr 5

Commit

d78cfdc

1 Parent(s): 933baa6

feat: add reward evolution chart to Gradio dashboard

Browse files

Files changed (5) hide show

evolution_logs.txt +30 -0
inference.py +2 -2
models.py +1 -0
server/app.py +30 -16
server/environment.py +2 -0

evolution_logs.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+[START] task=task_easy env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=2 action=propose_clarification reward=0.86 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=3 action=propose_clarification reward=0.89 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=4 action=propose_clarification reward=0.00 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=5 action=propose_clarification reward=0.86 done=true error=null
+[END] success=true steps=5 score=0.865 rewards=0.12,0.86,0.89,0.00,0.86
+[START] task=task_medium env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=2 action=propose_clarification reward=0.00 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=3 action=propose_new_rule reward=0.80 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=4 action=propose_clarification reward=0.00 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=5 action=propose_clarification reward=0.00 done=true error=null
+[END] success=false steps=5 score=0.000 rewards=0.12,0.00,0.80,0.00,0.00
+[START] task=task_hard env=PolicyEvolverEnv model=meta-llama/Llama-3.3-70B-Instruct
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=1 action=propose_clarification reward=0.12 done=false error=null
+INFO:httpx:HTTP Request: POST https://router.huggingface.co/v1/chat/completions "HTTP/1.1 200 OK"
+[STEP] step=2 action=evolve_policy reward=0.90 done=true error=null
+[END] success=true steps=2 score=0.900 rewards=0.12,0.90

inference.py CHANGED Viewed

@@ -103,8 +103,8 @@ def run_episode(task_id: str):
     rewards = []
     success = False
-    # Strategic refinement for 3 steps (Fix C: Limit steps for 20min run)
-    for _ in range(3):
         step_num += 1
         action_dict = agent.act(task_id, obs.model_dump())

     rewards = []
     success = False
+    # Strategic refinement for 5 steps (Audit Mode: 5 iterations)
+    for _ in range(5):
         step_num += 1
         action_dict = agent.act(task_id, obs.model_dump())

models.py CHANGED Viewed

@@ -100,3 +100,4 @@ class State(BaseModel):
     current_score: float = 0.0
     best_score: float = 0.0
     actions_taken: List[str] = Field(default_factory=list)

     current_score: float = 0.0
     best_score: float = 0.0
     actions_taken: List[str] = Field(default_factory=list)
+    rewards_history: List[float] = Field(default_factory=list)

server/app.py CHANGED Viewed

@@ -148,12 +148,15 @@ def build_custom_ui():
             })
         df_corpus = pd.DataFrame(corpus_data) if corpus_data else pd.DataFrame(columns=["ID", "Content", "System Action"])
-        # 2. Policy List (Markdown)
-        policy_md = "### 📜 Active Governance Framework\n"
-        for p in obs.get("current_policies", []):
-            policy_md += f"- **{p.get('id')}**: {p.get('text')}\n"
-        # 3. Simple Stats
         best_score = obs.get("info", {}).get("best_score", 0.0)
         steps_left = obs.get("info", {}).get("steps_remaining", 5)
         episode_id = obs.get("episode_id", "N/A")[:8]
@@ -162,13 +165,13 @@ def build_custom_ui():
         total = obs.get("corpus_size", len(corpus_data))
         corpus_stat = f"### 📊 Corpus: **{shown}** of **{total}** incidents displayed"
-        return df_corpus, policy_md, best_score, steps_left, episode_id, corpus_stat
     def handle_reset(task_id):
         obs = env.reset(task_id=task_id).model_dump()
-        df, pol, score, steps, ep, stat = format_obs(obs)
         reward_msg = "### 🏁 Scenario Initialized\nReview the Data Corpus and Active Framework to identify gaps."
-        return df, pol, score, steps, ep, stat, reward_msg, json.dumps(obs, indent=2)
     def handle_step(task_id, action_type, easy_term, easy_def, easy_just, easy_think,
                     med_domain, med_rule, med_scope, med_just, med_think,
@@ -185,15 +188,15 @@ def build_custom_ui():
             validated_action = Action.model_validate(payload)
             obs_obj = env.step(validated_action)
             obs = obs_obj.model_dump()
-            df, pol, score, steps, ep, stat = format_obs(obs)
             reward = obs.get("reward", 0.0)
             color = "green" if reward > 0 else "orange" if reward == 0 else "red"
             reward_msg = f"### <span style='color:{color}'>Latest Strategic Reward: {reward}</span>\nCurrent Project Score: {score}"
-            return df, pol, score, steps, ep, stat, reward_msg, json.dumps(obs, indent=2)
         except Exception as e:
-            return pd.DataFrame(), f"### Execution Error\n{str(e)}", 0, 0, "ERROR", "### ERROR", f"Traceback:\n{traceback.format_exc()}", "{}"
     with gr.Blocks(
         title="PolicyEvolver Judge Console",
@@ -210,6 +213,17 @@ def build_custom_ui():
                 best_score_disp = gr.Number(label="Environment Best Score", value=0.0, interactive=False)
                 steps_left_disp = gr.Number(label="Remaining Execution Steps", value=5, interactive=False)
                 episode_disp = gr.Textbox(label="Active Episode ID", value="N/A", interactive=False)
                 reward_outcome_disp = gr.Markdown("### Awaiting Scenario...")
                 gr.Markdown("---")
@@ -330,19 +344,19 @@ def build_custom_ui():
             return (t_id, mode) + res
         # Event Listeners
-        reset_btn.click(handle_reset, inputs=[task_id], outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box])
         # Automatic Sync: Radio -> Dropdown & Initialize
         action_mode.change(
             sync_from_mode,
             inputs=[action_mode],
-            outputs=[task_id, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
         )
         # Automatic Sync: Tab -> Dropdown & Radio & Initialize
         action_tabs.select(
             sync_from_tab,
-            outputs=[task_id, action_mode, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
         )
         step_btn.click(
@@ -353,7 +367,7 @@ def build_custom_ui():
                 med_domain, med_rule, med_scope, med_just, med_think,
                 hard_mods, hard_outcomes, hard_just, hard_think
             ],
-            outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_outcome_disp, raw_json_box]
         )
     return demo

             })
         df_corpus = pd.DataFrame(corpus_data) if corpus_data else pd.DataFrame(columns=["ID", "Content", "System Action"])
+        # 3. Simple Stats & Reward History
+        history = obs.get("info", {}).get("rewards_history", [])
+        df_reward = pd.DataFrame({
+            "Step": [i + 1 for i in range(len(history))],
+            "Reward": history
+        })
+        if df_reward.empty:
+            df_reward = pd.DataFrame({"Step": [0], "Reward": [0.0]})
         best_score = obs.get("info", {}).get("best_score", 0.0)
         steps_left = obs.get("info", {}).get("steps_remaining", 5)
         episode_id = obs.get("episode_id", "N/A")[:8]
         total = obs.get("corpus_size", len(corpus_data))
         corpus_stat = f"### 📊 Corpus: **{shown}** of **{total}** incidents displayed"
+        return df_corpus, policy_md, best_score, steps_left, episode_id, corpus_stat, df_reward
     def handle_reset(task_id):
         obs = env.reset(task_id=task_id).model_dump()
+        df, pol, score, steps, ep, stat, df_hist = format_obs(obs)
         reward_msg = "### 🏁 Scenario Initialized\nReview the Data Corpus and Active Framework to identify gaps."
+        return df, pol, score, steps, ep, stat, df_hist, reward_msg, json.dumps(obs, indent=2)
     def handle_step(task_id, action_type, easy_term, easy_def, easy_just, easy_think,
                     med_domain, med_rule, med_scope, med_just, med_think,
             validated_action = Action.model_validate(payload)
             obs_obj = env.step(validated_action)
             obs = obs_obj.model_dump()
+            df, pol, score, steps, ep, stat, df_hist = format_obs(obs)
             reward = obs.get("reward", 0.0)
             color = "green" if reward > 0 else "orange" if reward == 0 else "red"
             reward_msg = f"### <span style='color:{color}'>Latest Strategic Reward: {reward}</span>\nCurrent Project Score: {score}"
+            return df, pol, score, steps, ep, stat, df_hist, reward_msg, json.dumps(obs, indent=2)
         except Exception as e:
+            return pd.DataFrame(), f"### Execution Error\n{str(e)}", 0, 0, "ERROR", "### ERROR", pd.DataFrame(), f"Traceback:\n{traceback.format_exc()}", "{}"
     with gr.Blocks(
         title="PolicyEvolver Judge Console",
                 best_score_disp = gr.Number(label="Environment Best Score", value=0.0, interactive=False)
                 steps_left_disp = gr.Number(label="Remaining Execution Steps", value=5, interactive=False)
                 episode_disp = gr.Textbox(label="Active Episode ID", value="N/A", interactive=False)
+                gr.Markdown("### 📈 Reward Evolution")
+                reward_plot = gr.LinePlot(
+                    label="Strategic Reward Trend",
+                    x="Step",
+                    y="Reward",
+                    tooltip=["Step", "Reward"],
+                    width=300,
+                    height=200,
+                )
                 reward_outcome_disp = gr.Markdown("### Awaiting Scenario...")
                 gr.Markdown("---")
             return (t_id, mode) + res
         # Event Listeners
+        reset_btn.click(handle_reset, inputs=[task_id], outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box])
         # Automatic Sync: Radio -> Dropdown & Initialize
         action_mode.change(
             sync_from_mode,
             inputs=[action_mode],
+            outputs=[task_id, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
         )
         # Automatic Sync: Tab -> Dropdown & Radio & Initialize
         action_tabs.select(
             sync_from_tab,
+            outputs=[task_id, action_mode, corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
         )
         step_btn.click(
                 med_domain, med_rule, med_scope, med_just, med_think,
                 hard_mods, hard_outcomes, hard_just, hard_think
             ],
+            outputs=[corpus_table, policy_display, best_score_disp, steps_left_disp, episode_disp, corpus_count_disp, reward_plot, reward_outcome_disp, raw_json_box]
         )
     return demo

server/environment.py CHANGED Viewed

@@ -132,6 +132,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
         self._state.current_score = reward
         self._state.best_score = max(self._state.best_score, reward)
         self._persistent_best_score = max(self._persistent_best_score, reward)
         action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
         self._state.actions_taken.append(action_type)
@@ -178,6 +179,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
             info={
                 "best_score": self._state.best_score,
                 "last_reward": reward,
                 "action_history": self._state.actions_taken,
                 "steps_remaining": self._state.max_steps - self._state.step_count,
             },

         self._state.current_score = reward
         self._state.best_score = max(self._state.best_score, reward)
         self._persistent_best_score = max(self._persistent_best_score, reward)
+        self._state.rewards_history.append(reward)
         action_type = action_dict.get("action_type", "unknown") if isinstance(action_dict, dict) else "unknown"
         self._state.actions_taken.append(action_type)
             info={
                 "best_score": self._state.best_score,
                 "last_reward": reward,
+                "rewards_history": self._state.rewards_history,
                 "action_history": self._state.actions_taken,
                 "steps_remaining": self._state.max_steps - self._state.step_count,
             },