Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- constants.py +7 -2
- rewards/README.md +24 -17
- rewards/exploration.py +3 -3
- rewards/generation.py +3 -3
- server/explainer_env_environment.py +4 -3
- tests/test_rewards.py +4 -3
constants.py
CHANGED
|
@@ -12,12 +12,17 @@ AVAILABLE_TOOLS = (
|
|
| 12 |
"search_hf_hub",
|
| 13 |
)
|
| 14 |
|
| 15 |
-
MAX_EXPLORE_REWARD =
|
| 16 |
MAX_GENERATE_REWARD = 1.0
|
| 17 |
-
MAX_REPAIR_REWARD =
|
| 18 |
SUCCESS_SCORE_THRESHOLD = 0.3
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
def normalized_episode_score(total_reward: float) -> float:
|
| 22 |
"""Normalize an episode's accumulated reward to the required [0, 1] range.
|
| 23 |
|
|
|
|
| 12 |
"search_hf_hub",
|
| 13 |
)
|
| 14 |
|
| 15 |
+
MAX_EXPLORE_REWARD = 1.0
|
| 16 |
MAX_GENERATE_REWARD = 1.0
|
| 17 |
+
MAX_REPAIR_REWARD = 1.0
|
| 18 |
SUCCESS_SCORE_THRESHOLD = 0.3
|
| 19 |
|
| 20 |
|
| 21 |
+
def clamp_action_reward(value: float) -> float:
|
| 22 |
+
"""Clamp any single action reward to the required [0, 1] range."""
|
| 23 |
+
return min(max(value, 0.0), 1.0)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
def normalized_episode_score(total_reward: float) -> float:
|
| 27 |
"""Normalize an episode's accumulated reward to the required [0, 1] range.
|
| 28 |
|
rewards/README.md
CHANGED
|
@@ -5,10 +5,11 @@ Multi-component reward system for the explore -> generate -> repair episode.
|
|
| 5 |
## Episode Flow
|
| 6 |
|
| 7 |
```
|
| 8 |
-
reset() --> [explore x 0..
|
| 9 |
```
|
| 10 |
|
| 11 |
Each step returns a per-step reward. The agent learns what tool to use, what to retrieve, when to stop exploring, and how to repair broken artifacts.
|
|
|
|
| 12 |
|
| 13 |
## Exploration Rewards (`exploration.py`)
|
| 14 |
|
|
@@ -16,12 +17,10 @@ Per-step reward for each `explore` action. Gated by information need -- once the
|
|
| 16 |
|
| 17 |
| Component | Weight | Range | Description |
|
| 18 |
|---|---|---|---|
|
| 19 |
-
| `
|
| 20 |
-
| `
|
| 21 |
-
| `
|
| 22 |
-
| `
|
| 23 |
-
| `result_novelty` | 0.15 | 0-1 | New normalized terms vs. previous context |
|
| 24 |
-
| `diversity` | 0.10 | 0-1 | Useful new source/tool diversity |
|
| 25 |
| `step_cost` | -0.05 | flat | Per-step penalty -- exploration must justify itself |
|
| 26 |
|
| 27 |
**Gating mechanism**: `info_need = 1 - sufficiency`. Raw reward is scaled by `0.3 + 0.7 * info_need`, so high sufficiency -> low reward for more exploration. This teaches the agent to stop when it has enough.
|
|
@@ -35,20 +34,20 @@ Reward on `generate` and `repair` actions. Uses **multiplicative gates** instead
|
|
| 35 |
| Condition | Effect |
|
| 36 |
|---|---|
|
| 37 |
| Code doesn't parse (AST fails) | total = 0 |
|
| 38 |
-
|
|
|
|
|
| 39 |
| Code executes successfully | total = quality * 1.0 |
|
| 40 |
|
| 41 |
### Quality components
|
| 42 |
|
| 43 |
| Component | Weight | Range | Description |
|
| 44 |
|---|---|---|---|
|
| 45 |
-
| `
|
| 46 |
-
| `
|
| 47 |
-
| `structure` | 0.
|
| 48 |
-
| `
|
| 49 |
-
| `context_usage` | 0.35 | 0-1 | Code references terms from exploration research |
|
| 50 |
|
| 51 |
-
|
| 52 |
|
| 53 |
### Marimo structure scoring
|
| 54 |
|
|
@@ -74,15 +73,23 @@ Clean code (no violations) gets +0.1 bonus.
|
|
| 74 |
|
| 75 |
### Repair scoring
|
| 76 |
|
| 77 |
-
If generation fails lint/build validation, the observation enters `repair` and exposes structured errors.
|
| 78 |
|
| 79 |
| Condition | Effect |
|
| 80 |
|---|---|
|
| 81 |
| First generation succeeds | Full eligible generation reward; episode ends |
|
| 82 |
-
| Repair succeeds | Base generation reward * 0.
|
| 83 |
-
| Repair fails | Base generation reward * 0.
|
| 84 |
| Code repeated unchanged | Additional penalty |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
## Search Sources (`sources.py`)
|
| 87 |
|
| 88 |
All search calls are **async** (httpx + wikipediaapi.AsyncWikipedia). Content is retrieved at section/chunk level and ranked using **BM25** to surface the most relevant parts.
|
|
|
|
| 5 |
## Episode Flow
|
| 6 |
|
| 7 |
```
|
| 8 |
+
reset() --> [explore x 0..6] --> generate x 1 --> [repair x 0..3] --> done
|
| 9 |
```
|
| 10 |
|
| 11 |
Each step returns a per-step reward. The agent learns what tool to use, what to retrieve, when to stop exploring, and how to repair broken artifacts.
|
| 12 |
+
Every action reward and `*_total` component is clamped to the `0-1` range.
|
| 13 |
|
| 14 |
## Exploration Rewards (`exploration.py`)
|
| 15 |
|
|
|
|
| 17 |
|
| 18 |
| Component | Weight | Range | Description |
|
| 19 |
|---|---|---|---|
|
| 20 |
+
| `query_quality` | 0.20 | 0-1 | Query relevance plus tool fit |
|
| 21 |
+
| `evidence_quality` | 0.25 | 0-1 | Retrieved chunk quality plus useful source diversity |
|
| 22 |
+
| `information_gain` | 0.40 | 0-1 | Newly covered concepts plus result novelty |
|
| 23 |
+
| `efficiency` | 0.15 | 0-1 | Action novelty scaled by remaining information need |
|
|
|
|
|
|
|
| 24 |
| `step_cost` | -0.05 | flat | Per-step penalty -- exploration must justify itself |
|
| 25 |
|
| 26 |
**Gating mechanism**: `info_need = 1 - sufficiency`. Raw reward is scaled by `0.3 + 0.7 * info_need`, so high sufficiency -> low reward for more exploration. This teaches the agent to stop when it has enough.
|
|
|
|
| 34 |
| Condition | Effect |
|
| 35 |
|---|---|
|
| 36 |
| Code doesn't parse (AST fails) | total = 0 |
|
| 37 |
+
| Static check fails | total = quality * 0.12-0.18 |
|
| 38 |
+
| Code doesn't execute | total = quality * 0.30 |
|
| 39 |
| Code executes successfully | total = quality * 1.0 |
|
| 40 |
|
| 41 |
### Quality components
|
| 42 |
|
| 43 |
| Component | Weight | Range | Description |
|
| 44 |
|---|---|---|---|
|
| 45 |
+
| `validity` | 0.15 | 0-1 | Parse/static-check/execution validity |
|
| 46 |
+
| `task_alignment` | 0.30 | 0-1 | Keyword coverage plus preferred format match |
|
| 47 |
+
| `structure` | 0.30 | 0-1 | Structural quality (cells/scenes, UI, viz, `marimo check`) |
|
| 48 |
+
| `research_usage` | 0.25 | 0-1 | Code references terms from exploration research |
|
|
|
|
| 49 |
|
| 50 |
+
For manim, `structure` includes scene structure plus narration quality.
|
| 51 |
|
| 52 |
### Marimo structure scoring
|
| 53 |
|
|
|
|
| 73 |
|
| 74 |
### Repair scoring
|
| 75 |
|
| 76 |
+
If generation fails lint/build validation, the observation enters `repair` and exposes structured errors. Up to three repair attempts are allowed:
|
| 77 |
|
| 78 |
| Condition | Effect |
|
| 79 |
|---|---|
|
| 80 |
| First generation succeeds | Full eligible generation reward; episode ends |
|
| 81 |
+
| Repair succeeds | Base generation reward * 0.6, plus small bonuses for fixing prior error codes and changing code |
|
| 82 |
+
| Repair fails | Base generation reward * 0.25, plus a small bonus if prior error codes are fixed; episode ends |
|
| 83 |
| Code repeated unchanged | Additional penalty |
|
| 84 |
|
| 85 |
+
Repair reward components are:
|
| 86 |
+
|
| 87 |
+
| Component | Range | Description |
|
| 88 |
+
|---|---|---|
|
| 89 |
+
| `repair_success` | 0/1 | Whether the repaired artifact executes successfully |
|
| 90 |
+
| `fixed_prior_errors` | 0/1 | Whether previous error codes are gone |
|
| 91 |
+
| `changed_code` | 0/1 | Whether the repair changed the submitted code |
|
| 92 |
+
|
| 93 |
## Search Sources (`sources.py`)
|
| 94 |
|
| 95 |
All search calls are **async** (httpx + wikipediaapi.AsyncWikipedia). Content is retrieved at section/chunk level and ranked using **BM25** to surface the most relevant parts.
|
rewards/exploration.py
CHANGED
|
@@ -3,11 +3,11 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
try:
|
| 6 |
-
from ..constants import MAX_EXPLORE_REWARD
|
| 7 |
from ..research.retrieval import tokenize
|
| 8 |
from ..research.types import ResearchResult
|
| 9 |
except ImportError: # pragma: no cover - supports direct test execution
|
| 10 |
-
from constants import MAX_EXPLORE_REWARD
|
| 11 |
from research.retrieval import tokenize
|
| 12 |
from research.types import ResearchResult
|
| 13 |
|
|
@@ -238,7 +238,7 @@ def compute_explore_reward(
|
|
| 238 |
)
|
| 239 |
gate = _exploration_gate(sufficiency_after) if result_ok else 0.0
|
| 240 |
total = raw * gate + 0.08 * info_need - STEP_COST
|
| 241 |
-
total =
|
| 242 |
|
| 243 |
components = {
|
| 244 |
"query_quality": round(query_quality, 3),
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
try:
|
| 6 |
+
from ..constants import MAX_EXPLORE_REWARD, clamp_action_reward
|
| 7 |
from ..research.retrieval import tokenize
|
| 8 |
from ..research.types import ResearchResult
|
| 9 |
except ImportError: # pragma: no cover - supports direct test execution
|
| 10 |
+
from constants import MAX_EXPLORE_REWARD, clamp_action_reward
|
| 11 |
from research.retrieval import tokenize
|
| 12 |
from research.types import ResearchResult
|
| 13 |
|
|
|
|
| 238 |
)
|
| 239 |
gate = _exploration_gate(sufficiency_after) if result_ok else 0.0
|
| 240 |
total = raw * gate + 0.08 * info_need - STEP_COST
|
| 241 |
+
total = min(MAX_EXPLORE_REWARD, clamp_action_reward(total))
|
| 242 |
|
| 243 |
components = {
|
| 244 |
"query_quality": round(query_quality, 3),
|
rewards/generation.py
CHANGED
|
@@ -22,9 +22,9 @@ from typing import TYPE_CHECKING
|
|
| 22 |
from .sandbox import ast_parses, check_marimo, extract_scene_class
|
| 23 |
|
| 24 |
try:
|
| 25 |
-
from ..constants import MAX_REPAIR_REWARD
|
| 26 |
except ImportError: # pragma: no cover - supports direct test execution
|
| 27 |
-
from constants import MAX_REPAIR_REWARD
|
| 28 |
|
| 29 |
if TYPE_CHECKING:
|
| 30 |
from ..task_bank import Task
|
|
@@ -369,7 +369,7 @@ def adjust_repair_reward(
|
|
| 369 |
if not changed:
|
| 370 |
reward -= 0.15
|
| 371 |
|
| 372 |
-
reward =
|
| 373 |
return reward, {
|
| 374 |
"repair_success": 1.0 if repair_success else 0.0,
|
| 375 |
"fixed_prior_errors": 1.0 if fixed_prior else 0.0,
|
|
|
|
| 22 |
from .sandbox import ast_parses, check_marimo, extract_scene_class
|
| 23 |
|
| 24 |
try:
|
| 25 |
+
from ..constants import MAX_REPAIR_REWARD, clamp_action_reward
|
| 26 |
except ImportError: # pragma: no cover - supports direct test execution
|
| 27 |
+
from constants import MAX_REPAIR_REWARD, clamp_action_reward
|
| 28 |
|
| 29 |
if TYPE_CHECKING:
|
| 30 |
from ..task_bank import Task
|
|
|
|
| 369 |
if not changed:
|
| 370 |
reward -= 0.15
|
| 371 |
|
| 372 |
+
reward = min(MAX_REPAIR_REWARD, clamp_action_reward(reward))
|
| 373 |
return reward, {
|
| 374 |
"repair_success": 1.0 if repair_success else 0.0,
|
| 375 |
"fixed_prior_errors": 1.0 if fixed_prior else 0.0,
|
server/explainer_env_environment.py
CHANGED
|
@@ -21,7 +21,7 @@ from openenv.core.env_server.interfaces import Environment
|
|
| 21 |
from openenv.core.env_server.types import State
|
| 22 |
|
| 23 |
try:
|
| 24 |
-
from ..constants import MAX_EXPLORE_STEPS, MAX_REPAIR_STEPS
|
| 25 |
from ..models import ExplainerAction, ExplainerObservation
|
| 26 |
from ..research import AVAILABLE_TOOLS, run_research_tool
|
| 27 |
from ..rewards.exploration import compute_explore_reward
|
|
@@ -29,7 +29,7 @@ try:
|
|
| 29 |
from ..rewards.sandbox import validate_code
|
| 30 |
from ..task_bank import ALL_TASKS, EASY_TASKS, HARD_TASKS, MEDIUM_TASKS, Task
|
| 31 |
except ImportError:
|
| 32 |
-
from constants import MAX_EXPLORE_STEPS, MAX_REPAIR_STEPS
|
| 33 |
from models import ExplainerAction, ExplainerObservation
|
| 34 |
from research import AVAILABLE_TOOLS, run_research_tool
|
| 35 |
from rewards.exploration import compute_explore_reward
|
|
@@ -385,7 +385,8 @@ class ExplainerEnvironment(Environment):
|
|
| 385 |
static_check_passed=sandbox.check_passed,
|
| 386 |
error_codes=sandbox.error_codes,
|
| 387 |
)
|
| 388 |
-
reward =
|
|
|
|
| 389 |
|
| 390 |
self._last_code = code
|
| 391 |
self._last_format = fmt
|
|
|
|
| 21 |
from openenv.core.env_server.types import State
|
| 22 |
|
| 23 |
try:
|
| 24 |
+
from ..constants import MAX_EXPLORE_STEPS, MAX_REPAIR_STEPS, clamp_action_reward
|
| 25 |
from ..models import ExplainerAction, ExplainerObservation
|
| 26 |
from ..research import AVAILABLE_TOOLS, run_research_tool
|
| 27 |
from ..rewards.exploration import compute_explore_reward
|
|
|
|
| 29 |
from ..rewards.sandbox import validate_code
|
| 30 |
from ..task_bank import ALL_TASKS, EASY_TASKS, HARD_TASKS, MEDIUM_TASKS, Task
|
| 31 |
except ImportError:
|
| 32 |
+
from constants import MAX_EXPLORE_STEPS, MAX_REPAIR_STEPS, clamp_action_reward
|
| 33 |
from models import ExplainerAction, ExplainerObservation
|
| 34 |
from research import AVAILABLE_TOOLS, run_research_tool
|
| 35 |
from rewards.exploration import compute_explore_reward
|
|
|
|
| 385 |
static_check_passed=sandbox.check_passed,
|
| 386 |
error_codes=sandbox.error_codes,
|
| 387 |
)
|
| 388 |
+
reward = clamp_action_reward(reward + skip_penalty)
|
| 389 |
+
components["generate_total"] = round(reward, 4)
|
| 390 |
|
| 391 |
self._last_code = code
|
| 392 |
self._last_format = fmt
|
tests/test_rewards.py
CHANGED
|
@@ -412,7 +412,7 @@ def test_reward_spread():
|
|
| 412 |
assert len(unique) >= 3
|
| 413 |
|
| 414 |
|
| 415 |
-
def
|
| 416 |
reward, comp = adjust_repair_reward(
|
| 417 |
1.0,
|
| 418 |
repair_success=True,
|
|
@@ -421,7 +421,8 @@ def test_repair_reward_success_is_capped_and_changed():
|
|
| 421 |
previous_code="x =",
|
| 422 |
repaired_code="x = 1",
|
| 423 |
)
|
| 424 |
-
assert reward ==
|
|
|
|
| 425 |
assert comp["repair_success"] == 1.0
|
| 426 |
assert comp["fixed_prior_errors"] == 1.0
|
| 427 |
assert comp["changed_code"] == 1.0
|
|
@@ -494,7 +495,7 @@ if __name__ == "__main__":
|
|
| 494 |
test_marimo_static_failure_is_not_code_valid,
|
| 495 |
test_generate_reward_wrong_format,
|
| 496 |
test_reward_spread,
|
| 497 |
-
|
| 498 |
test_repair_reward_penalizes_repeated_code,
|
| 499 |
test_repair_reward_failed_fix_stays_discounted,
|
| 500 |
test_normalized_episode_score_bounds,
|
|
|
|
| 412 |
assert len(unique) >= 3
|
| 413 |
|
| 414 |
|
| 415 |
+
def test_repair_reward_success_is_discounted_and_changed():
|
| 416 |
reward, comp = adjust_repair_reward(
|
| 417 |
1.0,
|
| 418 |
repair_success=True,
|
|
|
|
| 421 |
previous_code="x =",
|
| 422 |
repaired_code="x = 1",
|
| 423 |
)
|
| 424 |
+
assert reward == 0.72
|
| 425 |
+
assert 0.0 <= reward <= MAX_REPAIR_REWARD
|
| 426 |
assert comp["repair_success"] == 1.0
|
| 427 |
assert comp["fixed_prior_errors"] == 1.0
|
| 428 |
assert comp["changed_code"] == 1.0
|
|
|
|
| 495 |
test_marimo_static_failure_is_not_code_valid,
|
| 496 |
test_generate_reward_wrong_format,
|
| 497 |
test_reward_spread,
|
| 498 |
+
test_repair_reward_success_is_discounted_and_changed,
|
| 499 |
test_repair_reward_penalizes_repeated_code,
|
| 500 |
test_repair_reward_failed_fix_stays_discounted,
|
| 501 |
test_normalized_episode_score_bounds,
|