CreativeEngineer's picture
feat: reward verifier alignment, notebook hardening, model name fix
cdc237b
from __future__ import annotations
from typing import Literal
from openenv.core import Action, Observation, State
from pydantic import BaseModel, Field
ActionIntent = Literal["run", "submit", "restore_best"]
ConstraintName = Literal[
"none",
"aspect_ratio",
"average_triangularity",
"edge_iota_over_nfp",
]
ParameterName = Literal[
"aspect_ratio",
"elongation",
"rotational_transform",
"triangularity_scale",
]
DirectionName = Literal["increase", "decrease"]
MagnitudeName = Literal["small", "medium", "large"]
EvaluationFidelityName = Literal["low", "high"]
class LowDimBoundaryParams(BaseModel):
aspect_ratio: float
elongation: float
rotational_transform: float
triangularity_scale: float
def default_low_dim_boundary_params() -> LowDimBoundaryParams:
return LowDimBoundaryParams(
aspect_ratio=3.6,
elongation=1.4,
rotational_transform=1.5,
triangularity_scale=0.55,
)
class RewardBreakdown(BaseModel):
intent: ActionIntent = "run"
total: float = 0.0
evaluation_failed: bool = False
recovered_from_failure: bool = False
reference_constraints_satisfied: bool = False
reference_score: float | None = None
reference_feasibility: float | None = None
reference_max_elongation: float | None = None
initial_reference_score: float | None = None
terminal_score_ratio: float | None = None
invalid_action_penalty: float = 0.0
failure_penalty: float = 0.0
failure_submit_penalty: float = 0.0
failure_budget_penalty: float = 0.0
feasibility_crossing_bonus: float = 0.0
feasibility_regression_penalty: float = 0.0
feasibility_delta_reward: float = 0.0
best_feasibility_bonus: float = 0.0
near_feasible_bonus: float = 0.0
aspect_ratio_repair_reward: float = 0.0
triangularity_repair_reward: float = 0.0
iota_repair_reward: float = 0.0
objective_delta_reward: float = 0.0
best_score_bonus: float = 0.0
step_cost: float = 0.0
no_progress_penalty: float = 0.0
repeat_state_penalty: float = 0.0
recovery_bonus: float = 0.0
terminal_improvement_bonus: float = 0.0
terminal_budget_bonus: float = 0.0
terminal_no_improvement_penalty: float = 0.0
def default_reward_breakdown() -> RewardBreakdown:
return RewardBreakdown()
class ActionMonitor(BaseModel):
intent: ActionIntent = "run"
parameter: ParameterName | None = None
direction: DirectionName | None = None
magnitude: MagnitudeName | None = None
params_before: LowDimBoundaryParams = Field(default_factory=default_low_dim_boundary_params)
params_after: LowDimBoundaryParams = Field(default_factory=default_low_dim_boundary_params)
clamped: bool = False
no_op: bool = False
repeat_state: bool = False
used_best_params: bool = False
def default_action_monitor() -> ActionMonitor:
params = default_low_dim_boundary_params()
return ActionMonitor(params_before=params, params_after=params)
class StellaratorAction(Action):
intent: ActionIntent
parameter: ParameterName | None = None
direction: DirectionName | None = None
magnitude: MagnitudeName | None = None
reasoning: str = ""
class StellaratorObservation(Observation):
diagnostics_text: str = ""
max_elongation: float = 0.0
aspect_ratio: float = 0.0
average_triangularity: float = 0.0
edge_iota_over_nfp: float = 0.0
aspect_ratio_violation: float = 0.0
triangularity_violation: float = 0.0
iota_violation: float = 0.0
dominant_constraint: ConstraintName = "none"
p1_score: float = 0.0
p1_feasibility: float = 0.0
vacuum_well: float = 0.0
evaluation_fidelity: EvaluationFidelityName = "low"
evaluation_failed: bool = False
failure_reason: str = ""
step_number: int = 0
budget_remaining: int = 6
no_progress_steps: int = 0
best_low_fidelity_score: float = 0.0
best_low_fidelity_feasibility: float = float("inf")
constraints_satisfied: bool = True
target_spec: str = ""
reward_breakdown: RewardBreakdown = Field(default_factory=default_reward_breakdown)
action_monitor: ActionMonitor = Field(default_factory=default_action_monitor)
episode_total_reward: float = 0.0
trajectory_summary: str = ""
class StellaratorState(State):
initial_params: LowDimBoundaryParams = Field(default_factory=default_low_dim_boundary_params)
current_params: LowDimBoundaryParams = Field(default_factory=default_low_dim_boundary_params)
best_params: LowDimBoundaryParams = Field(default_factory=default_low_dim_boundary_params)
initial_low_fidelity_score: float = 0.0
best_low_fidelity_score: float = 0.0
best_low_fidelity_feasibility: float = float("inf")
budget_total: int = 6
budget_remaining: int = 6
episode_done: bool = False
constraints_satisfied: bool = True
total_reward: float = 0.0
no_progress_steps: int = 0
visited_state_keys: list[str] = Field(default_factory=list)
history: list[str] = Field(default_factory=list)