Falgunisharma's picture
Improve environment depth: investigate action, cascading deps, rich observations, harder hard task
5f6895d
"""Pydantic models for the Feature Flag Cleanup environment."""
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field, ConfigDict
class FlagAction(BaseModel):
"""Action the agent takes on a feature flag."""
model_config = ConfigDict(extra="forbid", validate_assignment=True)
action: str = Field(
...,
description="One of: remove, keep, deprecate, escalate, investigate",
pattern="^(remove|keep|deprecate|escalate|investigate)$",
)
reasoning: str = Field(
default="",
description="Optional reasoning for the action",
)
metadata: Dict[str, Any] = Field(default_factory=dict)
class FlagObservation(BaseModel):
"""Observation presented to the agent for each feature flag."""
model_config = ConfigDict(extra="forbid", validate_assignment=True)
flag_name: str = Field(..., description="Name of the feature flag")
description: str = Field(..., description="What the flag controls")
rollout_percentage: float = Field(..., ge=0.0, le=1.0, description="Current rollout 0.0-1.0")
age_days: int = Field(..., ge=0, description="Days since flag was created")
last_modified_days: int = Field(..., ge=0, description="Days since last modification")
owner: str = Field(..., description="Team or person who owns the flag")
owner_active: bool = Field(..., description="Whether the owner is still active")
num_code_references: int = Field(..., ge=0, description="Number of code references")
has_dependencies: bool = Field(..., description="Whether other flags depend on this one")
dependent_flags: List[str] = Field(default_factory=list, description="Flags depending on this one")
is_kill_switch: bool = Field(default=False, description="Emergency kill switch flag")
has_active_incident: bool = Field(default=False, description="Active incident involving this flag")
usage_last_30d: int = Field(default=0, ge=0, description="Flag evaluations in last 30 days")
in_active_experiment: bool = Field(default=False, description="Part of a running A/B test")
services: List[str] = Field(default_factory=list, description="Services referencing this flag")
task_id: str = Field(..., description="Current task identifier")
flags_remaining: int = Field(..., ge=0, description="Flags left to process")
# Rich context fields (Weakness #2 fix: richer observations)
code_snippet: str = Field(default="", description="Code snippet showing how the flag is used")
last_commit_message: str = Field(default="", description="Last git commit that modified this flag")
pr_context: str = Field(default="", description="Context from the PR that introduced this flag")
related_incidents: List[str] = Field(default_factory=list, description="Past incident IDs involving this flag")
investigation_notes: str = Field(default="", description="Notes revealed by investigate action")
# Cascading context (Weakness #1 fix: decisions affect future flags)
previously_removed: List[str] = Field(default_factory=list, description="Flags already removed this episode")
cascade_warning: str = Field(default="", description="Warning if a dependency was removed earlier")
done: bool = Field(default=False)
reward: Optional[float] = Field(default=None)
metadata: Dict[str, Any] = Field(default_factory=dict)
class FlagState(BaseModel):
"""Internal state of the environment."""
model_config = ConfigDict(extra="allow", validate_assignment=True)
episode_id: Optional[str] = Field(default=None)
step_count: int = Field(default=0, ge=0)
task_id: str = Field(default="easy")
current_flag_index: int = Field(default=0, ge=0)
total_flags: int = Field(default=0, ge=0)
cumulative_reward: float = Field(default=0.0)
flags_processed: List[str] = Field(default_factory=list)
flags_removed: List[str] = Field(default_factory=list)
investigation_count: int = Field(default=0, ge=0)