File size: 6,255 Bytes
5dd1bb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{
  "$schema": "autocode-verification-input-v1",
  "feature_id": "F005",
  "spec_path": "specs/F005-IMPLEMENTATION_SPEC.md",
  "generated": "2026-03-27T12:00:00Z",
  "verification_mode": "mvp",

  "overview": {
    "summary": "Automated evaluation wrapper that runs N episodes with a given policy against SQLEnvironment and returns structured metrics (success_rate, avg_reward, avg_steps). Includes a built-in RandomPolicy for instant baseline comparison. Results are collected incrementally so partial failures do not lose completed episode data.",
    "goal": "Enable single-command evaluation: 'How does policy X perform over 100 episodes?' with structured output for training comparison (random vs trained)."
  },

  "interfaces": {
    "types": [
      {
        "name": "Policy",
        "description": "Protocol (structural subtype) for any evaluation policy. Any object with a matching select_action method satisfies this interface.",
        "fields": [
          {"name": "select_action", "type": "(observation: SQLObservation) -> SQLAction", "description": "Choose an action given the current observation"}
        ]
      },
      {
        "name": "EpisodeResult",
        "description": "Per-episode evaluation metrics. Frozen dataclass.",
        "fields": [
          {"name": "episode_index", "type": "int", "description": "0-based episode number"},
          {"name": "correct", "type": "bool", "description": "Whether the ANSWER action matched the gold answer"},
          {"name": "total_reward", "type": "float", "description": "Cumulative reward for the episode"},
          {"name": "steps", "type": "int", "description": "Number of steps taken in the episode"},
          {"name": "error", "type": "str | None", "optional": true, "description": "Error message if episode failed, None otherwise"}
        ]
      },
      {
        "name": "EvaluationResult",
        "description": "Aggregate evaluation metrics with per-episode breakdown. Frozen dataclass.",
        "fields": [
          {"name": "success_rate", "type": "float", "description": "Fraction of correct episodes in [0.0, 1.0]"},
          {"name": "avg_reward", "type": "float", "description": "Mean total_reward across completed episodes"},
          {"name": "avg_steps", "type": "float", "description": "Mean steps across completed episodes"},
          {"name": "n_episodes", "type": "int", "description": "Total number of episodes attempted"},
          {"name": "n_completed", "type": "int", "description": "Episodes that completed without error"},
          {"name": "episodes", "type": "list[EpisodeResult]", "description": "Per-episode breakdown for analysis"}
        ]
      }
    ],
    "functions": [
      {
        "name": "RandomPolicy.__init__",
        "params": [
          {"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for reproducibility"}
        ],
        "returns": "None",
        "description": "Initialize random baseline policy. Deterministic given a seed."
      },
      {
        "name": "RandomPolicy.select_action",
        "params": [
          {"name": "observation", "type": "SQLObservation", "description": "Current environment observation"}
        ],
        "returns": "SQLAction",
        "description": "Pick a random action. If budget_remaining > 1: randomly choose DESCRIBE, SAMPLE, or QUERY. If budget_remaining == 1: ANSWER with a random guess."
      },
      {
        "name": "evaluate",
        "params": [
          {"name": "env", "type": "SQLEnvironment", "description": "The environment to evaluate against"},
          {"name": "policy", "type": "Policy", "description": "Any object satisfying the Policy protocol"},
          {"name": "n_episodes", "type": "int", "default": "100", "description": "Number of episodes to run"},
          {"name": "seed", "type": "int | None", "default": "None", "description": "Base seed for reproducibility; episode i uses seed+i"},
          {"name": "progress_callback", "type": "Callable[[int, int], None] | None", "default": "None", "description": "Optional callback(current, total) for progress reporting"}
        ],
        "returns": "EvaluationResult",
        "raises": ["ValueError"],
        "description": "Run automated evaluation of a policy over multiple episodes. Collects results incrementally -- failed episodes are recorded and evaluation continues."
      }
    ],
    "api_endpoints": []
  },

  "data_flow": {
    "primary_flow": [
      "evaluate() called with env, policy, n_episodes, optional seed",
      "For each episode: env.reset(seed=base_seed+i) returns initial SQLObservation",
      "Loop: policy.select_action(obs) -> SQLAction, then env.step(action) -> SQLObservation, accumulate reward",
      "Episode ends when obs.done is True; record EpisodeResult with correct/reward/steps",
      "Aggregate all EpisodeResults into EvaluationResult with success_rate, avg_reward, avg_steps"
    ],
    "alternative_flows": [
      {
        "condition": "n_episodes is 0",
        "steps": ["Return EvaluationResult with all zeros and empty episodes list"]
      },
      {
        "condition": "Exception during episode (reset, select_action, or step fails)",
        "steps": [
          "Catch exception",
          "Record EpisodeResult with correct=False, total_reward=0.0, steps=0, error=str(exc)",
          "Continue to next episode"
        ]
      }
    ]
  },

  "error_handling": {
    "error_types": [
      {
        "name": "ValueError",
        "when": "n_episodes < 0",
        "handling": "Raise immediately before starting evaluation"
      },
      {
        "name": "Exception (per-episode)",
        "when": "Any exception during env.reset(), policy.select_action(), or env.step()",
        "handling": "Catch, record as failed EpisodeResult with error field, continue to next episode"
      }
    ],
    "retry_strategy": null
  },

  "dependencies": {
    "external": [],
    "internal": [
      {"name": "models.SQLAction", "usage": "Action type returned by policies"},
      {"name": "models.SQLObservation", "usage": "Observation type passed to policies"},
      {"name": "server.sql_environment.SQLEnvironment", "usage": "Environment with reset() and step() methods"}
    ]
  }
}