ajaxwin commited on
Commit
dccaaac
·
1 Parent(s): c6002b4

refactor: Update task configurations and grading logic for improved scoring and consistency

Browse files
README.md CHANGED
@@ -357,18 +357,18 @@ tasks:
357
  - id: task1_vuln_detection
358
  name: Targeted Vulnerability Detection
359
  difficulty: medium
360
- max_steps: 15
361
- max_score: 5.0
362
  - id: task2_property_discovery
363
  name: Property Discovery
364
  difficulty: hard
365
- max_steps: 10
366
- max_score: 5.0
367
  - id: task3_rule_checker
368
  name: Rule Checker
369
  difficulty: easy
370
- max_steps: 10
371
- max_score: 5.0
372
  observation_schema: models/observation.py
373
  action_schema: models/action.py
374
  app_port: 7860
 
357
  - id: task1_vuln_detection
358
  name: Targeted Vulnerability Detection
359
  difficulty: medium
360
+ max_steps: 40
361
+ max_score: 1.0
362
  - id: task2_property_discovery
363
  name: Property Discovery
364
  difficulty: hard
365
+ max_steps: 40
366
+ max_score: 1.0
367
  - id: task3_rule_checker
368
  name: Rule Checker
369
  difficulty: easy
370
+ max_steps: 20
371
+ max_score: 1.0
372
  observation_schema: models/observation.py
373
  action_schema: models/action.py
374
  app_port: 7860
inference.py CHANGED
@@ -41,11 +41,14 @@ from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
41
 
42
  load_dotenv()
43
  API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
44
- MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o")
45
  HF_TOKEN = os.getenv("HF_TOKEN", "")
46
 
47
  if not HF_TOKEN:
48
  raise RuntimeError("HF_TOKEN environment variable not set")
 
 
 
49
 
50
  client = AsyncOpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
51
 
@@ -129,6 +132,9 @@ def log_end( success: bool, steps: int, score: float, rewards: List[float]) -> N
129
  flush=True,
130
  )
131
 
 
 
 
132
  # ─────────────────────────────────────────────────────────────────────────────
133
  # Generic episode runner
134
  # ─────────────────────────────────────────────────────────────────────────────
@@ -189,7 +195,6 @@ async def run_episode(
189
 
190
  step_rewards.append(r_val)
191
  steps_taken = step
192
- print(raw, at.value, r_val)
193
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
194
 
195
  if done:
@@ -205,7 +210,7 @@ async def run_episode(
205
  result_dict = {
206
  "episode": ep_num,
207
  "seed": seed,
208
- "grader_score": grader_score,
209
  "contract": obs.get("contract_name", ""),
210
  }
211
  if extra_fields:
@@ -280,14 +285,14 @@ async def run_task(
280
  episodes = await asyncio.gather(*tasks)
281
  avg_score = sum(e["grader_score"] for e in episodes) / num_episodes
282
 
283
- print(f"\n Avg grader score : {avg_score:.4f}", flush=True)
284
  return {
285
  "task_id": task_id,
286
  "name": task_name,
287
  "status": "active",
288
  "num_episodes": num_episodes,
289
  "episodes": episodes,
290
- "avg_grader_score": avg_score,
291
  }
292
 
293
  # ─────────────────────────────────────────────────────────────────────────────
@@ -351,7 +356,7 @@ async def main() -> None:
351
  print("BASELINE SUMMARY", flush=True)
352
  print("=" * 60, flush=True)
353
  for t in results["tasks"]:
354
- print(f" ✅ {t['name']:40s}: {t['avg_grader_score']:.3f}", flush=True)
355
  print(f"\n Overall avg grader score: {overall:.4f}", flush=True)
356
 
357
  with open("baseline_scores.json", "w") as f:
 
41
 
42
  load_dotenv()
43
  API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
44
+ MODEL_NAME = os.getenv("MODEL_NAME", "")
45
  HF_TOKEN = os.getenv("HF_TOKEN", "")
46
 
47
  if not HF_TOKEN:
48
  raise RuntimeError("HF_TOKEN environment variable not set")
49
+
50
+ if not MODEL_NAME:
51
+ raise RuntimeError("MODEL_NAME not set")
52
 
53
  client = AsyncOpenAI(api_key=HF_TOKEN, base_url=API_BASE_URL)
54
 
 
132
  flush=True,
133
  )
134
 
135
+ def _clamp(reward: float) -> float:
136
+ return max(0.001, min(0.999, reward))
137
+
138
  # ─────────────────────────────────────────────────────────────────────────────
139
  # Generic episode runner
140
  # ─────────────────────────────────────────────────────────────────────────────
 
195
 
196
  step_rewards.append(r_val)
197
  steps_taken = step
 
198
  log_step(step=step, action=at.value, reward=r_val, done=done, error=error_msg)
199
 
200
  if done:
 
210
  result_dict = {
211
  "episode": ep_num,
212
  "seed": seed,
213
+ "grader_score": _clamp(grader_score),
214
  "contract": obs.get("contract_name", ""),
215
  }
216
  if extra_fields:
 
285
  episodes = await asyncio.gather(*tasks)
286
  avg_score = sum(e["grader_score"] for e in episodes) / num_episodes
287
 
288
+ print(f"\n Avg grader score : {_clamp(avg_score):.4f}", flush=True)
289
  return {
290
  "task_id": task_id,
291
  "name": task_name,
292
  "status": "active",
293
  "num_episodes": num_episodes,
294
  "episodes": episodes,
295
+ "avg_grader_score": _clamp(avg_score),
296
  }
297
 
298
  # ─────────────────────────────────────────────────────────────────────────────
 
356
  print("BASELINE SUMMARY", flush=True)
357
  print("=" * 60, flush=True)
358
  for t in results["tasks"]:
359
+ print(f" ✅ {t['name']:40s}: {_clamp(t['avg_grader_score']):.3f}", flush=True)
360
  print(f"\n Overall avg grader score: {overall:.4f}", flush=True)
361
 
362
  with open("baseline_scores.json", "w") as f:
openenv.yaml CHANGED
@@ -16,10 +16,10 @@ tasks:
16
  description: >
17
  Given a Solidity contract (4-6 functions), identify the single vulnerable
18
  function and describe its vulnerability type in 2-3 words.
19
- max_steps: 20
20
- reward_range: [-10.0, 10.0]
21
  grader: tasks/task1/grader.py
22
- grader_score_range: [0.0, 1.0]
23
 
24
  - id: task2_property_discovery
25
  name: Property Discovery
@@ -28,8 +28,8 @@ tasks:
28
  description: >
29
  Given a single Solidity function with known properties, discover the
30
  correct natural-language postcondition describing its correct behaviour.
31
- max_steps: 15
32
- reward_range: [-5.0, 5.0]
33
  grader: tasks/task2/grader.py
34
  grader_score_range: [0.0, 1.0]
35
 
@@ -40,8 +40,8 @@ tasks:
40
  description: >
41
  Given a natural-language property and a Solidity contract, identify the
42
  function that violates that property. Partial credit for internal subfunctions.
43
- max_steps: 15
44
- reward_range: [-5.0, 5.0]
45
  grader: tasks/task3/grader.py
46
  grader_score_range: [0.0, 1.0]
47
 
@@ -60,30 +60,80 @@ observation_space:
60
  extra: {type: object}
61
 
62
  action_space:
 
 
 
 
 
 
63
  task1:
64
- list_functions: {params: {}, reward: -0.05}
65
- get_function_code: {params: {function_name: string}, reward: "+0.05 / -0.10"}
66
- get_function_summary: {params: {function_name: string}, reward: "+0.03 / -0.05"}
67
- get_file_metadata: {params: {}, reward: -0.04}
68
- get_state_variable: {params: {variable_name: "string opt"}, reward: -0.05}
69
- get_call_graph: {params: {}, reward: -0.08}
70
- submit: {params: {function_name: string, vulnerability_type: string}, reward: "+5.0 / +1.0 / -1.5"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  task2:
72
- get_function_code: {params: {}, reward: -0.06}
73
- get_function_natspec: {params: {}, reward: -0.08}
74
- get_file_natspec: {params: {}, reward: -0.03}
75
- get_related_functions: {params: {}, reward: -0.06}
76
- get_io: {params: {}, reward: -0.04}
77
- get_similar_rule: {params: {}, reward: -0.20}
78
- submit_property: {params: {property: string}, reward: "0.0-5.0 keyword-weighted, one attempt"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  task3:
80
- list_functions: {params: {}, reward: -0.05}
81
- get_function_metadata: {params: {function_name: string}, reward: -0.05}
82
- get_function_code: {params: {function_name: string}, reward: -0.10}
83
- get_state_variable: {params: {variable_name: "string opt"}, reward: -0.05}
84
- get_call_graph: {params: {}, reward: -0.08}
85
- get_formalized_property: {params: {}, reward: -0.03}
86
- submit_function: {params: {function_name: string}, reward: "+5.0 / +1.5 / -1.5, one attempt"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  reward:
89
  type: shaped
@@ -95,16 +145,11 @@ reward:
95
  get_function_summary_correct: +0.03
96
  get_function_summary_wrong: -0.05
97
  task1_terminal:
98
- correct: +5.0
99
- partial: +1.0
100
- wrong: -1.5
101
  task2_terminal:
102
- formula: "score * 5.0 where score = 0.70*(key_matches/key_total) + 0.30*(bonus_matches/bonus_total)"
103
- range: [0.0, 5.0]
104
  task3_terminal:
105
- correct_function: +5.0
106
- subfunction: +1.5
107
- wrong_function: -1.5
108
 
109
  data:
110
  source: "Certora audited DeFi projects"
 
16
  description: >
17
  Given a Solidity contract (4-6 functions), identify the single vulnerable
18
  function and describe its vulnerability type in 2-3 words.
19
+ max_steps: 40
20
+ reward_range: [0, 1]
21
  grader: tasks/task1/grader.py
22
+ grader_score_range: [0, 1]
23
 
24
  - id: task2_property_discovery
25
  name: Property Discovery
 
28
  description: >
29
  Given a single Solidity function with known properties, discover the
30
  correct natural-language postcondition describing its correct behaviour.
31
+ max_steps: 30
32
+ reward_range: [0, 1]
33
  grader: tasks/task2/grader.py
34
  grader_score_range: [0.0, 1.0]
35
 
 
40
  description: >
41
  Given a natural-language property and a Solidity contract, identify the
42
  function that violates that property. Partial credit for internal subfunctions.
43
+ max_steps: 20
44
+ reward_range: [0, 1]
45
  grader: tasks/task3/grader.py
46
  grader_score_range: [0.0, 1.0]
47
 
 
60
  extra: {type: object}
61
 
62
  action_space:
63
+ # General actions applicable across all tasks
64
+ general:
65
+ unknown: {reward: 0.0} # UNKNOWN action cost
66
+ repeated: {reward: -0.22} # REPEATED action cost
67
+ resubmit: {reward: 0.0} # RESUBMIT action cost
68
+
69
  task1:
70
+ list_functions:
71
+ params: {}
72
+ reward: -0.04
73
+ get_function_code:
74
+ params: {function_name: string}
75
+ reward: -0.14
76
+ get_function_summary:
77
+ params: {function_name: string}
78
+ reward: -0.07
79
+ get_file_metadata:
80
+ params: {}
81
+ reward: -0.02
82
+ get_state_variable:
83
+ params: {variable_name: "string opt"}
84
+ reward: -0.06
85
+ get_call_graph:
86
+ params: {}
87
+ reward: -0.08
88
+ submit:
89
+ params: {function_name: string, vulnerability_type: string}
90
+ reward: 0.0 # terminal reward handled by grader
91
+
92
  task2:
93
+ get_function_code:
94
+ params: {}
95
+ reward: -0.14
96
+ get_function_natspec:
97
+ params: {}
98
+ reward: -0.08
99
+ get_file_natspec:
100
+ params: {}
101
+ reward: 0.05
102
+ get_related_functions:
103
+ params: {}
104
+ reward: 0.07
105
+ get_signature:
106
+ params: {}
107
+ reward: 0.04
108
+ get_similar_rule:
109
+ params: {}
110
+ reward: 0.15
111
+ submit_property:
112
+ params: {property: string}
113
+ reward: 0.0 # terminal reward handled by grader
114
+
115
  task3:
116
+ list_functions:
117
+ params: {}
118
+ reward: -0.04
119
+ get_function_metadata:
120
+ params: {function_name: string}
121
+ reward: 0.04
122
+ get_function_code:
123
+ params: {function_name: string}
124
+ reward: -0.14
125
+ get_state_variable:
126
+ params: {variable_name: "string opt"}
127
+ reward: -0.06
128
+ get_call_graph:
129
+ params: {}
130
+ reward: -0.08
131
+ get_property_specification: # replaces get_formalized_property
132
+ params: {}
133
+ reward: 0.02
134
+ submit_function:
135
+ params: {function_name: string}
136
+ reward: 0.0 # terminal reward handled by grader
137
 
138
  reward:
139
  type: shaped
 
145
  get_function_summary_correct: +0.03
146
  get_function_summary_wrong: -0.05
147
  task1_terminal:
148
+ range: [0.0, 1.0]
 
 
149
  task2_terminal:
150
+ range: [0.0, 1.0]
 
151
  task3_terminal:
152
+ range: [0.0, 1.0]
 
 
153
 
154
  data:
155
  source: "Certora audited DeFi projects"
server/tasks/task1/grader.py CHANGED
@@ -27,7 +27,7 @@ class Task1Grader:
27
 
28
  # Score formula
29
  free_budget = (cummulative_cost / steps) * (self.n + 2)
30
- reward = func_match * issue_match * (self._decay ** max(0, cummulative_cost - free_budget))
31
  return self._clamp(reward)
32
 
33
  def get_canonical_answer(self) -> Dict[str, str]:
 
27
 
28
  # Score formula
29
  free_budget = (cummulative_cost / steps) * (self.n + 2)
30
+ reward = (func_match * 0.5) + (issue_match * 0.5) + (self._decay ** max(0, cummulative_cost - free_budget))
31
  return self._clamp(reward)
32
 
33
  def get_canonical_answer(self) -> Dict[str, str]:
server/tasks/task2/grader.py CHANGED
@@ -36,6 +36,6 @@ class Task2Grader:
36
  matcher = SemanticMatcher()
37
  match_score = matcher.matchscore(self.property, submitted)
38
  free_budget = (cummulative_cost / steps) * (self.n + 2)
39
- final_score = match_score * (self._decay ** max(0, cummulative_cost - free_budget))
40
 
41
  return self._clamp(final_score), matcher.confidence()
 
36
  matcher = SemanticMatcher()
37
  match_score = matcher.matchscore(self.property, submitted)
38
  free_budget = (cummulative_cost / steps) * (self.n + 2)
39
+ final_score = (match_score * 0.5) + (self._decay ** max(0, cummulative_cost - free_budget))
40
 
41
  return self._clamp(final_score), matcher.confidence()