Mohammed-Altaf commited on
Commit
38ee4ab
·
1 Parent(s): d7b24b0

fixed scoring and logging for final submission

Browse files
helpers/logging.py CHANGED
@@ -1,19 +1,33 @@
1
- from typing import List, Optional, Union
2
 
3
 
4
  def log_start(task: str, env: str, model: str) -> None:
 
5
  print(f"[START] task={task} env={env} model={model}", flush=True)
6
 
7
 
8
- def log_step(step: int, action: Union[dict, str], reward: float, done: bool, error: Optional[str]) -> None:
9
- error_val = error if error else "null"
 
 
 
 
 
 
 
 
 
10
  done_val = str(done).lower()
11
- print(
12
- f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
13
- flush=True,
14
- )
15
 
 
 
16
 
17
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
 
 
 
 
18
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
19
- print(f"[END] success={str(success).lower()} steps={steps} score={int(score)} rewards={rewards_str}\n", flush=True)
 
1
+ from typing import List, Optional
2
 
3
 
4
  def log_start(task: str, env: str, model: str) -> None:
5
+ """Emit the [START] line at episode begin."""
6
  print(f"[START] task={task} env={env} model={model}", flush=True)
7
 
8
 
9
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
10
+ """Emit one [STEP] line immediately after env.step() returns.
11
+
12
+ Args:
13
+ step: 1-based step number.
14
+ action: Compact single-line action label (e.g. 'execute_code').
15
+ reward: Step reward, formatted to 2 decimal places.
16
+ done: Whether the episode ended after this step.
17
+ error: Raw error string from the env, or None.
18
+ """
19
+ error_val = error.replace("\n", " ") if error else "null"
20
  done_val = str(done).lower()
21
+ print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
22
+
 
 
23
 
24
+ def log_end(success: bool, steps: int, rewards: List[float]) -> None:
25
+ """Emit the [END] line after env.close(), always emitted even on exception.
26
 
27
+ Args:
28
+ success: Whether the episode was successful.
29
+ steps: Total number of steps taken.
30
+ rewards: List of per-step rewards, each formatted to 2 decimal places.
31
+ """
32
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
33
+ print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)
inference.py CHANGED
@@ -76,14 +76,14 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
76
  done = exec_result.done
77
  except Exception as exc:
78
  print(f"[DEBUG] env step failed: {exc}", flush=True)
79
- log_step(step=step + 1, action=action, reward=0.0, done=False, error=str(exc))
80
  rewards.append(0.0)
81
  continue
82
 
83
  rewards.append(reward)
84
  error = exec_obs.error if not exec_obs.success else None
85
  result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
86
- log_step(step=step + 1, action=action, reward=reward, done=done, error=error)
87
 
88
  messages.append({"role": "assistant", "content": response_text})
89
  messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
@@ -97,13 +97,14 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
97
  score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
98
  except Exception as exc:
99
  print(f"[DEBUG] env step failed: {exc}", flush=True)
100
- log_step(step=step + 1, action=action, reward=0.0, done=True, error=str(exc))
101
- log_end(success=False, steps=step + 1, score=0.0, rewards=rewards)
102
  return 0.0
103
 
 
104
  rewards.append(score)
105
- log_step(step=step + 1, action=action, reward=score, done=True, error=None)
106
- log_end(success=score > 0.0, steps=step + 1, score=score, rewards=rewards)
107
  return score
108
 
109
  else:
@@ -127,7 +128,7 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
127
  }
128
  )
129
 
130
- log_end(success=False, steps=MAX_STEPS, score=0.0, rewards=rewards)
131
  return 0.0
132
 
133
 
 
76
  done = exec_result.done
77
  except Exception as exc:
78
  print(f"[DEBUG] env step failed: {exc}", flush=True)
79
+ log_step(step=step + 1, action=action_type, reward=0.0, done=False, error=str(exc))
80
  rewards.append(0.0)
81
  continue
82
 
83
  rewards.append(reward)
84
  error = exec_obs.error if not exec_obs.success else None
85
  result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
86
+ log_step(step=step + 1, action=action_type, reward=reward, done=done, error=error)
87
 
88
  messages.append({"role": "assistant", "content": response_text})
89
  messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
 
97
  score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
98
  except Exception as exc:
99
  print(f"[DEBUG] env step failed: {exc}", flush=True)
100
+ log_step(step=step + 1, action=action_type, reward=0.0, done=True, error=str(exc))
101
+ log_end(success=False, steps=step + 1, rewards=rewards)
102
  return 0.0
103
 
104
+ score = max(0.01, min(0.99, score))
105
  rewards.append(score)
106
+ log_step(step=step + 1, action=action_type, reward=score, done=True, error=None)
107
+ log_end(success=score > 0.01, steps=step + 1, rewards=rewards)
108
  return score
109
 
110
  else:
 
128
  }
129
  )
130
 
131
+ log_end(success=False, steps=MAX_STEPS, rewards=rewards)
132
  return 0.0
133
 
134
 
openenv.yaml CHANGED
@@ -6,3 +6,16 @@ type: space
6
  runtime: fastapi
7
  app: server.app:app
8
  port: 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  runtime: fastapi
7
  app: server.app:app
8
  port: 8000
9
+ tasks:
10
+ - id: task_easy
11
+ grader: tasks.task_easy:TopRevenueCategoryTask
12
+ - id: task_medium
13
+ grader: tasks.task_medium:CityRevenueShareTask
14
+ - id: task_medium_2
15
+ grader: tasks.task_medium_2:MonthlyRevenueRatioTask
16
+ - id: task_hard
17
+ grader: tasks.task_hard:RepeatCustomerCohortTask
18
+ - id: task_hard_2
19
+ grader: tasks.task_hard_2:CustomerLoyaltyRevenueTask
20
+ - id: task_hard_3
21
+ grader: tasks.task_hard_3:SupplierProfitabilityTask
tasks/task_easy.py CHANGED
@@ -49,4 +49,5 @@ class TopRevenueCategoryTask(BaseTask):
49
  """
50
  expected = self.expected_answer().strip().lower()
51
  submitted = answer.strip().lower()
52
- return 1.0 if expected in submitted else 0.0
 
 
49
  """
50
  expected = self.expected_answer().strip().lower()
51
  submitted = answer.strip().lower()
52
+ raw = 1.0 if expected in submitted else 0.0
53
+ return max(0.01, min(0.99, raw))
tasks/task_hard.py CHANGED
@@ -100,4 +100,4 @@ class RepeatCustomerCohortTask(BaseTask):
100
  except ValueError:
101
  pass
102
 
103
- return score
 
100
  except ValueError:
101
  pass
102
 
103
+ return max(0.01, min(0.99, score))
tasks/task_hard_2.py CHANGED
@@ -100,4 +100,4 @@ class CustomerLoyaltyRevenueTask(BaseTask):
100
  except ValueError:
101
  pass
102
 
103
- return score
 
100
  except ValueError:
101
  pass
102
 
103
+ return max(0.01, min(0.99, score))
tasks/task_hard_3.py CHANGED
@@ -104,4 +104,4 @@ class SupplierProfitabilityTask(BaseTask):
104
  except ValueError:
105
  pass
106
 
107
- return score
 
104
  except ValueError:
105
  pass
106
 
107
+ return max(0.01, min(0.99, score))
tasks/task_medium.py CHANGED
@@ -74,4 +74,4 @@ class CityRevenueShareTask(BaseTask):
74
  except ValueError:
75
  pass
76
 
77
- return score
 
74
  except ValueError:
75
  pass
76
 
77
+ return max(0.01, min(0.99, score))
tasks/task_medium_2.py CHANGED
@@ -85,4 +85,4 @@ class MonthlyRevenueRatioTask(BaseTask):
85
  except ValueError:
86
  pass
87
 
88
- return score
 
85
  except ValueError:
86
  pass
87
 
88
+ return max(0.01, min(0.99, score))