Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files
server/rust_coder_environment.py
CHANGED
|
@@ -194,6 +194,7 @@ class RustCoderEnvironment(Environment):
|
|
| 194 |
problem.get("id"),
|
| 195 |
problem.get("title"),
|
| 196 |
)
|
|
|
|
| 197 |
done = False
|
| 198 |
return RustCoderObservation(
|
| 199 |
problem_description=problem["description"],
|
|
@@ -219,6 +220,13 @@ class RustCoderEnvironment(Environment):
|
|
| 219 |
# ββ 1. Compilation (40%) ββββββββββββββββββββββββββββββββββββββ
|
| 220 |
compilation_success, compilation_output = self._compile_check(code)
|
| 221 |
r_compilation = 1.0 if compilation_success else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
# ββ 2. Correctness + Coverage (20% each) βββββββββββββββββββββ
|
| 224 |
test_results: List[Dict] = []
|
|
@@ -242,12 +250,16 @@ class RustCoderEnvironment(Environment):
|
|
| 242 |
# Only score elegance for code that compiles; otherwise it can
|
| 243 |
# incorrectly award points for non-compiling submissions.
|
| 244 |
r_elegance = self._score_elegance(code) if compilation_success else 0.0
|
|
|
|
|
|
|
| 245 |
|
| 246 |
# ββ 4. Efficiency (10%) βββββββββββββββββββββββββββββββββββββββ
|
| 247 |
baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
|
| 248 |
r_efficiency = 0.0
|
| 249 |
if compilation_success:
|
| 250 |
r_efficiency = self._score_efficiency(code, baseline_ms)
|
|
|
|
|
|
|
| 251 |
|
| 252 |
# ββ Total reward ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 253 |
reward_breakdown = {
|
|
@@ -272,6 +284,8 @@ class RustCoderEnvironment(Environment):
|
|
| 272 |
)
|
| 273 |
|
| 274 |
# ββ Advance Logic βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
| 275 |
self.current_problem_idx += 1
|
| 276 |
done = self.current_problem_idx >= len(self.problems)
|
| 277 |
|
|
|
|
| 194 |
problem.get("id"),
|
| 195 |
problem.get("title"),
|
| 196 |
)
|
| 197 |
+
# Episode is not finished; allow retry on same problem.
|
| 198 |
done = False
|
| 199 |
return RustCoderObservation(
|
| 200 |
problem_description=problem["description"],
|
|
|
|
| 220 |
# ββ 1. Compilation (40%) ββββββββββββββββββββββββββββββββββββββ
|
| 221 |
compilation_success, compilation_output = self._compile_check(code)
|
| 222 |
r_compilation = 1.0 if compilation_success else 0.0
|
| 223 |
+
# Warnings are not compilation errors in Rust, but they indicate lower quality.
|
| 224 |
+
# Penalize compilation score slightly when warnings are present.
|
| 225 |
+
warning_count = 0
|
| 226 |
+
if compilation_output:
|
| 227 |
+
warning_count = len(re.findall(r'(?m)^warning:', compilation_output))
|
| 228 |
+
if compilation_success and warning_count > 0:
|
| 229 |
+
r_compilation = max(0.6, 1.0 - min(0.05 * warning_count, 0.4))
|
| 230 |
|
| 231 |
# ββ 2. Correctness + Coverage (20% each) βββββββββββββββββββββ
|
| 232 |
test_results: List[Dict] = []
|
|
|
|
| 250 |
# Only score elegance for code that compiles; otherwise it can
|
| 251 |
# incorrectly award points for non-compiling submissions.
|
| 252 |
r_elegance = self._score_elegance(code) if compilation_success else 0.0
|
| 253 |
+
if compilation_success and warning_count > 0:
|
| 254 |
+
r_elegance = max(0.0, round(r_elegance - min(0.02 * warning_count, 0.2), 4))
|
| 255 |
|
| 256 |
# ββ 4. Efficiency (10%) βββββββββββββββββββββββββββββββββββββββ
|
| 257 |
baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
|
| 258 |
r_efficiency = 0.0
|
| 259 |
if compilation_success:
|
| 260 |
r_efficiency = self._score_efficiency(code, baseline_ms)
|
| 261 |
+
if warning_count > 0:
|
| 262 |
+
r_efficiency = max(0.0, round(r_efficiency - min(0.02 * warning_count, 0.2), 4))
|
| 263 |
|
| 264 |
# ββ Total reward ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 265 |
reward_breakdown = {
|
|
|
|
| 284 |
)
|
| 285 |
|
| 286 |
# ββ Advance Logic βββββββββββββββββββββββββββββββββββββββββββββ
|
| 287 |
+
# One step = one evaluated task. We advance to the next task, and the episode
|
| 288 |
+
# ends only after the final task has been evaluated.
|
| 289 |
self.current_problem_idx += 1
|
| 290 |
done = self.current_problem_idx >= len(self.problems)
|
| 291 |
|