File size: 20,995 Bytes
f15113b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
"""

Task definitions and graders for the Methanol APC Environment.



Four tasks with increasing difficulty, each with a deterministic grader

that returns a score in [0.0, 1.0].



Tasks

-----

1. startup          (Easy)   β€” Ramp reactor from idle to operating temperature

2. optimization     (Medium) β€” Maximize profit at steady state

3. disturbance_rejection (Hard) β€” Handle cooling system failure

4. long_horizon_production (Expert) β€” Catalyst-aware marathon production

"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, List

from .reactor_sim import ReactorState, EMERGENCY_SHUTDOWN_TEMP


# ---------------------------------------------------------------------------
# Base task
# ---------------------------------------------------------------------------
@dataclass
class TaskConfig:
    """Configuration for a single task."""

    name: str
    max_steps: int
    initial_temperature: float = 250.0
    initial_pressure: float = 50.0
    initial_feed_h2: float = 4.0
    initial_feed_co: float = 2.0
    initial_cooling_flow: float = 50.0
    initial_cooling_temp: float = 25.0
    initial_compressor: float = 40.0
    initial_catalyst: float = 1.0
    # Disturbance schedule: {step: {field: value}}
    disturbances: Dict[int, Dict[str, float]] = field(default_factory=lambda: {})
    # Operation mode: "steady_state" | "periodic" | "batch"
    operation_mode: str = "steady_state"
    # For periodic mode: demand cycle period (steps)
    demand_period: int = 50
    # For batch mode: target production (kg)
    batch_target_kg: float = 0.0


# ---------------------------------------------------------------------------
# Task definitions
# ---------------------------------------------------------------------------

STARTUP_TASK = TaskConfig(
    name="startup",
    max_steps=50,
    initial_temperature=150.0,
    initial_pressure=30.0,
    initial_feed_h2=0.0,
    initial_feed_co=0.0,
    initial_cooling_flow=20.0,
    initial_compressor=20.0,
)

OPTIMIZATION_TASK = TaskConfig(
    name="optimization",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
)

DISTURBANCE_TASK = TaskConfig(
    name="disturbance_rejection",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # At step 25: cooling water temp rises from 25 -> 45 degC
    # (cooling tower failure β€” reduces cooling capacity, pushes toward runaway)
    disturbances={25: {"cooling_water_temp": 45.0}},
)

LONG_HORIZON_TASK = TaskConfig(
    name="long_horizon_production",
    max_steps=500,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    operation_mode="batch",
    batch_target_kg=5000.0,  # produce 5000 kg methanol
)

TASKS: Dict[str, TaskConfig] = {
    "startup": STARTUP_TASK,
    "optimization": OPTIMIZATION_TASK,
    "disturbance_rejection": DISTURBANCE_TASK,
    "long_horizon_production": LONG_HORIZON_TASK,
}

# ---------------------------------------------------------------------------
# NEW TASKS β€” 8 additional scenarios for increased difficulty range
# ---------------------------------------------------------------------------

# Easy-Medium: Emergency recovery β€” start near shutdown, cool down safely
EMERGENCY_RECOVERY_TASK = TaskConfig(
    name="emergency_recovery",
    max_steps=80,
    initial_temperature=290.0,  # dangerously close to 300C shutdown
    initial_pressure=70.0,
    initial_feed_h2=6.0,
    initial_feed_co=3.0,
    initial_cooling_flow=40.0,
    initial_compressor=60.0,
)

# Medium: Feed composition upset β€” H2/CO ratio shifts at step 30
FEED_UPSET_TASK = TaskConfig(
    name="feed_composition_upset",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # At step 30: simulate upstream reformer fluctuation
    # Agent must compensate by adjusting feed rates
    disturbances={30: {"cooling_water_temp": 25.0}},  # placeholder β€” actual feed upset handled in env
)

# Medium: Cost minimization β€” fixed production target, minimize opex
COST_MINIMIZATION_TASK = TaskConfig(
    name="cost_minimization",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
)

# Hard: Pressure loss β€” compressor drops 40% at step 20
PRESSURE_LOSS_TASK = TaskConfig(
    name="pressure_loss",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # At step 20: compressor output drops
    disturbances={20: {"cooling_water_temp": 25.0}},  # placeholder
)

# Hard: Day-night cycle β€” cooling water temp oscillates
DAY_NIGHT_TASK = TaskConfig(
    name="day_night_cycle",
    max_steps=150,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # Cooling water temp changes every 25 steps: 25->35->25->35->25->35
    disturbances={
        25: {"cooling_water_temp": 35.0},
        50: {"cooling_water_temp": 25.0},
        75: {"cooling_water_temp": 35.0},
        100: {"cooling_water_temp": 25.0},
        125: {"cooling_water_temp": 35.0},
    },
    operation_mode="periodic",
    demand_period=50,
)

# Hard: Catalyst degradation β€” start with aged catalyst
AGED_CATALYST_TASK = TaskConfig(
    name="aged_catalyst",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    initial_catalyst=0.4,  # severely aged catalyst
)

# Expert: Multi-disturbance β€” cascading failures
MULTI_DISTURBANCE_TASK = TaskConfig(
    name="multi_disturbance",
    max_steps=150,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # Cascading failures: cooling at 25, then worse at 50
    disturbances={
        25: {"cooling_water_temp": 35.0},
        50: {"cooling_water_temp": 45.0},
    },
)

# Expert: Maximum yield challenge β€” produce as much as possible in 200 steps
MAX_YIELD_TASK = TaskConfig(
    name="maximum_yield",
    max_steps=200,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
)

# Register all new tasks
TASKS.update({
    "emergency_recovery": EMERGENCY_RECOVERY_TASK,
    "feed_composition_upset": FEED_UPSET_TASK,
    "cost_minimization": COST_MINIMIZATION_TASK,
    "pressure_loss": PRESSURE_LOSS_TASK,
    "day_night_cycle": DAY_NIGHT_TASK,
    "aged_catalyst": AGED_CATALYST_TASK,
    "multi_disturbance": MULTI_DISTURBANCE_TASK,
    "maximum_yield": MAX_YIELD_TASK,
})


# ---------------------------------------------------------------------------
# Graders  β€” each returns float in (0.0, 1.0) strictly, deterministic
# ---------------------------------------------------------------------------

def _clamp_score(score: float) -> float:
    """Map score in [0, 1] to strictly (0, 1) using centered sigmoid.



    sigmoid(k*(x - 0.5)) centers the S-curve so that:

      0.0 -> ~0.02  (bad stays clearly bad)

      0.5 -> 0.50   (midpoint preserved)

      1.0 -> ~0.98  (good stays clearly good)

    k=10 gives wide spread; final affine scales to [0.01, 0.99].

    """
    import math
    mapped = 1.0 / (1.0 + math.exp(-10.0 * (score - 0.5)))
    return 0.01 + 0.98 * mapped  # scale to (0.01, 0.99)

def grade_startup(trajectory: List[ReactorState]) -> float:
    """Grade the startup task.



    Score based on:

    - Did the reactor reach 250 degC?

    - How much overshoot above 250 degC?

    - Was there an emergency shutdown?

    """
    if not trajectory:
        return 0.0

    target = 250.0
    max_temp = max(s.temperature for s in trajectory)
    final_temp = trajectory[-1].temperature
    shutdown = any(s.emergency_shutdown for s in trajectory)

    if shutdown:
        return 0.0

    # Did we reach target?
    reached = any(s.temperature >= target - 5.0 for s in trajectory)
    if not reached:
        # Partial credit for getting close
        return 0.1 * min(max_temp / target, 1.0)

    # Overshoot penalty
    overshoot = max(0.0, max_temp - target)
    if overshoot > 20.0:
        return 0.1
    score = 1.0 - (overshoot / 20.0)

    # Stability bonus: final temp should be near target
    final_error = abs(final_temp - target)
    if final_error < 5.0:
        score = min(1.0, score + 0.1)

    return max(0.0, min(1.0, score))


def grade_optimization(trajectory: List[ReactorState]) -> float:
    """Grade the optimization task.



    Score = normalized cumulative profit relative to baseline/theoretical range.

    """
    if not trajectory:
        return 0.0

    shutdown = any(s.emergency_shutdown for s in trajectory)
    total_profit = trajectory[-1].cumulative_profit

    # Baseline: conservative operation yields ~$5 over 100 steps
    # Theoretical max: aggressive-but-safe yields ~$25 over 100 steps
    baseline_profit = 5.0
    max_profit = 25.0

    if shutdown:
        # Still give partial credit for profit earned before shutdown
        score = 0.2 * max(0.0, total_profit / max_profit)
        return max(0.0, min(1.0, score))

    score = (total_profit - baseline_profit) / max(max_profit - baseline_profit, 1e-6)
    return max(0.0, min(1.0, score))


def grade_disturbance(trajectory: List[ReactorState]) -> float:
    """Grade the disturbance rejection task.



    50% for survival (no shutdown), 50% for maintained production.

    """
    if not trajectory:
        return 0.0

    shutdown = any(s.emergency_shutdown for s in trajectory)
    survival_score = 0.0 if shutdown else 0.5

    # Production after disturbance (step 25+)
    post_disturbance = [s for s in trajectory if s.time_step >= 25]
    if not post_disturbance:
        return survival_score

    production_after = sum(
        max(0.0, post_disturbance[i].methanol_produced - 
            (post_disturbance[i - 1].methanol_produced if i > 0 else 
             post_disturbance[0].methanol_produced))
        for i in range(1, len(post_disturbance))
    )

    # Expected production at steady state over 75 steps: ~12 kg
    expected = 12.0
    yield_score = min(0.5, 0.5 * production_after / max(expected, 1e-6))

    return max(0.0, min(1.0, survival_score + yield_score))


def grade_long_horizon(trajectory: List[ReactorState]) -> float:
    """Grade the long-horizon production task.



    Target: produce 50,000 kg of methanol.

    Score based on production achieved and catalyst health.

    """
    if not trajectory:
        return 0.0

    target = 50_000.0
    final = trajectory[-1]
    production = final.methanol_produced
    catalyst = final.catalyst_health
    shutdown = any(s.emergency_shutdown for s in trajectory)
    steps = final.time_step

    if shutdown:
        return 0.1 * min(production / target, 1.0)

    if catalyst <= 0.01:
        # Catalyst destroyed β€” heavy penalty
        return 0.1 * min(production / target, 1.0)

    if production >= target:
        # Reached target β€” score by speed
        score = 1.0 - (steps / 500.0)
        return max(0.3, min(1.0, score))

    # Didn't reach target β€” partial credit
    return 0.3 * min(production / target, 1.0)


def _clamped_grader(fn):
    """Wrap a grader to ensure score is strictly in (0, 1)."""
    def wrapper(trajectory):
        return _clamp_score(fn(trajectory))
    return wrapper


GRADERS = {
    "startup": _clamped_grader(grade_startup),
    "optimization": _clamped_grader(grade_optimization),
    "disturbance_rejection": _clamped_grader(grade_disturbance),
    "long_horizon_production": _clamped_grader(grade_long_horizon),
}


# ---------------------------------------------------------------------------
# Graders for new tasks β€” reuse patterns from existing graders
# ---------------------------------------------------------------------------

def grade_emergency_recovery(trajectory: List[ReactorState]) -> float:
    """Grade emergency recovery: cool down from 290C without shutdown."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.0
    final_temp = trajectory[-1].temperature
    # Score based on how close to target 250C and how quickly
    if final_temp > 270:
        return 0.2  # still too hot
    temp_score = 0.5 * max(0.0, 1.0 - abs(final_temp - 250.0) / 40.0)
    # Production bonus
    production = trajectory[-1].methanol_produced
    prod_score = 0.5 * min(1.0, production / 200.0)
    return temp_score + prod_score


def grade_feed_upset(trajectory: List[ReactorState]) -> float:
    """Grade feed composition upset: maintain production through ratio change."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.1
    profit = trajectory[-1].cumulative_profit
    return min(1.0, max(0.0, profit / 20.0))


def grade_cost_minimization(trajectory: List[ReactorState]) -> float:
    """Grade cost minimization: maximize profit efficiency (profit per unit feed)."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    profit = trajectory[-1].cumulative_profit
    production = trajectory[-1].methanol_produced
    if shutdown or production < 10.0:
        return 0.1
    # Profit per kg of methanol produced
    efficiency = profit / max(production, 1.0)
    return min(1.0, max(0.0, efficiency / 0.5))  # ~$0.50/kg is excellent


def grade_pressure_loss(trajectory: List[ReactorState]) -> float:
    """Grade pressure loss: maintain production after compressor drops."""
    return grade_disturbance(trajectory)  # same scoring as disturbance rejection


def grade_day_night(trajectory: List[ReactorState]) -> float:
    """Grade day-night cycle: maintain stable production through oscillating cooling."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.1
    # Stability: low temperature variance
    temps = [s.temperature for s in trajectory]
    mean_temp = sum(temps) / len(temps)
    variance = sum((t - mean_temp) ** 2 for t in temps) / len(temps)
    stability_score = 0.5 * max(0.0, 1.0 - variance / 100.0)
    # Production
    production = trajectory[-1].methanol_produced
    prod_score = 0.5 * min(1.0, production / 500.0)
    return stability_score + prod_score


def grade_aged_catalyst(trajectory: List[ReactorState]) -> float:
    """Grade aged catalyst: maximize production with degraded catalyst."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.1
    production = trajectory[-1].methanol_produced
    catalyst_preserved = trajectory[-1].catalyst_health
    # With aged catalyst (start at 0.4), getting any production is good
    prod_score = 0.6 * min(1.0, production / 200.0)
    cat_score = 0.4 * (catalyst_preserved / 0.4)  # relative preservation
    return min(1.0, prod_score + cat_score)


def grade_multi_disturbance(trajectory: List[ReactorState]) -> float:
    """Grade multi-disturbance: survive cascading failures."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    survival = 0.0 if shutdown else 0.4
    # Production after second disturbance (step 50+)
    post = [s for s in trajectory if s.time_step >= 50]
    if not post:
        return survival
    production_after = post[-1].methanol_produced - (post[0].methanol_produced if post else 0)
    yield_score = 0.6 * min(1.0, production_after / 300.0)
    return survival + yield_score


def grade_max_yield(trajectory: List[ReactorState]) -> float:
    """Grade maximum yield: total methanol produced in 200 steps."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    production = trajectory[-1].methanol_produced
    if shutdown:
        return 0.1 * min(1.0, production / 1000.0)
    # 1000 kg in 200 steps is excellent
    return min(1.0, production / 1000.0)


# Register new graders
GRADERS.update({
    "emergency_recovery": _clamped_grader(grade_emergency_recovery),
    "feed_composition_upset": _clamped_grader(grade_feed_upset),
    "cost_minimization": _clamped_grader(grade_cost_minimization),
    "pressure_loss": _clamped_grader(grade_pressure_loss),
    "day_night_cycle": _clamped_grader(grade_day_night),
    "aged_catalyst": _clamped_grader(grade_aged_catalyst),
    "multi_disturbance": _clamped_grader(grade_multi_disturbance),
    "maximum_yield": _clamped_grader(grade_max_yield),
})


# ---------------------------------------------------------------------------
# Step reward computation (dense, per-step)
# ---------------------------------------------------------------------------

def compute_step_reward(

    prev: ReactorState,

    curr: ReactorState,

    task: TaskConfig,

) -> float:
    """Compute dense per-step reward.



    Six components normalized to roughly [-1, +1]:

    1. profit_reward:        normalized step profit

    2. safety_reward:        distance from safety limits

    3. stability_reward:     low temperature variance

    4. catalyst_reward:      catalyst health preservation

    5. task_progress_reward: task-specific progress signal

    6. shutdown_penalty:     -1.0 if emergency shutdown

    """
    if curr.emergency_shutdown:
        import math
        mapped = 1.0 / (1.0 + math.exp(-3.0 * (-1.0)))  # raw = -1.0
        return 0.01 + 0.98 * mapped  # β‰ˆ 0.06

    # 1. Profit reward (0 to +0.4)
    profit_reward = max(-0.2, min(0.4, curr.profit_this_step / 0.5))

    # 2. Safety reward: distance from 300 degC limit (-0.3 to +0.2)
    temp_margin = (EMERGENCY_SHUTDOWN_TEMP - curr.temperature) / EMERGENCY_SHUTDOWN_TEMP
    if curr.temperature > 280:
        safety_reward = -0.3 * (curr.temperature - 280) / 20.0
    elif curr.temperature > 270:
        safety_reward = -0.1
    else:
        safety_reward = 0.1 * temp_margin

    # 3. Stability reward: low temperature change (+0.0 to +0.1)
    temp_change = abs(curr.temperature - prev.temperature)
    stability_reward = 0.1 * max(0.0, 1.0 - temp_change / 5.0)

    # 4. Catalyst reward (+0.0 to +0.1)
    catalyst_reward = 0.1 * curr.catalyst_health

    # 5. Task-specific progress
    progress_reward = 0.0
    if task.name == "startup":
        target = 250.0
        dist_now = abs(curr.temperature - target)
        dist_prev = abs(prev.temperature - target)
        if dist_now < dist_prev:
            progress_reward = 0.2 * (dist_prev - dist_now) / target
        elif curr.temperature > target + 5:
            progress_reward = -0.1
    elif task.name == "optimization":
        progress_reward = 0.2 * max(0.0, min(1.0, curr.profit_this_step / 0.3))
    elif task.name == "disturbance_rejection":
        # Reward stability after disturbance
        if curr.time_step > 25:
            progress_reward = 0.2 * max(0.0, 1.0 - temp_change / 3.0)
        else:
            progress_reward = 0.1 * max(0.0, curr.profit_this_step / 0.3)
    elif task.name == "long_horizon_production":
        # Reward production rate while preserving catalyst
        production_rate = curr.methanol_produced - prev.methanol_produced
        progress_reward = 0.15 * min(1.0, production_rate / 0.2)
        progress_reward += 0.05 * curr.catalyst_health

    total = profit_reward + safety_reward + stability_reward + catalyst_reward + progress_reward
    # Sigmoid mapping: preserves relative signal in (0.01, 0.99)
    import math
    mapped = 1.0 / (1.0 + math.exp(-3.0 * total))
    return 0.01 + 0.98 * mapped