Spaces:

glitchfilter
/

methanol_apc

Sleeping

File size: 20,995 Bytes

f15113b

"""

Task definitions and graders for the Methanol APC Environment.



Four tasks with increasing difficulty, each with a deterministic grader

that returns a score in [0.0, 1.0].



Tasks

-----

1. startup          (Easy)   — Ramp reactor from idle to operating temperature

2. optimization     (Medium) — Maximize profit at steady state

3. disturbance_rejection (Hard) — Handle cooling system failure

4. long_horizon_production (Expert) — Catalyst-aware marathon production

"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Dict, List

from .reactor_sim import ReactorState, EMERGENCY_SHUTDOWN_TEMP


# ---------------------------------------------------------------------------
# Base task
# ---------------------------------------------------------------------------
@dataclass
class TaskConfig:
    """Configuration for a single task."""

    name: str
    max_steps: int
    initial_temperature: float = 250.0
    initial_pressure: float = 50.0
    initial_feed_h2: float = 4.0
    initial_feed_co: float = 2.0
    initial_cooling_flow: float = 50.0
    initial_cooling_temp: float = 25.0
    initial_compressor: float = 40.0
    initial_catalyst: float = 1.0
    # Disturbance schedule: {step: {field: value}}
    disturbances: Dict[int, Dict[str, float]] = field(default_factory=lambda: {})
    # Operation mode: "steady_state" | "periodic" | "batch"
    operation_mode: str = "steady_state"
    # For periodic mode: demand cycle period (steps)
    demand_period: int = 50
    # For batch mode: target production (kg)
    batch_target_kg: float = 0.0


# ---------------------------------------------------------------------------
# Task definitions
# ---------------------------------------------------------------------------

STARTUP_TASK = TaskConfig(
    name="startup",
    max_steps=50,
    initial_temperature=150.0,
    initial_pressure=30.0,
    initial_feed_h2=0.0,
    initial_feed_co=0.0,
    initial_cooling_flow=20.0,
    initial_compressor=20.0,
)

OPTIMIZATION_TASK = TaskConfig(
    name="optimization",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
)

DISTURBANCE_TASK = TaskConfig(
    name="disturbance_rejection",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # At step 25: cooling water temp rises from 25 -> 45 degC
    # (cooling tower failure — reduces cooling capacity, pushes toward runaway)
    disturbances={25: {"cooling_water_temp": 45.0}},
)

LONG_HORIZON_TASK = TaskConfig(
    name="long_horizon_production",
    max_steps=500,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    operation_mode="batch",
    batch_target_kg=5000.0,  # produce 5000 kg methanol
)

TASKS: Dict[str, TaskConfig] = {
    "startup": STARTUP_TASK,
    "optimization": OPTIMIZATION_TASK,
    "disturbance_rejection": DISTURBANCE_TASK,
    "long_horizon_production": LONG_HORIZON_TASK,
}

# ---------------------------------------------------------------------------
# NEW TASKS — 8 additional scenarios for increased difficulty range
# ---------------------------------------------------------------------------

# Easy-Medium: Emergency recovery — start near shutdown, cool down safely
EMERGENCY_RECOVERY_TASK = TaskConfig(
    name="emergency_recovery",
    max_steps=80,
    initial_temperature=290.0,  # dangerously close to 300C shutdown
    initial_pressure=70.0,
    initial_feed_h2=6.0,
    initial_feed_co=3.0,
    initial_cooling_flow=40.0,
    initial_compressor=60.0,
)

# Medium: Feed composition upset — H2/CO ratio shifts at step 30
FEED_UPSET_TASK = TaskConfig(
    name="feed_composition_upset",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # At step 30: simulate upstream reformer fluctuation
    # Agent must compensate by adjusting feed rates
    disturbances={30: {"cooling_water_temp": 25.0}},  # placeholder — actual feed upset handled in env
)

# Medium: Cost minimization — fixed production target, minimize opex
COST_MINIMIZATION_TASK = TaskConfig(
    name="cost_minimization",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
)

# Hard: Pressure loss — compressor drops 40% at step 20
PRESSURE_LOSS_TASK = TaskConfig(
    name="pressure_loss",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # At step 20: compressor output drops
    disturbances={20: {"cooling_water_temp": 25.0}},  # placeholder
)

# Hard: Day-night cycle — cooling water temp oscillates
DAY_NIGHT_TASK = TaskConfig(
    name="day_night_cycle",
    max_steps=150,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # Cooling water temp changes every 25 steps: 25->35->25->35->25->35
    disturbances={
        25: {"cooling_water_temp": 35.0},
        50: {"cooling_water_temp": 25.0},
        75: {"cooling_water_temp": 35.0},
        100: {"cooling_water_temp": 25.0},
        125: {"cooling_water_temp": 35.0},
    },
    operation_mode="periodic",
    demand_period=50,
)

# Hard: Catalyst degradation — start with aged catalyst
AGED_CATALYST_TASK = TaskConfig(
    name="aged_catalyst",
    max_steps=100,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    initial_catalyst=0.4,  # severely aged catalyst
)

# Expert: Multi-disturbance — cascading failures
MULTI_DISTURBANCE_TASK = TaskConfig(
    name="multi_disturbance",
    max_steps=150,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
    # Cascading failures: cooling at 25, then worse at 50
    disturbances={
        25: {"cooling_water_temp": 35.0},
        50: {"cooling_water_temp": 45.0},
    },
)

# Expert: Maximum yield challenge — produce as much as possible in 200 steps
MAX_YIELD_TASK = TaskConfig(
    name="maximum_yield",
    max_steps=200,
    initial_temperature=250.0,
    initial_pressure=60.0,
    initial_feed_h2=4.0,
    initial_feed_co=2.0,
    initial_cooling_flow=50.0,
    initial_compressor=50.0,
)

# Register all new tasks
TASKS.update({
    "emergency_recovery": EMERGENCY_RECOVERY_TASK,
    "feed_composition_upset": FEED_UPSET_TASK,
    "cost_minimization": COST_MINIMIZATION_TASK,
    "pressure_loss": PRESSURE_LOSS_TASK,
    "day_night_cycle": DAY_NIGHT_TASK,
    "aged_catalyst": AGED_CATALYST_TASK,
    "multi_disturbance": MULTI_DISTURBANCE_TASK,
    "maximum_yield": MAX_YIELD_TASK,
})


# ---------------------------------------------------------------------------
# Graders  — each returns float in (0.0, 1.0) strictly, deterministic
# ---------------------------------------------------------------------------

def _clamp_score(score: float) -> float:
    """Map score in [0, 1] to strictly (0, 1) using centered sigmoid.



    sigmoid(k*(x - 0.5)) centers the S-curve so that:

      0.0 -> ~0.02  (bad stays clearly bad)

      0.5 -> 0.50   (midpoint preserved)

      1.0 -> ~0.98  (good stays clearly good)

    k=10 gives wide spread; final affine scales to [0.01, 0.99].

    """
    import math
    mapped = 1.0 / (1.0 + math.exp(-10.0 * (score - 0.5)))
    return 0.01 + 0.98 * mapped  # scale to (0.01, 0.99)

def grade_startup(trajectory: List[ReactorState]) -> float:
    """Grade the startup task.



    Score based on:

    - Did the reactor reach 250 degC?

    - How much overshoot above 250 degC?

    - Was there an emergency shutdown?

    """
    if not trajectory:
        return 0.0

    target = 250.0
    max_temp = max(s.temperature for s in trajectory)
    final_temp = trajectory[-1].temperature
    shutdown = any(s.emergency_shutdown for s in trajectory)

    if shutdown:
        return 0.0

    # Did we reach target?
    reached = any(s.temperature >= target - 5.0 for s in trajectory)
    if not reached:
        # Partial credit for getting close
        return 0.1 * min(max_temp / target, 1.0)

    # Overshoot penalty
    overshoot = max(0.0, max_temp - target)
    if overshoot > 20.0:
        return 0.1
    score = 1.0 - (overshoot / 20.0)

    # Stability bonus: final temp should be near target
    final_error = abs(final_temp - target)
    if final_error < 5.0:
        score = min(1.0, score + 0.1)

    return max(0.0, min(1.0, score))


def grade_optimization(trajectory: List[ReactorState]) -> float:
    """Grade the optimization task.



    Score = normalized cumulative profit relative to baseline/theoretical range.

    """
    if not trajectory:
        return 0.0

    shutdown = any(s.emergency_shutdown for s in trajectory)
    total_profit = trajectory[-1].cumulative_profit

    # Baseline: conservative operation yields ~$5 over 100 steps
    # Theoretical max: aggressive-but-safe yields ~$25 over 100 steps
    baseline_profit = 5.0
    max_profit = 25.0

    if shutdown:
        # Still give partial credit for profit earned before shutdown
        score = 0.2 * max(0.0, total_profit / max_profit)
        return max(0.0, min(1.0, score))

    score = (total_profit - baseline_profit) / max(max_profit - baseline_profit, 1e-6)
    return max(0.0, min(1.0, score))


def grade_disturbance(trajectory: List[ReactorState]) -> float:
    """Grade the disturbance rejection task.



    50% for survival (no shutdown), 50% for maintained production.

    """
    if not trajectory:
        return 0.0

    shutdown = any(s.emergency_shutdown for s in trajectory)
    survival_score = 0.0 if shutdown else 0.5

    # Production after disturbance (step 25+)
    post_disturbance = [s for s in trajectory if s.time_step >= 25]
    if not post_disturbance:
        return survival_score

    production_after = sum(
        max(0.0, post_disturbance[i].methanol_produced - 
            (post_disturbance[i - 1].methanol_produced if i > 0 else 
             post_disturbance[0].methanol_produced))
        for i in range(1, len(post_disturbance))
    )

    # Expected production at steady state over 75 steps: ~12 kg
    expected = 12.0
    yield_score = min(0.5, 0.5 * production_after / max(expected, 1e-6))

    return max(0.0, min(1.0, survival_score + yield_score))


def grade_long_horizon(trajectory: List[ReactorState]) -> float:
    """Grade the long-horizon production task.



    Target: produce 50,000 kg of methanol.

    Score based on production achieved and catalyst health.

    """
    if not trajectory:
        return 0.0

    target = 50_000.0
    final = trajectory[-1]
    production = final.methanol_produced
    catalyst = final.catalyst_health
    shutdown = any(s.emergency_shutdown for s in trajectory)
    steps = final.time_step

    if shutdown:
        return 0.1 * min(production / target, 1.0)

    if catalyst <= 0.01:
        # Catalyst destroyed — heavy penalty
        return 0.1 * min(production / target, 1.0)

    if production >= target:
        # Reached target — score by speed
        score = 1.0 - (steps / 500.0)
        return max(0.3, min(1.0, score))

    # Didn't reach target — partial credit
    return 0.3 * min(production / target, 1.0)


def _clamped_grader(fn):
    """Wrap a grader to ensure score is strictly in (0, 1)."""
    def wrapper(trajectory):
        return _clamp_score(fn(trajectory))
    return wrapper


GRADERS = {
    "startup": _clamped_grader(grade_startup),
    "optimization": _clamped_grader(grade_optimization),
    "disturbance_rejection": _clamped_grader(grade_disturbance),
    "long_horizon_production": _clamped_grader(grade_long_horizon),
}


# ---------------------------------------------------------------------------
# Graders for new tasks — reuse patterns from existing graders
# ---------------------------------------------------------------------------

def grade_emergency_recovery(trajectory: List[ReactorState]) -> float:
    """Grade emergency recovery: cool down from 290C without shutdown."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.0
    final_temp = trajectory[-1].temperature
    # Score based on how close to target 250C and how quickly
    if final_temp > 270:
        return 0.2  # still too hot
    temp_score = 0.5 * max(0.0, 1.0 - abs(final_temp - 250.0) / 40.0)
    # Production bonus
    production = trajectory[-1].methanol_produced
    prod_score = 0.5 * min(1.0, production / 200.0)
    return temp_score + prod_score


def grade_feed_upset(trajectory: List[ReactorState]) -> float:
    """Grade feed composition upset: maintain production through ratio change."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.1
    profit = trajectory[-1].cumulative_profit
    return min(1.0, max(0.0, profit / 20.0))


def grade_cost_minimization(trajectory: List[ReactorState]) -> float:
    """Grade cost minimization: maximize profit efficiency (profit per unit feed)."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    profit = trajectory[-1].cumulative_profit
    production = trajectory[-1].methanol_produced
    if shutdown or production < 10.0:
        return 0.1
    # Profit per kg of methanol produced
    efficiency = profit / max(production, 1.0)
    return min(1.0, max(0.0, efficiency / 0.5))  # ~$0.50/kg is excellent


def grade_pressure_loss(trajectory: List[ReactorState]) -> float:
    """Grade pressure loss: maintain production after compressor drops."""
    return grade_disturbance(trajectory)  # same scoring as disturbance rejection


def grade_day_night(trajectory: List[ReactorState]) -> float:
    """Grade day-night cycle: maintain stable production through oscillating cooling."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.1
    # Stability: low temperature variance
    temps = [s.temperature for s in trajectory]
    mean_temp = sum(temps) / len(temps)
    variance = sum((t - mean_temp) ** 2 for t in temps) / len(temps)
    stability_score = 0.5 * max(0.0, 1.0 - variance / 100.0)
    # Production
    production = trajectory[-1].methanol_produced
    prod_score = 0.5 * min(1.0, production / 500.0)
    return stability_score + prod_score


def grade_aged_catalyst(trajectory: List[ReactorState]) -> float:
    """Grade aged catalyst: maximize production with degraded catalyst."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    if shutdown:
        return 0.1
    production = trajectory[-1].methanol_produced
    catalyst_preserved = trajectory[-1].catalyst_health
    # With aged catalyst (start at 0.4), getting any production is good
    prod_score = 0.6 * min(1.0, production / 200.0)
    cat_score = 0.4 * (catalyst_preserved / 0.4)  # relative preservation
    return min(1.0, prod_score + cat_score)


def grade_multi_disturbance(trajectory: List[ReactorState]) -> float:
    """Grade multi-disturbance: survive cascading failures."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    survival = 0.0 if shutdown else 0.4
    # Production after second disturbance (step 50+)
    post = [s for s in trajectory if s.time_step >= 50]
    if not post:
        return survival
    production_after = post[-1].methanol_produced - (post[0].methanol_produced if post else 0)
    yield_score = 0.6 * min(1.0, production_after / 300.0)
    return survival + yield_score


def grade_max_yield(trajectory: List[ReactorState]) -> float:
    """Grade maximum yield: total methanol produced in 200 steps."""
    if not trajectory:
        return 0.0
    shutdown = any(s.emergency_shutdown for s in trajectory)
    production = trajectory[-1].methanol_produced
    if shutdown:
        return 0.1 * min(1.0, production / 1000.0)
    # 1000 kg in 200 steps is excellent
    return min(1.0, production / 1000.0)


# Register new graders
GRADERS.update({
    "emergency_recovery": _clamped_grader(grade_emergency_recovery),
    "feed_composition_upset": _clamped_grader(grade_feed_upset),
    "cost_minimization": _clamped_grader(grade_cost_minimization),
    "pressure_loss": _clamped_grader(grade_pressure_loss),
    "day_night_cycle": _clamped_grader(grade_day_night),
    "aged_catalyst": _clamped_grader(grade_aged_catalyst),
    "multi_disturbance": _clamped_grader(grade_multi_disturbance),
    "maximum_yield": _clamped_grader(grade_max_yield),
})


# ---------------------------------------------------------------------------
# Step reward computation (dense, per-step)
# ---------------------------------------------------------------------------

def compute_step_reward(

    prev: ReactorState,

    curr: ReactorState,

    task: TaskConfig,

) -> float:
    """Compute dense per-step reward.



    Six components normalized to roughly [-1, +1]:

    1. profit_reward:        normalized step profit

    2. safety_reward:        distance from safety limits

    3. stability_reward:     low temperature variance

    4. catalyst_reward:      catalyst health preservation

    5. task_progress_reward: task-specific progress signal

    6. shutdown_penalty:     -1.0 if emergency shutdown

    """
    if curr.emergency_shutdown:
        import math
        mapped = 1.0 / (1.0 + math.exp(-3.0 * (-1.0)))  # raw = -1.0
        return 0.01 + 0.98 * mapped  # ≈ 0.06

    # 1. Profit reward (0 to +0.4)
    profit_reward = max(-0.2, min(0.4, curr.profit_this_step / 0.5))

    # 2. Safety reward: distance from 300 degC limit (-0.3 to +0.2)
    temp_margin = (EMERGENCY_SHUTDOWN_TEMP - curr.temperature) / EMERGENCY_SHUTDOWN_TEMP
    if curr.temperature > 280:
        safety_reward = -0.3 * (curr.temperature - 280) / 20.0
    elif curr.temperature > 270:
        safety_reward = -0.1
    else:
        safety_reward = 0.1 * temp_margin

    # 3. Stability reward: low temperature change (+0.0 to +0.1)
    temp_change = abs(curr.temperature - prev.temperature)
    stability_reward = 0.1 * max(0.0, 1.0 - temp_change / 5.0)

    # 4. Catalyst reward (+0.0 to +0.1)
    catalyst_reward = 0.1 * curr.catalyst_health

    # 5. Task-specific progress
    progress_reward = 0.0
    if task.name == "startup":
        target = 250.0
        dist_now = abs(curr.temperature - target)
        dist_prev = abs(prev.temperature - target)
        if dist_now < dist_prev:
            progress_reward = 0.2 * (dist_prev - dist_now) / target
        elif curr.temperature > target + 5:
            progress_reward = -0.1
    elif task.name == "optimization":
        progress_reward = 0.2 * max(0.0, min(1.0, curr.profit_this_step / 0.3))
    elif task.name == "disturbance_rejection":
        # Reward stability after disturbance
        if curr.time_step > 25:
            progress_reward = 0.2 * max(0.0, 1.0 - temp_change / 3.0)
        else:
            progress_reward = 0.1 * max(0.0, curr.profit_this_step / 0.3)
    elif task.name == "long_horizon_production":
        # Reward production rate while preserving catalyst
        production_rate = curr.methanol_produced - prev.methanol_produced
        progress_reward = 0.15 * min(1.0, production_rate / 0.2)
        progress_reward += 0.05 * curr.catalyst_health

    total = profit_reward + safety_reward + stability_reward + catalyst_reward + progress_reward
    # Sigmoid mapping: preserves relative signal in (0.01, 0.99)
    import math
    mapped = 1.0 / (1.0 + math.exp(-3.0 * total))
    return 0.01 + 0.98 * mapped