Spaces:

nirmalpratheep
/

Car-Racing-Agent

Sleeping

File size: 4,516 Bytes

41a9651

"""

Observation encoder for the car racing PPO agent.



Input

-----

img     : (B, 3, 64, 64)  float32, pixels normalised to 0..1

scalars : (B, 7)           float32, [angular_velocity, speed, ray×5]



Output

------

(B, 288)  flat feature vector → feed directly into actor / critic heads.



Architecture

------------

ImpalaCNN  (Espeholt et al., IMPALA 2018)

  3 blocks × (Conv → MaxPool → ResBlock → ResBlock)

  channels : 16 → 32 → 32

  64×64 input shrinks to 8×8 after 3 stride-2 MaxPools  →  32×8×8 = 2048 → FC(256)



  Key difference vs Nature CNN: each block adds two residual (skip) connections.

  Gradients flow straight back through the shortcuts, so early conv filters keep

  updating throughout training.  Empirically 3-5× more sample-efficient on

  visual RL tasks at identical inference cost.



Scalar MLP

  7 → 32 → 32  (angular_velocity, speed, ray×5)



Combined

  cat([img_features, scalar_features])  →  288-d vector

"""

import torch
import torch.nn as nn


# ── Building blocks ───────────────────────────────────────────────────────────

class _ResBlock(nn.Module):
    def __init__(self, channels: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.ReLU(),
            nn.Conv2d(channels, channels, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(channels, channels, 3, padding=1),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.net(x)          # skip connection


class _ImpalaBlock(nn.Module):
    def __init__(self, in_ch: int, out_ch: int):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, 3, padding=1)
        self.pool = nn.MaxPool2d(3, stride=2, padding=1)
        self.res1 = _ResBlock(out_ch)
        self.res2 = _ResBlock(out_ch)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pool(self.conv(x))
        x = self.res1(x)
        x = self.res2(x)
        return x


# ── Encoders ──────────────────────────────────────────────────────────────────

class ImpalaCNN(nn.Module):
    """

    Encodes a (B, 3, 64, 64) image to a (B, 256) feature vector.



    Block channels [16, 32, 32]:

        input  64×64

        block1 32×32  (16 ch)

        block2 16×16  (32 ch)

        block3  8×8   (32 ch)  →  flatten 2048  →  FC 256

    """

    CHANNELS = [16, 32, 32]

    def __init__(self, in_channels: int = 3, out_features: int = 256):
        super().__init__()
        blocks, ch = [], in_channels
        for out_ch in self.CHANNELS:
            blocks.append(_ImpalaBlock(ch, out_ch))
            ch = out_ch
        self.cnn = nn.Sequential(*blocks, nn.ReLU())
        self.fc  = nn.Sequential(
            nn.Flatten(),
            nn.Linear(ch * 8 * 8, out_features),
            nn.ReLU(),
        )
        self.out_features = out_features

    def forward(self, img: torch.Tensor) -> torch.Tensor:
        return self.fc(self.cnn(img))


class RaceEncoder(nn.Module):
    """

    Full encoder: ImpalaCNN for image + small MLP for scalars, outputs

    concatenated feature vector for actor / critic heads.



    out_features = img_features (256) + scalar_features (32) = 288

    """

    def __init__(self, img_features: int = 256, scalar_features: int = 32):
        super().__init__()
        self.cnn = ImpalaCNN(out_features=img_features)
        self.scalar_mlp = nn.Sequential(
            # 9 scalars: angular_velocity, speed, ray×5, wp_sin, wp_cos
            nn.Linear(9, scalar_features),
            nn.ReLU(),
            nn.Linear(scalar_features, scalar_features),
            nn.ReLU(),
        )
        self.out_features = img_features + scalar_features

    def forward(self, img: torch.Tensor, scalars: torch.Tensor) -> torch.Tensor:
        """

        img     : (B, 3, 64, 64)  float32  pixels / 255

        scalars : (B, 7)          float32  obs.scalars

                  [angular_velocity, speed,

                   ray_left, ray_front_left, ray_front, ray_front_right, ray_right]

        returns : (B, out_features)

        """
        return torch.cat([self.cnn(img), self.scalar_mlp(scalars)], dim=-1)