File size: 10,920 Bytes

5a64d6e

"""

BaramNuri (바람누리) - Lightweight Driver Behavior Detection Model



A hybrid architecture combining:

- Video Swin Transformer (Stage 1-3) for spatial features

- Selective State Space Model (SSM) for temporal modeling



Trained via Knowledge Distillation from Video Swin-T teacher.



Author: C-Team

License: Apache-2.0

"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.video import swin3d_t, Swin3D_T_Weights
from typing import Dict, Tuple


class SelectiveSSM(nn.Module):
    """

    Selective State Space Model (Mamba-style)



    Key: Dynamically generates B, C, delta based on input

    - Important information is remembered

    - Less important information is quickly forgotten

    """

    def __init__(self, d_model: int, d_state: int = 16, d_conv: int = 4, expand: int = 2, dropout: float = 0.1):
        super().__init__()

        self.d_model = d_model
        self.d_state = d_state
        self.d_conv = d_conv
        self.expand = expand
        self.d_inner = d_model * expand

        # Input projection (expansion)
        self.in_proj = nn.Linear(d_model, self.d_inner * 2, bias=False)

        # 1D convolution (local context)
        self.conv1d = nn.Conv1d(
            self.d_inner, self.d_inner,
            kernel_size=d_conv,
            padding=d_conv - 1,
            groups=self.d_inner
        )

        # SSM parameter generator (selective!)
        self.x_proj = nn.Linear(self.d_inner, d_state * 2 + 1, bias=False)

        # A parameter (learnable diagonal matrix)
        self.A_log = nn.Parameter(torch.log(torch.arange(1, d_state + 1, dtype=torch.float32)))
        self.D = nn.Parameter(torch.ones(self.d_inner))

        # Output projection
        self.out_proj = nn.Linear(self.d_inner, d_model, bias=False)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """

        Args:

            x: [B, T, D]

        Returns:

            y: [B, T, D]

        """
        residual = x
        x = self.layer_norm(x)

        B, T, D = x.shape

        # Input projection -> [B, T, 2*d_inner]
        xz = self.in_proj(x)
        x, z = xz.chunk(2, dim=-1)

        # 1D Conv (capture local context)
        x = x.transpose(1, 2)
        x = self.conv1d(x)[:, :, :T]
        x = x.transpose(1, 2)

        x = F.silu(x)

        # Selective SSM parameter generation
        x_ssm = self.x_proj(x)
        B_t = x_ssm[:, :, :self.d_state]
        C_t = x_ssm[:, :, self.d_state:self.d_state*2]
        delta = F.softplus(x_ssm[:, :, -1:])

        # A parameter (negative for stability)
        A = -torch.exp(self.A_log)

        # Discretization: A_bar = exp(delta * A)
        A_bar = torch.exp(delta * A.view(1, 1, -1))

        # SSM scan
        h = torch.zeros(B, self.d_inner, self.d_state, device=x.device, dtype=x.dtype)
        outputs = []

        for t in range(T):
            x_t = x[:, t, :]
            B_t_t = B_t[:, t, :]
            C_t_t = C_t[:, t, :]
            A_bar_t = A_bar[:, t, :]

            # h = A_bar * h + B_t * x
            h = h * A_bar_t.unsqueeze(1) + B_t_t.unsqueeze(1) * x_t.unsqueeze(-1)

            # y = C_t * h + D * x
            y_t = (C_t_t.unsqueeze(1) * h).sum(dim=-1) + self.D * x_t
            outputs.append(y_t)

        y = torch.stack(outputs, dim=1)

        # Gating
        y = y * F.silu(z)

        # Output projection
        y = self.out_proj(y)
        y = self.dropout(y)

        return y + residual


class TemporalSSMBlock(nn.Module):
    """

    Temporal SSM Block for video



    Takes [B, T, C] sequence and applies SSM layers

    """

    def __init__(self, d_model: int, d_state: int = 16, n_layers: int = 2, dropout: float = 0.1):
        super().__init__()

        self.ssm_layers = nn.ModuleList([
            SelectiveSSM(d_model, d_state=d_state, dropout=dropout)
            for _ in range(n_layers)
        ])

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """

        Args:

            x: [B, T, D] sequence

        Returns:

            y: [B, D] final representation

        """
        for ssm in self.ssm_layers:
            x = ssm(x)

        return x.mean(dim=1)


class BaramNuri(nn.Module):
    """

    BaramNuri (바람누리) - Lightweight Driver Behavior Detection Model



    Architecture:

    1. Video Swin-T Stages 1-3 (spatial features, 384 dim)

    2. Selective SSM Block (temporal modeling)

    3. Classification Head



    Parameters: 14.20M (49% reduction from teacher's 27.86M)

    Performance: 96.17% accuracy, 0.9504 Macro F1

    """

    CLASS_NAMES = ["정상", "졸음운전", "물건찾기", "휴대폰 사용", "운전자 폭행"]
    CLASS_NAMES_EN = ["normal", "drowsy_driving", "searching_object", "phone_usage", "driver_assault"]

    def __init__(

        self,

        num_classes: int = 5,

        pretrained: bool = True,

        d_state: int = 16,

        ssm_layers: int = 2,

        dropout: float = 0.2,

    ):
        super().__init__()

        self.num_classes = num_classes

        # Load Video Swin-T backbone (only Stage 1-3)
        if pretrained:
            print("Loading Swin backbone (Kinetics-400 pretrained)...")
            full_swin = swin3d_t(weights=Swin3D_T_Weights.KINETICS400_V1)
        else:
            full_swin = swin3d_t(weights=None)

        # Patch embedding
        self.patch_embed = full_swin.patch_embed

        # Use only Stage 1-3 (features[0:5]) for 384 dim output
        self.features = nn.Sequential(*[full_swin.features[i] for i in range(5)])

        # Stage 3 output: 384 dim
        self.feature_dim = 384

        # Global average pooling
        self.avgpool = nn.AdaptiveAvgPool3d(output_size=1)

        # SSM temporal modeling block
        self.temporal_ssm = TemporalSSMBlock(
            d_model=self.feature_dim,
            d_state=d_state,
            n_layers=ssm_layers,
            dropout=dropout,
        )

        # Classification head
        self.head = nn.Sequential(
            nn.LayerNorm(self.feature_dim),
            nn.Dropout(p=dropout),
            nn.Linear(self.feature_dim, num_classes),
        )

        # Initialize head
        self._init_head()

        # Delete Stage 4 parameters (memory saving)
        del full_swin

    def _init_head(self):
        """Initialize head weights"""
        for m in self.head.modules():
            if isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def extract_features(self, x: torch.Tensor) -> torch.Tensor:
        """

        Extract features (for knowledge distillation)



        Args:

            x: [B, C, T, H, W]

        Returns:

            features: [B, feature_dim]

        """
        # Patch embedding
        x = self.patch_embed(x)

        # Swin Stages
        x = self.features(x)

        B, T, H, W, C = x.shape

        # Spatial average -> [B, T, C] sequence
        x = x.mean(dim=[2, 3])

        # SSM temporal modeling
        x = self.temporal_ssm(x)

        return x

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """

        Forward pass



        Args:

            x: [B, C, T, H, W] video tensor

        Returns:

            logits: [B, num_classes]

        """
        features = self.extract_features(x)
        logits = self.head(features)
        return logits

    def forward_with_features(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """

        Return both features and logits (for knowledge distillation)

        """
        features = self.extract_features(x)
        logits = self.head(features)
        return logits, features

    def predict(self, x: torch.Tensor, return_english: bool = False) -> Dict:
        """

        Inference prediction



        Args:

            x: [1, C, T, H, W] single video

            return_english: Return English class names

        Returns:

            dict with class, confidence, class_name

        """
        self.eval()
        with torch.no_grad():
            logits = self.forward(x)
            probs = F.softmax(logits, dim=-1)[0]
            class_idx = probs.argmax().item()

            class_names = self.CLASS_NAMES_EN if return_english else self.CLASS_NAMES

            return {
                "class": class_idx,
                "confidence": probs[class_idx].item(),
                "class_name": class_names[class_idx],
                "all_probs": {
                    name: probs[i].item()
                    for i, name in enumerate(class_names)
                }
            }

    @classmethod
    def from_pretrained(cls, checkpoint_path: str, device: str = 'cpu'):
        """

        Load pretrained model from checkpoint



        Args:

            checkpoint_path: Path to .pth file

            device: 'cpu' or 'cuda'

        Returns:

            Loaded model in eval mode

        """
        model = cls(num_classes=5, pretrained=True)
        checkpoint = torch.load(checkpoint_path, map_location=device)

        if 'model_state_dict' in checkpoint:
            model.load_state_dict(checkpoint['model_state_dict'])
        else:
            model.load_state_dict(checkpoint)

        model = model.to(device)
        model.eval()

        return model


def count_parameters(model: nn.Module) -> int:
    """Count total model parameters"""
    return sum(p.numel() for p in model.parameters())


if __name__ == "__main__":
    print("=" * 60)
    print("BaramNuri Model Test")
    print("=" * 60)

    # Create model
    model = BaramNuri(num_classes=5, pretrained=True)

    # Parameter count
    total_params = count_parameters(model)
    print(f"\nTotal parameters: {total_params:,} ({total_params/1e6:.2f}M)")

    # Test with dummy input
    dummy_input = torch.randn(2, 3, 30, 224, 224)
    print(f"\nInput shape: {dummy_input.shape}")

    # Forward pass
    model.eval()
    with torch.no_grad():
        output = model(dummy_input)
    print(f"Output shape: {output.shape}")

    # Single sample prediction test
    single_input = torch.randn(1, 3, 30, 224, 224)
    prediction = model.predict(single_input)
    print(f"\nPrediction (Korean): {prediction['class_name']} ({prediction['confidence']:.2%})")

    prediction_en = model.predict(single_input, return_english=True)
    print(f"Prediction (English): {prediction_en['class_name']} ({prediction_en['confidence']:.2%})")

    print("\nModel test passed!")