Spaces:

Conner
/

sddec25-01

Sleeping

App Files Files Community

connerohnesorge commited on Dec 10, 2025

Commit

a69fe43

1 Parent(s): 1777497

latest

Browse files

Files changed (9) hide show

__pycache__/app.cpython-313.pyc +0 -0
app.py +389 -4
best_model.pth +3 -0
nsa/__init__.py +36 -0
nsa/__pycache__/__init__.cpython-313.pyc +0 -0
nsa/__pycache__/model.cpython-313.pyc +0 -0
nsa/model.py +1921 -0
packages.txt +2 -0
requirements.txt +6 -0

__pycache__/app.cpython-313.pyc ADDED Viewed

Binary file (11 kB). View file

app.py CHANGED Viewed

@@ -1,8 +1,393 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+#!/usr/bin/env python3
+"""
+NSA Pupil Segmentation Gradio Demo - Native Sparse Attention Web Application
+This Gradio application performs real-time pupil segmentation on webcam input
+using the NSAPupilSeg model (Native Sparse Attention). It demonstrates eye tracking
+and pupil detection capabilities for the VisionAssist medical assistive technology project.
+NSA Key Features:
+- Token Compression: Global coarse-grained context
+- Token Selection: Fine-grained focus on important regions (pupil)
+- Sliding Window: Local context for precise boundaries
+- Gated Aggregation: Learned combination of attention paths
+"""
+import cv2
+import numpy as np
+import torch
 import gradio as gr
+import mediapipe as mp
+from nsa import create_nsa_pupil_seg
+# =============================================================================
+# Model Loading (at module startup)
+# =============================================================================
+print("Loading NSA Pupil Segmentation model...")
+model = create_nsa_pupil_seg(size="pico", in_channels=1, num_classes=2)
+checkpoint = torch.load("best_model.pth", map_location="cpu", weights_only=False)
+if "model_state_dict" in checkpoint:
+    model.load_state_dict(checkpoint["model_state_dict"])
+    print(f"Loaded checkpoint with IoU: {checkpoint.get('valid_iou', 'N/A')}")
+else:
+    model.load_state_dict(checkpoint)
+model.eval()
+print("Model loaded successfully!")
+# =============================================================================
+# MediaPipe Face Mesh Setup
+# =============================================================================
+mp_face_mesh = mp.solutions.face_mesh
+face_mesh = mp_face_mesh.FaceMesh(
+    max_num_faces=1,
+    refine_landmarks=True,
+    min_detection_confidence=0.5,
+    min_tracking_confidence=0.5,
+)
+# =============================================================================
+# Constants (from demo.py - MUST match training exactly)
+# =============================================================================
+# MediaPipe left eye landmark indices (12 points around the eye)
+LEFT_EYE_INDICES = [362, 385, 387, 263, 373, 380, 374, 381, 382, 384, 398, 466]
+# Target aspect ratio for eye region (width:height = 640:400 = 1.6:1)
+TARGET_ASPECT_RATIO = 640 / 400  # 1.6:1
+# Model input/output dimensions
+MODEL_WIDTH = 640
+MODEL_HEIGHT = 400
+# Preprocessing parameters (MUST match training exactly)
+NORMALIZE_MEAN = 0.5
+NORMALIZE_STD = 0.5
+# Eye extraction settings
+BBOX_PADDING = 0.2  # 20% padding on each side
+MIN_EYE_REGION_SIZE = 50  # Minimum bounding box size
+# Visualization settings
+OVERLAY_ALPHA = 0.5
+# =============================================================================
+# Eye Region Extraction Function
+# =============================================================================
+def extract_eye_region(frame, landmarks):
+    """
+    Extract left eye region from frame using MediaPipe landmarks.
+    Args:
+        frame: Input BGR frame
+        landmarks: MediaPipe face landmarks
+    Returns:
+        tuple: (eye_crop, bbox) where bbox is (x, y, w, h), or (None, None)
+    """
+    h, w = frame.shape[:2]
+    # Extract left eye landmark coordinates
+    eye_points = np.array([
+        [int(landmarks.landmark[idx].x * w), int(landmarks.landmark[idx].y * h)]
+        for idx in LEFT_EYE_INDICES
+    ], dtype=np.int32)
+    # Compute bounding box
+    x_min, y_min = eye_points.min(axis=0)
+    x_max, y_max = eye_points.max(axis=0)
+    bbox_w = x_max - x_min
+    bbox_h = y_max - y_min
+    # Check if eye region is large enough
+    if bbox_w < MIN_EYE_REGION_SIZE or bbox_h < MIN_EYE_REGION_SIZE:
+        return None, None
+    # Add padding (20% on each side)
+    pad_w = int(bbox_w * BBOX_PADDING)
+    pad_h = int(bbox_h * BBOX_PADDING)
+    x_min = max(0, x_min - pad_w)
+    y_min = max(0, y_min - pad_h)
+    x_max = min(w, x_max + pad_w)
+    y_max = min(h, y_max + pad_h)
+    bbox_w = x_max - x_min
+    bbox_h = y_max - y_min
+    # Expand to 1.6:1 aspect ratio (640:400)
+    current_ratio = bbox_w / bbox_h
+    if current_ratio < TARGET_ASPECT_RATIO:
+        # Too narrow, expand width
+        target_w = int(bbox_h * TARGET_ASPECT_RATIO)
+        diff = target_w - bbox_w
+        x_min = max(0, x_min - diff // 2)
+        x_max = min(w, x_max + diff // 2)
+        bbox_w = x_max - x_min
+    else:
+        # Too short, expand height
+        target_h = int(bbox_w / TARGET_ASPECT_RATIO)
+        diff = target_h - bbox_h
+        y_min = max(0, y_min - diff // 2)
+        y_max = min(h, y_max + diff // 2)
+        bbox_h = y_max - y_min
+    # Extract region
+    eye_crop = frame[y_min:y_max, x_min:x_max]
+    # Validate the crop is not empty
+    if eye_crop.size == 0:
+        return None, None
+    return eye_crop, (x_min, y_min, bbox_w, bbox_h)
+# =============================================================================
+# Preprocessing Function (CRITICAL - must match training exactly)
+# =============================================================================
+def preprocess(eye_crop):
+    """
+    Preprocess eye region for model inference.
+    CRITICAL: Must match training preprocessing exactly.
+    Args:
+        eye_crop: BGR image of eye region
+    Returns:
+        torch.Tensor: Preprocessed tensor of shape (1, 1, 640, 400)
+    """
+    # Step 1: Resize to model input size (640, 400)
+    resized = cv2.resize(
+        eye_crop,
+        (MODEL_WIDTH, MODEL_HEIGHT),
+        interpolation=cv2.INTER_LINEAR
+    )
+    # Step 2: Convert to grayscale
+    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
+    # Step 3: Normalize to [-1, 1] range (mean=0.5, std=0.5)
+    normalized = (gray.astype(np.float32) / 255.0 - NORMALIZE_MEAN) / NORMALIZE_STD
+    # Step 4: Transpose to (1, 1, W, H) - model expects (B, C, W, H), NOT (B, C, H, W)
+    # normalized is (H, W) = (400, 640), we need (W, H) = (640, 400)
+    input_tensor = normalized.T[np.newaxis, np.newaxis, :, :]
+    return torch.from_numpy(input_tensor)
+# =============================================================================
+# Inference Function
+# =============================================================================
+def run_inference(input_tensor):
+    """
+    Run model inference on preprocessed input.
+    Args:
+        input_tensor: Preprocessed tensor of shape (1, 1, 640, 400)
+    Returns:
+        np.ndarray: Binary segmentation mask of shape (400, 640)
+    """
+    with torch.no_grad():
+        output = model(input_tensor)
+    # Convert output to numpy for post-processing
+    output_np = output.cpu().numpy()
+    # Post-processing: argmax to get binary mask
+    # Model outputs (B, C, W, H) = (1, 2, 640, 400), argmax over classes gives (640, 400)
+    # Transpose back to (H, W) = (400, 640) for visualization
+    mask = np.argmax(output_np[0], axis=0).T.astype(np.uint8)
+    return mask
+# =============================================================================
+# Visualization Function
+# =============================================================================
+def visualize(frame, eye_crop, mask, bbox, face_detected):
+    """
+    Visualize segmentation results on frame.
+    Args:
+        frame: Original BGR frame
+        eye_crop: Eye region crop
+        mask: Binary segmentation mask (400, 640)
+        bbox: Bounding box (x, y, w, h)
+        face_detected: Whether face was detected
+    Returns:
+        np.ndarray: Annotated frame
+    """
+    annotated = frame.copy()
+    # Draw status banner at top center
+    banner_height = 50
+    banner_w = annotated.shape[1]
+    # Semi-transparent black background for banner
+    banner_region = annotated[0:banner_height, 0:banner_w].astype(np.float32)
+    banner_region *= 0.5
+    annotated[0:banner_height, 0:banner_w] = banner_region.astype(np.uint8)
+    # Status text
+    if not face_detected:
+        status_text = "No Face Detected"
+        status_color = (0, 255, 255)  # Yellow (BGR)
+    elif mask is None:
+        status_text = "Move Closer"
+        status_color = (0, 255, 255)  # Yellow
+    else:
+        status_text = "Face Detected"
+        status_color = (0, 255, 0)  # Green
+    text_size = cv2.getTextSize(status_text, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2)[0]
+    text_x = (banner_w - text_size[0]) // 2
+    text_y = (banner_height + text_size[1]) // 2
+    cv2.putText(
+        annotated,
+        status_text,
+        (text_x, text_y),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        1.0,
+        status_color,
+        2,
+    )
+    # If we have a valid mask, overlay it on the eye region
+    if mask is not None and bbox is not None:
+        x, y, w, h = bbox
+        # Resize mask to match eye crop size
+        mask_resized = cv2.resize(mask, (w, h), interpolation=cv2.INTER_NEAREST)
+        # Create green overlay where mask==1 (pupil detected)
+        green_overlay = np.zeros((h, w, 3), dtype=np.uint8)
+        green_overlay[mask_resized == 1] = (0, 255, 0)  # Green in BGR
+        # Blend with original eye region
+        eye_region = annotated[y:y + h, x:x + w]
+        blended = cv2.addWeighted(
+            eye_region,
+            1 - OVERLAY_ALPHA,
+            green_overlay,
+            OVERLAY_ALPHA,
+            0
+        )
+        annotated[y:y + h, x:x + w] = blended
+        # Draw bounding box
+        cv2.rectangle(annotated, (x, y), (x + w, y + h), (0, 255, 0), 3)
+    # Draw model info (bottom-left)
+    cv2.putText(
+        annotated,
+        "NSA-pico",
+        (10, annotated.shape[0] - 20),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        0.7,
+        (0, 255, 0),
+        2,
+    )
+    return annotated
+# =============================================================================
+# Main Process Function
+# =============================================================================
+def process_frame(image):
+    """
+    Process a single frame from webcam for pupil segmentation.
+    Args:
+        image: Input RGB image from Gradio (numpy array)
+    Returns:
+        np.ndarray: Annotated RGB image for Gradio output
+    """
+    if image is None:
+        return None
+    # Gradio provides RGB, convert to BGR for OpenCV
+    frame_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    # Run MediaPipe face detection on RGB image
+    results = face_mesh.process(image)  # MediaPipe expects RGB
+    face_detected = results.multi_face_landmarks is not None
+    # Initialize variables
+    eye_crop = None
+    bbox = None
+    mask = None
+    # Process if face detected
+    if face_detected:
+        landmarks = results.multi_face_landmarks[0]
+        # Extract eye region (from BGR frame)
+        eye_crop, bbox = extract_eye_region(frame_bgr, landmarks)
+        if eye_crop is not None:
+            # Preprocess
+            input_tensor = preprocess(eye_crop)
+            # Run inference
+            mask = run_inference(input_tensor)
+    # Visualize (on BGR frame)
+    annotated_bgr = visualize(frame_bgr, eye_crop, mask, bbox, face_detected)
+    # Convert back to RGB for Gradio output
+    annotated_rgb = cv2.cvtColor(annotated_bgr, cv2.COLOR_BGR2RGB)
+    return annotated_rgb
+# =============================================================================
+# Gradio Interface
+# =============================================================================
+demo = gr.Interface(
+    fn=process_frame,
+    inputs=gr.Image(sources=["webcam"], streaming=True, label="Webcam Input"),
+    outputs=gr.Image(label="Pupil Segmentation"),
+    live=True,
+    title="NSA Pupil Segmentation Demo",
+    description="""
+    Real-time pupil segmentation using Native Sparse Attention (NSA).
+    This demo uses the NSAPupilSeg model from the VisionAssist project to detect
+    and segment the pupil region in real-time from your webcam feed.
+    **How it works:**
+    1. MediaPipe Face Mesh detects your face and eye landmarks
+    2. The left eye region is extracted and preprocessed
+    3. The NSA model performs semantic segmentation to identify the pupil
+    4. Results are overlaid on the video feed with a green highlight
+    **Tips for best results:**
+    - Ensure good lighting on your face
+    - Look directly at the camera
+    - Keep your face within the frame
+    - Move closer if the eye region is too small
+    **Model:** NSA-pico (Native Sparse Attention)
+    """,
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9de8d19344d1567ba49dc011a0d149f557c734f26ed70beaaa033568c774b8f
+size 253744

nsa/__init__.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+NSA (Native Sparse Attention) for Pupil Segmentation.
+This module implements a Native Sparse Attention mechanism adapted from
+DeepSeek's NSA paper for efficient pupil segmentation in eye images.
+Key components:
+- Token Compression: Coarse-grained global context
+- Token Selection: Fine-grained important region focus
+- Sliding Window: Local context for precise boundaries
+- Gated Aggregation: Learned combination of all attention paths
+Adapted for 2D vision tasks (segmentation) from the original 1D NLP formulation.
+"""
+from .model import (
+    NSAPupilSeg,
+    NSABlock,
+    SpatialNSA,
+    TokenCompression,
+    TokenSelection,
+    SlidingWindowAttention,
+    CombinedLoss,
+    create_nsa_pupil_seg,
+)
+__all__ = [
+    "NSAPupilSeg",
+    "NSABlock",
+    "SpatialNSA",
+    "TokenCompression",
+    "TokenSelection",
+    "SlidingWindowAttention",
+    "CombinedLoss",
+    "create_nsa_pupil_seg",
+]

nsa/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (978 Bytes). View file

nsa/__pycache__/model.cpython-313.pyc ADDED Viewed

Binary file (46.6 kB). View file

nsa/model.py ADDED Viewed

	@@ -0,0 +1,1921 @@

+"""
+Native Sparse Attention (NSA) Model for Pupil Segmentation.
+Implementation based on DeepSeek's NSA paper:
+"Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention"
+Adapted for 2D vision/segmentation tasks with domain-specific optimizations for
+pupil segmentation where:
+- Intense pixel localization is required
+- The pupil is only found on the eye (spatial locality)
+- OpenEDS provides multi-class data beyond pupil
+Architecture:
+- Encoder with NSA blocks for hierarchical feature extraction
+- Decoder with skip connections for precise segmentation
+- NSA combines: Compression (global), Selection (important), Sliding Window (local)
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# =============================================================================
+# Core Building Blocks
+# =============================================================================
+class ConvBNReLU(nn.Module):
+    """Convolution + BatchNorm + Activation block."""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+        activation: bool = True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(
+            out_channels
+        )
+        self.act = (
+            nn.GELU()
+            if activation
+            else nn.Identity()
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        return self.act(
+            self.bn(self.conv(x))
+        )
+class PatchEmbedding(nn.Module):
+    """
+    Embed image patches into tokens for attention processing.
+    Uses strided convolutions to reduce spatial resolution.
+    """
+    def __init__(
+        self,
+        in_channels: int = 1,
+        embed_dim: int = 32,
+        patch_size: int = 4,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        mid_dim = embed_dim // 2
+        # Two-stage downsampling for smoother feature transition
+        self.conv1 = ConvBNReLU(
+            in_channels,
+            mid_dim,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.conv2 = ConvBNReLU(
+            mid_dim,
+            embed_dim,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: Input image (B, C, H, W)
+        Returns:
+            Embedded patches (B, embed_dim, H//4, W//4)
+        """
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+# =============================================================================
+# Token Compression Module
+# =============================================================================
+class TokenCompression(nn.Module):
+    """
+    Compress spatial blocks into single tokens for coarse-grained attention.
+    From NSA paper Eq. 7:
+    K_cmp = {φ(k_{id+1:id+l}) | 0 ≤ i ≤ ⌊(t-l)/d⌋}
+    Adapted for 2D: compress spatial blocks into representative tokens.
+    """
+    def __init__(
+        self,
+        dim: int,
+        block_size: int = 4,
+        stride: int = 2,
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.stride = stride
+        # Learnable compression MLP with position encoding
+        self.compress_k = nn.Sequential(
+            nn.Linear(
+                dim
+                * block_size
+                * block_size,
+                dim * 2,
+            ),
+            nn.GELU(),
+            nn.Linear(dim * 2, dim),
+        )
+        self.compress_v = nn.Sequential(
+            nn.Linear(
+                dim
+                * block_size
+                * block_size,
+                dim * 2,
+            ),
+            nn.GELU(),
+            nn.Linear(dim * 2, dim),
+        )
+        # Intra-block position encoding
+        self.pos_embed = nn.Parameter(
+            torch.randn(
+                1,
+                block_size * block_size,
+                dim,
+            )
+            * 0.02
+        )
+    def forward(
+        self,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        spatial_size: tuple[int, int],
+    ) -> tuple[
+        torch.Tensor, torch.Tensor
+    ]:
+        """
+        Compress keys and values into block-level representations.
+        Args:
+            k: Keys (B, N, dim) where N = H * W
+            v: Values (B, N, dim)
+            spatial_size: (H, W) tuple for non-square inputs
+        Returns:
+            k_cmp: Compressed keys (B, N_cmp, dim)
+            v_cmp: Compressed values (B, N_cmp, dim)
+        """
+        B, N, dim = k.shape
+        # Use provided spatial dimensions for non-square inputs
+        H, W = spatial_size
+        bs = self.block_size
+        stride = self.stride
+        # Calculate number of blocks
+        n_blocks_h = (
+            H - bs
+        ) // stride + 1
+        n_blocks_w = (
+            W - bs
+        ) // stride + 1
+        # Extract overlapping blocks using unfold
+        # Use reshape instead of view for non-contiguous tensors
+        k_2d = (
+            k.reshape(B, H, W, dim)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )  # (B, dim, H, W)
+        v_2d = (
+            v.reshape(B, H, W, dim)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        # Unfold to get blocks: (B, dim*bs*bs, n_blocks)
+        k_blocks = F.unfold(
+            k_2d,
+            kernel_size=bs,
+            stride=stride,
+        )
+        v_blocks = F.unfold(
+            v_2d,
+            kernel_size=bs,
+            stride=stride,
+        )
+        # Reshape for compression: (B, n_blocks, dim*bs*bs)
+        n_blocks = k_blocks.shape[2]
+        k_blocks = k_blocks.permute(
+            0, 2, 1
+        ).contiguous()
+        v_blocks = v_blocks.permute(
+            0, 2, 1
+        ).contiguous()
+        # Add position encoding before compression
+        # Reshape blocks to add position encoding: (B, n_blocks, bs*bs, dim)
+        k_blocks_reshaped = (
+            k_blocks.reshape(
+                B,
+                n_blocks,
+                bs * bs,
+                dim,
+            )
+        )
+        k_blocks_reshaped = (
+            k_blocks_reshaped
+            + self.pos_embed.unsqueeze(
+                0
+            )
+        )
+        k_blocks_pos = (
+            k_blocks_reshaped.reshape(
+                B,
+                n_blocks,
+                bs * bs * dim,
+            )
+        )
+        # Compress to single tokens
+        k_cmp = self.compress_k(
+            k_blocks_pos
+        )
+        v_cmp = self.compress_v(
+            v_blocks
+        )
+        return k_cmp, v_cmp
+# =============================================================================
+# Token Selection Module
+# =============================================================================
+class TokenSelection(nn.Module):
+    """
+    Select important token blocks based on attention scores.
+    From NSA paper Eq. 8-12:
+    - Compute importance from compressed attention scores
+    - Select top-n blocks for fine-grained attention
+    For pupil segmentation: identifies the most relevant spatial regions.
+    """
+    def __init__(
+        self,
+        dim: int,
+        block_size: int = 4,
+        num_select: int = 4,
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.num_select = num_select
+        self.dim = dim
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_scores_cmp: torch.Tensor,
+        spatial_size: tuple[int, int],
+    ) -> tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        """
+        Select important blocks based on compressed attention scores.
+        Args:
+            q: Queries (B, H, N, dim)
+            k: Keys (B, N, dim)
+            v: Values (B, N, dim)
+            attn_scores_cmp: Attention from compression (B, H, N, N_cmp)
+            spatial_size: (height, width) of feature map
+        Returns:
+            k_slc: Selected keys
+            v_slc: Selected values
+            indices: Selected block indices
+        """
+        B, num_heads, N, N_cmp = (
+            attn_scores_cmp.shape
+        )
+        H, W = spatial_size
+        bs = self.block_size
+        # Sum attention across heads for shared selection (GQA-style)
+        importance = (
+            attn_scores_cmp.sum(dim=1)
+        )  # (B, N, N_cmp)
+        # Average importance across queries to get block scores
+        block_importance = (
+            importance.mean(dim=1)
+        )  # (B, N_cmp)
+        # Select top-n blocks
+        num_select = min(
+            self.num_select, N_cmp
+        )
+        _, indices = torch.topk(
+            block_importance,
+            num_select,
+            dim=-1,
+        )  # (B, num_select)
+        # Map compressed indices back to original token blocks
+        # This is simplified - in practice would need proper index mapping
+        # For now, use the indices to gather from original k, v
+        # Reshape k, v to blocks
+        n_blocks_h = (H - bs) // bs + 1
+        n_blocks_w = (W - bs) // bs + 1
+        # Gather selected blocks
+        k_2d = (
+            k.reshape(B, H, W, -1)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        v_2d = (
+            v.reshape(B, H, W, -1)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        # Use unfold to extract all blocks
+        k_blocks = F.unfold(
+            k_2d,
+            kernel_size=bs,
+            stride=bs,
+        )  # (B, dim*bs*bs, n_blocks)
+        v_blocks = F.unfold(
+            v_2d,
+            kernel_size=bs,
+            stride=bs,
+        )
+        n_blocks = k_blocks.shape[2]
+        k_blocks = (
+            k_blocks.permute(0, 2, 1)
+            .contiguous()
+            .reshape(
+                B, n_blocks, bs * bs, -1
+            )
+        )
+        v_blocks = (
+            v_blocks.permute(0, 2, 1)
+            .contiguous()
+            .reshape(
+                B, n_blocks, bs * bs, -1
+            )
+        )
+        # Clamp indices to valid range
+        indices = indices.clamp(
+            0, n_blocks - 1
+        )
+        # Gather selected blocks
+        indices_expanded = (
+            indices.unsqueeze(-1)
+            .unsqueeze(-1)
+            .expand(
+                -1,
+                -1,
+                bs * bs,
+                k.shape[-1],
+            )
+        )
+        k_slc = torch.gather(
+            k_blocks,
+            1,
+            indices_expanded,
+        )  # (B, num_select, bs*bs, dim)
+        v_slc = torch.gather(
+            v_blocks,
+            1,
+            indices_expanded,
+        )
+        # Flatten selected blocks
+        k_slc = k_slc.view(
+            B, num_select * bs * bs, -1
+        )
+        v_slc = v_slc.view(
+            B, num_select * bs * bs, -1
+        )
+        return k_slc, v_slc, indices
+# =============================================================================
+# Sliding Window Attention
+# =============================================================================
+class SlidingWindowAttention(nn.Module):
+    """
+    Local sliding window attention for fine-grained local context.
+    From NSA paper Section 3.3.3:
+    Maintains recent tokens in a window for local pattern recognition.
+    For pupil segmentation: critical for precise boundary delineation.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 2,
+        window_size: int = 7,
+        qkv_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(
+            dim, dim * 3, bias=qkv_bias
+        )
+        self.proj = nn.Linear(dim, dim)
+        # Relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                (2 * window_size - 1)
+                * (2 * window_size - 1),
+                num_heads,
+            )
+        )
+        nn.init.trunc_normal_(
+            self.relative_position_bias_table,
+            std=0.02,
+        )
+        # Create position index
+        coords_h = torch.arange(
+            window_size
+        )
+        coords_w = torch.arange(
+            window_size
+        )
+        coords = torch.stack(
+            torch.meshgrid(
+                coords_h,
+                coords_w,
+                indexing="ij",
+            )
+        )
+        coords_flatten = coords.flatten(
+            1
+        )
+        relative_coords = (
+            coords_flatten[:, :, None]
+            - coords_flatten[:, None, :]
+        )
+        relative_coords = (
+            relative_coords.permute(
+                1, 2, 0
+            ).contiguous()
+        )
+        relative_coords[:, :, 0] += (
+            window_size - 1
+        )
+        relative_coords[:, :, 1] += (
+            window_size - 1
+        )
+        relative_coords[:, :, 0] *= (
+            2 * window_size - 1
+        )
+        relative_position_index = (
+            relative_coords.sum(-1)
+        )
+        self.register_buffer(
+            "relative_position_index",
+            relative_position_index,
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Apply sliding window attention.
+        Args:
+            x: Input features (B, C, H, W)
+        Returns:
+            Output features (B, C, H, W)
+        """
+        B, C, H, W = x.shape
+        ws = self.window_size
+        # Pad to multiple of window size
+        pad_h = (ws - H % ws) % ws
+        pad_w = (ws - W % ws) % ws
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(
+                x, (0, pad_w, 0, pad_h)
+            )
+        _, _, Hp, Wp = x.shape
+        # Reshape to windows: (B*num_windows, ws*ws, C)
+        x = x.view(
+            B,
+            C,
+            Hp // ws,
+            ws,
+            Wp // ws,
+            ws,
+        )
+        x = x.permute(
+            0, 2, 4, 3, 5, 1
+        ).contiguous()
+        x = x.view(-1, ws * ws, C)
+        # Compute QKV
+        B_win = x.shape[0]
+        qkv = self.qkv(x).reshape(
+            B_win,
+            ws * ws,
+            3,
+            self.num_heads,
+            self.head_dim,
+        )
+        qkv = qkv.permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # Attention
+        attn = (
+            q @ k.transpose(-2, -1)
+        ) * self.scale
+        # Add relative position bias
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(
+                -1
+            )
+        ].view(
+            ws * ws, ws * ws, -1
+        )
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()
+        attn = (
+            attn
+            + relative_position_bias.unsqueeze(
+                0
+            )
+        )
+        attn = attn.softmax(dim=-1)
+        x = (
+            (attn @ v)
+            .transpose(1, 2)
+            .reshape(B_win, ws * ws, C)
+        )
+        x = self.proj(x)
+        # Reshape back
+        num_windows_h = Hp // ws
+        num_windows_w = Wp // ws
+        x = x.view(
+            B,
+            num_windows_h,
+            num_windows_w,
+            ws,
+            ws,
+            C,
+        )
+        x = x.permute(
+            0, 5, 1, 3, 2, 4
+        ).contiguous()
+        x = x.view(B, C, Hp, Wp)
+        # Remove padding
+        if pad_h > 0 or pad_w > 0:
+            x = x[:, :, :H, :W]
+        return x
+# =============================================================================
+# Native Sparse Attention (NSA) - Core Module
+# =============================================================================
+class SpatialNSA(nn.Module):
+    """
+    Native Sparse Attention adapted for 2D spatial features.
+    Combines three attention paths (NSA paper Eq. 5):
+    o* = Σ g_c · Attn(q, K̃_c, Ṽ_c) for c ∈ {cmp, slc, win}
+    Components:
+    1. Compressed Attention: Global coarse-grained context
+    2. Selected Attention: Fine-grained important regions
+    3. Sliding Window: Local context for precise boundaries
+    4. Gated Aggregation: Learned combination
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 2,
+        compress_block_size: int = 4,
+        compress_stride: int = 2,
+        select_block_size: int = 4,
+        num_select: int = 4,
+        window_size: int = 7,
+        qkv_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # Separate QKV for each branch (prevents shortcut learning)
+        self.qkv_cmp = nn.Linear(
+            dim, dim * 3, bias=qkv_bias
+        )
+        self.qkv_slc = nn.Linear(
+            dim, dim * 3, bias=qkv_bias
+        )
+        # Token compression module
+        self.compression = TokenCompression(
+            dim=dim,
+            block_size=compress_block_size,
+            stride=compress_stride,
+        )
+        # Token selection module
+        self.selection = TokenSelection(
+            dim=dim,
+            block_size=select_block_size,
+            num_select=num_select,
+        )
+        # Sliding window attention
+        self.window_attn = (
+            SlidingWindowAttention(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+            )
+        )
+        # Output projections
+        self.proj_cmp = nn.Linear(
+            dim, dim
+        )
+        self.proj_slc = nn.Linear(
+            dim, dim
+        )
+        # Gating mechanism (NSA paper Eq. 5)
+        self.gate = nn.Sequential(
+            nn.Linear(dim, dim // 4),
+            nn.GELU(),
+            nn.Linear(dim // 4, 3),
+            nn.Sigmoid(),
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Apply Native Sparse Attention.
+        Args:
+            x: Input features (B, C, H, W)
+        Returns:
+            Output features (B, C, H, W)
+        """
+        B, C, H, W = x.shape
+        N = H * W
+        # Reshape to sequence
+        x_seq = x.flatten(2).transpose(
+            1, 2
+        )  # (B, N, C)
+        # =================================================================
+        # Branch 1: Compressed Attention (Global Coarse-Grained)
+        # =================================================================
+        qkv_cmp = self.qkv_cmp(x_seq)
+        qkv_cmp = qkv_cmp.reshape(
+            B,
+            N,
+            3,
+            self.num_heads,
+            self.head_dim,
+        )
+        qkv_cmp = qkv_cmp.permute(
+            2, 0, 3, 1, 4
+        )
+        q_cmp, k_cmp_raw, v_cmp_raw = (
+            qkv_cmp[0],
+            qkv_cmp[1],
+            qkv_cmp[2],
+        )
+        # Reshape k, v for compression
+        k_for_cmp = k_cmp_raw.transpose(
+            1, 2
+        ).reshape(B, N, C)
+        v_for_cmp = v_cmp_raw.transpose(
+            1, 2
+        ).reshape(B, N, C)
+        # Compress tokens
+        k_cmp, v_cmp = self.compression(
+            k_for_cmp, v_for_cmp, (H, W)
+        )
+        N_cmp = k_cmp.shape[1]
+        # Reshape for multi-head attention
+        k_cmp = k_cmp.view(
+            B,
+            N_cmp,
+            self.num_heads,
+            self.head_dim,
+        ).transpose(1, 2)
+        v_cmp = v_cmp.view(
+            B,
+            N_cmp,
+            self.num_heads,
+            self.head_dim,
+        ).transpose(1, 2)
+        # Compute compressed attention
+        attn_cmp = (
+            q_cmp
+            @ k_cmp.transpose(-2, -1)
+        ) * self.scale
+        attn_cmp_softmax = (
+            attn_cmp.softmax(dim=-1)
+        )
+        o_cmp = attn_cmp_softmax @ v_cmp
+        o_cmp = o_cmp.transpose(
+            1, 2
+        ).reshape(B, N, C)
+        o_cmp = self.proj_cmp(o_cmp)
+        # =================================================================
+        # Branch 2: Selected Attention (Fine-Grained Important)
+        # =================================================================
+        qkv_slc = self.qkv_slc(x_seq)
+        qkv_slc = qkv_slc.reshape(
+            B,
+            N,
+            3,
+            self.num_heads,
+            self.head_dim,
+        )
+        qkv_slc = qkv_slc.permute(
+            2, 0, 3, 1, 4
+        )
+        q_slc, k_slc_raw, v_slc_raw = (
+            qkv_slc[0],
+            qkv_slc[1],
+            qkv_slc[2],
+        )
+        k_for_slc = k_slc_raw.transpose(
+            1, 2
+        ).reshape(B, N, C)
+        v_for_slc = v_slc_raw.transpose(
+            1, 2
+        ).reshape(B, N, C)
+        # Select important blocks based on compressed attention scores
+        k_slc, v_slc, _ = (
+            self.selection(
+                q_slc,
+                k_for_slc,
+                v_for_slc,
+                attn_cmp_softmax,
+                (H, W),
+            )
+        )
+        N_slc = k_slc.shape[1]
+        k_slc = k_slc.view(
+            B,
+            N_slc,
+            self.num_heads,
+            self.head_dim,
+        ).transpose(1, 2)
+        v_slc = v_slc.view(
+            B,
+            N_slc,
+            self.num_heads,
+            self.head_dim,
+        ).transpose(1, 2)
+        # Compute selected attention
+        attn_slc = (
+            q_slc
+            @ k_slc.transpose(-2, -1)
+        ) * self.scale
+        attn_slc = attn_slc.softmax(
+            dim=-1
+        )
+        o_slc = attn_slc @ v_slc
+        o_slc = o_slc.transpose(
+            1, 2
+        ).reshape(B, N, C)
+        o_slc = self.proj_slc(o_slc)
+        # =================================================================
+        # Branch 3: Sliding Window Attention (Local Context)
+        # =================================================================
+        o_win = self.window_attn(x)
+        o_win = o_win.flatten(
+            2
+        ).transpose(
+            1, 2
+        )  # (B, N, C)
+        # =================================================================
+        # Gated Aggregation
+        # =================================================================
+        # Compute per-token gates
+        gates = self.gate(
+            x_seq
+        )  # (B, N, 3)
+        g_cmp = gates[:, :, 0:1]
+        g_slc = gates[:, :, 1:2]
+        g_win = gates[:, :, 2:3]
+        # Weighted combination
+        out = (
+            g_cmp * o_cmp
+            + g_slc * o_slc
+            + g_win * o_win
+        )
+        # Reshape back to spatial
+        out = out.transpose(1, 2).view(
+            B, C, H, W
+        )
+        return out
+# =============================================================================
+# NSA Block (Attention + FFN)
+# =============================================================================
+class NSABlock(nn.Module):
+    """
+    Complete NSA block with attention, normalization, and FFN.
+    Structure:
+    - Depthwise conv for local features (like EfficientViT)
+    - Native Sparse Attention for global/selective features
+    - FFN for channel mixing
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 2,
+        mlp_ratio: float = 2.0,
+        compress_block_size: int = 4,
+        compress_stride: int = 2,
+        select_block_size: int = 4,
+        num_select: int = 4,
+        window_size: int = 7,
+    ):
+        super().__init__()
+        # Local feature extraction (depthwise conv)
+        self.norm1 = nn.BatchNorm2d(dim)
+        self.dw_conv = nn.Conv2d(
+            dim,
+            dim,
+            kernel_size=3,
+            padding=1,
+            groups=dim,
+        )
+        # NSA attention
+        self.norm2 = nn.BatchNorm2d(dim)
+        self.nsa = SpatialNSA(
+            dim=dim,
+            num_heads=num_heads,
+            compress_block_size=compress_block_size,
+            compress_stride=compress_stride,
+            select_block_size=select_block_size,
+            num_select=num_select,
+            window_size=window_size,
+        )
+        # FFN
+        self.norm3 = nn.LayerNorm(dim)
+        hidden_dim = int(
+            dim * mlp_ratio
+        )
+        self.ffn = nn.Sequential(
+            nn.Linear(dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, dim),
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: Input features (B, C, H, W)
+        Returns:
+            Output features (B, C, H, W)
+        """
+        # Local features
+        x = x + self.dw_conv(
+            self.norm1(x)
+        )
+        # NSA attention
+        x = x + self.nsa(self.norm2(x))
+        # FFN
+        B, C, H, W = x.shape
+        x_flat = x.flatten(2).transpose(
+            1, 2
+        )  # (B, N, C)
+        x_flat = x_flat + self.ffn(
+            self.norm3(x_flat)
+        )
+        x = x_flat.transpose(1, 2).view(
+            B, C, H, W
+        )
+        return x
+# =============================================================================
+# NSA Stage (Multiple Blocks + Optional Downsampling)
+# =============================================================================
+class NSAStage(nn.Module):
+    """
+    Stage containing multiple NSA blocks with optional downsampling.
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        depth: int = 1,
+        num_heads: int = 2,
+        mlp_ratio: float = 2.0,
+        compress_block_size: int = 4,
+        compress_stride: int = 2,
+        select_block_size: int = 4,
+        num_select: int = 4,
+        window_size: int = 7,
+        downsample: bool = True,
+    ):
+        super().__init__()
+        # Downsampling
+        self.downsample = None
+        if downsample:
+            self.downsample = (
+                nn.Sequential(
+                    ConvBNReLU(
+                        in_dim,
+                        out_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                    ),
+                )
+            )
+        elif in_dim != out_dim:
+            self.downsample = (
+                ConvBNReLU(
+                    in_dim,
+                    out_dim,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+        # NSA blocks
+        self.blocks = nn.ModuleList(
+            [
+                NSABlock(
+                    dim=out_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    compress_block_size=compress_block_size,
+                    compress_stride=compress_stride,
+                    select_block_size=select_block_size,
+                    num_select=num_select,
+                    window_size=window_size,
+                )
+                for _ in range(depth)
+            ]
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        if self.downsample is not None:
+            x = self.downsample(x)
+        for block in self.blocks:
+            x = block(x)
+        return x
+# =============================================================================
+# NSA Encoder
+# =============================================================================
+class NSAEncoder(nn.Module):
+    """
+    NSA-based encoder for hierarchical feature extraction.
+    Produces multi-scale features for segmentation decoder.
+    """
+    def __init__(
+        self,
+        in_channels: int = 1,
+        embed_dims: tuple = (
+            32,
+            64,
+            96,
+        ),
+        depths: tuple = (1, 1, 1),
+        num_heads: tuple = (2, 2, 4),
+        mlp_ratios: tuple = (2, 2, 2),
+        compress_block_sizes: tuple = (
+            4,
+            4,
+            4,
+        ),
+        compress_strides: tuple = (
+            2,
+            2,
+            2,
+        ),
+        select_block_sizes: tuple = (
+            4,
+            4,
+            4,
+        ),
+        num_selects: tuple = (4, 4, 4),
+        window_sizes: tuple = (7, 7, 7),
+    ):
+        super().__init__()
+        # Patch embedding
+        self.patch_embed = (
+            PatchEmbedding(
+                in_channels=in_channels,
+                embed_dim=embed_dims[0],
+            )
+        )
+        # Stage 1: No downsampling (already done in patch embed)
+        self.stage1 = NSAStage(
+            in_dim=embed_dims[0],
+            out_dim=embed_dims[0],
+            depth=depths[0],
+            num_heads=num_heads[0],
+            mlp_ratio=mlp_ratios[0],
+            compress_block_size=compress_block_sizes[
+                0
+            ],
+            compress_stride=compress_strides[
+                0
+            ],
+            select_block_size=select_block_sizes[
+                0
+            ],
+            num_select=num_selects[0],
+            window_size=window_sizes[0],
+            downsample=False,
+        )
+        # Stage 2: Downsample 2x
+        self.stage2 = NSAStage(
+            in_dim=embed_dims[0],
+            out_dim=embed_dims[1],
+            depth=depths[1],
+            num_heads=num_heads[1],
+            mlp_ratio=mlp_ratios[1],
+            compress_block_size=compress_block_sizes[
+                1
+            ],
+            compress_stride=compress_strides[
+                1
+            ],
+            select_block_size=select_block_sizes[
+                1
+            ],
+            num_select=num_selects[1],
+            window_size=window_sizes[1],
+            downsample=True,
+        )
+        # Stage 3: Downsample 2x
+        self.stage3 = NSAStage(
+            in_dim=embed_dims[1],
+            out_dim=embed_dims[2],
+            depth=depths[2],
+            num_heads=num_heads[2],
+            mlp_ratio=mlp_ratios[2],
+            compress_block_size=compress_block_sizes[
+                2
+            ],
+            compress_stride=compress_strides[
+                2
+            ],
+            select_block_size=select_block_sizes[
+                2
+            ],
+            num_select=num_selects[2],
+            window_size=window_sizes[2],
+            downsample=True,
+        )
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple:
+        """
+        Args:
+            x: Input image (B, C, H, W)
+        Returns:
+            Multi-scale features (f1, f2, f3)
+        """
+        x = self.patch_embed(x)
+        f1 = self.stage1(
+            x
+        )  # 1/4 resolution
+        f2 = self.stage2(
+            f1
+        )  # 1/8 resolution
+        f3 = self.stage3(
+            f2
+        )  # 1/16 resolution
+        return f1, f2, f3
+# =============================================================================
+# Segmentation Decoder
+# =============================================================================
+class SegmentationDecoder(nn.Module):
+    """
+    FPN-style decoder with skip connections for precise segmentation.
+    Progressively upsamples features to input resolution.
+    """
+    def __init__(
+        self,
+        encoder_dims: tuple = (
+            32,
+            64,
+            96,
+        ),
+        decoder_dim: int = 32,
+        num_classes: int = 2,
+    ):
+        super().__init__()
+        # Lateral connections
+        self.lateral3 = nn.Conv2d(
+            encoder_dims[2],
+            decoder_dim,
+            kernel_size=1,
+        )
+        self.lateral2 = nn.Conv2d(
+            encoder_dims[1],
+            decoder_dim,
+            kernel_size=1,
+        )
+        self.lateral1 = nn.Conv2d(
+            encoder_dims[0],
+            decoder_dim,
+            kernel_size=1,
+        )
+        # Smoothing convolutions
+        self.smooth3 = nn.Sequential(
+            nn.Conv2d(
+                decoder_dim,
+                decoder_dim,
+                kernel_size=3,
+                padding=1,
+                groups=decoder_dim,
+            ),
+            nn.BatchNorm2d(decoder_dim),
+            nn.GELU(),
+        )
+        self.smooth2 = nn.Sequential(
+            nn.Conv2d(
+                decoder_dim,
+                decoder_dim,
+                kernel_size=3,
+                padding=1,
+                groups=decoder_dim,
+            ),
+            nn.BatchNorm2d(decoder_dim),
+            nn.GELU(),
+        )
+        self.smooth1 = nn.Sequential(
+            nn.Conv2d(
+                decoder_dim,
+                decoder_dim,
+                kernel_size=3,
+                padding=1,
+                groups=decoder_dim,
+            ),
+            nn.BatchNorm2d(decoder_dim),
+            nn.GELU(),
+        )
+        # Segmentation head
+        self.head = nn.Conv2d(
+            decoder_dim,
+            num_classes,
+            kernel_size=1,
+        )
+    def forward(
+        self,
+        f1: torch.Tensor,
+        f2: torch.Tensor,
+        f3: torch.Tensor,
+        target_size: tuple,
+    ) -> torch.Tensor:
+        """
+        Args:
+            f1, f2, f3: Multi-scale encoder features
+            target_size: (H, W) of output
+        Returns:
+            Segmentation logits (B, num_classes, H, W)
+        """
+        # Top-down path with lateral connections
+        p3 = self.lateral3(f3)
+        p3 = self.smooth3(p3)
+        p2 = self.lateral2(
+            f2
+        ) + F.interpolate(
+            p3,
+            size=f2.shape[2:],
+            mode="bilinear",
+            align_corners=False,
+        )
+        p2 = self.smooth2(p2)
+        p1 = self.lateral1(
+            f1
+        ) + F.interpolate(
+            p2,
+            size=f1.shape[2:],
+            mode="bilinear",
+            align_corners=False,
+        )
+        p1 = self.smooth1(p1)
+        # Segmentation output
+        out = self.head(p1)
+        out = F.interpolate(
+            out,
+            size=target_size,
+            mode="bilinear",
+            align_corners=False,
+        )
+        return out
+# =============================================================================
+# Complete NSA Pupil Segmentation Model
+# =============================================================================
+class NSAPupilSeg(nn.Module):
+    """
+    Native Sparse Attention model for Pupil Segmentation.
+    Architecture:
+    - NSA Encoder: Hierarchical feature extraction with sparse attention
+    - FPN Decoder: Multi-scale feature fusion for precise segmentation
+    Key NSA components for pupil segmentation:
+    - Compression: Captures global eye context (is this an eye? rough pupil location)
+    - Selection: Focuses on pupil region with fine-grained attention
+    - Sliding Window: Precise local boundaries for pixel-accurate segmentation
+    """
+    def __init__(
+        self,
+        in_channels: int = 1,
+        num_classes: int = 2,
+        embed_dims: tuple = (
+            32,
+            64,
+            96,
+        ),
+        depths: tuple = (1, 1, 1),
+        num_heads: tuple = (2, 2, 4),
+        mlp_ratios: tuple = (2, 2, 2),
+        compress_block_sizes: tuple = (
+            4,
+            4,
+            4,
+        ),
+        compress_strides: tuple = (
+            2,
+            2,
+            2,
+        ),
+        select_block_sizes: tuple = (
+            4,
+            4,
+            4,
+        ),
+        num_selects: tuple = (4, 4, 4),
+        window_sizes: tuple = (7, 7, 7),
+        decoder_dim: int = 32,
+    ):
+        super().__init__()
+        self.encoder = NSAEncoder(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            depths=depths,
+            num_heads=num_heads,
+            mlp_ratios=mlp_ratios,
+            compress_block_sizes=compress_block_sizes,
+            compress_strides=compress_strides,
+            select_block_sizes=select_block_sizes,
+            num_selects=num_selects,
+            window_sizes=window_sizes,
+        )
+        self.decoder = (
+            SegmentationDecoder(
+                encoder_dims=embed_dims,
+                decoder_dim=decoder_dim,
+                num_classes=num_classes,
+            )
+        )
+        self._initialize_weights()
+    def _initialize_weights(self):
+        """Initialize model weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight,
+                    mode="fan_out",
+                    nonlinearity="relu",
+                )
+                if m.bias is not None:
+                    nn.init.zeros_(
+                        m.bias
+                    )
+            elif isinstance(
+                m, nn.BatchNorm2d
+            ):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(
+                m, nn.Linear
+            ):
+                nn.init.trunc_normal_(
+                    m.weight, std=0.02
+                )
+                if m.bias is not None:
+                    nn.init.zeros_(
+                        m.bias
+                    )
+            elif isinstance(
+                m, nn.LayerNorm
+            ):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(
+        self, x: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: Input image (B, C, H, W)
+        Returns:
+            Segmentation logits (B, num_classes, H, W)
+        """
+        target_size = (
+            x.shape[2],
+            x.shape[3],
+        )
+        f1, f2, f3 = self.encoder(x)
+        out = self.decoder(
+            f1, f2, f3, target_size
+        )
+        return out
+# =============================================================================
+# Loss Function (same as src/ for compatibility)
+# =============================================================================
+def focal_surface_loss(
+    probs: torch.Tensor,
+    dist_map: torch.Tensor,
+    gamma: float = 2.0,
+) -> torch.Tensor:
+    """Surface loss with focal weighting for hard boundary pixels.
+    Args:
+        probs: Predicted probabilities (B, C, H, W)
+        dist_map: Distance transform (B, 2, H, W)
+        gamma: Focal weighting exponent
+    Returns:
+        Focal-weighted surface loss scalar
+    """
+    focal_weight = (1 - probs) ** gamma
+    return (
+        (focal_weight * probs * dist_map)
+        .flatten(start_dim=2)
+        .mean(dim=2)
+        .mean(dim=1)
+        .mean()
+    )
+def boundary_dice_loss(
+    probs: torch.Tensor,
+    target: torch.Tensor,
+    kernel_size: int = 3,
+    epsilon: float = 1e-5,
+) -> torch.Tensor:
+    """Dice loss computed only on boundary pixels.
+    Args:
+        probs: Predicted probabilities (B, C, H, W)
+        target: Ground truth labels (B, H, W)
+        kernel_size: Size of kernel for boundary extraction
+        epsilon: Small constant for numerical stability
+    Returns:
+        Boundary dice loss scalar
+    """
+    # Extract boundary via morphological gradient
+    target_float = target.float().unsqueeze(1)
+    padding = kernel_size // 2
+    dilated = F.max_pool2d(
+        target_float,
+        kernel_size,
+        stride=1,
+        padding=padding,
+    )
+    eroded = -F.max_pool2d(
+        -target_float,
+        kernel_size,
+        stride=1,
+        padding=padding,
+    )
+    boundary = (dilated - eroded).squeeze(1)  # (B, H, W)
+    # Compute Dice only on boundary pixels
+    probs_pupil = probs[:, 1]  # pupil class probabilities (B, H, W)
+    probs_boundary = probs_pupil * boundary
+    target_boundary = target.float() * boundary
+    intersection = (
+        probs_boundary * target_boundary
+    ).sum(dim=(1, 2))
+    union = probs_boundary.sum(
+        dim=(1, 2)
+    ) + target_boundary.sum(dim=(1, 2))
+    dice = (
+        2.0 * intersection + epsilon
+    ) / (union + epsilon)
+    return (1.0 - dice).mean()
+class CombinedLoss(nn.Module):
+    """
+    Combined loss for pupil segmentation:
+    - Weighted Cross Entropy: Handles class imbalance
+    - Dice Loss: Better for small regions like pupils
+    - Focal Surface Loss: Boundary-aware optimization with focal weighting
+    - Boundary Dice Loss: Explicit optimization for edge pixels
+    """
+    def __init__(
+        self,
+        epsilon: float = 1e-5,
+        focal_gamma: float = 2.0,
+        boundary_weight: float = 0.3,
+        boundary_kernel_size: int = 3,
+    ):
+        super().__init__()
+        self.epsilon = epsilon
+        self.focal_gamma = focal_gamma
+        self.boundary_weight = boundary_weight
+        self.boundary_kernel_size = boundary_kernel_size
+        self.nll = nn.NLLLoss(
+            reduction="none"
+        )
+    def forward(
+        self,
+        logits: torch.Tensor,
+        target: torch.Tensor,
+        spatial_weights: torch.Tensor,
+        dist_map: torch.Tensor,
+        alpha: float,
+        eye_weight: torch.Tensor = None,
+    ) -> tuple:
+        """
+        Args:
+            logits: Model output (B, C, H, W)
+            target: Ground truth (B, H, W)
+            spatial_weights: Spatial weighting map (B, H, W)
+            dist_map: Distance map for surface loss (B, 2, H, W)
+            alpha: Balance between dice and surface loss
+            eye_weight: Soft distance weighting from eye region (B, H, W)
+        Returns:
+            (total_loss, ce_loss, dice_loss, surface_loss, boundary_loss)
+        """
+        probs = F.softmax(logits, dim=1)
+        log_probs = F.log_softmax(
+            logits, dim=1
+        )
+        # Weighted Cross Entropy
+        ce_loss = self.nll(
+            log_probs, target
+        )
+        # Apply spatial weights and optional eye weight
+        weight_factor = 1.0 + spatial_weights
+        if eye_weight is not None:
+            weight_factor = weight_factor * eye_weight
+        weighted_ce = (
+            ce_loss * weight_factor
+        ).mean()
+        # Dice Loss
+        target_onehot = (
+            F.one_hot(
+                target, num_classes=2
+            )
+            .permute(0, 3, 1, 2)
+            .float()
+        )
+        probs_flat = probs.flatten(
+            start_dim=2
+        )
+        target_flat = (
+            target_onehot.flatten(
+                start_dim=2
+            )
+        )
+        intersection = (
+            probs_flat * target_flat
+        ).sum(dim=2)
+        cardinality = (
+            probs_flat + target_flat
+        ).sum(dim=2)
+        class_weights = 1.0 / (
+            target_flat.sum(dim=2) ** 2
+        ).clamp(min=self.epsilon)
+        dice = (
+            2.0
+            * (
+                class_weights
+                * intersection
+            ).sum(dim=1)
+            / (
+                class_weights
+                * cardinality
+            ).sum(dim=1)
+        )
+        dice_loss = (
+            1.0
+            - dice.clamp(
+                min=self.epsilon
+            )
+        ).mean()
+        # Focal Surface Loss (replaces standard surface loss)
+        surface_loss = focal_surface_loss(
+            probs,
+            dist_map,
+            gamma=self.focal_gamma,
+        )
+        # Boundary Dice Loss
+        bdice_loss = boundary_dice_loss(
+            probs,
+            target,
+            kernel_size=self.boundary_kernel_size,
+            epsilon=self.epsilon,
+        )
+        # Total loss with updated weighting
+        # Use max(1 - alpha, 0.2) for surface loss weight
+        surface_weight = max(1.0 - alpha, 0.2)
+        total_loss = (
+            weighted_ce
+            + alpha * dice_loss
+            + surface_weight * surface_loss
+            + self.boundary_weight * bdice_loss
+        )
+        return (
+            total_loss,
+            weighted_ce,
+            dice_loss,
+            surface_loss,
+            bdice_loss,
+        )
+# =============================================================================
+# Factory function for easy model creation
+# =============================================================================
+def create_nsa_pupil_seg(
+    size: str = "small",
+    in_channels: int = 1,
+    num_classes: int = 2,
+) -> NSAPupilSeg:
+    """
+    Create NSA Pupil Segmentation model with predefined configurations.
+    Args:
+        size: Model size ('pico', 'nano', 'tiny', 'small', 'medium')
+        in_channels: Number of input channels
+        num_classes: Number of output classes
+    Returns:
+        Configured NSAPupilSeg model
+    """
+    configs = {
+        "pico": {
+            "embed_dims": (4, 4, 4),
+            "depths": (1, 1, 1),
+            "num_heads": (1, 1, 1),
+            "mlp_ratios": (
+                1.0,
+                1.0,
+                1.0,
+            ),
+            "compress_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "compress_strides": (
+                4,
+                4,
+                4,
+            ),
+            "select_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "num_selects": (1, 1, 1),
+            "window_sizes": (3, 3, 3),
+            "decoder_dim": 4,
+        },
+        "nano": {
+            "embed_dims": (4, 8, 12),
+            "depths": (1, 1, 1),
+            "num_heads": (1, 1, 1),
+            "mlp_ratios": (
+                1.0,
+                1.0,
+                1.0,
+            ),
+            "compress_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "compress_strides": (
+                4,
+                4,
+                4,
+            ),
+            "select_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "num_selects": (1, 1, 1),
+            "window_sizes": (3, 3, 3),
+            "decoder_dim": 4,
+        },
+        "tiny": {
+            "embed_dims": (8, 12, 16),
+            "depths": (1, 1, 1),
+            "num_heads": (1, 1, 1),
+            "mlp_ratios": (
+                1.5,
+                1.5,
+                1.5,
+            ),
+            "compress_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "compress_strides": (
+                4,
+                4,
+                4,
+            ),
+            "select_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "num_selects": (1, 1, 1),
+            "window_sizes": (3, 3, 3),
+            "decoder_dim": 8,
+        },
+        "small": {
+            "embed_dims": (12, 24, 32),
+            "depths": (1, 1, 1),
+            "num_heads": (1, 1, 2),
+            "mlp_ratios": (
+                1.5,
+                1.5,
+                1.5,
+            ),
+            "compress_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "compress_strides": (
+                4,
+                4,
+                4,
+            ),
+            "select_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "num_selects": (1, 1, 1),
+            "window_sizes": (3, 3, 3),
+            "decoder_dim": 12,
+        },
+        "medium": {
+            "embed_dims": (16, 32, 48),
+            "depths": (1, 1, 1),
+            "num_heads": (1, 2, 2),
+            "mlp_ratios": (
+                1.5,
+                1.5,
+                1.5,
+            ),
+            "compress_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "compress_strides": (
+                3,
+                3,
+                3,
+            ),
+            "select_block_sizes": (
+                4,
+                4,
+                4,
+            ),
+            "num_selects": (2, 2, 2),
+            "window_sizes": (3, 3, 3),
+            "decoder_dim": 16,
+        },
+    }
+    if size not in configs:
+        raise ValueError(
+            f"Unknown size: {size}. Choose from {list(configs.keys())}"
+        )
+    return NSAPupilSeg(
+        in_channels=in_channels,
+        num_classes=num_classes,
+        **configs[size],
+    )
+# =============================================================================
+# Testing / Verification
+# =============================================================================
+if __name__ == "__main__":
+    # Test model creation and forward pass
+    print(
+        "Testing NSA Pupil Segmentation Model"
+    )
+    print("=" * 60)
+    # Create models of different sizes
+    for size in [
+        "pico",
+        "nano",
+        "tiny",
+        "small",
+        "medium",
+    ]:
+        model = create_nsa_pupil_seg(
+            size=size
+        )
+        # Count parameters
+        n_params = sum(
+            p.numel()
+            for p in model.parameters()
+        )
+        # Test forward pass
+        x = torch.randn(
+            2, 1, 400, 640
+        )  # OpenEDS image size
+        model.eval()
+        with torch.no_grad():
+            out = model(x)
+        print(
+            f"\n{size.upper()} Model:"
+        )
+        print(
+            f"  Parameters: {n_params:,}"
+        )
+        print(
+            f"  Input shape: {x.shape}"
+        )
+        print(
+            f"  Output shape: {out.shape}"
+        )
+    print("\n" + "=" * 60)
+    print("All tests passed!")

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ libgl1-mesa-glx
2	+ libglib2.0-0

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0.0
+numpy>=1.21.0
+opencv-python-headless>=4.5.0
+mediapipe>=0.10.21
+gradio==6.1.0
+Pillow>=8.3.0