import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models


class VideoEncoder(nn.Module):
    def __init__(self, embed_dim: int = 128, pretrained: bool = True):
        super().__init__()

        # Proper weights loading
        if pretrained:
            backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        else:
            backbone = models.resnet18(weights=None)

        self.feature_extractor = nn.Sequential(*list(backbone.children())[:-1])

        self.frame_projection = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, embed_dim)
        )

        self.temporal_pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        """
        x: (B, T, 3, H, W)
        returns: (B, embed_dim)
        """
        B, T, C, H, W = x.shape

        # Flatten frames
        x = x.view(B * T, C, H, W)

        feats = self.feature_extractor(x)      # (B*T, 512, 1, 1)
        feats = feats.view(B, T, 512)          # (B, T, 512)

        feats = self.frame_projection(feats)   # (B, T, embed_dim)

        feats = feats.permute(0, 2, 1)         # (B, embed_dim, T)
        pooled = self.temporal_pool(feats).squeeze(-1)

        embeddings = F.normalize(pooled, dim=1)

        return embeddings