|
|
import torch |
|
|
import torch.nn as nn |
|
|
from transformers import DistilBertModel, DistilBertConfig |
|
|
from diffusers import UNet2DConditionModel |
|
|
|
|
|
class VideoJEPA(nn.Module): |
|
|
def __init__(self, text_dim=768, video_dim=512, latent_dim=1024): |
|
|
super().__init__() |
|
|
|
|
|
|
|
|
self.video_encoder = nn.Sequential( |
|
|
nn.Conv3d(3, 64, kernel_size=(3, 5, 5), stride=(1, 2, 2)), |
|
|
nn.ReLU(), |
|
|
nn.MaxPool3d((1, 2, 2)), |
|
|
nn.Conv3d(64, 128, kernel_size=(3, 3, 3)), |
|
|
nn.ReLU(), |
|
|
nn.AdaptiveAvgPool3d((None, 8, 8)) |
|
|
) |
|
|
self.video_proj = nn.Linear(128*8*8, video_dim) |
|
|
|
|
|
|
|
|
self.text_encoder = DistilBertModel.from_pretrained("distilbert-base-uncased") |
|
|
self.text_proj = nn.Linear(text_dim, latent_dim) |
|
|
|
|
|
|
|
|
self.fusion_transformer = nn.TransformerEncoder( |
|
|
nn.TransformerEncoderLayer(d_model=latent_dim, nhead=8), |
|
|
num_layers=4 |
|
|
) |
|
|
|
|
|
|
|
|
self.diffusion_decoder = UNet2DConditionModel( |
|
|
sample_size=64, |
|
|
in_channels=3, |
|
|
out_channels=3, |
|
|
cross_attention_dim=latent_dim |
|
|
) |
|
|
|
|
|
def forward(self, video, text_input): |
|
|
|
|
|
B, C, T, H, W = video.shape |
|
|
video_features = self.video_encoder(video) |
|
|
video_features = video_features.permute(0, 2, 1, 3, 4).contiguous() |
|
|
video_features = video_features.view(B*T, -1) |
|
|
video_emb = self.video_proj(video_features).view(B, T, -1) |
|
|
|
|
|
|
|
|
text_emb = self.text_encoder(**text_input).last_hidden_state |
|
|
text_emb = self.text_proj(text_emb[:, 0]) |
|
|
|
|
|
|
|
|
fused_emb = torch.cat([video_emb, text_emb.unsqueeze(1)], dim=1) |
|
|
context_emb = self.fusion_transformer(fused_emb) |
|
|
|
|
|
return context_emb |