Spaces:
Paused
Paused
| from __future__ import annotations | |
| import torch | |
| import torch.nn as nn | |
| class LipFDNet(nn.Module): | |
| """ | |
| Minimal LipFD-compatible network wrapper for Space inference. | |
| The hosted checkpoint is loaded into this module by modules.m1_lipsync. | |
| The forward signature follows the app contract: visual lip crops plus an | |
| audio mel spectrogram produce frame-level logits. | |
| """ | |
| def __init__(self): | |
| super().__init__() | |
| self.visual = nn.Sequential( | |
| nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1), | |
| nn.ReLU(), | |
| nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1), | |
| nn.ReLU(), | |
| nn.AdaptiveAvgPool2d((1, 1)), | |
| nn.Flatten(), | |
| ) | |
| self.audio = nn.Sequential( | |
| nn.Linear(1, 16), | |
| nn.ReLU(), | |
| ) | |
| self.classifier = nn.Sequential( | |
| nn.Linear(48, 32), | |
| nn.ReLU(), | |
| nn.Linear(32, 1), | |
| ) | |
| def forward(self, frames: torch.Tensor, audio: torch.Tensor) -> torch.Tensor: | |
| if frames.ndim == 3: | |
| frames = frames.unsqueeze(0) | |
| visual_feat = self.visual(frames) | |
| audio_level = audio.float().mean().reshape(1, 1).expand(visual_feat.size(0), 1) | |
| audio_feat = self.audio(audio_level) | |
| return self.classifier(torch.cat([visual_feat, audio_feat], dim=-1)).squeeze(-1) | |