| # My CLIP Video-Text Model | |
| This model was trained on the MSR-VTT dataset using a custom CLIP-based architecture. | |
| self.video_proj = nn.Sequential( | |
| nn.Linear(512, 2048), | |
| nn.ReLU(), | |
| nn.Linear(2048, 2048), | |
| nn.ReLU(), | |
| nn.Linear(2048, 512) | |
| ) |
| # My CLIP Video-Text Model | |
| This model was trained on the MSR-VTT dataset using a custom CLIP-based architecture. | |
| self.video_proj = nn.Sequential( | |
| nn.Linear(512, 2048), | |
| nn.ReLU(), | |
| nn.Linear(2048, 2048), | |
| nn.ReLU(), | |
| nn.Linear(2048, 512) | |
| ) |