Spaces:
Running
Running
Upload tsm-using-ucf101.ipynb
#28
by DHafez - opened
- tsm-using-ucf101.ipynb +1 -0
tsm-using-ucf101.ipynb
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# ==========================================\n# 1. Install Dependencies & Download UCF101\n# ==========================================\n# We need 'av' installed for torchvision video decoding\n!pip install av -q\n\nimport os\nif not os.path.exists('UCF-101'):\n print('Downloading UCF101 dataset (This might take ~5-10 minutes over Colab network...)')\n # Standard UCF101 original source from UCF\n !wget --no-check-certificate 'https://www.crcv.ucf.edu/data/UCF101/UCF101.rar'\n # The official Train/Test Splits\n !wget --no-check-certificate 'https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip'\n\n print('Extracting splits...')\n !unzip -q UCF101TrainTestSplits-RecognitionTask.zip\n\n print('Extracting videos...')\n !apt-get install unrar -y -q\n !unrar x UCF101.rar > /dev/null\n print('Download and extraction complete!')\nelse:\n print('UCF101 already exists in directory.')","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ==========================================\n# 2. Import Libraries\n# ==========================================\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nfrom torch.utils.data import DataLoader\nimport torchvision.models as models\nimport torchvision.transforms as transforms\nimport torchvision.datasets as datasets\nimport time\n\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\nprint(f\"Executing on hardware: {device}\")","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ==========================================\n# 3. Core Temporal Shift Implementation\n# ==========================================\nclass TemporalShift(nn.Module):\n \"\"\"\n Shifts a predetermined proportion of channels forwards and backwards in time.\n \"\"\"\n def __init__(self, net, n_segment=8, n_div=8):\n super(TemporalShift, self).__init__()\n self.net = net\n self.n_segment = n_segment\n self.fold_div = n_div\n\n def forward(self, x):\n x = self.shift(x, self.n_segment, fold_div=self.fold_div)\n return self.net(x)\n\n @staticmethod\n def shift(x, n_segment, fold_div=8):\n nt, c, h, w = x.size()\n n_batch = nt // n_segment\n x = x.view(n_batch, n_segment, c, h, w)\n\n fold = c // fold_div\n out = torch.zeros_like(x)\n\n # Shift LEFT (Current frame sees Future frame)\n out[:, :-1, :fold] = x[:, 1:, :fold]\n # Shift RIGHT (Current frame sees Past frame)\n out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold]\n # The remaining channels remain entirely spatial (no temporal shift)\n out[:, :, 2 * fold:] = x[:, :, 2 * fold:]\n\n return out.view(nt, c, h, w)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ==========================================\n# 3. Core Temporal Shift Implementation\n# ==========================================\nclass TemporalShift(nn.Module):\n \"\"\"\n Shifts a predetermined proportion of channels forwards and backwards in time.\n \"\"\"\n def __init__(self, net, n_segment=8, n_div=8):\n super(TemporalShift, self).__init__()\n self.net = net\n self.n_segment = n_segment\n self.fold_div = n_div\n\n def forward(self, x):\n x = self.shift(x, self.n_segment, fold_div=self.fold_div)\n return self.net(x)\n\n @staticmethod\n def shift(x, n_segment, fold_div=8):\n nt, c, h, w = x.size()\n n_batch = nt // n_segment\n x = x.view(n_batch, n_segment, c, h, w)\n\n fold = c // fold_div\n out = torch.zeros_like(x)\n\n # Shift LEFT (Current frame sees Future frame)\n out[:, :-1, :fold] = x[:, 1:, :fold]\n # Shift RIGHT (Current frame sees Past frame)\n out[:, 1:, fold: 2 * fold] = x[:, :-1, fold: 2 * fold]\n # The remaining channels remain entirely spatial (no temporal shift)\n out[:, :, 2 * fold:] = x[:, :, 2 * fold:]\n\n return out.view(nt, c, h, w)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ==========================================\n# 4. ResNet-50 Wrapping\n# ==========================================\ndef make_temporal_shift(net, n_segment, n_div=8):\n \"\"\"\n Binds the Temporal Shift Module natively inside standard ResNet.\n \"\"\"\n if isinstance(net, models.resnet.Bottleneck):\n net.conv1 = TemporalShift(net.conv1, n_segment=n_segment, n_div=n_div)\n else:\n for m in net.children():\n make_temporal_shift(m, n_segment, n_div)\n\nclass TSMResNet(nn.Module):\n def __init__(self, num_classes=101, n_segment=8):\n super(TSMResNet, self).__init__()\n self.n_segment = n_segment\n \n # 1. Base ImageNet trained ResNet50\n self.base_model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)\n \n # 2. Upgrade base architecture natively with TSM logic\n make_temporal_shift(self.base_model, n_segment=n_segment)\n \n # 3. Change Final Layer to UCF101 Classes\n in_features = self.base_model.fc.in_features\n self.base_model.fc = nn.Linear(in_features, num_classes)\n \n def forward(self, x):\n # x shape initially: [Batch, Segments, Channels, Height, Width]\n b, t, c, h, w = x.size()\n \n # Merge Batch and Time so ResNet standard 2D convolutions don't crash\n x = x.view(b * t, c, h, w)\n out = self.base_model(x)\n \n # Consensus Function: Average temporal predictions back down to video-level prediction\n out = out.view(b, t, -1)\n out = out.mean(dim=1) \n return out","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ==========================================\n# 5. Setup UCF101 Dataloader\n# ==========================================\nNUM_CLASSES = 101\nN_SEGMENT = 8\nBATCH_SIZE = 8\n\nclass VideoTransform:\n \"\"\"\n Custom transform interface to enforce standard Torchvision Video properties\n into dimensions acceptable by standard PyTorch transforms.\n \"\"\"\n def __init__(self, transform):\n self.transform = transform\n \n def __call__(self, video):\n # Pyvision's video output format is [T, H, W, C] and 0-255 scaling\n # We must transform each frame into ImageNet specs [C, H, W]\n video = video.permute(0, 3, 1, 2) / 255.0 \n \n processed_frames = []\n for t in range(video.size(0)):\n frame = transforms.ToPILImage()(video[t])\n processed_frames.append(self.transform(frame))\n \n return torch.stack(processed_frames)\n\n# 2D ImageNet default normalization strategy\ntransform = transforms.Compose([\n transforms.Resize((256, 256)),\n transforms.CenterCrop((224, 224)),\n transforms.ToTensor(),\n transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n])\n\nprint(\"Indexing datasets. Depending on disk-speed, this may take 10-60 seconds... \")\ntrain_dataset = datasets.UCF101(\n root='UCF-101',\n annotation_path='ucfTrainTestlist',\n frames_per_clip=N_SEGMENT,\n step_between_clips=16,\n train=True,\n transform=VideoTransform(transform),\n num_workers=2\n)\n\ntrain_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)\nprint(f\"Loaded {len(train_dataset)} unique training video clips.\")","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ==========================================\n# 6. Training Execution\n# ==========================================\nmodel = TSMResNet(num_classes=NUM_CLASSES, n_segment=N_SEGMENT).to(device)\ncriterion = nn.CrossEntropyLoss()\noptimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)\nscheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[20, 40], gamma=0.1)\n\n# Keeping epochs fairly low for demo proof-of-concept.\nEPOCHS = 1\n\n# Define a custom collate_fn to handle variable length clips\ndef custom_video_collate_fn(batch):\n # 'batch' is a list of tuples: (video_tensor, audio_tensor, label_tensor)\n # where video_tensor might have variable number of frames.\n videos, audios, labels = zip(*batch)\n\n padded_videos = []\n for video in videos:\n num_frames = video.size(0) # Current number of frames in the clip\n # Target number of frames is N_SEGMENT (defined in C_6)\n if num_frames < N_SEGMENT:\n # Pad by repeating the last frame to match N_SEGMENT\n if num_frames > 0:\n padding = video[-1].unsqueeze(0).repeat(N_SEGMENT - num_frames, 1, 1, 1)\n padded_videos.append(torch.cat([video, padding], dim=0))\n else: # If a video clip itself has 0 frames (highly unlikely for UCF101)\n # Create a zero tensor with expected shape (C, H, W)\n # Assuming C=3, H=224, W=224 based on transformations in C_6\n dummy_frame_shape = (3, 224, 224)\n # Use a dummy tensor for padding, placed on the correct device and dtype\n # 'device' is defined in C_3\n padding = torch.zeros(N_SEGMENT, *dummy_frame_shape, dtype=torch.float32).to(device)\n padded_videos.append(padding)\n elif num_frames > N_SEGMENT:\n # If for some reason a clip has more frames than N_SEGMENT, truncate it.\n # This shouldn't happen if torchvision's UCF101 uses frames_per_clip correctly.\n padded_videos.append(video[:N_SEGMENT])\n else: # num_frames == N_SEGMENT\n padded_videos.append(video)\n\n videos_tensor = torch.stack(padded_videos)\n # Labels are typically a list of integers, stack them into a tensor.\n labels_tensor = torch.tensor(labels, dtype=torch.long) # Assuming labels are long type\n\n return videos_tensor, list(audios), labels_tensor # Returning list(audios) as they are not used.\n\n# Re-create the train_loader with the custom collate_fn\n# train_dataset, BATCH_SIZE, shuffle, pin_memory are defined in C_6\ntrain_loader = DataLoader(\n train_dataset, # train_dataset is defined in C_6\n batch_size=BATCH_SIZE, # BATCH_SIZE is defined in C_6\n shuffle=True,\n pin_memory=True,\n collate_fn=custom_video_collate_fn # Use the custom collate function\n)\n\nprint(f\"\\n🚀 Initiating Training Loop on UCF101\")\nfor epoch in range(1, EPOCHS + 1):\n model.train()\n running_loss = 0.0\n correct = 0\n total = 0\n\n # Standard Torchvision Video dataset outputs (Video, Audio, Label)\n for batch_idx, (videos, _, labels) in enumerate(train_loader):\n videos, labels = videos.to(device), labels.to(device)\n\n optimizer.zero_grad()\n outputs = model(videos)\n\n loss = criterion(outputs, labels)\n loss.backward()\n optimizer.step()\n\n running_loss += loss.item()\n _, predicted = torch.max(outputs.data, 1)\n total += labels.size(0)\n correct += (predicted == labels).sum().item()\n\n # Periodic logging status\n if batch_idx % 50 == 0:\n print(f\" [Epoch {epoch}] Batch [{batch_idx}/{len(train_loader)}] Current Running Loss: {loss.item():.4f}\")\n\n scheduler.step()\n\n epoch_loss = running_loss / len(train_loader)\n epoch_acc = 100 * correct / total\n print(f\"\\n>>> Epoch [{(epoch):02d}/{EPOCHS}] - Loss: {epoch_loss:.4f} | Accuracy: {epoch_acc:.2f}% | LR: {scheduler.get_last_lr()[0]:.4f}\\n\")\n\nprint(\"Training Routine Completed successfully.\")","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}
|