Spaces:

Creator-090
/

isl-api

Sleeping

App Files Files Community

Creator-090 commited on Apr 6

Commit

b7dcf66

1 Parent(s): c126626

add: implement SwinTClassifications model and video processing functions

Browse files

Files changed (1) hide show

model.py +134 -0

model.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import torch
+import torch.nn as nn
+from torchvision.models import video as ptv
+from torchvision.transforms import v2
+from transformers import VivitImageProcessor
+from decord import VideoReader
+from decord.bridge import set_bridge
+import gc
+import tempfile
+import os
+# Exactly 76 classes from your notebook metadata
+CLASSES = [
+    'afternoon', 'animal', 'bad', 'beautiful', 'big', 'bird', 'blind',
+    'cat', 'cheap', 'clothing', 'cold', 'cow', 'curved', 'deaf', 'dog',
+    'dress', 'dry', 'evening', 'expensive', 'famous', 'fast', 'female',
+    'fish', 'flat', 'friday', 'good', 'happy', 'hat', 'healthy', 'horse',
+    'hot', 'hour', 'light', 'long', 'loose', 'loud', 'minute', 'monday',
+    'month', 'morning', 'mouse', 'narrow', 'new', 'night', 'old', 'pant',
+    'pocket', 'quiet', 'sad', 'saturday', 'second', 'shirt', 'shoes',
+    'short', 'sick', 'skirt', 'slow', 'small', 'suit', 'sunday', 't_shirt',
+    'tall', 'thursday', 'time', 'today', 'tomorrow', 'tuesday', 'ugly',
+    'warm', 'wednesday', 'week', 'wet', 'wide', 'year', 'yesterday', 'young'
+]
+# Constants matched to your hyperparameters
+CLIP_LENGTH = 16
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class SwinTClassifications(nn.Module):
+    """Model architecture from your notebook cell 79/197"""
+    def __init__(self, classes, weights="KINETICS400_V1"):
+        super().__init__()
+        self.classes = classes
+        # Load Swin3D-S backbone
+        self.base_model = ptv.swin3d_s(weights=weights)
+        # Classification head with your 76 output features
+        self.classification_head = nn.Sequential(
+            nn.Linear(self.base_model.head.in_features, len(self.classes))
+        )
+        # Head replaced with Identity as per your architecture
+        self.base_model.head = nn.Identity()
+    def forward(self, x):
+        x = self.base_model(x)
+        x = self.classification_head(x)
+        return x
+def load_model():
+    """Downloads best model from your HF repo and loads weights"""
+    from huggingface_hub import hf_hub_download
+    print("Fetching model from Hugging Face Hub...")
+    model_path = hf_hub_download(
+        repo_id="Creator-090/isl-swin3d-model",
+        filename="ISL_best_model.pt"
+    )
+    model = SwinTClassifications(classes=CLASSES)
+    model.load_state_dict(
+        torch.load(model_path, map_location=DEVICE, weights_only=True)
+    )
+    model = model.to(DEVICE)
+    model.eval()
+    return model
+def preprocess_video(video_bytes: bytes):
+    """Preprocessing logic utilizing VivitImageProcessor and Decord"""
+    set_bridge("torch")
+    # Save bytes to temporary file for decord VideoReader
+    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
+        f.write(video_bytes)
+        tmp_path = f.name
+    try:
+        # Manual processor configuration from your notebook
+        image_processor = VivitImageProcessor(
+            do_resize=True,
+            size={"shortest_edge": 224},
+            do_center_crop=True,
+            crop_size={"height": 224, "width": 224},
+            do_rescale=True,
+            rescale_factor=1/255,
+            do_normalize=True,
+            image_mean=[0.5, 0.5, 0.5],
+            image_std=[0.5, 0.5, 0.5],
+        )
+        vr = VideoReader(tmp_path)
+        # Ensure we get exactly CLIP_LENGTH frames
+        total_frames = len(vr)
+        indices = list(range(min(total_frames, CLIP_LENGTH)))
+        if len(indices) < CLIP_LENGTH:
+            # Pad if video is too short
+            indices += [indices[-1]] * (CLIP_LENGTH - len(indices))
+        video = vr.get_batch(indices)
+        # Format: (C, T, H, W) as required by Swin3D
+        video = v2.functional.to_dtype(video.permute(0, 3, 1, 2), torch.uint8, scale=False)
+        processed = image_processor(list(video), return_tensors='pt', input_data_format='channels_first')
+        pixel_values = processed['pixel_values'].squeeze(0)
+        pixel_values = pixel_values.permute(1, 0, 2, 3) # Permute to (C, T, H, W)
+        return pixel_values.unsqueeze(0) # Add batch dimension
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+def predict(model, video_bytes: bytes, top_k: int = 5):
+    """Runs inference and returns the top results"""
+    pixel_values = preprocess_video(video_bytes).to(DEVICE)
+    with torch.no_grad():
+        # Standardize for CPU/GPU mixed precision
+        outputs = model(pixel_values)
+        probabilities = torch.nn.functional.softmax(outputs, dim=-1)[0]
+    top_probs, top_indices = torch.topk(probabilities, k=top_k)
+    results = []
+    for i in range(top_k):
+        results.append({
+            "class": CLASSES[top_indices[i].item()],
+            "confidence": float(top_probs[i].item())
+        })
+    return {
+        "prediction": results[0]["class"],
+        "confidence": results[0]["confidence"],
+        "top_k": results
+    }