Spaces:

Creator-090
/

isl-api

Sleeping

App Files Files Community

Creator-090 commited on Apr 6

Commit

d0dc8d3

1 Parent(s): b7dcf66

fix: update Dockerfile and requirements for video processing dependencies; enhance preprocess_video function

Browse files

Files changed (3) hide show

Dockerfile +9 -1
model.py +16 -19
requirements.txt +3 -2

Dockerfile CHANGED Viewed

@@ -1,9 +1,17 @@
 FROM python:3.10
 WORKDIR /app
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
 COPY . .

 FROM python:3.10
+# System dependencies for video/image processing
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 COPY requirements.txt .
+# Use --upgrade to ensure numpy is downgraded and torch is updated
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY . .

model.py CHANGED Viewed

@@ -65,17 +65,12 @@ def load_model():
     model.eval()
     return model
-def preprocess_video(video_bytes: bytes):
-    """Preprocessing logic utilizing VivitImageProcessor and Decord"""
     set_bridge("torch")
-    # Save bytes to temporary file for decord VideoReader
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
         f.write(video_bytes)
         tmp_path = f.name
     try:
-        # Manual processor configuration from your notebook
         image_processor = VivitImageProcessor(
             do_resize=True,
             size={"shortest_edge": 224},
@@ -87,24 +82,26 @@ def preprocess_video(video_bytes: bytes):
             image_mean=[0.5, 0.5, 0.5],
             image_std=[0.5, 0.5, 0.5],
         )
         vr = VideoReader(tmp_path)
-        # Ensure we get exactly CLIP_LENGTH frames
         total_frames = len(vr)
-        indices = list(range(min(total_frames, CLIP_LENGTH)))
-        if len(indices) < CLIP_LENGTH:
-            # Pad if video is too short
-            indices += [indices[-1]] * (CLIP_LENGTH - len(indices))
-        video = vr.get_batch(indices)
-        # Format: (C, T, H, W) as required by Swin3D
-        video = v2.functional.to_dtype(video.permute(0, 3, 1, 2), torch.uint8, scale=False)
-        processed = image_processor(list(video), return_tensors='pt', input_data_format='channels_first')
-        pixel_values = processed['pixel_values'].squeeze(0)
-        pixel_values = pixel_values.permute(1, 0, 2, 3) # Permute to (C, T, H, W)
-        return pixel_values.unsqueeze(0) # Add batch dimension
     finally:
         if os.path.exists(tmp_path):
             os.remove(tmp_path)

     model.eval()
     return model
+def preprocess_video(video_bytes: bytes, clip_length: int = 16):
     set_bridge("torch")
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
         f.write(video_bytes)
         tmp_path = f.name
     try:
         image_processor = VivitImageProcessor(
             do_resize=True,
             size={"shortest_edge": 224},
             image_mean=[0.5, 0.5, 0.5],
             image_std=[0.5, 0.5, 0.5],
         )
         vr = VideoReader(tmp_path)
         total_frames = len(vr)
+        indices = list(range(min(total_frames, clip_length)))
+        if len(indices) < clip_length:
+            indices += [indices[-1]] * (clip_length - len(indices))
+        # Ensure video is a torch tensor in (Frames, Channels, Height, Width)
+        video = vr.get_batch(indices)
+        video = video.permute(0, 3, 1, 2).float() # Convert to Float for the processor
+        # Pass as a list of Tensors
+        processed = image_processor(
+            list(video),
+            return_tensors='pt'
+        )
+        pixel_values = processed['pixel_values'].squeeze(0) # (T, C, H, W)
+        pixel_values = pixel_values.permute(1, 0, 2, 3)    # (C, T, H, W) for Swin3D
+        return pixel_values.unsqueeze(0)
     finally:
         if os.path.exists(tmp_path):
             os.remove(tmp_path)

requirements.txt CHANGED Viewed

@@ -1,7 +1,8 @@
 fastapi
 uvicorn
-torch==2.2.0
-torchvision==0.17.0
 transformers
 decord
 huggingface_hub

 fastapi
 uvicorn
+numpy<2.0.0
+torch>=2.4.0
+torchvision
 transformers
 decord
 huggingface_hub