Creator-090 commited on
Commit
d0dc8d3
·
1 Parent(s): b7dcf66

fix: update Dockerfile and requirements for video processing dependencies; enhance preprocess_video function

Browse files
Files changed (3) hide show
  1. Dockerfile +9 -1
  2. model.py +16 -19
  3. requirements.txt +3 -2
Dockerfile CHANGED
@@ -1,9 +1,17 @@
1
  FROM python:3.10
2
 
 
 
 
 
 
 
 
3
  WORKDIR /app
4
 
5
  COPY requirements.txt .
6
- RUN pip install --no-cache-dir -r requirements.txt
 
7
 
8
  COPY . .
9
 
 
1
  FROM python:3.10
2
 
3
+ # System dependencies for video/image processing
4
+ RUN apt-get update && apt-get install -y \
5
+ libgl1-mesa-glx \
6
+ libglib2.0-0 \
7
+ ffmpeg \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
  WORKDIR /app
11
 
12
  COPY requirements.txt .
13
+ # Use --upgrade to ensure numpy is downgraded and torch is updated
14
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
 
16
  COPY . .
17
 
model.py CHANGED
@@ -65,17 +65,12 @@ def load_model():
65
  model.eval()
66
  return model
67
 
68
- def preprocess_video(video_bytes: bytes):
69
- """Preprocessing logic utilizing VivitImageProcessor and Decord"""
70
  set_bridge("torch")
71
-
72
- # Save bytes to temporary file for decord VideoReader
73
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
74
  f.write(video_bytes)
75
  tmp_path = f.name
76
-
77
  try:
78
- # Manual processor configuration from your notebook
79
  image_processor = VivitImageProcessor(
80
  do_resize=True,
81
  size={"shortest_edge": 224},
@@ -87,24 +82,26 @@ def preprocess_video(video_bytes: bytes):
87
  image_mean=[0.5, 0.5, 0.5],
88
  image_std=[0.5, 0.5, 0.5],
89
  )
90
-
91
  vr = VideoReader(tmp_path)
92
- # Ensure we get exactly CLIP_LENGTH frames
93
  total_frames = len(vr)
94
- indices = list(range(min(total_frames, CLIP_LENGTH)))
95
- if len(indices) < CLIP_LENGTH:
96
- # Pad if video is too short
97
- indices += [indices[-1]] * (CLIP_LENGTH - len(indices))
98
 
99
- video = vr.get_batch(indices)
100
- # Format: (C, T, H, W) as required by Swin3D
101
- video = v2.functional.to_dtype(video.permute(0, 3, 1, 2), torch.uint8, scale=False)
 
 
 
 
 
 
102
 
103
- processed = image_processor(list(video), return_tensors='pt', input_data_format='channels_first')
104
- pixel_values = processed['pixel_values'].squeeze(0)
105
- pixel_values = pixel_values.permute(1, 0, 2, 3) # Permute to (C, T, H, W)
106
 
107
- return pixel_values.unsqueeze(0) # Add batch dimension
108
  finally:
109
  if os.path.exists(tmp_path):
110
  os.remove(tmp_path)
 
65
  model.eval()
66
  return model
67
 
68
+ def preprocess_video(video_bytes: bytes, clip_length: int = 16):
 
69
  set_bridge("torch")
 
 
70
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
71
  f.write(video_bytes)
72
  tmp_path = f.name
 
73
  try:
 
74
  image_processor = VivitImageProcessor(
75
  do_resize=True,
76
  size={"shortest_edge": 224},
 
82
  image_mean=[0.5, 0.5, 0.5],
83
  image_std=[0.5, 0.5, 0.5],
84
  )
 
85
  vr = VideoReader(tmp_path)
 
86
  total_frames = len(vr)
87
+ indices = list(range(min(total_frames, clip_length)))
88
+ if len(indices) < clip_length:
89
+ indices += [indices[-1]] * (clip_length - len(indices))
 
90
 
91
+ # Ensure video is a torch tensor in (Frames, Channels, Height, Width)
92
+ video = vr.get_batch(indices)
93
+ video = video.permute(0, 3, 1, 2).float() # Convert to Float for the processor
94
+
95
+ # Pass as a list of Tensors
96
+ processed = image_processor(
97
+ list(video),
98
+ return_tensors='pt'
99
+ )
100
 
101
+ pixel_values = processed['pixel_values'].squeeze(0) # (T, C, H, W)
102
+ pixel_values = pixel_values.permute(1, 0, 2, 3) # (C, T, H, W) for Swin3D
 
103
 
104
+ return pixel_values.unsqueeze(0)
105
  finally:
106
  if os.path.exists(tmp_path):
107
  os.remove(tmp_path)
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  fastapi
2
  uvicorn
3
- torch==2.2.0
4
- torchvision==0.17.0
 
5
  transformers
6
  decord
7
  huggingface_hub
 
1
  fastapi
2
  uvicorn
3
+ numpy<2.0.0
4
+ torch>=2.4.0
5
+ torchvision
6
  transformers
7
  decord
8
  huggingface_hub