File size: 3,099 Bytes
513d6d1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | import os
import cv2
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from moviepy.editor import VideoFileClip
import logging
logger = logging.getLogger(__name__)
class BMTPreprocessor:
"""
Handles video and audio preprocessing for BMT feature extraction.
"""
def __init__(self, target_sr=16000, target_fps=25):
self.target_sr = target_sr
self.target_fps = target_fps
def extract_audio(self, video_path, output_audio_path):
"""
Extracts audio from video and saves as WAV.
"""
try:
video = VideoFileClip(video_path)
if video.audio:
video.audio.write_audiofile(output_audio_path, fps=self.target_sr, logger=None)
return True
else:
logger.warning(f"No audio track found in {video_path}")
return False
except Exception as e:
logger.error(f"Audio extraction failed: {e}")
return False
def sample_video_frames(self, video_path, resize=(224, 224)):
"""
Samples frames from video at the target FPS.
"""
frames = []
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Calculate step to match target FPS
step = max(1, int(fps / self.target_fps))
count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if count % step == 0:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, resize)
frames.append(frame)
count += 1
cap.release()
return np.stack(frames) if frames else None
class I3D(nn.Module):
"""
Simplified Inflated 3D ConvNet (I3D) architecture shell.
In real production, this would load the full architecture.
"""
def __init__(self, num_classes=400, in_channels=3):
super(I3D, self).__init__()
# This is a simplified placeholder.
# The actual model has many Inception modules.
# We will use this to wrap the loaded .pt weights.
self.conv3d_1a_7x7 = nn.Conv3d(in_channels, 64, kernel_size=(7, 7, 7), stride=(2, 2, 2), padding=(3, 3, 3))
# ... more layers ...
self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.logits = nn.Conv3d(1024, num_classes, kernel_size=(1, 1, 1))
def forward(self, x):
# Forward pass returning the 1024-dim feature vector before the logits layer
# [Batch, 1024, T, H, W] -> AvgPool -> [Batch, 1024, 1, 1, 1]
pass
class VGGish(nn.Module):
"""
Simplified VGGish shell.
"""
def __init__(self):
super(VGGish, self).__init__()
# ... layer definitions ...
pass
def forward(self, x):
# Returning 128-dim embedding
pass
|