import os import cv2 import numpy as np import torch import torch.nn as nn from PIL import Image from moviepy.editor import VideoFileClip import logging logger = logging.getLogger(__name__) class BMTPreprocessor: """ Handles video and audio preprocessing for BMT feature extraction. """ def __init__(self, target_sr=16000, target_fps=25): self.target_sr = target_sr self.target_fps = target_fps def extract_audio(self, video_path, output_audio_path): """ Extracts audio from video and saves as WAV. """ try: video = VideoFileClip(video_path) if video.audio: video.audio.write_audiofile(output_audio_path, fps=self.target_sr, logger=None) return True else: logger.warning(f"No audio track found in {video_path}") return False except Exception as e: logger.error(f"Audio extraction failed: {e}") return False def sample_video_frames(self, video_path, resize=(224, 224)): """ Samples frames from video at the target FPS. """ frames = [] cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # Calculate step to match target FPS step = max(1, int(fps / self.target_fps)) count = 0 while True: ret, frame = cap.read() if not ret: break if count % step == 0: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = cv2.resize(frame, resize) frames.append(frame) count += 1 cap.release() return np.stack(frames) if frames else None class I3D(nn.Module): """ Simplified Inflated 3D ConvNet (I3D) architecture shell. In real production, this would load the full architecture. """ def __init__(self, num_classes=400, in_channels=3): super(I3D, self).__init__() # This is a simplified placeholder. # The actual model has many Inception modules. # We will use this to wrap the loaded .pt weights. self.conv3d_1a_7x7 = nn.Conv3d(in_channels, 64, kernel_size=(7, 7, 7), stride=(2, 2, 2), padding=(3, 3, 3)) # ... more layers ... self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) self.logits = nn.Conv3d(1024, num_classes, kernel_size=(1, 1, 1)) def forward(self, x): # Forward pass returning the 1024-dim feature vector before the logits layer # [Batch, 1024, T, H, W] -> AvgPool -> [Batch, 1024, 1, 1, 1] pass class VGGish(nn.Module): """ Simplified VGGish shell. """ def __init__(self): super(VGGish, self).__init__() # ... layer definitions ... pass def forward(self, x): # Returning 128-dim embedding pass