Toun / bmt_utils.py
babaTEEpe's picture
Upload 17 files
513d6d1 verified
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from moviepy.editor import VideoFileClip
import logging
logger = logging.getLogger(__name__)
class BMTPreprocessor:
"""
Handles video and audio preprocessing for BMT feature extraction.
"""
def __init__(self, target_sr=16000, target_fps=25):
self.target_sr = target_sr
self.target_fps = target_fps
def extract_audio(self, video_path, output_audio_path):
"""
Extracts audio from video and saves as WAV.
"""
try:
video = VideoFileClip(video_path)
if video.audio:
video.audio.write_audiofile(output_audio_path, fps=self.target_sr, logger=None)
return True
else:
logger.warning(f"No audio track found in {video_path}")
return False
except Exception as e:
logger.error(f"Audio extraction failed: {e}")
return False
def sample_video_frames(self, video_path, resize=(224, 224)):
"""
Samples frames from video at the target FPS.
"""
frames = []
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Calculate step to match target FPS
step = max(1, int(fps / self.target_fps))
count = 0
while True:
ret, frame = cap.read()
if not ret:
break
if count % step == 0:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, resize)
frames.append(frame)
count += 1
cap.release()
return np.stack(frames) if frames else None
class I3D(nn.Module):
"""
Simplified Inflated 3D ConvNet (I3D) architecture shell.
In real production, this would load the full architecture.
"""
def __init__(self, num_classes=400, in_channels=3):
super(I3D, self).__init__()
# This is a simplified placeholder.
# The actual model has many Inception modules.
# We will use this to wrap the loaded .pt weights.
self.conv3d_1a_7x7 = nn.Conv3d(in_channels, 64, kernel_size=(7, 7, 7), stride=(2, 2, 2), padding=(3, 3, 3))
# ... more layers ...
self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.logits = nn.Conv3d(1024, num_classes, kernel_size=(1, 1, 1))
def forward(self, x):
# Forward pass returning the 1024-dim feature vector before the logits layer
# [Batch, 1024, T, H, W] -> AvgPool -> [Batch, 1024, 1, 1, 1]
pass
class VGGish(nn.Module):
"""
Simplified VGGish shell.
"""
def __init__(self):
super(VGGish, self).__init__()
# ... layer definitions ...
pass
def forward(self, x):
# Returning 128-dim embedding
pass