Spaces:

babaTEEpe
/

Toun

Sleeping

App Files Files Community

Toun / bmt_utils.py

babaTEEpe

Upload 17 files

513d6d1 verified about 1 month ago

raw

history blame contribute delete

3.1 kB

	import os
	import cv2
	import numpy as np
	import torch
	import torch.nn as nn
	from PIL import Image
	from moviepy.editor import VideoFileClip
	import logging

	logger = logging.getLogger(__name__)

	class BMTPreprocessor:
	"""
	Handles video and audio preprocessing for BMT feature extraction.
	"""
	def __init__(self, target_sr=16000, target_fps=25):
	self.target_sr = target_sr
	self.target_fps = target_fps

	def extract_audio(self, video_path, output_audio_path):
	"""
	Extracts audio from video and saves as WAV.
	"""
	try:
	video = VideoFileClip(video_path)
	if video.audio:
	video.audio.write_audiofile(output_audio_path, fps=self.target_sr, logger=None)
	return True
	else:
	logger.warning(f"No audio track found in {video_path}")
	return False
	except Exception as e:
	logger.error(f"Audio extraction failed: {e}")
	return False

	def sample_video_frames(self, video_path, resize=(224, 224)):
	"""
	Samples frames from video at the target FPS.
	"""
	frames = []
	cap = cv2.VideoCapture(video_path)
	fps = cap.get(cv2.CAP_PROP_FPS)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

	# Calculate step to match target FPS
	step = max(1, int(fps / self.target_fps))

	count = 0
	while True:
	ret, frame = cap.read()
	if not ret:
	break

	if count % step == 0:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frame = cv2.resize(frame, resize)
	frames.append(frame)
	count += 1

	cap.release()
	return np.stack(frames) if frames else None

	class I3D(nn.Module):
	"""
	Simplified Inflated 3D ConvNet (I3D) architecture shell.
	In real production, this would load the full architecture.
	"""
	def __init__(self, num_classes=400, in_channels=3):
	super(I3D, self).__init__()
	# This is a simplified placeholder.
	# The actual model has many Inception modules.
	# We will use this to wrap the loaded .pt weights.
	self.conv3d_1a_7x7 = nn.Conv3d(in_channels, 64, kernel_size=(7, 7, 7), stride=(2, 2, 2), padding=(3, 3, 3))
	# ... more layers ...
	self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
	self.logits = nn.Conv3d(1024, num_classes, kernel_size=(1, 1, 1))

	def forward(self, x):
	# Forward pass returning the 1024-dim feature vector before the logits layer
	# [Batch, 1024, T, H, W] -> AvgPool -> [Batch, 1024, 1, 1, 1]
	pass

	class VGGish(nn.Module):
	"""
	Simplified VGGish shell.
	"""
	def __init__(self):
	super(VGGish, self).__init__()
	# ... layer definitions ...
	pass

	def forward(self, x):
	# Returning 128-dim embedding
	pass