Spaces:

bd04
/

BD_HAR_25

Sleeping

App Files Files Community

bd04 commited on Dec 3, 2025

Commit

7650036

1 Parent(s): 54edf93

Add inference source

Browse files

Files changed (3) hide show

Source/inference.py +105 -0
Source/lstm.py +41 -0
Source/preprocessing.py +10 -0

Source/inference.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import json
+import argparse
+import numpy as np
+import torch
+import cv2
+from torchvision import models
+from torchvision.models import ResNet50_Weights
+from lstm import MultiLayerBiLSTMClassifier
+from preprocessing import preprocessingData
+def load_label_map(dataset):
+	# Resolve label map relative to this file
+	base = os.path.dirname(__file__)
+	label_path = os.path.join(base, f"label_map_idx2label_{dataset}.json")
+	if not os.path.exists(label_path):
+		raise FileNotFoundError(f"Label map not found: {label_path}")
+	with open(label_path, "r", encoding="utf-8") as f:
+		return json.load(f)
+def read_video_frames(video_path, num_frames=16):
+	cap = cv2.VideoCapture(video_path)
+	if not cap.isOpened():
+		raise RuntimeError(f"Cannot open video file: {video_path}")
+	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+	if total_frames == 0:
+		raise RuntimeError(f"Video contains no frames: {video_path}")
+	frame_indices = np.linspace(0, total_frames - 1, num_frames).astype(int)
+	frames = []
+	for idx in range(total_frames):
+		ret, frame = cap.read()
+		if not ret:
+			break
+		if idx in frame_indices:
+			frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+			frames.append(frame_rgb)
+	cap.release()
+	if len(frames) == 0:
+		raise RuntimeError("No frames extracted from video.")
+	while len(frames) < num_frames:
+		frames.append(frames[-1])
+	return frames[:num_frames]
+def load_model(model_path, input_size, hidden_size, num_layers, num_classes):
+	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+	model = MultiLayerBiLSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)
+	model.load_state_dict(torch.load(model_path, map_location=device))
+	model.eval()
+	return model
+def predict_activity(dataset, video_path, model_path, num_frames=32, hidden_size=256, num_layers=2):
+	"""
+	Run inference on a single video and return (predicted_class_index, predicted_label).
+	This function is import-friendly for web apps (Gradio/Streamlit).
+	"""
+	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+	# Load label map and number of classes
+	label_map = load_label_map(dataset)
+	num_classes = len(label_map)
+	# Step 1: Read and process video
+	frames = read_video_frames(video_path, num_frames)
+	transform = preprocessingData()
+	transformed_frames = [transform(frame) for frame in frames]
+	frames_tensor = torch.stack(transformed_frames, dim=0).to(device)
+	# Step 2: Extract features
+	resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT).to(device)
+	resnet_feat = torch.nn.Sequential(*list(resnet.children())[:-1])
+	resnet.eval()
+	with torch.no_grad():
+		features_tensor = resnet_feat(frames_tensor)
+	features = torch.flatten(features_tensor, start_dim=1).cpu().numpy()
+	# Step 3: Load model
+	input_size = features.shape[1]
+	model = load_model(model_path, input_size, hidden_size, num_layers, num_classes)
+	# Step 4: Predict
+	with torch.no_grad():
+		input_seq = torch.from_numpy(features).unsqueeze(0).float().to(device)
+		outputs = model(input_seq)
+		predicted_class = torch.argmax(outputs, dim=1).item()
+		predicted_label = label_map[str(predicted_class)]
+	return predicted_class, predicted_label
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(description="Inference on a single video using trained HAR model")
+	parser.add_argument("dataset", type=str, help="Dataset used to train model (ucf11 or ucf50)")
+	parser.add_argument("video_path", type=str, help="Path to input video file")
+	parser.add_argument("model_path", type=str, help="Path to trained model (.pt)")
+	args = parser.parse_args()
+	cls, lbl = predict_activity(args.dataset.lower(), args.video_path, args.model_path)
+	print(f"Predicted class index: {cls} ({lbl})")

Source/lstm.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+class LSTMClassifier(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes):
+        super(LSTMClassifier, self).__init__()
+        self.hidden_size = hidden_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.fc = nn.Linear(hidden_size, num_classes)
+    def forward(self, x):
+        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
+        c0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)
+        # Forward propagate LSTM
+        out, _ = self.lstm(x, (h0, c0))
+        # Decode the hidden state of the last time step
+        out = self.fc(out[:, -1, :])
+        out = nn.functional.softmax(out, dim=1)
+        return out
+class MultiLayerBiLSTMClassifier(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers, num_classes):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=0.2)
+        self.fc = nn.Linear(hidden_size*2, num_classes) # *2 to account for bidirectional LSTM
+        self.dropout = nn.Dropout(0.2)
+    def forward(self, x):
+        # Initialize hidden state and cell state with zeros
+        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(x.device) # *2 to account for bidirectional LSTM
+        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size).to(x.device) # *2 to account for bidirectional LSTM
+        # Forward propagate bidirectional LSTM
+        out, _ = self.lstm(x, (h0, c0))
+        out = self.dropout(out[:, -1, :])  # Apply dropout before FC layer
+        # Decode the hidden state of the last time step
+        out = self.fc(out)
+        #out = nn.functional.softmax(out, dim=1)
+        return out

Source/preprocessing.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import torchvision.transforms as transforms
+def preprocessingData():
+    transform = transforms.Compose([
+        transforms.ToPILImage(),  # Converts the frame from a NumPy array to a PIL Image, which is required for further transformations.
+        transforms.Resize((224, 224)),  # Resizes the frame to 224x224 pixels, the input size expected by ResNet50.
+        transforms.ToTensor(),  # Converts the PIL Image to a PyTorch tensor and scales pixel values to [0, 1].
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalizes the tensor using the mean and standard deviation of the ImageNet dataset, which ResNet50 was trained on.
+    ])
+    return transform