SigMamba: Unified Video Anomaly Detection
Weakly Supervised Video Anomaly Detection using SigLIP 2 + Mamba SSM
A unified architecture that combines SigLIP 2 (Google's SOTA vision encoder) with Mamba (Linear-complexity State Space Model) for detecting anomalies in surveillance videos. The system achieves linear O(N) scaling, enabling processing of long-form video content that was previously impractical with quadratic-cost Transformers.
Key Features
- Linear Complexity: O(N) scaling via Mamba SSM (vs O(NΒ²) for Transformers)
- Dual Input Modes: Accepts raw pixels or pre-extracted features
Architecture
The model operates in two modes:
| Mode | Input | Use Case |
|---|---|---|
| Unified | pixel_values (B, T, 3, 384, 384) |
End-to-end inference |
| Modular | features (B, T, 1024) |
Training / batch processing |
Hyperparameters
| Parameter | Value | Description |
|---|---|---|
| Feature Dim | 1024 | SigLIP output dimension |
| Mamba d_model | 768 | Internal hidden dimension |
| Mamba Depth | 8 | Number of stacked layers |
Usage
Prerequisites
pip install opencv-python
pip install transformers==4.57.3
It's recommended to use
num_frames=32due to model's training.
Loading the Model
from transformers import AutoModel, AutoProcessor
import torch
# Load the unified exported model
model = AutoModel.from_pretrained(
"VINAY-UMRETHE/SigMamba-V1",
trust_remote_code=True
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
# Load vision processor for pixel preprocessing
processor = AutoProcessor.from_pretrained(model.config.vision_model_id)
Inference Mode 1: Unified (Raw Pixels β Scores)
Use this when you have raw video frames. The model handles feature extraction internally.
Input Shape
pixel_values: (Batch, Time, Channels, Height, Width)
(B, T, 3, 384, 384)
Example: Single Video
import cv2
import numpy as np
def load_video_frames(video_path, num_frames=32):
"""Sample frames uniformly from a video."""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
frames = []
for idx in indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame = cap.read()
if ret:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(frame)
cap.release()
return frames
# Load and preprocess
frames = load_video_frames("test_video.mp4", num_frames=32)
inputs = processor(images=frames, return_tensors="pt")
pixel_values = inputs.pixel_values.to(device) # (32, 3, 384, 384)
# Add batch dimension: (1, 32, 3, 384, 384)
pixel_values = pixel_values.unsqueeze(0)
# Inference
with torch.no_grad():
scores = model(pixel_values=pixel_values)
# scores shape: (1, 32, 1)
# Get results
anomaly_scores = scores.squeeze().cpu().numpy() # (32,)
max_score = anomaly_scores.max()
print(f"Max Anomaly Score: {max_score:.4f}")
Inference Mode 2: Modular (Pre-Extracted Features β Scores)
Use this when you've already extracted features (e.g., from batch processing).
Input Shape
features: (Batch, Time, FeatureDim)
(B, T, 1024)
Example: From Feature File
def load_features_from_txt(feature_path):
"""Load features from text file (one line per segment)."""
with open(feature_path, 'r') as f:
lines = f.readlines()
features = []
for line in lines:
values = [float(v) for v in line.strip().split()]
features.append(values)
return torch.tensor(features, dtype=torch.float32)
# Load features
features = load_features_from_txt("video_features.txt") # (T, 1024)
features = features.unsqueeze(0).to(device) # (1, T, 1024)
# Inference
with torch.no_grad():
scores = model(features=features)
# scores shape: (1, T, 1)
print(f"Anomaly Scores: {scores.squeeze().cpu().numpy()}")
Batch Processing Multiple Videos
Process multiple videos in a single forward pass for efficiency.
# Load multiple videos
video_paths = ["video1.mp4", "video2.mp4", "video3.mp4"]
batch_frames = []
for path in video_paths:
frames = load_video_frames(path, num_frames=32)
inputs = processor(images=frames, return_tensors="pt")
batch_frames.append(inputs.pixel_values)
# Stack into batch: (3, 32, 3, 384, 384)
pixel_values = torch.stack(batch_frames).to(device)
# Single forward pass for all videos
with torch.no_grad():
scores = model(pixel_values=pixel_values)
# scores shape: (3, 32, 1)
# Per-video max scores
for i, path in enumerate(video_paths):
max_score = scores[i].max().item()
print(f"{path}: {max_score:.4f}")
Single Frame Analysis
For quick spot-checks on individual frames.
from PIL import Image
# Load single image
image = Image.open("suspicious_frame.jpg")
inputs = processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values.to(device) # (1, 3, 384, 384)
# Reshape: (1, 1, 3, 384, 384) - batch=1, time=1
pixel_values = pixel_values.unsqueeze(0)
with torch.no_grad():
score = model(pixel_values=pixel_values)
print(f"Frame Anomaly Score: {score.item():.4f}")
Extract Features Only (No Classification)
Access the Mamba encoder output directly for custom downstream tasks.
# Load frames
frames = load_video_frames("video.mp4", num_frames=32)
inputs = processor(images=frames, return_tensors="pt")
pixel_values = inputs.pixel_values.unsqueeze(0).to(device)
# Access internal components
with torch.no_grad():
# Step 1: Extract vision features
b, t, c, h, w = pixel_values.shape
flat_pixels = pixel_values.view(b * t, c, h, w)
vision_features = model.vision_model.get_image_features(pixel_values=flat_pixels)
vision_features = vision_features / vision_features.norm(dim=-1, keepdim=True)
vision_features = vision_features.view(b, t, -1) # (1, 32, 1024)
# Step 2: Get Mamba-encoded features
mamba_features = model.mamba_encoder(vision_features) # (1, 32, 512)
print(f"Vision Features: {vision_features.shape}")
print(f"Mamba Features: {mamba_features.shape}")
Threshold-Based Detection
Apply a threshold to convert scores into binary predictions.
def detect_anomalies(video_path, threshold=0.5):
"""Returns list of anomalous segment indices."""
frames = load_video_frames(video_path, num_frames=32)
inputs = processor(images=frames, return_tensors="pt")
pixel_values = inputs.pixel_values.unsqueeze(0).to(device)
with torch.no_grad():
scores = model(pixel_values=pixel_values)
scores = scores.squeeze().cpu().numpy()
anomalous_segments = np.where(scores > threshold)[0]
return {
"scores": scores,
"max_score": scores.max(),
"is_anomalous": scores.max() > threshold,
"anomalous_segments": anomalous_segments.tolist()
}
# Usage
result = detect_anomalies("test.mp4", threshold=0.5)
print(f"Anomalous: {result['is_anomalous']}")
print(f"Segments: {result['anomalous_segments']}")
Output Reference
| Method | Input | Output Shape | Description |
|---|---|---|---|
model(pixel_values=...) |
(B, T, 3, 384, 384) |
(B, T, 1) |
End-to-end inference |
model(features=...) |
(B, T, 1024) |
(B, T, 1) |
Feature-based inference |
model.mamba_encoder(...) |
(B, T, 1024) |
(B, T, 512) |
Encoded temporal features |
model.vision_model.get_image_features(...) |
(N, 3, 384, 384) |
(N, 1024) |
Raw vision embeddings |
License
This model is licensed under the MIT License.
- Downloads last month
- 61