#!/usr/bin/env python3 """ Outlier removal algorithm for video frame embeddings using DBSCAN. Reads embeddings, detects outliers, and exports predictions to CSV files. GPU acceleration is automatically detected and used if available. Usage: # Process CLIP embeddings from outlier_artifacts python outliers_removal_algorithm.py --embeddings-dir ./outlier_artifacts/embeddings --output-dir ./outlier_artifacts/cleaned_CSVs --model-type clip # Process DINOv2 embeddings python outliers_removal_algorithm.py --embeddings-dir ./outlier_artifacts/embeddings --output-dir ./outlier_artifacts/cleaned_CSVs --model-type dinov2 # Process ResNet18 embeddings python outliers_removal_algorithm.py --embeddings-dir ./outlier_artifacts/embeddings --output-dir ./outlier_artifacts/cleaned_CSVs --model-type resnet18 # Custom DBSCAN parameters with CLIP embeddings python outliers_removal_algorithm.py --embeddings-dir ./outlier_artifacts/embeddings --output-dir ./outlier_artifacts/cleaned_CSVs --model-type clip --eps 0.45 --min-samples 50 # Filter to specific action category python outliers_removal_algorithm.py --embeddings-dir ./outlier_artifacts/embeddings --output-dir ./outlier_artifacts/cleaned_CSVs --model-type clip --action-filter Crawling # Limit processing to first 10 videos python outliers_removal_algorithm.py --embeddings-dir ./outlier_artifacts/embeddings --output-dir ./outlier_artifacts/cleaned_CSVs --model-type clip --max-videos 10 Note: To generate cleaned videos from predictions, use generate_cleaned_videos_from_predictions.py """ import os import glob import csv import argparse import numpy as np import torch from pathlib import Path try: import cupy as cp from cuml.cluster import DBSCAN as cuDBSCAN CUML_AVAILABLE = True except ImportError: CUML_AVAILABLE = False from sklearn.cluster import DBSCAN as skDBSCAN # Automatically detect GPU availability USE_GPU = CUML_AVAILABLE and torch.cuda.is_available() def to_numpy(x): """Convert tensor or array to numpy float32.""" if isinstance(x, torch.Tensor): x = x.detach().cpu().numpy() return np.asarray(x, dtype=np.float32) def dbscan_outliers(X, eps=0.55, min_samples=10): """ Detect outliers using DBSCAN (noise points). Args: X: Feature matrix (N, D) eps: DBSCAN epsilon parameter min_samples: DBSCAN minimum samples parameter Returns: Boolean array of shape (N,) where True = outlier """ X = to_numpy(X) # Ensure X is 2D: (n_samples, n_features) if X.ndim > 2: X = X.reshape(X.shape[0], -1) if USE_GPU: labels = cuDBSCAN(eps=eps, min_samples=min_samples).fit_predict(cp.asarray(X)).get() else: labels = skDBSCAN(eps=eps, min_samples=min_samples, n_jobs=-1).fit_predict(X) return labels == -1 def extract_action_name(filename, model_type): """Extract action category from embedding filename based on model type.""" name = os.path.basename(filename) suffix = f'_{model_type}_embeddings' name = name.replace(suffix + '.pt', '').replace(suffix + '.pth', '') return name def process_all_embeddings(emb_dir, eps, min_samples, output_dir, model_type='clip', max_videos=None, action_filter=None): """ Process all embeddings and export predictions to CSV files. Args: emb_dir: Directory containing embedding .pt files eps: DBSCAN epsilon parameter min_samples: DBSCAN minimum samples parameter output_dir: Directory to save CSV predictions model_type: Model type to load ('clip', 'dinov2', or 'resnet18') max_videos: Limit processing to first N videos action_filter: Filter to specific action category """ # Filter files by model type (e.g., *_clip_embeddings.pt, *_dinov2_embeddings.pt, or *_resnet18_embeddings.pt) pattern = f"*_{model_type}_embeddings.pt" pt_files = sorted(glob.glob(os.path.join(emb_dir, pattern))) if action_filter: pt_files = [f for f in pt_files if action_filter.lower() in os.path.basename(f).lower()] print(f"Filtering to action: {action_filter}") print(f"Found {len(pt_files)} matching file(s)") # Create output directory output_path = Path(output_dir) output_path.mkdir(exist_ok=True, parents=True) print("=" * 80) print("OUTLIER REMOVAL ALGORITHM - DBSCAN") print("=" * 80) print(f"Model type: {model_type.upper()}") print(f"GPU Acceleration: {'Enabled (cuML)' if USE_GPU else 'Disabled (CPU/sklearn)'}") print(f"Embeddings dir: {emb_dir}") print(f"Output dir: {output_dir}") print(f"DBSCAN parameters: eps={eps}, min_samples={min_samples}") print(f"Total embedding files: {len(pt_files)}") print("=" * 80) total_videos = 0 for pt_path in pt_files: data = torch.load(pt_path, map_location="cpu") action_name = extract_action_name(pt_path, model_type) print(f"Processing action: {action_name}") # Create CSV for this action csv_path = output_path / f"{action_name}.csv" with open(csv_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['video_id', 'predicted_outliers_list']) for video_name, video_data in data.items(): if max_videos and total_videos >= max_videos: break total_videos += 1 embeddings = video_data["embeddings"] # Run DBSCAN outlier detection predictions = dbscan_outliers(embeddings, eps=eps, min_samples=min_samples) # Convert boolean array to list of outlier indices outlier_indices = np.where(predictions)[0].tolist() outliers_str = ",".join(map(str, outlier_indices)) # Write to CSV writer.writerow([video_name, outliers_str]) num_outliers = predictions.sum() num_frames = len(embeddings) if max_videos and total_videos >= max_videos: break print("\n" + "=" * 80) print("PROCESSING COMPLETE") print("=" * 80) print(f"Total videos processed: {total_videos}") print(f"CSV files saved to: {output_path.absolute()}") print("\nNext step: Use generate_cleaned_videos_from_predictions.py to create cleaned videos") print("=" * 80) def main(): parser = argparse.ArgumentParser( description="Outlier removal algorithm using DBSCAN: detect outliers and export predictions to CSV" ) parser.add_argument("--embeddings-dir", required=True, help="Directory containing embedding .pt files") parser.add_argument("--output-dir", default="./outlier_artifacts/cleaned_CSVs", help="Directory to save prediction CSV files") parser.add_argument("--model-type", type=str, choices=['clip', 'dinov2', 'resnet18'], default='clip', help="Model type to load: 'clip', 'dinov2', or 'resnet18' (default: clip)") parser.add_argument("--max-videos", type=int, help="Limit processing to first N videos") parser.add_argument("--action-filter", help="Filter to specific action category (e.g., 'Crawling')") # DBSCAN parameters parser.add_argument("--eps", type=float, default=0.5, help="DBSCAN: Epsilon parameter") parser.add_argument("--min-samples", type=int, default=40, help="DBSCAN: Minimum samples parameter") args = parser.parse_args() process_all_embeddings( emb_dir=args.embeddings_dir, eps=args.eps, min_samples=args.min_samples, output_dir=args.output_dir, model_type=args.model_type, max_videos=args.max_videos, action_filter=args.action_filter ) if __name__ == "__main__": main()