Merge pull request #1 from bawolf/blog_dataset

Browse files

Files changed (5) hide show

.gitignore +2 -0
script/hyperparameter_tuning.py +204 -38
script/train.py +278 -211
script/visualization/visualize.py +22 -13
src/dataset/dataset.py +3 -1

.gitignore CHANGED Viewed

@@ -35,6 +35,8 @@ ENV/
 # Project specific
 runs/
 checkpoints/
 *.pth
 *.ckpt

 # Project specific
 runs/
+outputs/
+runs_hyperparam/
 checkpoints/
 *.pth
 *.ckpt

script/hyperparameter_tuning.py CHANGED Viewed

@@ -1,24 +1,39 @@
 import optuna
 import os
-import os
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 from script.train import train_and_evaluate
 from src.utils.utils import create_run_directory
-def objective(trial, hyperparam_run_dir):
     config = {
-        "clip_model": trial.suggest_categorical("clip_model", ["openai/clip-vit-base-patch32", "openai/clip-vit-large-patch14"]),
-        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-4),
-        "weight_decay": trial.suggest_loguniform("weight_decay", 1e-8, 1e-1),
-        "unfreeze_layers": trial.suggest_int("unfreeze_layers", 1, 6),
-        "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128]),
-        "gradient_clip_max_norm": trial.suggest_uniform("gradient_clip_max_norm", 0.1, 1.0),
         "augmentation_strength": trial.suggest_float("augmentation_strength", 0.0, 1.0),
         "crop_scale_min": trial.suggest_float("crop_scale_min", 0.6, 0.9),
         "max_frames": trial.suggest_int("max_frames", 5, 15),
-        "sigma": trial.suggest_uniform("sigma", 0.1, 0.5),
     }
     class_labels = ["windmill", "halo", "swipe", "baby_mill"][:3]
@@ -27,9 +42,9 @@ def objective(trial, hyperparam_run_dir):
     config.update({
         "class_labels": class_labels,
         "num_classes": len(class_labels),
-        "data_path": '../finetune/3moves_test',
-        "num_epochs": 50,  # Reduced for faster trials
-        "patience": 10,    # Adjusted for faster trials
         "image_size": 224,
         "crop_scale_max": 1.0,
         "normalization_mean": [0.485, 0.456, 0.406],
@@ -37,7 +52,7 @@ def objective(trial, hyperparam_run_dir):
         "overfitting_threshold": 10,
     })
-    # Derive augmentation parameters from augmentation_strength
     config.update({
         "flip_probability": 0.5 * config["augmentation_strength"],
         "rotation_degrees": int(15 * config["augmentation_strength"]),
@@ -47,33 +62,184 @@ def objective(trial, hyperparam_run_dir):
         "hue_jitter": 0.1 * config["augmentation_strength"],
     })
-    # Create a unique run directory for this trial
-    config["run_dir"] = create_run_directory(prefix=f"trial", parent_dir=hyperparam_run_dir)
-    # Run training and evaluation
-    val_accuracy = train_and_evaluate(config)
-    return val_accuracy
-def main():
-    # Set up the study and optimize
-    hyperparam_run_dir = create_run_directory(suffix='_hyperparam')
-    study = optuna.create_study(direction="maximize")
-    study.optimize(lambda trial: objective(trial, hyperparam_run_dir), n_trials=100)  # Adjust the number of trials as needed
-    # Save the study results
-    study.trials_dataframe().to_csv(os.path.join(hyperparam_run_dir, 'study_results.csv'))
-    print("Best trial:")
-    trial = study.best_trial
-    print("  Value: ", trial.value)
-    print("  Params: ")
-    for key, value in trial.params.items():
-        print("    {}: {}".format(key, value))
-    # Save the best trial parameters
-    with open(os.path.join(hyperparam_run_dir, 'best_params.txt'), 'w') as f:
-        for key, value in trial.params.items():
-            f.write(f"{key}: {value}\n")
 if __name__ == "__main__":
-    main()

 import optuna
 import os
+from datetime import datetime
+import pandas as pd
+from pathlib import Path
+import json
+import math
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 from script.train import train_and_evaluate
 from src.utils.utils import create_run_directory
+def create_hyperparam_directory():
+    """Create a parent directory for all hyperparameter searches"""
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_dir = "runs_hyperparam"
+    hyperparam_dir = os.path.join(base_dir, f"hyperparam_{timestamp}")
+    os.makedirs(hyperparam_dir, exist_ok=True)
+    return hyperparam_dir
+def objective(trial, hyperparam_run_dir, data_path):
+    """Objective function for a single dataset"""
+    # Then suggest parameters using the model-specific ranges
     config = {
+        "clip_model":  trial.suggest_categorical("clip_model", ["openai/clip-vit-base-patch32", "openai/clip-vit-large-patch14"]),
+        "batch_size": trial.suggest_categorical("batch_size", [8,16,32]),
+        "unfreeze_layers": trial.suggest_int("unfreeze_layers", 1, 4),
+        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
+        "weight_decay": trial.suggest_float("weight_decay", 1e-8, 1e-1, log=True),
+        "gradient_clip_max_norm": trial.suggest_float("gradient_clip_max_norm", 0.1, 1.0),
         "augmentation_strength": trial.suggest_float("augmentation_strength", 0.0, 1.0),
         "crop_scale_min": trial.suggest_float("crop_scale_min", 0.6, 0.9),
         "max_frames": trial.suggest_int("max_frames", 5, 15),
+        "sigma": trial.suggest_float("sigma", 0.1, 0.5),
     }
     class_labels = ["windmill", "halo", "swipe", "baby_mill"][:3]
     config.update({
         "class_labels": class_labels,
         "num_classes": len(class_labels),
+        "data_path": data_path,
+        "num_epochs": 50,
+        "patience": 10,
         "image_size": 224,
         "crop_scale_max": 1.0,
         "normalization_mean": [0.485, 0.456, 0.406],
         "overfitting_threshold": 10,
     })
+    # Derive augmentation parameters
     config.update({
         "flip_probability": 0.5 * config["augmentation_strength"],
         "rotation_degrees": int(15 * config["augmentation_strength"]),
         "hue_jitter": 0.1 * config["augmentation_strength"],
     })
+    # Create dataset-specific run directory
+    dataset_label = '_'.join(Path(data_path).parts[-2:])  # Get last two parts of path
+    trial_dir = create_run_directory(
+        prefix=f"trial_{dataset_label}",
+        parent_dir=hyperparam_run_dir
+    )
+    config["run_dir"] = trial_dir
+    # Run training and evaluation with device cleanup
+    try:
+        val_accuracy, vis_dir = train_and_evaluate(config)
+        if val_accuracy is None or math.isnan(val_accuracy) or math.isinf(val_accuracy):
+            raise ValueError(f"Invalid accuracy value: {val_accuracy}")
+        # Save trial info
+        trial_info = {
+            'dataset': data_path,
+            'dataset_label': dataset_label,
+            'trial_number': trial.number,
+            'parameters': trial.params,
+            'value': val_accuracy,
+            'visualization_dir': vis_dir
+        }
+        with open(os.path.join(trial_dir, 'trial_info.json'), 'w') as f:
+            json.dump(trial_info, f, indent=4)
+        return val_accuracy
+    except Exception as e:
+        print(f"Error in trial for {data_path}: {str(e)}")
+        # Log detailed error information
+        error_log_path = os.path.join(hyperparam_run_dir, 'error_log.txt')
+        with open(error_log_path, 'a') as f:
+            f.write(f"\nError in trial at {datetime.now()}:\n")
+            f.write(f"Dataset: {data_path}\n")
+            f.write(f"Error: {str(e)}\n")
+            f.write(f"Trial params: {trial.params}\n")
+            f.write("Stack trace:\n")
+            import traceback
+            f.write(traceback.format_exc())
+            f.write("\n" + "="*50 + "\n")
+        return float('-inf')
+def run_hyperparameter_search(data_paths, n_trials=100):
+    """Run hyperparameter search for multiple datasets"""
+    # Create parent directory for all searches
+    parent_hyperparam_dir = create_hyperparam_directory()
+    # Store results for all datasets
+    all_results = {}
+    for data_path in data_paths:
+        print(f"\nStarting hyperparameter search for dataset: {data_path}")
+        # Create dataset-specific directory
+        dataset_label = '_'.join(Path(data_path).parts[-2:])
+        dataset_dir = os.path.join(parent_hyperparam_dir, f"search_{dataset_label}")
+        os.makedirs(dataset_dir, exist_ok=True)
+        # Create and run study with explicit trial count tracking
+        study = optuna.create_study(direction="maximize")
+        completed_trials = 0
+        failed_trials = []
+        total_attempts = 0
+        max_attempts =  n_trials * 2
+        while completed_trials < n_trials and total_attempts < max_attempts:
+            try:
+                total_attempts += 1
+                study.optimize(
+                    lambda trial: objective(trial, dataset_dir, data_path),
+                    n_trials=1
+                )
+                # Only increment if the trial actually succeeded
+                if study.trials[-1].value != float('-inf'):
+                    completed_trials += 1
+                    print(f"Completed trial {completed_trials}/{n_trials} for {dataset_label}")
+                else:
+                    error_info = {
+                        'trial_number': completed_trials + len(failed_trials) + 1,
+                        'error': "Trial returned -inf",
+                        'timestamp': datetime.now().isoformat()
+                    }
+                    failed_trials.append(error_info)
+                    print(f"Failed trial for {dataset_label}: returned -inf")
+            except Exception as e:
+                error_info = {
+                    'trial_number': completed_trials + len(failed_trials) + 1,
+                    'error': str(e),
+                    'timestamp': datetime.now().isoformat()
+                }
+                failed_trials.append(error_info)
+                print(f"Error in trial for {dataset_label}: {str(e)}")
+                # Log the error
+                with open(os.path.join(dataset_dir, 'failed_trials.json'), 'w') as f:
+                    json.dump(failed_trials, f, indent=4)
+            if total_attempts >= max_attempts:
+                print(f"Warning: Reached maximum attempts ({max_attempts}) for {dataset_label}")
+        # Save study results
+        results_df = study.trials_dataframe()
+        results_df.to_csv(os.path.join(dataset_dir, 'study_results.csv'))
+        # Save trial statistics
+        trial_stats = {
+            'completed_trials': completed_trials,
+            'failed_trials': len(failed_trials),
+            'total_attempts': completed_trials + len(failed_trials)
+        }
+        with open(os.path.join(dataset_dir, 'trial_statistics.json'), 'w') as f:
+            json.dump(trial_stats, f, indent=4)
+        # Save best trial info
+        best_trial = study.best_trial
+        best_params_path = os.path.join(dataset_dir, 'best_params.txt')
+        with open(best_params_path, 'w') as f:
+            f.write(f"Best trial value: {best_trial.value}\n\n")
+            f.write("Best parameters:\n")
+            for key, value in best_trial.params.items():
+                f.write(f"{key}: {value}\n")
+        # Store results
+        all_results[data_path] = {
+            'best_value': best_trial.value,
+            'best_params': best_trial.params,
+            'study': study,
+            'results_df': results_df,
+            'failed_trials': failed_trials,
+            'trial_stats': trial_stats
+        }
+        print(f"\nResults for {data_path}:")
+        print(f"Completed trials: {completed_trials}")
+        print(f"Failed trials: {len(failed_trials)}")
+        print(f"Best trial value: {best_trial.value}")
+        print("Best parameters:")
+        for key, value in best_trial.params.items():
+            print(f"    {key}: {value}")
+    # Create overall summary with additional statistics
+    summary_data = []
+    for data_path, result in all_results.items():
+        summary_data.append({
+            'dataset': data_path,
+            'best_accuracy': result['best_value'],
+            'completed_trials': result['trial_stats']['completed_trials'],
+            'failed_trials': result['trial_stats']['failed_trials'],
+            **result['best_params']
+        })
+    summary_df = pd.DataFrame(summary_data)
+    summary_df.to_csv(os.path.join(parent_hyperparam_dir, 'overall_summary.csv'), index=False)
+    return parent_hyperparam_dir, all_results
 if __name__ == "__main__":
+    # List of dataset paths to optimize
+    data_paths = [
+        '../finetune/blog/bryant/random',
+        '../finetune/blog/bryant/adjusted',
+        '../finetune/blog/youtube/random',
+        '../finetune/blog/youtube/adjusted',
+        '../finetune/blog/combined/random',
+        '../finetune/blog/combined/adjusted',
+        '../finetune/blog/bryant_train_youtube_val/default'
+    ]
+    # Run hyperparameter search
+    hyperparam_dir, results = run_hyperparameter_search(
+        data_paths,
+        n_trials=8  # Adjust as needed
+    )
+    print(f"\nHyperparameter search complete!")
+    print(f"Results are saved in: {hyperparam_dir}")

script/train.py CHANGED Viewed

@@ -7,6 +7,7 @@ import logging
 import csv
 import json
 from torch.optim.lr_scheduler import CosineAnnealingLR
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
@@ -15,209 +16,253 @@ from src.utils.utils import create_run_directory
 from src.dataset.dataset import VideoDataset
 from src.models.model import create_model
 from src.dataset.video_utils import create_transform
 def train_and_evaluate(config):
-    # Create a run directory if it doesn't exist
-    if "run_dir" not in config:
-        config["run_dir"] = create_run_directory()
-    # Update paths based on run_dir
-    config.update({
-        "best_model_path": os.path.join(config["run_dir"], 'best_model.pth'),
-        "final_model_path": os.path.join(config["run_dir"], 'final_model.pth'),
-        "csv_path": os.path.join(config["run_dir"], 'training_log.csv'),
-        "misclassifications_dir": os.path.join(config["run_dir"], 'misclassifications'),
-    })
-    config_path = os.path.join(config["run_dir"], 'config.json')
-    with open(config_path, 'w') as f:
-        json.dump(config, f, indent=2)
-    # Set up logging
-    logging.basicConfig(level=logging.INFO,
-                        format='%(asctime)s - %(levelname)s - %(message)s',
-                        handlers=[logging.FileHandler(os.path.join(config["run_dir"], 'training.log')),
-                                logging.StreamHandler()])
-    logger = logging.getLogger(__name__)
-    # Set device
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    logger.info(f"Using device: {device}")
-    # Initialize variables
-    best_val_loss = float('inf')
-    epochs_without_improvement = 0
-    model = create_model(config["num_classes"], config["clip_model"])
-    # Unfreeze the last 2 layers of the vision encoder
-    model.unfreeze_vision_encoder(num_layers=config["unfreeze_layers"])
-    # Move model to device
-    model = model.to(device)
-    logger.info(f"Model architecture:\n{model}")
-    # Load datasets
-    train_dataset = VideoDataset(
-        os.path.join(config['data_path'], 'train.csv'),
-        config=config
-    )
-    # For validation, create a new config with training=False for transforms
-    val_config = config.copy()
-    val_dataset = VideoDataset(
-        os.path.join(config['data_path'], 'val.csv'),
-        config=val_config,
-        transform=create_transform(config, training=False)
-    )
-    # Create data loaders
-    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
-    val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)
-    # Define optimizer and learning rate scheduler
-    optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
-    scheduler = CosineAnnealingLR(optimizer, T_max=config["num_epochs"])
-    # Open a CSV file to log training progress
-    with open(config["csv_path"], 'w', newline='') as file:
-        writer = csv.writer(file)
-        writer.writerow(["epoch", "train_loss", "train_accuracy", "val_loss", "val_accuracy"])
-    # Function to calculate accuracy
-    def calculate_accuracy(outputs, labels):
-        _, predicted = torch.max(outputs, 1)
-        correct = (predicted == labels).sum().item()
-        total = labels.size(0)
-        return correct / total
-    def log_misclassifications(outputs, labels, video_paths, dataset, misclassified_videos):
-        _, predicted = torch.max(outputs, 1)
-        for pred, label, video_path in zip(predicted, labels, video_paths):
-            if pred != label:
-                true_label = dataset.label_map[label.item()]
-                predicted_label = dataset.label_map[pred.item()]
-                misclassified_videos.append({
-                    'video_path': video_path,
-                    'true_label': true_label,
-                    'predicted_label': predicted_label
-                })
-    # Create a subfolder for misclassification logs
-    os.makedirs(config["misclassifications_dir"], exist_ok=True)
-    # Training loop
-    for epoch in range(config["num_epochs"]):
-        model.train()
-        total_loss = 0
-        total_accuracy = 0
-        for frames, labels, video_paths in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{config['num_epochs']}"):
-            frames = frames.to(device)
-            labels = labels.to(device)
-            logits = model(frames)
-            loss = torch.nn.functional.cross_entropy(logits, labels)
-            accuracy = calculate_accuracy(logits, labels)
-            optimizer.zero_grad()
-            loss.backward()
-            clip_grad_norm_(model.parameters(), max_norm=config["gradient_clip_max_norm"])
-            optimizer.step()
-            total_loss += loss.item()
-            total_accuracy += accuracy
-        avg_train_loss = total_loss / len(train_loader)
-        avg_train_accuracy = total_accuracy / len(train_loader)
-        # Validation
-        model.eval()
-        val_loss = 0
-        val_accuracy = 0
-        misclassified_videos = []
-        with torch.no_grad():
-            for frames, labels, video_paths in val_loader:
                 frames = frames.to(device)
                 labels = labels.to(device)
                 logits = model(frames)
-                loss = torch.nn.functional.cross_entropy(logits, labels)
                 accuracy = calculate_accuracy(logits, labels)
-                val_loss += loss.item()
-                val_accuracy += accuracy
-                # Log misclassifications
-                log_misclassifications(logits, labels, video_paths, val_dataset, misclassified_videos)
-        avg_val_loss = val_loss / len(val_loader)
-        avg_val_accuracy = val_accuracy / len(val_loader)
-        # Log misclassified videos
-        if misclassified_videos:
-            misclassified_log_path = os.path.join(config["misclassifications_dir"], f'epoch_{epoch+1}.json')
-            with open(misclassified_log_path, 'w') as f:
-                json.dump(misclassified_videos, f, indent=2)
-            logger.info(f"Logged {len(misclassified_videos)} misclassified videos to {misclassified_log_path}")
-        # Log the metrics
-        logger.info(f"Epoch [{epoch+1}/{config['num_epochs']}], "
-                    f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy*100:.2f}%, "
-                    f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {avg_val_accuracy*100:.2f}%")
-        # Write to CSV
-        with open(config["csv_path"], 'a', newline='') as file:
-            writer = csv.writer(file)
-            writer.writerow([epoch+1, avg_train_loss, avg_train_accuracy*100, avg_val_loss, avg_val_accuracy*100])
-        # Learning rate scheduling
-        scheduler.step()
-        # Save the best model and check for early stopping
-        if avg_val_loss < best_val_loss:
-            best_val_loss = avg_val_loss
-            torch.save(model.state_dict(), config["best_model_path"])
-            logger.info(f"Saved best model to {config['best_model_path']}")
-            epochs_without_improvement = 0
-        else:
-            epochs_without_improvement += 1
-        # Early stopping check
-        if epochs_without_improvement >= config["patience"]:
-            logger.info(f"Early stopping triggered after {config['patience']} epochs without improvement")
-            break
-        # Overfitting detection
-        if avg_train_accuracy - avg_val_accuracy > config["overfitting_threshold"]:
-            logger.warning("Possible overfitting detected")
-    logger.info("Training finished!")
-    # Save the final model
-    torch.save(model.state_dict(), config["final_model_path"])
-    logger.info(f"Saved final model to {config['final_model_path']}")
-    # Save run information
-    with open(os.path.join(config["run_dir"], 'run_info.txt'), 'w') as f:
-        for key, value in config.items():
-            f.write(f"{key}: {value}\n")
-        f.write(f"Device: {device}\n")
-        f.write(f"Model: {model.__class__.__name__}\n")
-        f.write(f"Optimizer: {optimizer.__class__.__name__}\n")
-        f.write(f"Scheduler: {scheduler.__class__.__name__}\n")
-        f.write(f"Loss function: CrossEntropyLoss\n")
-        f.write(f"Data augmentation: RandomHorizontalFlip, RandomRotation(5), ColorJitter\n")
-        f.write(f"Mixed precision training: {'Enabled' if 'scaler' in locals() else 'Disabled'}\n")
-        f.write(f"Train dataset size: {len(train_dataset)}\n")
-        f.write(f"Validation dataset size: {len(val_dataset)}\n")
-        f.write(f"Vision encoder frozen: {'Partially' if hasattr(model, 'unfreeze_vision_encoder') else 'Unknown'}\n")
-    print("Script finished.")
-    return avg_val_accuracy
 def main():
     # Create run directory
@@ -228,35 +273,57 @@ def main():
     config = {
         "class_labels": class_labels,
         "num_classes": len(class_labels),
-        "data_path": '../finetune/3moves_otherpeopleval',
         "batch_size": 32,
-        "learning_rate": 2e-6,
-        "weight_decay": 0.007,
         "num_epochs": 50,
-        "patience": 10,  # for early stopping
-        "max_frames": 10,
-        "sigma": 0.3,
         "image_size": 224,
-        "flip_probability": 0.5,
-        "rotation_degrees": 15,
-        "brightness_jitter": 0.2,
-        "contrast_jitter": 0.2,
-        "saturation_jitter": 0.2,
-        "hue_jitter": 0.1,
-        "crop_scale_min": 0.8,
         "crop_scale_max": 1.0,
-        "normalization_mean": [0.485, 0.456, 0.406],
-        "normalization_std": [0.229, 0.224, 0.225],
-        "unfreeze_layers": 3,
-        "clip_model": "openai/clip-vit-large-patch14",
-        # "clip_model": "openai/clip-vit-base-patch32",
-        "gradient_clip_max_norm": 1.0,
         "overfitting_threshold": 10,
         "run_dir": run_dir,
-        "best_model_path": os.path.join(run_dir, 'best_model.pth'),
-        "final_model_path": os.path.join(run_dir, 'final_model.pth'),
-        "csv_path": os.path.join(run_dir, 'training_log.csv'),
-        "misclassifications_dir": os.path.join(run_dir, 'misclassifications'),
     }
     train_and_evaluate(config)

 import csv
 import json
 from torch.optim.lr_scheduler import CosineAnnealingLR
+import math
 import sys
 sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 from src.dataset.dataset import VideoDataset
 from src.models.model import create_model
 from src.dataset.video_utils import create_transform
+from visualization.visualize import run_visualization
+from visualization.miscalculations_report import analyze_misclassifications
 def train_and_evaluate(config):
+    try:
+        # Create a run directory if it doesn't exist
+        if "run_dir" not in config:
+            config["run_dir"] = create_run_directory()
+        # Update paths based on run_dir
+        config.update({
+            "best_model_path": os.path.join(config["run_dir"], 'best_model.pth'),
+            "final_model_path": os.path.join(config["run_dir"], 'final_model.pth'),
+            "csv_path": os.path.join(config["run_dir"], 'training_log.csv'),
+            "misclassifications_dir": os.path.join(config["run_dir"], 'misclassifications'),
+        })
+        config_path = os.path.join(config["run_dir"], 'config.json')
+        with open(config_path, 'w') as f:
+            json.dump(config, f, indent=2)
+        # Set up logging
+        logging.basicConfig(level=logging.INFO,
+                            format='%(asctime)s - %(levelname)s - %(message)s',
+                            handlers=[logging.FileHandler(os.path.join(config["run_dir"], 'training.log')),
+                                    logging.StreamHandler()])
+        logger = logging.getLogger(__name__)
+        # Use device from config
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Using device: {device}")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Initialize variables
+        best_val_loss = float('inf')
+        epochs_without_improvement = 0
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
+            print(f"Currently allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")
+        model = create_model(config["num_classes"], config["clip_model"])
+        # Unfreeze the last 2 layers of the vision encoder
+        model.unfreeze_vision_encoder(num_layers=config["unfreeze_layers"])
+        model = model.to(device)
+        # Ensure criterion is on the same device
+        criterion = torch.nn.CrossEntropyLoss().to(device)
+        # logger.info(f"Model architecture:\n{model}")
+        # Load datasets
+        train_dataset = VideoDataset(
+            os.path.join(config['data_path'], 'train.csv'),
+            config=config
+        )
+        # For validation, create a new config with training=False for transforms
+        val_config = config.copy()
+        val_dataset = VideoDataset(
+            os.path.join(config['data_path'], 'val.csv'),
+            config=val_config,
+            transform=create_transform(config, training=False)
+        )
+        # Create data loaders
+        train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
+        val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)
+        # Define optimizer and learning rate scheduler
+        optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
+        scheduler = CosineAnnealingLR(optimizer, T_max=config["num_epochs"])
+        # Open a CSV file to log training progress
+        with open(config["csv_path"], 'w', newline='') as file:
+            writer = csv.writer(file)
+            writer.writerow(["epoch", "train_loss", "train_accuracy", "val_loss", "val_accuracy"])
+        # Function to calculate accuracy
+        def calculate_accuracy(outputs, labels):
+            _, predicted = torch.max(outputs, 1)
+            correct = (predicted == labels).sum().item()
+            total = labels.size(0)
+            return correct / total
+        def log_misclassifications(outputs, labels, video_paths, dataset, misclassified_videos):
+            _, predicted = torch.max(outputs, 1)
+            for pred, label, video_path in zip(predicted, labels, video_paths):
+                if pred != label:
+                    true_label = dataset.label_map[label.item()]
+                    predicted_label = dataset.label_map[pred.item()]
+                    misclassified_videos.append({
+                        'video_path': video_path,
+                        'true_label': true_label,
+                        'predicted_label': predicted_label
+                    })
+        # Create a subfolder for misclassification logs
+        os.makedirs(config["misclassifications_dir"], exist_ok=True)
+        # Training loop
+        for epoch in range(config["num_epochs"]):
+            model.train()
+            total_loss = 0
+            total_accuracy = 0
+            for frames, labels, video_paths in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{config['num_epochs']}"):
                 frames = frames.to(device)
                 labels = labels.to(device)
                 logits = model(frames)
+                loss = criterion(logits, labels)
                 accuracy = calculate_accuracy(logits, labels)
+                optimizer.zero_grad()
+                loss.backward()
+                clip_grad_norm_(model.parameters(), max_norm=config["gradient_clip_max_norm"])
+                optimizer.step()
+                total_loss += loss.item()
+                total_accuracy += accuracy
+            avg_train_loss = total_loss / len(train_loader)
+            avg_train_accuracy = total_accuracy / len(train_loader)
+            # Validation
+            model.eval()
+            val_loss = 0
+            val_accuracy = 0
+            misclassified_videos = []
+            with torch.no_grad():
+                for frames, labels, video_paths in val_loader:
+                    frames = frames.to(device)
+                    labels = labels.to(device)
+                    logits = model(frames)
+                    loss = criterion(logits, labels)
+                    accuracy = calculate_accuracy(logits, labels)
+                    val_loss += loss.item()
+                    val_accuracy += accuracy
+                    # Log misclassifications
+                    log_misclassifications(logits, labels, video_paths, val_dataset, misclassified_videos)
+            avg_val_loss = val_loss / len(val_loader)
+            avg_val_accuracy = val_accuracy / len(val_loader)
+            # Log misclassified videos
+            if misclassified_videos:
+                misclassified_log_path = os.path.join(config["misclassifications_dir"], f'epoch_{epoch+1}.json')
+                with open(misclassified_log_path, 'w') as f:
+                    json.dump(misclassified_videos, f, indent=2)
+                logger.info(f"Logged {len(misclassified_videos)} misclassified videos to {misclassified_log_path}")
+            # Log the metrics
+            logger.info(f"Epoch [{epoch+1}/{config['num_epochs']}], "
+                        f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy*100:.2f}%, "
+                        f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {avg_val_accuracy*100:.2f}%")
+            # Write to CSV
+            with open(config["csv_path"], 'a', newline='') as file:
+                writer = csv.writer(file)
+                writer.writerow([epoch+1, avg_train_loss, avg_train_accuracy*100, avg_val_loss, avg_val_accuracy*100])
+            # Learning rate scheduling
+            scheduler.step()
+            # Save the best model and check for early stopping
+            if avg_val_loss < best_val_loss:
+                best_val_loss = avg_val_loss
+                torch.save(model.state_dict(), config["best_model_path"])
+                logger.info(f"Saved best model to {config['best_model_path']}")
+                epochs_without_improvement = 0
+            else:
+                epochs_without_improvement += 1
+            # Early stopping check
+            if epochs_without_improvement >= config["patience"]:
+                logger.info(f"Early stopping triggered after {config['patience']} epochs without improvement")
+                break
+            # Overfitting detection
+            if avg_train_accuracy - avg_val_accuracy > config["overfitting_threshold"]:
+                logger.warning("Possible overfitting detected")
+        logger.info("Training finished!")
+        # Save the final model
+        torch.save(model.state_dict(), config["final_model_path"])
+        logger.info(f"Saved final model to {config['final_model_path']}")
+        # Save run information
+        with open(os.path.join(config["run_dir"], 'run_info.txt'), 'w') as f:
+            for key, value in config.items():
+                f.write(f"{key}: {value}\n")
+            f.write(f"Device: {device}\n")
+            f.write(f"Model: {model.__class__.__name__}\n")
+            f.write(f"Optimizer: {optimizer.__class__.__name__}\n")
+            f.write(f"Scheduler: {scheduler.__class__.__name__}\n")
+            f.write(f"Loss function: CrossEntropyLoss\n")
+            f.write(f"Data augmentation: RandomHorizontalFlip, RandomRotation(5), ColorJitter\n")
+            f.write(f"Mixed precision training: {'Enabled' if 'scaler' in locals() else 'Disabled'}\n")
+            f.write(f"Train dataset size: {len(train_dataset)}\n")
+            f.write(f"Validation dataset size: {len(val_dataset)}\n")
+            f.write(f"Vision encoder frozen: {'Partially' if hasattr(model, 'unfreeze_vision_encoder') else 'Unknown'}\n")
+        # Run visualization
+        try:
+            logger.info("Running visualization...")
+            vis_dir, confusion_matrix = run_visualization(config["run_dir"])
+            logger.info(f"Visualization complete! Check the output directory: {vis_dir}")
+            # Log confusion matrix results
+            class_accuracies = confusion_matrix.diagonal() / confusion_matrix.sum(axis=1)
+            overall_accuracy = confusion_matrix.diagonal().sum() / confusion_matrix.sum()
+            logger.info("\nConfusion Matrix Results:")
+            for i, (label, accuracy) in enumerate(zip(config['class_labels'], class_accuracies)):
+                logger.info(f"{label}: {accuracy:.2%}")
+            logger.info(f"Overall Accuracy: {overall_accuracy:.2%}")
+        except Exception as e:
+            logger.error(f"Error running visualization: {str(e)}")
+        # Run misclassification analysis
+        try:
+            analyze_misclassifications(config["run_dir"])
+            logger.info(f"Misclassification analysis complete! Check the output directory: {config['run_dir']}")
+        except Exception as e:
+            logger.error(f"Error running misclassification analysis: {str(e)}")
+        if math.isnan(avg_val_accuracy) or math.isinf(avg_val_accuracy):
+                raise ValueError(f"Invalid validation accuracy: {avg_val_accuracy}")
+        print("Script finished.")
+        return avg_val_accuracy, vis_dir
+    except Exception as e:
+        logger.error(f"Training error: {str(e)}")
+        raise  # Re-raise the exception to be caught by the hyperparameter tuning
 def main():
     # Create run directory
     config = {
         "class_labels": class_labels,
         "num_classes": len(class_labels),
+        "clip_model": "openai/clip-vit-large-patch14",
         "batch_size": 32,
+        "unfreeze_layers": 4,
+        "learning_rate": 5.305885796107412e-06,
+        "weight_decay": 4.543630233732527e-07,
+        "gradient_clip_max_norm": 0.6446650879658523,
+        "augmentation_strength": 0.5827616006715585,
+        "crop_scale_min": 0.7872781274088598,
+        "max_frames": 15,
+        "sigma": 0.286510943464138,
+        "data_path": "../finetune/blog/bryant/random",
         "num_epochs": 50,
+        "patience": 10,
         "image_size": 224,
         "crop_scale_max": 1.0,
+        "normalization_mean": [
+            0.485,
+            0.456,
+            0.406
+        ],
+        "normalization_std": [
+            0.229,
+            0.224,
+            0.225
+        ],
         "overfitting_threshold": 10,
+        # "data_path": '../finetune/blog/bryant/random',
+        # "batch_size": 8,
+        # "learning_rate": 2e-6,
+        # "weight_decay": 0.007,
+        # "num_epochs": 2,
+        # "patience": 10,  # for early stopping
+        # "max_frames": 10,
+        # "sigma": 0.3,
+        # "image_size": 224,
+        # "flip_probability": 0.5,
+        # "rotation_degrees": 15,
+        # "brightness_jitter": 0.2,
+        # "contrast_jitter": 0.2,
+        # "saturation_jitter": 0.2,
+        # "hue_jitter": 0.1,
+        # "crop_scale_min": 0.8,
+        # "crop_scale_max": 1.0,
+        # "normalization_mean": [0.485, 0.456, 0.406],
+        # "normalization_std": [0.229, 0.224, 0.225],
+        # "unfreeze_layers": 3,
+        # # "clip_model": "openai/clip-vit-large-patch14",
+        # "clip_model": "openai/clip-vit-base-patch32",
+        # "gradient_clip_max_norm": 1.0,
+        # "overfitting_threshold": 10,
         "run_dir": run_dir,
     }
     train_and_evaluate(config)

script/visualization/visualize.py CHANGED Viewed

@@ -110,28 +110,28 @@ def generate_evaluation_metrics(model, data_loader, device, output_dir, class_la
     return cm
-if __name__ == "__main__":
-    # Find the most recent run directory
-    #
-    run_dir = get_latest_run_dir()
-    # run_dir= "/home/bawolf/workspace/break/clip/runs/run_20241024-150232_otherpeopleval_large_model"
-    # run_dir = "/home/bawolf/workspace/break/clip/runs/run_20241022-122939_3moves_balanced"
     # Load configuration
     config = get_config(run_dir)
     class_labels = config['class_labels']
     num_classes = config['num_classes']
-    data_path = config['data_path']
-    # data_path= '../finetune/3moves_otherpeopleval'
-    # data_path = '../finetune/otherpeople3moves'
     # Paths
     log_file = os.path.join(run_dir, 'training_log.csv')
     model_path = get_latest_model_path(run_dir)
-    test_csv = os.path.join(data_path, 'test.csv')
-    # test_csv = os.path.join(data_path, 'val.csv')
-    # test_csv = os.path.join(data_path, 'train.csv')
     # Get the last directory of data_path and the file name
     last_dir = os.path.basename(os.path.normpath(data_path))
@@ -160,3 +160,12 @@ if __name__ == "__main__":
     cm = generate_evaluation_metrics(model, test_loader, device, vis_dir, class_labels, data_info)
     print(f"Visualization complete! Check the output directory: {vis_dir}")

     return cm
+def run_visualization(run_dir, data_path=None, test_csv=None):
+    """
+    Run visualization for a specific training run
+    Args:
+        run_dir (str): Path to the run directory
+        data_path (str, optional): Override the data path from config
+        test_csv (str, optional): Override the test CSV path
+    """
     # Load configuration
     config = get_config(run_dir)
     class_labels = config['class_labels']
     num_classes = config['num_classes']
+    data_path = data_path or config['data_path']
     # Paths
     log_file = os.path.join(run_dir, 'training_log.csv')
     model_path = get_latest_model_path(run_dir)
+    if test_csv is None:
+        test_csv = os.path.join(data_path, 'test.csv')
     # Get the last directory of data_path and the file name
     last_dir = os.path.basename(os.path.normpath(data_path))
     cm = generate_evaluation_metrics(model, test_loader, device, vis_dir, class_labels, data_info)
     print(f"Visualization complete! Check the output directory: {vis_dir}")
+    return vis_dir, cm
+if __name__ == "__main__":
+    # Find the most recent run directory
+    run_dir = get_latest_run_dir()
+    # run_dir = "/home/bawolf/workspace/break/clip/runs/run_20241024-150232_otherpeopleval_large_model"
+    # run_dir = "/home/bawolf/workspace/break/clip/runs/run_20241022-122939_3moves_balanced"
+    run_visualization(run_dir)

src/dataset/dataset.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 from torch.utils.data import Dataset
 import csv
 from .video_utils import create_transform, extract_frames
 class VideoDataset(Dataset):
     def __init__(self, file_path, config, transform=None):
@@ -29,7 +30,8 @@ class VideoDataset(Dataset):
                 if len(row) != 2:
                     print(f"Skipping invalid row: {row}")
                     continue
-                video_path, label = row
                 try:
                     label = int(label)
                 except ValueError:

 from torch.utils.data import Dataset
 import csv
 from .video_utils import create_transform, extract_frames
+import os
 class VideoDataset(Dataset):
     def __init__(self, file_path, config, transform=None):
                 if len(row) != 2:
                     print(f"Skipping invalid row: {row}")
                     continue
+                relative_video_path, label = row
+                video_path = os.path.join(config['data_path'], relative_video_path)
                 try:
                     label = int(label)
                 except ValueError: