File size: 6,801 Bytes

08bbb4c

#!/usr/bin/env python3
"""
Clean up script for removing training runs without exported models.
Removes all directories in runs/ folder that don't have a corresponding exported model file.
"""

import argparse
import os
import shutil
from pathlib import Path
import glob


def find_exported_models():
    """Find all exported model files in the current directory"""
    exported_models = []
    seen_files = set()  # Track files we've already processed

    # Look for pattern: *_YYYYMMDD_HHMMSS.joblib
    # This matches any exported model with timestamp format
    patterns = [
        "*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib"
    ]

    for pattern in patterns:
        for filepath in glob.glob(pattern):
            # Skip if we've already seen this file
            if filepath in seen_files:
                continue
            seen_files.add(filepath)

            # Extract timestamp from filename
            # Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib
            filename = os.path.basename(filepath)
            parts = filename.replace(".joblib", "").split("_")
            if len(parts) >= 4:
                # Get the last two parts which should be date and time
                timestamp = "_".join(parts[-2:])
                exported_models.append({
                    "file": filepath,
                    "timestamp": timestamp
                })

    return exported_models


def find_all_runs():
    """Find all run directories in the runs folder"""
    runs_dir = Path("runs")
    if not runs_dir.exists():
        return []

    runs = []
    for run_path in runs_dir.iterdir():
        if run_path.is_dir():
            # Run directories are named with timestamps: YYYYMMDD_HHMMSS
            run_name = run_path.name
            runs.append({
                "path": run_path,
                "timestamp": run_name
            })

    return runs


def clean_runs(dry_run=False, verbose=False):
    """
    Remove all run directories that don't have exported models.

    Args:
        dry_run: If True, only show what would be deleted without actually deleting
        verbose: If True, show detailed information

    Returns:
        Tuple of (runs_to_keep, runs_to_delete)
    """
    # Find all exported models
    exported_models = find_exported_models()
    exported_timestamps = {model["timestamp"] for model in exported_models}

    # Find all runs
    all_runs = find_all_runs()

    # Categorize runs
    runs_to_keep = []
    runs_to_delete = []

    for run in all_runs:
        if run["timestamp"] in exported_timestamps:
            runs_to_keep.append(run)
        else:
            runs_to_delete.append(run)

    # Show summary
    print(f"Found {len(all_runs)} total runs")
    print(f"Found {len(exported_models)} exported models")
    print(f"Runs to keep: {len(runs_to_keep)}")
    print(f"Runs to delete: {len(runs_to_delete)}")

    if verbose and exported_models:
        print("\nExported models found:")
        for model in exported_models:
            print(f"  - {model['file']} (timestamp: {model['timestamp']})")

    if verbose and runs_to_keep:
        print("\nRuns with exported models (will be kept):")
        for run in runs_to_keep:
            print(f"  - {run['path']}")

    if runs_to_delete:
        print("\nRuns without exported models (will be deleted):")
        for run in runs_to_delete:
            print(f"  - {run['path']}")
            if verbose:
                # Check if metadata exists and show some info
                metadata_path = run["path"] / "metadata.json"
                if metadata_path.exists():
                    import json
                    try:
                        with open(metadata_path) as f:
                            metadata = json.load(f)
                            print(f"    Model: {metadata.get('model_name', 'unknown')}, "
                                  f"Dataset: {metadata.get('dataset', 'unknown')}, "
                                  f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}")
                    except (json.JSONDecodeError, KeyError):
                        pass

    # Calculate space to be freed
    total_size = 0
    for run in runs_to_delete:
        total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file())

    if total_size > 0:
        size_mb = total_size / (1024 * 1024)
        print(f"\nTotal space to be freed: {size_mb:.2f} MB")

    # Perform deletion if not dry run
    if not dry_run and runs_to_delete:
        deleted_count = 0
        for run in runs_to_delete:
            try:
                shutil.rmtree(run["path"])
                deleted_count += 1
                if verbose:
                    print(f"Deleted: {run['path']}")
            except Exception as e:
                print(f"Error deleting {run['path']}: {e}")

        print(f"\nSuccessfully deleted {deleted_count} run(s)")
    elif dry_run and runs_to_delete:
        print("\nDry run mode - no files were deleted")
        print("Run without --dry-run to actually delete these directories")
    elif not runs_to_delete:
        print("\nNo runs to delete - all runs have exported models or no runs found")

    return runs_to_keep, runs_to_delete


def main():
    parser = argparse.ArgumentParser(
        description="Clean up training runs without exported models"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be deleted without actually deleting",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Show detailed information about runs",
    )
    parser.add_argument(
        "--yes",
        "-y",
        action="store_true",
        help="Skip confirmation prompt",
    )

    args = parser.parse_args()

    # Check if runs directory exists
    if not Path("runs").exists():
        print("No 'runs' directory found. Nothing to clean.")
        return

    # Find runs to delete
    print("Analyzing runs directory...\n")

    # Do a dry run first to show what will be deleted
    _, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose)

    if not runs_to_delete:
        return

    # Ask for confirmation if not in dry-run mode and not auto-yes
    if not args.dry_run and not args.yes and runs_to_delete:
        print("\n" + "=" * 60)
        response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ")
        if response.lower() != 'y':
            print("Cleanup cancelled")
            return

    # Perform actual cleanup if not dry run
    if not args.dry_run:
        print("\nPerforming cleanup...")
        clean_runs(dry_run=False, verbose=args.verbose)


if __name__ == "__main__":
    main()