File size: 6,801 Bytes
08bbb4c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
#!/usr/bin/env python3
"""
Clean up script for removing training runs without exported models.
Removes all directories in runs/ folder that don't have a corresponding exported model file.
"""
import argparse
import os
import shutil
from pathlib import Path
import glob
def find_exported_models():
"""Find all exported model files in the current directory"""
exported_models = []
seen_files = set() # Track files we've already processed
# Look for pattern: *_YYYYMMDD_HHMMSS.joblib
# This matches any exported model with timestamp format
patterns = [
"*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib"
]
for pattern in patterns:
for filepath in glob.glob(pattern):
# Skip if we've already seen this file
if filepath in seen_files:
continue
seen_files.add(filepath)
# Extract timestamp from filename
# Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib
filename = os.path.basename(filepath)
parts = filename.replace(".joblib", "").split("_")
if len(parts) >= 4:
# Get the last two parts which should be date and time
timestamp = "_".join(parts[-2:])
exported_models.append({
"file": filepath,
"timestamp": timestamp
})
return exported_models
def find_all_runs():
"""Find all run directories in the runs folder"""
runs_dir = Path("runs")
if not runs_dir.exists():
return []
runs = []
for run_path in runs_dir.iterdir():
if run_path.is_dir():
# Run directories are named with timestamps: YYYYMMDD_HHMMSS
run_name = run_path.name
runs.append({
"path": run_path,
"timestamp": run_name
})
return runs
def clean_runs(dry_run=False, verbose=False):
"""
Remove all run directories that don't have exported models.
Args:
dry_run: If True, only show what would be deleted without actually deleting
verbose: If True, show detailed information
Returns:
Tuple of (runs_to_keep, runs_to_delete)
"""
# Find all exported models
exported_models = find_exported_models()
exported_timestamps = {model["timestamp"] for model in exported_models}
# Find all runs
all_runs = find_all_runs()
# Categorize runs
runs_to_keep = []
runs_to_delete = []
for run in all_runs:
if run["timestamp"] in exported_timestamps:
runs_to_keep.append(run)
else:
runs_to_delete.append(run)
# Show summary
print(f"Found {len(all_runs)} total runs")
print(f"Found {len(exported_models)} exported models")
print(f"Runs to keep: {len(runs_to_keep)}")
print(f"Runs to delete: {len(runs_to_delete)}")
if verbose and exported_models:
print("\nExported models found:")
for model in exported_models:
print(f" - {model['file']} (timestamp: {model['timestamp']})")
if verbose and runs_to_keep:
print("\nRuns with exported models (will be kept):")
for run in runs_to_keep:
print(f" - {run['path']}")
if runs_to_delete:
print("\nRuns without exported models (will be deleted):")
for run in runs_to_delete:
print(f" - {run['path']}")
if verbose:
# Check if metadata exists and show some info
metadata_path = run["path"] / "metadata.json"
if metadata_path.exists():
import json
try:
with open(metadata_path) as f:
metadata = json.load(f)
print(f" Model: {metadata.get('model_name', 'unknown')}, "
f"Dataset: {metadata.get('dataset', 'unknown')}, "
f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}")
except (json.JSONDecodeError, KeyError):
pass
# Calculate space to be freed
total_size = 0
for run in runs_to_delete:
total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file())
if total_size > 0:
size_mb = total_size / (1024 * 1024)
print(f"\nTotal space to be freed: {size_mb:.2f} MB")
# Perform deletion if not dry run
if not dry_run and runs_to_delete:
deleted_count = 0
for run in runs_to_delete:
try:
shutil.rmtree(run["path"])
deleted_count += 1
if verbose:
print(f"Deleted: {run['path']}")
except Exception as e:
print(f"Error deleting {run['path']}: {e}")
print(f"\nSuccessfully deleted {deleted_count} run(s)")
elif dry_run and runs_to_delete:
print("\nDry run mode - no files were deleted")
print("Run without --dry-run to actually delete these directories")
elif not runs_to_delete:
print("\nNo runs to delete - all runs have exported models or no runs found")
return runs_to_keep, runs_to_delete
def main():
parser = argparse.ArgumentParser(
description="Clean up training runs without exported models"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be deleted without actually deleting",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Show detailed information about runs",
)
parser.add_argument(
"--yes",
"-y",
action="store_true",
help="Skip confirmation prompt",
)
args = parser.parse_args()
# Check if runs directory exists
if not Path("runs").exists():
print("No 'runs' directory found. Nothing to clean.")
return
# Find runs to delete
print("Analyzing runs directory...\n")
# Do a dry run first to show what will be deleted
_, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose)
if not runs_to_delete:
return
# Ask for confirmation if not in dry-run mode and not auto-yes
if not args.dry_run and not args.yes and runs_to_delete:
print("\n" + "=" * 60)
response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ")
if response.lower() != 'y':
print("Cleanup cancelled")
return
# Perform actual cleanup if not dry run
if not args.dry_run:
print("\nPerforming cleanup...")
clean_runs(dry_run=False, verbose=args.verbose)
if __name__ == "__main__":
main() |