File size: 6,801 Bytes
08bbb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/usr/bin/env python3
"""
Clean up script for removing training runs without exported models.
Removes all directories in runs/ folder that don't have a corresponding exported model file.
"""

import argparse
import os
import shutil
from pathlib import Path
import glob


def find_exported_models():
    """Find all exported model files in the current directory"""
    exported_models = []
    seen_files = set()  # Track files we've already processed

    # Look for pattern: *_YYYYMMDD_HHMMSS.joblib
    # This matches any exported model with timestamp format
    patterns = [
        "*_20[0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].joblib"
    ]

    for pattern in patterns:
        for filepath in glob.glob(pattern):
            # Skip if we've already seen this file
            if filepath in seen_files:
                continue
            seen_files.add(filepath)

            # Extract timestamp from filename
            # Format: dataset_sentiment_YYYYMMDD_HHMMSS.joblib
            filename = os.path.basename(filepath)
            parts = filename.replace(".joblib", "").split("_")
            if len(parts) >= 4:
                # Get the last two parts which should be date and time
                timestamp = "_".join(parts[-2:])
                exported_models.append({
                    "file": filepath,
                    "timestamp": timestamp
                })

    return exported_models


def find_all_runs():
    """Find all run directories in the runs folder"""
    runs_dir = Path("runs")
    if not runs_dir.exists():
        return []

    runs = []
    for run_path in runs_dir.iterdir():
        if run_path.is_dir():
            # Run directories are named with timestamps: YYYYMMDD_HHMMSS
            run_name = run_path.name
            runs.append({
                "path": run_path,
                "timestamp": run_name
            })

    return runs


def clean_runs(dry_run=False, verbose=False):
    """
    Remove all run directories that don't have exported models.

    Args:
        dry_run: If True, only show what would be deleted without actually deleting
        verbose: If True, show detailed information

    Returns:
        Tuple of (runs_to_keep, runs_to_delete)
    """
    # Find all exported models
    exported_models = find_exported_models()
    exported_timestamps = {model["timestamp"] for model in exported_models}

    # Find all runs
    all_runs = find_all_runs()

    # Categorize runs
    runs_to_keep = []
    runs_to_delete = []

    for run in all_runs:
        if run["timestamp"] in exported_timestamps:
            runs_to_keep.append(run)
        else:
            runs_to_delete.append(run)

    # Show summary
    print(f"Found {len(all_runs)} total runs")
    print(f"Found {len(exported_models)} exported models")
    print(f"Runs to keep: {len(runs_to_keep)}")
    print(f"Runs to delete: {len(runs_to_delete)}")

    if verbose and exported_models:
        print("\nExported models found:")
        for model in exported_models:
            print(f"  - {model['file']} (timestamp: {model['timestamp']})")

    if verbose and runs_to_keep:
        print("\nRuns with exported models (will be kept):")
        for run in runs_to_keep:
            print(f"  - {run['path']}")

    if runs_to_delete:
        print("\nRuns without exported models (will be deleted):")
        for run in runs_to_delete:
            print(f"  - {run['path']}")
            if verbose:
                # Check if metadata exists and show some info
                metadata_path = run["path"] / "metadata.json"
                if metadata_path.exists():
                    import json
                    try:
                        with open(metadata_path) as f:
                            metadata = json.load(f)
                            print(f"    Model: {metadata.get('model_name', 'unknown')}, "
                                  f"Dataset: {metadata.get('dataset', 'unknown')}, "
                                  f"Accuracy: {metadata.get('test_accuracy', 'N/A'):.4f}")
                    except (json.JSONDecodeError, KeyError):
                        pass

    # Calculate space to be freed
    total_size = 0
    for run in runs_to_delete:
        total_size += sum(f.stat().st_size for f in run["path"].rglob("*") if f.is_file())

    if total_size > 0:
        size_mb = total_size / (1024 * 1024)
        print(f"\nTotal space to be freed: {size_mb:.2f} MB")

    # Perform deletion if not dry run
    if not dry_run and runs_to_delete:
        deleted_count = 0
        for run in runs_to_delete:
            try:
                shutil.rmtree(run["path"])
                deleted_count += 1
                if verbose:
                    print(f"Deleted: {run['path']}")
            except Exception as e:
                print(f"Error deleting {run['path']}: {e}")

        print(f"\nSuccessfully deleted {deleted_count} run(s)")
    elif dry_run and runs_to_delete:
        print("\nDry run mode - no files were deleted")
        print("Run without --dry-run to actually delete these directories")
    elif not runs_to_delete:
        print("\nNo runs to delete - all runs have exported models or no runs found")

    return runs_to_keep, runs_to_delete


def main():
    parser = argparse.ArgumentParser(
        description="Clean up training runs without exported models"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Show what would be deleted without actually deleting",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Show detailed information about runs",
    )
    parser.add_argument(
        "--yes",
        "-y",
        action="store_true",
        help="Skip confirmation prompt",
    )

    args = parser.parse_args()

    # Check if runs directory exists
    if not Path("runs").exists():
        print("No 'runs' directory found. Nothing to clean.")
        return

    # Find runs to delete
    print("Analyzing runs directory...\n")

    # Do a dry run first to show what will be deleted
    _, runs_to_delete = clean_runs(dry_run=True, verbose=args.verbose)

    if not runs_to_delete:
        return

    # Ask for confirmation if not in dry-run mode and not auto-yes
    if not args.dry_run and not args.yes and runs_to_delete:
        print("\n" + "=" * 60)
        response = input(f"Are you sure you want to delete {len(runs_to_delete)} run(s)? [y/N]: ")
        if response.lower() != 'y':
            print("Cleanup cancelled")
            return

    # Perform actual cleanup if not dry run
    if not args.dry_run:
        print("\nPerforming cleanup...")
        clean_runs(dry_run=False, verbose=args.verbose)


if __name__ == "__main__":
    main()