Remove LFS storage cleanup configuration and refactor pre-upload checking and cleanup logic
Browse files- Remove LFS cleanup related settings from the configuration file.
- Rename and refactor the `manage_archives` method to `check_and_cleanup_before_upload` to check the archive file count and perform the necessary cleanup before upload.
- Updated scripts to invoke new cleanup logic before upload to ensure more efficient archive management.
- configs/persistence.conf +0 -16
- scripts/utils/hf_persistence.py +45 -38
- scripts/utils/persistence.sh +3 -8
configs/persistence.conf
CHANGED
|
@@ -58,22 +58,6 @@ FORCE_SYNC_RESTORE=true
|
|
| 58 |
# Enable data integrity verification after restore
|
| 59 |
ENABLE_INTEGRITY_CHECK=true
|
| 60 |
|
| 61 |
-
# =============================================================================
|
| 62 |
-
# LFS Storage Cleanup Configuration (NEW)
|
| 63 |
-
# =============================================================================
|
| 64 |
-
|
| 65 |
-
# LFS Cleanup Method Options:
|
| 66 |
-
# - 'squash': Use super_squash_history API (gentle, may not always work)
|
| 67 |
-
# - 'recreate': Delete and recreate dataset (aggressive, always works)
|
| 68 |
-
# - 'none': No LFS cleanup (default old behavior)
|
| 69 |
-
LFS_CLEANUP_METHOD=squash
|
| 70 |
-
|
| 71 |
-
# Force LFS cleanup even if only 1 file was deleted (be careful with 'recreate')
|
| 72 |
-
FORCE_LFS_CLEANUP=false
|
| 73 |
-
|
| 74 |
-
# Enable LFS storage monitoring and warnings
|
| 75 |
-
ENABLE_LFS_MONITORING=true
|
| 76 |
-
|
| 77 |
# Log Configuration
|
| 78 |
# Log file path (using user directory for better permissions)
|
| 79 |
LOG_FILE="/home/user/logs/persistence.log"
|
|
|
|
| 58 |
# Enable data integrity verification after restore
|
| 59 |
ENABLE_INTEGRITY_CHECK=true
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
# Log Configuration
|
| 62 |
# Log file path (using user directory for better permissions)
|
| 63 |
LOG_FILE="/home/user/logs/persistence.log"
|
scripts/utils/hf_persistence.py
CHANGED
|
@@ -81,9 +81,9 @@ class HFPersistenceManager:
|
|
| 81 |
traceback.print_exc()
|
| 82 |
return False
|
| 83 |
|
| 84 |
-
def
|
| 85 |
"""
|
| 86 |
-
|
| 87 |
|
| 88 |
Args:
|
| 89 |
archive_prefix: Archive file prefix
|
|
@@ -97,51 +97,37 @@ class HFPersistenceManager:
|
|
| 97 |
api = self._get_api()
|
| 98 |
files = api.list_repo_files(repo_id=self.dataset_id, repo_type='dataset')
|
| 99 |
archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
|
| 100 |
-
archive_files.sort()
|
| 101 |
|
| 102 |
-
|
| 103 |
-
if len(archive_files) >
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
# LFS cleanup options
|
| 113 |
-
cleanup_method = os.environ.get('LFS_CLEANUP_METHOD', 'none')
|
| 114 |
-
|
| 115 |
-
if cleanup_method == 'squash' and len(files_to_delete) > 0:
|
| 116 |
-
self._cleanup_lfs_squash(api)
|
| 117 |
-
elif cleanup_method == 'recreate' and len(files_to_delete) > 0:
|
| 118 |
-
if not self._cleanup_lfs_recreate(api, archive_files, max_files, archive_prefix, archive_extension):
|
| 119 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
print(f'β Archive management completed, currently keeping {min(len(archive_files), max_files)} archives')
|
| 122 |
return True
|
| 123 |
except Exception as e:
|
| 124 |
-
print(f'β
|
| 125 |
return False
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
try:
|
| 130 |
-
print('π Attempting to clean LFS history with super_squash_history...')
|
| 131 |
-
api.super_squash_history(repo_id=self.dataset_id, repo_type='dataset')
|
| 132 |
-
print('β
LFS history cleanup attempted - storage may take time to reflect changes')
|
| 133 |
-
except Exception as e:
|
| 134 |
-
print(f'β οΈ LFS history cleanup failed: {str(e)}')
|
| 135 |
-
print('π‘ Consider setting LFS_CLEANUP_METHOD=recreate for stronger cleanup')
|
| 136 |
-
|
| 137 |
-
def _cleanup_lfs_recreate(self, api: HfApi, archive_files: list, max_files: int,
|
| 138 |
archive_prefix: str, archive_extension: str) -> bool:
|
| 139 |
"""Force delete and recreate dataset to clean LFS storage"""
|
| 140 |
print('π¨ WARNING: Force recreate mode enabled')
|
| 141 |
print('π This will delete and recreate the entire dataset to clean LFS storage')
|
| 142 |
|
| 143 |
# Backup files that need to be preserved
|
| 144 |
-
remaining_files =
|
| 145 |
if remaining_files:
|
| 146 |
print(f'π¦ Backing up {len(remaining_files)} files for restoration...')
|
| 147 |
backup_data = []
|
|
@@ -204,8 +190,28 @@ class HFPersistenceManager:
|
|
| 204 |
print('β οΈ Manual intervention may be required')
|
| 205 |
return False
|
| 206 |
else:
|
| 207 |
-
print('π No files to preserve,
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
def list_available_archives(self, archive_prefix: str, archive_extension: str) -> tuple[bool, str]:
|
| 211 |
"""
|
|
@@ -328,9 +334,10 @@ def main():
|
|
| 328 |
print('β upload action requires --archive-file and --filename parameters')
|
| 329 |
sys.exit(1)
|
| 330 |
|
| 331 |
-
|
|
|
|
| 332 |
if success:
|
| 333 |
-
success = manager.
|
| 334 |
|
| 335 |
sys.exit(0 if success else 1)
|
| 336 |
|
|
|
|
| 81 |
traceback.print_exc()
|
| 82 |
return False
|
| 83 |
|
| 84 |
+
def check_and_cleanup_before_upload(self, archive_prefix: str, archive_extension: str, max_files: int) -> bool:
|
| 85 |
"""
|
| 86 |
+
Check if adding new archive would exceed limit, if so, force recreate dataset first
|
| 87 |
|
| 88 |
Args:
|
| 89 |
archive_prefix: Archive file prefix
|
|
|
|
| 97 |
api = self._get_api()
|
| 98 |
files = api.list_repo_files(repo_id=self.dataset_id, repo_type='dataset')
|
| 99 |
archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
|
| 100 |
+
archive_files.sort(reverse=True) # Sort newest first
|
| 101 |
|
| 102 |
+
# Check if adding 1 new file would exceed the limit
|
| 103 |
+
if len(archive_files) + 1 > max_files:
|
| 104 |
+
print(f'π¨ Adding new archive would exceed limit ({len(archive_files)} + 1 > {max_files})')
|
| 105 |
+
print('π¨ Starting force dataset recreation to clean up old archives')
|
| 106 |
+
|
| 107 |
+
# Keep only the newest (max_files - 1) files to make room for the new one
|
| 108 |
+
files_to_keep = archive_files[:max_files-1] if max_files > 1 else []
|
| 109 |
+
|
| 110 |
+
if not self._cleanup_lfs_recreate(api, files_to_keep, max_files, archive_prefix, archive_extension):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
return False
|
| 112 |
+
|
| 113 |
+
print(f'β Dataset recreation completed, ready for new archive upload')
|
| 114 |
+
else:
|
| 115 |
+
print(f'β Archive count check passed ({len(archive_files)} + 1 <= {max_files})')
|
| 116 |
|
|
|
|
| 117 |
return True
|
| 118 |
except Exception as e:
|
| 119 |
+
print(f'β Pre-upload cleanup failed: {str(e)}')
|
| 120 |
return False
|
| 121 |
|
| 122 |
+
|
| 123 |
+
def _cleanup_lfs_recreate(self, api: HfApi, files_to_keep: list, max_files: int,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
archive_prefix: str, archive_extension: str) -> bool:
|
| 125 |
"""Force delete and recreate dataset to clean LFS storage"""
|
| 126 |
print('π¨ WARNING: Force recreate mode enabled')
|
| 127 |
print('π This will delete and recreate the entire dataset to clean LFS storage')
|
| 128 |
|
| 129 |
# Backup files that need to be preserved
|
| 130 |
+
remaining_files = files_to_keep
|
| 131 |
if remaining_files:
|
| 132 |
print(f'π¦ Backing up {len(remaining_files)} files for restoration...')
|
| 133 |
backup_data = []
|
|
|
|
| 190 |
print('β οΈ Manual intervention may be required')
|
| 191 |
return False
|
| 192 |
else:
|
| 193 |
+
print('π No files to preserve, proceeding with dataset cleanup')
|
| 194 |
+
# Delete dataset even when no files need to be preserved
|
| 195 |
+
try:
|
| 196 |
+
print('ποΈ Deleting dataset to clean LFS storage...')
|
| 197 |
+
api.delete_repo(repo_id=self.dataset_id, repo_type='dataset')
|
| 198 |
+
print('β Dataset deleted successfully')
|
| 199 |
+
|
| 200 |
+
# Wait for deletion to complete
|
| 201 |
+
time.sleep(10)
|
| 202 |
+
|
| 203 |
+
# Recreate dataset
|
| 204 |
+
print('π¨ Recreating dataset...')
|
| 205 |
+
api.create_repo(repo_id=self.dataset_id, repo_type='dataset', exist_ok=True)
|
| 206 |
+
print('β Dataset recreated successfully')
|
| 207 |
+
|
| 208 |
+
print('π Dataset recreation and LFS cleanup completed!')
|
| 209 |
+
return True
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
print(f'β Dataset recreation failed: {str(e)}')
|
| 213 |
+
print('β οΈ Manual intervention may be required')
|
| 214 |
+
return False
|
| 215 |
|
| 216 |
def list_available_archives(self, archive_prefix: str, archive_extension: str) -> tuple[bool, str]:
|
| 217 |
"""
|
|
|
|
| 334 |
print('β upload action requires --archive-file and --filename parameters')
|
| 335 |
sys.exit(1)
|
| 336 |
|
| 337 |
+
# First check and cleanup if needed BEFORE uploading
|
| 338 |
+
success = manager.check_and_cleanup_before_upload(args.archive_prefix, args.archive_extension, args.max_archives)
|
| 339 |
if success:
|
| 340 |
+
success = manager.upload_archive(args.archive_file, args.filename)
|
| 341 |
|
| 342 |
sys.exit(0 if success else 1)
|
| 343 |
|
scripts/utils/persistence.sh
CHANGED
|
@@ -67,11 +67,6 @@ set_default_configuration() {
|
|
| 67 |
# Logging configuration
|
| 68 |
export LOG_FILE="${LOG_FILE:-}"
|
| 69 |
export LOG_LEVEL="${LOG_LEVEL:-}"
|
| 70 |
-
|
| 71 |
-
# LFS Storage Cleanup configuration
|
| 72 |
-
export LFS_CLEANUP_METHOD="${LFS_CLEANUP_METHOD:-none}"
|
| 73 |
-
export FORCE_LFS_CLEANUP="${FORCE_LFS_CLEANUP:-false}"
|
| 74 |
-
export ENABLE_LFS_MONITORING="${ENABLE_LFS_MONITORING:-false}"
|
| 75 |
}
|
| 76 |
|
| 77 |
# Validate required environment variables
|
|
@@ -113,7 +108,7 @@ create_archive() {
|
|
| 113 |
|
| 114 |
# Build exclude arguments - only exclude files that would negatively impact HuggingFace datasets backup
|
| 115 |
local exclude_args=""
|
| 116 |
-
local default_excludes="__pycache__,*.tmp"
|
| 117 |
local combined_patterns="${EXCLUDE_PATTERNS:-},${default_excludes}"
|
| 118 |
|
| 119 |
if [[ -n "$combined_patterns" ]]; then
|
|
@@ -183,7 +178,7 @@ create_archive() {
|
|
| 183 |
fi
|
| 184 |
}
|
| 185 |
|
| 186 |
-
# Call Python upload handler
|
| 187 |
run_upload_handler() {
|
| 188 |
local archive_file="$1"
|
| 189 |
local filename="$2"
|
|
@@ -196,7 +191,7 @@ run_upload_handler() {
|
|
| 196 |
# Get script directory for relative imports
|
| 197 |
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 198 |
|
| 199 |
-
# Call the standalone Python module
|
| 200 |
python3 "${script_dir}/hf_persistence.py" upload \
|
| 201 |
--token "$token" \
|
| 202 |
--dataset-id "$dataset_id" \
|
|
|
|
| 67 |
# Logging configuration
|
| 68 |
export LOG_FILE="${LOG_FILE:-}"
|
| 69 |
export LOG_LEVEL="${LOG_LEVEL:-}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
}
|
| 71 |
|
| 72 |
# Validate required environment variables
|
|
|
|
| 108 |
|
| 109 |
# Build exclude arguments - only exclude files that would negatively impact HuggingFace datasets backup
|
| 110 |
local exclude_args=""
|
| 111 |
+
local default_excludes="__pycache__,*.tmp,*/temp,*/cache,*/.cache,*/log,*/logs"
|
| 112 |
local combined_patterns="${EXCLUDE_PATTERNS:-},${default_excludes}"
|
| 113 |
|
| 114 |
if [[ -n "$combined_patterns" ]]; then
|
|
|
|
| 178 |
fi
|
| 179 |
}
|
| 180 |
|
| 181 |
+
# Call Python upload handler with pre-upload cleanup
|
| 182 |
run_upload_handler() {
|
| 183 |
local archive_file="$1"
|
| 184 |
local filename="$2"
|
|
|
|
| 191 |
# Get script directory for relative imports
|
| 192 |
local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 193 |
|
| 194 |
+
# Call the standalone Python module (now with pre-upload cleanup logic)
|
| 195 |
python3 "${script_dir}/hf_persistence.py" upload \
|
| 196 |
--token "$token" \
|
| 197 |
--dataset-id "$dataset_id" \
|