autoface commited on
Commit
f69f744
Β·
1 Parent(s): fa446dc

Remove LFS storage cleanup configuration and refactor pre-upload checking and cleanup logic

Browse files

- Remove LFS cleanup related settings from the configuration file.
- Rename and refactor the `manage_archives` method to `check_and_cleanup_before_upload` to check the archive file count and perform the necessary cleanup before upload.
- Updated scripts to invoke new cleanup logic before upload to ensure more efficient archive management.

configs/persistence.conf CHANGED
@@ -58,22 +58,6 @@ FORCE_SYNC_RESTORE=true
58
  # Enable data integrity verification after restore
59
  ENABLE_INTEGRITY_CHECK=true
60
 
61
- # =============================================================================
62
- # LFS Storage Cleanup Configuration (NEW)
63
- # =============================================================================
64
-
65
- # LFS Cleanup Method Options:
66
- # - 'squash': Use super_squash_history API (gentle, may not always work)
67
- # - 'recreate': Delete and recreate dataset (aggressive, always works)
68
- # - 'none': No LFS cleanup (default old behavior)
69
- LFS_CLEANUP_METHOD=squash
70
-
71
- # Force LFS cleanup even if only 1 file was deleted (be careful with 'recreate')
72
- FORCE_LFS_CLEANUP=false
73
-
74
- # Enable LFS storage monitoring and warnings
75
- ENABLE_LFS_MONITORING=true
76
-
77
  # Log Configuration
78
  # Log file path (using user directory for better permissions)
79
  LOG_FILE="/home/user/logs/persistence.log"
 
58
  # Enable data integrity verification after restore
59
  ENABLE_INTEGRITY_CHECK=true
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # Log Configuration
62
  # Log file path (using user directory for better permissions)
63
  LOG_FILE="/home/user/logs/persistence.log"
scripts/utils/hf_persistence.py CHANGED
@@ -81,9 +81,9 @@ class HFPersistenceManager:
81
  traceback.print_exc()
82
  return False
83
 
84
- def manage_archives(self, archive_prefix: str, archive_extension: str, max_files: int) -> bool:
85
  """
86
- Manage archive file count, delete old archives exceeding the limit
87
 
88
  Args:
89
  archive_prefix: Archive file prefix
@@ -97,51 +97,37 @@ class HFPersistenceManager:
97
  api = self._get_api()
98
  files = api.list_repo_files(repo_id=self.dataset_id, repo_type='dataset')
99
  archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
100
- archive_files.sort()
101
 
102
- files_to_delete = []
103
- if len(archive_files) >= max_files:
104
- files_to_delete = archive_files[:(len(archive_files) - max_files)]
105
- for file_to_delete in files_to_delete:
106
- try:
107
- api.delete_file(path_in_repo=file_to_delete, repo_id=self.dataset_id, repo_type='dataset')
108
- print(f'βœ“ Deleted old archive: {file_to_delete}')
109
- except Exception as e:
110
- print(f'βœ— Deletion failed {file_to_delete}: {str(e)}')
111
-
112
- # LFS cleanup options
113
- cleanup_method = os.environ.get('LFS_CLEANUP_METHOD', 'none')
114
-
115
- if cleanup_method == 'squash' and len(files_to_delete) > 0:
116
- self._cleanup_lfs_squash(api)
117
- elif cleanup_method == 'recreate' and len(files_to_delete) > 0:
118
- if not self._cleanup_lfs_recreate(api, archive_files, max_files, archive_prefix, archive_extension):
119
  return False
 
 
 
 
120
 
121
- print(f'βœ“ Archive management completed, currently keeping {min(len(archive_files), max_files)} archives')
122
  return True
123
  except Exception as e:
124
- print(f'βœ— Archive management failed: {str(e)}')
125
  return False
126
 
127
- def _cleanup_lfs_squash(self, api: HfApi) -> None:
128
- """Clean LFS history using super_squash_history"""
129
- try:
130
- print('πŸ”„ Attempting to clean LFS history with super_squash_history...')
131
- api.super_squash_history(repo_id=self.dataset_id, repo_type='dataset')
132
- print('βœ… LFS history cleanup attempted - storage may take time to reflect changes')
133
- except Exception as e:
134
- print(f'⚠️ LFS history cleanup failed: {str(e)}')
135
- print('πŸ’‘ Consider setting LFS_CLEANUP_METHOD=recreate for stronger cleanup')
136
-
137
- def _cleanup_lfs_recreate(self, api: HfApi, archive_files: list, max_files: int,
138
  archive_prefix: str, archive_extension: str) -> bool:
139
  """Force delete and recreate dataset to clean LFS storage"""
140
  print('🚨 WARNING: Force recreate mode enabled')
141
  print('πŸ“‹ This will delete and recreate the entire dataset to clean LFS storage')
142
 
143
  # Backup files that need to be preserved
144
- remaining_files = archive_files[-(max_files-1):] if max_files > 1 else []
145
  if remaining_files:
146
  print(f'πŸ“¦ Backing up {len(remaining_files)} files for restoration...')
147
  backup_data = []
@@ -204,8 +190,28 @@ class HFPersistenceManager:
204
  print('⚠️ Manual intervention may be required')
205
  return False
206
  else:
207
- print('πŸ“ No files to preserve, skipping backup')
208
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  def list_available_archives(self, archive_prefix: str, archive_extension: str) -> tuple[bool, str]:
211
  """
@@ -328,9 +334,10 @@ def main():
328
  print('βœ— upload action requires --archive-file and --filename parameters')
329
  sys.exit(1)
330
 
331
- success = manager.upload_archive(args.archive_file, args.filename)
 
332
  if success:
333
- success = manager.manage_archives(args.archive_prefix, args.archive_extension, args.max_archives)
334
 
335
  sys.exit(0 if success else 1)
336
 
 
81
  traceback.print_exc()
82
  return False
83
 
84
+ def check_and_cleanup_before_upload(self, archive_prefix: str, archive_extension: str, max_files: int) -> bool:
85
  """
86
+ Check if adding new archive would exceed limit, if so, force recreate dataset first
87
 
88
  Args:
89
  archive_prefix: Archive file prefix
 
97
  api = self._get_api()
98
  files = api.list_repo_files(repo_id=self.dataset_id, repo_type='dataset')
99
  archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
100
+ archive_files.sort(reverse=True) # Sort newest first
101
 
102
+ # Check if adding 1 new file would exceed the limit
103
+ if len(archive_files) + 1 > max_files:
104
+ print(f'🚨 Adding new archive would exceed limit ({len(archive_files)} + 1 > {max_files})')
105
+ print('🚨 Starting force dataset recreation to clean up old archives')
106
+
107
+ # Keep only the newest (max_files - 1) files to make room for the new one
108
+ files_to_keep = archive_files[:max_files-1] if max_files > 1 else []
109
+
110
+ if not self._cleanup_lfs_recreate(api, files_to_keep, max_files, archive_prefix, archive_extension):
 
 
 
 
 
 
 
 
111
  return False
112
+
113
+ print(f'βœ“ Dataset recreation completed, ready for new archive upload')
114
+ else:
115
+ print(f'βœ“ Archive count check passed ({len(archive_files)} + 1 <= {max_files})')
116
 
 
117
  return True
118
  except Exception as e:
119
+ print(f'βœ— Pre-upload cleanup failed: {str(e)}')
120
  return False
121
 
122
+
123
+ def _cleanup_lfs_recreate(self, api: HfApi, files_to_keep: list, max_files: int,
 
 
 
 
 
 
 
 
 
124
  archive_prefix: str, archive_extension: str) -> bool:
125
  """Force delete and recreate dataset to clean LFS storage"""
126
  print('🚨 WARNING: Force recreate mode enabled')
127
  print('πŸ“‹ This will delete and recreate the entire dataset to clean LFS storage')
128
 
129
  # Backup files that need to be preserved
130
+ remaining_files = files_to_keep
131
  if remaining_files:
132
  print(f'πŸ“¦ Backing up {len(remaining_files)} files for restoration...')
133
  backup_data = []
 
190
  print('⚠️ Manual intervention may be required')
191
  return False
192
  else:
193
+ print('πŸ“ No files to preserve, proceeding with dataset cleanup')
194
+ # Delete dataset even when no files need to be preserved
195
+ try:
196
+ print('πŸ—‘οΈ Deleting dataset to clean LFS storage...')
197
+ api.delete_repo(repo_id=self.dataset_id, repo_type='dataset')
198
+ print('βœ“ Dataset deleted successfully')
199
+
200
+ # Wait for deletion to complete
201
+ time.sleep(10)
202
+
203
+ # Recreate dataset
204
+ print('πŸ”¨ Recreating dataset...')
205
+ api.create_repo(repo_id=self.dataset_id, repo_type='dataset', exist_ok=True)
206
+ print('βœ“ Dataset recreated successfully')
207
+
208
+ print('πŸŽ‰ Dataset recreation and LFS cleanup completed!')
209
+ return True
210
+
211
+ except Exception as e:
212
+ print(f'βœ— Dataset recreation failed: {str(e)}')
213
+ print('⚠️ Manual intervention may be required')
214
+ return False
215
 
216
  def list_available_archives(self, archive_prefix: str, archive_extension: str) -> tuple[bool, str]:
217
  """
 
334
  print('βœ— upload action requires --archive-file and --filename parameters')
335
  sys.exit(1)
336
 
337
+ # First check and cleanup if needed BEFORE uploading
338
+ success = manager.check_and_cleanup_before_upload(args.archive_prefix, args.archive_extension, args.max_archives)
339
  if success:
340
+ success = manager.upload_archive(args.archive_file, args.filename)
341
 
342
  sys.exit(0 if success else 1)
343
 
scripts/utils/persistence.sh CHANGED
@@ -67,11 +67,6 @@ set_default_configuration() {
67
  # Logging configuration
68
  export LOG_FILE="${LOG_FILE:-}"
69
  export LOG_LEVEL="${LOG_LEVEL:-}"
70
-
71
- # LFS Storage Cleanup configuration
72
- export LFS_CLEANUP_METHOD="${LFS_CLEANUP_METHOD:-none}"
73
- export FORCE_LFS_CLEANUP="${FORCE_LFS_CLEANUP:-false}"
74
- export ENABLE_LFS_MONITORING="${ENABLE_LFS_MONITORING:-false}"
75
  }
76
 
77
  # Validate required environment variables
@@ -113,7 +108,7 @@ create_archive() {
113
 
114
  # Build exclude arguments - only exclude files that would negatively impact HuggingFace datasets backup
115
  local exclude_args=""
116
- local default_excludes="__pycache__,*.tmp"
117
  local combined_patterns="${EXCLUDE_PATTERNS:-},${default_excludes}"
118
 
119
  if [[ -n "$combined_patterns" ]]; then
@@ -183,7 +178,7 @@ create_archive() {
183
  fi
184
  }
185
 
186
- # Call Python upload handler
187
  run_upload_handler() {
188
  local archive_file="$1"
189
  local filename="$2"
@@ -196,7 +191,7 @@ run_upload_handler() {
196
  # Get script directory for relative imports
197
  local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
198
 
199
- # Call the standalone Python module
200
  python3 "${script_dir}/hf_persistence.py" upload \
201
  --token "$token" \
202
  --dataset-id "$dataset_id" \
 
67
  # Logging configuration
68
  export LOG_FILE="${LOG_FILE:-}"
69
  export LOG_LEVEL="${LOG_LEVEL:-}"
 
 
 
 
 
70
  }
71
 
72
  # Validate required environment variables
 
108
 
109
  # Build exclude arguments - only exclude files that would negatively impact HuggingFace datasets backup
110
  local exclude_args=""
111
+ local default_excludes="__pycache__,*.tmp,*/temp,*/cache,*/.cache,*/log,*/logs"
112
  local combined_patterns="${EXCLUDE_PATTERNS:-},${default_excludes}"
113
 
114
  if [[ -n "$combined_patterns" ]]; then
 
178
  fi
179
  }
180
 
181
+ # Call Python upload handler with pre-upload cleanup
182
  run_upload_handler() {
183
  local archive_file="$1"
184
  local filename="$2"
 
191
  # Get script directory for relative imports
192
  local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
193
 
194
+ # Call the standalone Python module (now with pre-upload cleanup logic)
195
  python3 "${script_dir}/hf_persistence.py" upload \
196
  --token "$token" \
197
  --dataset-id "$dataset_id" \