autoface commited on
Commit
fa446dc
·
1 Parent(s): e070be5

Adding Hugging Face data persistence tool and related configurations

Browse files

- Add rules file to define writing requirements for project code and documentation.
- Add LFS storage cleanup configuration option in the configuration file.
- Added Python scripts to implement Hugging Face data persistence features, including uploading, managing, and restoring archived files.
- Updated Shell scripts to call new Python modules to streamline archive processing.

.cursor/rules/rule.mdc ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: true
5
+ ---
6
+ - All project code must be written in English.
7
+ - Documentation should only be written when I request it.
configs/persistence.conf CHANGED
@@ -58,6 +58,22 @@ FORCE_SYNC_RESTORE=true
58
  # Enable data integrity verification after restore
59
  ENABLE_INTEGRITY_CHECK=true
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  # Log Configuration
62
  # Log file path (using user directory for better permissions)
63
  LOG_FILE="/home/user/logs/persistence.log"
 
58
  # Enable data integrity verification after restore
59
  ENABLE_INTEGRITY_CHECK=true
60
 
61
+ # =============================================================================
62
+ # LFS Storage Cleanup Configuration (NEW)
63
+ # =============================================================================
64
+
65
+ # LFS Cleanup Method Options:
66
+ # - 'squash': Use super_squash_history API (gentle, may not always work)
67
+ # - 'recreate': Delete and recreate dataset (aggressive, always works)
68
+ # - 'none': No LFS cleanup (default old behavior)
69
+ LFS_CLEANUP_METHOD=squash
70
+
71
+ # Force LFS cleanup even if only 1 file was deleted (be careful with 'recreate')
72
+ FORCE_LFS_CLEANUP=false
73
+
74
+ # Enable LFS storage monitoring and warnings
75
+ ENABLE_LFS_MONITORING=true
76
+
77
  # Log Configuration
78
  # Log file path (using user directory for better permissions)
79
  LOG_FILE="/home/user/logs/persistence.log"
scripts/utils/hf_persistence.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Hugging Face Data Persistence Tool
4
+
5
+ This module provides functionality for interacting with Hugging Face Dataset, including:
6
+ - Uploading archive files
7
+ - Managing archive file count
8
+ - LFS storage cleanup
9
+ - Downloading and restoring archive files
10
+ - Listing available archive files
11
+ """
12
+
13
+ import sys
14
+ import os
15
+ import traceback
16
+ import time
17
+ import tempfile
18
+ import subprocess
19
+ import argparse
20
+ from pathlib import Path
21
+ from huggingface_hub import HfApi
22
+
23
+ # Set UTC timezone
24
+ os.environ['TZ'] = 'UTC'
25
+ time.tzset()
26
+
27
+ # Set Hugging Face cache directory
28
+ cache_dir = '/home/user/.cache/huggingface'
29
+ os.makedirs(cache_dir, exist_ok=True)
30
+ os.environ['HF_HOME'] = cache_dir
31
+ os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir
32
+
33
+
34
+ class HFPersistenceManager:
35
+ """Hugging Face Data Persistence Manager"""
36
+
37
+ def __init__(self, token: str, dataset_id: str):
38
+ """
39
+ Initialize the manager
40
+
41
+ Args:
42
+ token: Hugging Face access token
43
+ dataset_id: Dataset ID
44
+ """
45
+ self.token = token
46
+ self.dataset_id = dataset_id
47
+ self.api = None
48
+
49
+ # Set authentication
50
+ os.environ['HUGGING_FACE_HUB_TOKEN'] = token
51
+
52
+ def _get_api(self) -> HfApi:
53
+ """Get HfApi instance"""
54
+ if self.api is None:
55
+ self.api = HfApi()
56
+ return self.api
57
+
58
+ def upload_archive(self, local_path: str, remote_path: str) -> bool:
59
+ """
60
+ Upload archive file to Hugging Face Dataset
61
+
62
+ Args:
63
+ local_path: Local file path
64
+ remote_path: Remote file path
65
+
66
+ Returns:
67
+ bool: Whether upload was successful
68
+ """
69
+ try:
70
+ api = self._get_api()
71
+ api.upload_file(
72
+ path_or_fileobj=local_path,
73
+ path_in_repo=remote_path,
74
+ repo_id=self.dataset_id,
75
+ repo_type='dataset'
76
+ )
77
+ print(f'✓ Archive uploaded successfully: {remote_path}')
78
+ return True
79
+ except Exception as e:
80
+ print(f'✗ Archive upload failed: {str(e)}')
81
+ traceback.print_exc()
82
+ return False
83
+
84
+ def manage_archives(self, archive_prefix: str, archive_extension: str, max_files: int) -> bool:
85
+ """
86
+ Manage archive file count, delete old archives exceeding the limit
87
+
88
+ Args:
89
+ archive_prefix: Archive file prefix
90
+ archive_extension: Archive file extension
91
+ max_files: Maximum number of files to keep
92
+
93
+ Returns:
94
+ bool: Whether operation was successful
95
+ """
96
+ try:
97
+ api = self._get_api()
98
+ files = api.list_repo_files(repo_id=self.dataset_id, repo_type='dataset')
99
+ archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
100
+ archive_files.sort()
101
+
102
+ files_to_delete = []
103
+ if len(archive_files) >= max_files:
104
+ files_to_delete = archive_files[:(len(archive_files) - max_files)]
105
+ for file_to_delete in files_to_delete:
106
+ try:
107
+ api.delete_file(path_in_repo=file_to_delete, repo_id=self.dataset_id, repo_type='dataset')
108
+ print(f'✓ Deleted old archive: {file_to_delete}')
109
+ except Exception as e:
110
+ print(f'✗ Deletion failed {file_to_delete}: {str(e)}')
111
+
112
+ # LFS cleanup options
113
+ cleanup_method = os.environ.get('LFS_CLEANUP_METHOD', 'none')
114
+
115
+ if cleanup_method == 'squash' and len(files_to_delete) > 0:
116
+ self._cleanup_lfs_squash(api)
117
+ elif cleanup_method == 'recreate' and len(files_to_delete) > 0:
118
+ if not self._cleanup_lfs_recreate(api, archive_files, max_files, archive_prefix, archive_extension):
119
+ return False
120
+
121
+ print(f'✓ Archive management completed, currently keeping {min(len(archive_files), max_files)} archives')
122
+ return True
123
+ except Exception as e:
124
+ print(f'✗ Archive management failed: {str(e)}')
125
+ return False
126
+
127
+ def _cleanup_lfs_squash(self, api: HfApi) -> None:
128
+ """Clean LFS history using super_squash_history"""
129
+ try:
130
+ print('🔄 Attempting to clean LFS history with super_squash_history...')
131
+ api.super_squash_history(repo_id=self.dataset_id, repo_type='dataset')
132
+ print('✅ LFS history cleanup attempted - storage may take time to reflect changes')
133
+ except Exception as e:
134
+ print(f'⚠️ LFS history cleanup failed: {str(e)}')
135
+ print('💡 Consider setting LFS_CLEANUP_METHOD=recreate for stronger cleanup')
136
+
137
+ def _cleanup_lfs_recreate(self, api: HfApi, archive_files: list, max_files: int,
138
+ archive_prefix: str, archive_extension: str) -> bool:
139
+ """Force delete and recreate dataset to clean LFS storage"""
140
+ print('🚨 WARNING: Force recreate mode enabled')
141
+ print('📋 This will delete and recreate the entire dataset to clean LFS storage')
142
+
143
+ # Backup files that need to be preserved
144
+ remaining_files = archive_files[-(max_files-1):] if max_files > 1 else []
145
+ if remaining_files:
146
+ print(f'📦 Backing up {len(remaining_files)} files for restoration...')
147
+ backup_data = []
148
+ for file_name in remaining_files:
149
+ try:
150
+ # Download file content
151
+ file_path = api.hf_hub_download(
152
+ repo_id=self.dataset_id,
153
+ filename=file_name,
154
+ repo_type='dataset'
155
+ )
156
+ with open(file_path, 'rb') as f:
157
+ backup_data.append((file_name, f.read()))
158
+ print(f'✓ Backed up: {file_name}')
159
+ except Exception as e:
160
+ print(f'✗ Backup failed for {file_name}: {str(e)}')
161
+
162
+ # Delete dataset
163
+ try:
164
+ print('🗑️ Deleting dataset to clean LFS storage...')
165
+ api.delete_repo(repo_id=self.dataset_id, repo_type='dataset')
166
+ print('✓ Dataset deleted successfully')
167
+
168
+ # Wait for deletion to complete
169
+ time.sleep(10)
170
+
171
+ # Recreate dataset
172
+ print('🔨 Recreating dataset...')
173
+ api.create_repo(repo_id=self.dataset_id, repo_type='dataset', exist_ok=True)
174
+ print('✓ Dataset recreated successfully')
175
+
176
+ # Restore backed up files
177
+ print('📤 Restoring backed up files...')
178
+ for file_name, file_content in backup_data:
179
+ try:
180
+ # Create temporary file
181
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
182
+ temp_file.write(file_content)
183
+ temp_path = temp_file.name
184
+
185
+ # Re-upload
186
+ api.upload_file(
187
+ path_or_fileobj=temp_path,
188
+ path_in_repo=file_name,
189
+ repo_id=self.dataset_id,
190
+ repo_type='dataset'
191
+ )
192
+
193
+ # Clean up temporary file
194
+ os.unlink(temp_path)
195
+ print(f'✓ Restored: {file_name}')
196
+ except Exception as e:
197
+ print(f'✗ Restore failed for {file_name}: {str(e)}')
198
+
199
+ print('🎉 Dataset recreation and LFS cleanup completed!')
200
+ return True
201
+
202
+ except Exception as e:
203
+ print(f'✗ Dataset recreation failed: {str(e)}')
204
+ print('⚠️ Manual intervention may be required')
205
+ return False
206
+ else:
207
+ print('📝 No files to preserve, skipping backup')
208
+ return True
209
+
210
+ def list_available_archives(self, archive_prefix: str, archive_extension: str) -> tuple[bool, str]:
211
+ """
212
+ List available archive files
213
+
214
+ Args:
215
+ archive_prefix: Archive file prefix
216
+ archive_extension: Archive file extension
217
+
218
+ Returns:
219
+ tuple: (success status, latest archive filename)
220
+ """
221
+ try:
222
+ api = self._get_api()
223
+ files = api.list_repo_files(self.dataset_id, repo_type='dataset')
224
+ archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
225
+ archive_files.sort(reverse=True)
226
+
227
+ if archive_files:
228
+ print('Available archive list:')
229
+ for i, archive in enumerate(archive_files, 1):
230
+ print(f' {i}. {archive}')
231
+ # Return latest archive filename
232
+ print(f'LATEST_BACKUP:{archive_files[0]}')
233
+ return True, archive_files[0]
234
+ else:
235
+ print('No archive files found')
236
+ return False, ""
237
+ except Exception as e:
238
+ print(f'Failed to get archive list: {str(e)}')
239
+ traceback.print_exc()
240
+ return False, ""
241
+
242
+ def restore_from_archive(self, archive_name: str, restore_path: str) -> bool:
243
+ """
244
+ Restore archive from Hugging Face Dataset
245
+
246
+ Args:
247
+ archive_name: Archive filename
248
+ restore_path: Restore path
249
+
250
+ Returns:
251
+ bool: Whether restoration was successful
252
+ """
253
+ try:
254
+ api = self._get_api()
255
+
256
+ # Ensure download directory exists with proper permissions
257
+ download_dir = '/home/user/downloads'
258
+ os.makedirs(download_dir, exist_ok=True)
259
+
260
+ # Download archive file to user directory
261
+ print(f'Downloading archive: {archive_name}')
262
+ local_path = api.hf_hub_download(
263
+ repo_id=self.dataset_id,
264
+ filename=archive_name,
265
+ repo_type='dataset',
266
+ local_dir=download_dir
267
+ )
268
+
269
+ # Extract archive with better error handling
270
+ print(f'Extracting archive to: {restore_path}')
271
+
272
+ # Use subprocess for better control and error handling
273
+ # Set UTC timezone for timestamp consistency
274
+ env = os.environ.copy()
275
+ env['TZ'] = 'UTC'
276
+
277
+ extract_cmd = [
278
+ 'tar', '-xzf', local_path, '-C', restore_path,
279
+ '--warning=no-timestamp', # Suppress timestamp warnings
280
+ '--warning=no-unknown-keyword', # Suppress unknown keyword warnings
281
+ '--no-same-owner', # Don't try to restore original ownership
282
+ '--no-same-permissions', # Don't try to restore original permissions
283
+ '--touch' # Set extracted files timestamps to current UTC time
284
+ ]
285
+
286
+ result = subprocess.run(extract_cmd, capture_output=True, text=True, env=env)
287
+
288
+ if result.returncode == 0:
289
+ print(f'✓ Archive restored successfully: {archive_name}')
290
+ print('✓ Timestamps normalized to UTC timezone')
291
+
292
+ # Clean up temporary file
293
+ os.remove(local_path)
294
+ return True
295
+ else:
296
+ print(f'✗ Archive extraction failed with return code: {result.returncode}')
297
+ if result.stderr:
298
+ print(f'Error output: {result.stderr}')
299
+ return False
300
+
301
+ except Exception as e:
302
+ print(f'✗ Archive restoration failed: {str(e)}')
303
+ traceback.print_exc()
304
+ return False
305
+
306
+
307
+ def main():
308
+ """Command line entry point"""
309
+ parser = argparse.ArgumentParser(description='Hugging Face Data Persistence Tool')
310
+ parser.add_argument('action', choices=['upload', 'list', 'restore'],
311
+ help='Action to perform')
312
+ parser.add_argument('--token', required=True, help='Hugging Face access token')
313
+ parser.add_argument('--dataset-id', required=True, help='Dataset ID')
314
+ parser.add_argument('--archive-file', help='Archive file path (for upload)')
315
+ parser.add_argument('--filename', help='Remote filename (for upload)')
316
+ parser.add_argument('--archive-prefix', default='backup', help='Archive file prefix')
317
+ parser.add_argument('--archive-extension', default='tar.gz', help='Archive file extension')
318
+ parser.add_argument('--max-archives', type=int, default=5, help='Maximum number of archives to keep')
319
+ parser.add_argument('--archive-name', help='Archive name to restore (for restore)')
320
+ parser.add_argument('--restore-path', default='./', help='Restore path (for restore)')
321
+
322
+ args = parser.parse_args()
323
+
324
+ manager = HFPersistenceManager(args.token, args.dataset_id)
325
+
326
+ if args.action == 'upload':
327
+ if not args.archive_file or not args.filename:
328
+ print('✗ upload action requires --archive-file and --filename parameters')
329
+ sys.exit(1)
330
+
331
+ success = manager.upload_archive(args.archive_file, args.filename)
332
+ if success:
333
+ success = manager.manage_archives(args.archive_prefix, args.archive_extension, args.max_archives)
334
+
335
+ sys.exit(0 if success else 1)
336
+
337
+ elif args.action == 'list':
338
+ success, latest = manager.list_available_archives(args.archive_prefix, args.archive_extension)
339
+ sys.exit(0 if success else 1)
340
+
341
+ elif args.action == 'restore':
342
+ if not args.archive_name:
343
+ print('✗ restore action requires --archive-name parameter')
344
+ sys.exit(1)
345
+
346
+ success = manager.restore_from_archive(args.archive_name, args.restore_path)
347
+ sys.exit(0 if success else 1)
348
+
349
+
350
+ if __name__ == '__main__':
351
+ main()
scripts/utils/persistence.sh CHANGED
@@ -67,6 +67,11 @@ set_default_configuration() {
67
  # Logging configuration
68
  export LOG_FILE="${LOG_FILE:-}"
69
  export LOG_LEVEL="${LOG_LEVEL:-}"
 
 
 
 
 
70
  }
71
 
72
  # Validate required environment variables
@@ -178,7 +183,7 @@ create_archive() {
178
  fi
179
  }
180
 
181
- # Embedded Python upload handler
182
  run_upload_handler() {
183
  local archive_file="$1"
184
  local filename="$2"
@@ -188,74 +193,18 @@ run_upload_handler() {
188
  local max_backups="$6"
189
  local token="$7"
190
 
191
- python3 - <<EOF
192
- import sys
193
- import os
194
- import traceback
195
- import time
196
- from huggingface_hub import HfApi
197
-
198
- # Set UTC timezone for consistent timestamp handling
199
- os.environ['TZ'] = 'UTC'
200
- time.tzset()
201
-
202
- # Set up Hugging Face cache directory with proper permissions
203
- cache_dir = '/home/user/.cache/huggingface'
204
- os.makedirs(cache_dir, exist_ok=True)
205
- os.environ['HF_HOME'] = cache_dir
206
- os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir
207
- def upload_archive(api, local_path, remote_path, repo_id):
208
- """Upload archive file to Hugging Face Dataset"""
209
- try:
210
- api.upload_file(
211
- path_or_fileobj=local_path,
212
- path_in_repo=remote_path,
213
- repo_id=repo_id,
214
- repo_type='dataset'
215
- )
216
- print(f'✓ Archive uploaded successfully: {remote_path}')
217
- return True
218
- except Exception as e:
219
- print(f'✗ Archive upload failed: {str(e)}')
220
- traceback.print_exc()
221
- return False
222
- def manage_archives(api, repo_id, archive_prefix, archive_extension, max_files):
223
- """Manage archive file count, delete old archives exceeding limit"""
224
- try:
225
- files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
226
- archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
227
- archive_files.sort()
228
- if len(archive_files) >= max_files:
229
- files_to_delete = archive_files[:(len(archive_files) - max_files)]
230
- for file_to_delete in files_to_delete:
231
- try:
232
- api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type='dataset')
233
- print(f'✓ Deleted old archive: {file_to_delete}')
234
- except Exception as e:
235
- print(f'✗ Deletion failed {file_to_delete}: {str(e)}')
236
- print(f'✓ Archive management completed, currently keeping {min(len(archive_files), max_files)} archives')
237
- return True
238
- except Exception as e:
239
- print(f'✗ Archive management failed: {str(e)}')
240
- return False
241
- # Set authentication
242
- os.environ['HUGGING_FACE_HUB_TOKEN'] = '$token'
243
- try:
244
- api = HfApi()
245
- # Upload file
246
- if upload_archive(api, '$archive_file', '$filename', '$dataset_id'):
247
- # Manage archive count
248
- if manage_archives(api, '$dataset_id', '$backup_prefix', '$backup_extension', $max_backups):
249
- sys.exit(0)
250
- else:
251
- sys.exit(1)
252
- else:
253
- sys.exit(1)
254
- except Exception as e:
255
- print(f'✗ Upload process error: {str(e)}')
256
- traceback.print_exc()
257
- sys.exit(1)
258
- EOF
259
  }
260
 
261
  # Upload archive to Hugging Face
@@ -319,62 +268,22 @@ sync_daemon() {
319
  done
320
  }
321
 
322
- # Embedded Python archive lister
323
  run_archive_lister() {
324
  local dataset_id="$1"
325
  local backup_prefix="$2"
326
  local backup_extension="$3"
327
  local token="$4"
328
 
329
- python3 - <<EOF
330
- import sys
331
- import os
332
- import traceback
333
- import time
334
- from huggingface_hub import HfApi
335
-
336
- # Set up Hugging Face cache directory with proper permissions
337
- cache_dir = '/home/user/.cache/huggingface'
338
- os.makedirs(cache_dir, exist_ok=True)
339
- os.environ['HF_HOME'] = cache_dir
340
- os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir
341
-
342
- # Set UTC timezone for consistent timestamp handling
343
- os.environ['TZ'] = 'UTC'
344
- time.tzset()
345
- def list_available_archives(api, repo_id, archive_prefix, archive_extension):
346
- """List available archive files"""
347
- try:
348
- files = api.list_repo_files(repo_id, repo_type='dataset')
349
- archive_files = [f for f in files if f.startswith(archive_prefix) and f.endswith(f'.{archive_extension}')]
350
- archive_files.sort(reverse=True)
351
- if archive_files:
352
- print('Available archive list:')
353
- for i, archive in enumerate(archive_files, 1):
354
- print(f' {i}. {archive}')
355
- # Return latest archive filename
356
- print(f'LATEST_BACKUP:{archive_files[0]}')
357
- return True
358
- else:
359
- print('No archive files found')
360
- return False
361
- except Exception as e:
362
- print(f'Failed to get archive list: {str(e)}')
363
- traceback.print_exc()
364
- return False
365
- # Set authentication
366
- os.environ['HUGGING_FACE_HUB_TOKEN'] = '$token'
367
- try:
368
- api = HfApi()
369
- if list_available_archives(api, '$dataset_id', '$backup_prefix', '$backup_extension'):
370
- sys.exit(0)
371
- else:
372
- sys.exit(1)
373
- except Exception as e:
374
- print(f'List retrieval process error: {str(e)}')
375
- traceback.print_exc()
376
- sys.exit(1)
377
- EOF
378
  }
379
 
380
  # List available archives
@@ -392,97 +301,22 @@ list_archives() {
392
  run_archive_lister "$DATASET_ID" "$ARCHIVE_PREFIX" "$ARCHIVE_EXTENSION" "$HF_TOKEN"
393
  }
394
 
395
- # Embedded Python download handler
396
  run_download_handler() {
397
  local backup_name="$1"
398
  local dataset_id="$2"
399
  local restore_path="$3"
400
  local token="$4"
401
 
402
- python3 - <<EOF
403
- import sys
404
- import os
405
- import traceback
406
- import subprocess
407
- import time
408
- from huggingface_hub import HfApi
409
-
410
- # Set UTC timezone for consistent timestamp handling
411
- os.environ['TZ'] = 'UTC'
412
- time.tzset()
413
-
414
- # Set up Hugging Face cache directory with proper permissions
415
- cache_dir = '/home/user/.cache/huggingface'
416
- os.makedirs(cache_dir, exist_ok=True)
417
- os.environ['HF_HOME'] = cache_dir
418
- os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir
419
-
420
- def restore_from_archive(api, repo_id, archive_name, restore_path):
421
- """Restore archive from Hugging Face Dataset"""
422
- try:
423
- # Ensure download directory exists with proper permissions
424
- download_dir = '/home/user/downloads'
425
- os.makedirs(download_dir, exist_ok=True)
426
-
427
- # Download archive file to user-owned directory
428
- print(f'Downloading archive: {archive_name}')
429
- local_path = api.hf_hub_download(
430
- repo_id=repo_id,
431
- filename=archive_name,
432
- repo_type='dataset',
433
- local_dir=download_dir
434
- )
435
-
436
- # Extract archive with better error handling
437
- print(f'Extracting archive to: {restore_path}')
438
-
439
- # Use subprocess for better control and error handling
440
- # Set UTC timezone for consistent timestamp handling
441
- env = os.environ.copy()
442
- env['TZ'] = 'UTC'
443
-
444
- extract_cmd = [
445
- 'tar', '-xzf', local_path, '-C', restore_path,
446
- '--warning=no-timestamp', # Suppress timestamp warnings
447
- '--warning=no-unknown-keyword', # Suppress unknown keyword warnings
448
- '--no-same-owner', # Don't try to restore original ownership
449
- '--no-same-permissions', # Don't try to restore original permissions
450
- '--touch' # Set extracted files' timestamps to current time in UTC
451
- ]
452
-
453
- result = subprocess.run(extract_cmd, capture_output=True, text=True, env=env)
454
-
455
- if result.returncode == 0:
456
- print(f'✓ Archive restored successfully: {archive_name}')
457
- print('✓ Timestamps normalized to UTC timezone')
458
-
459
- # Clean up temporary files
460
- os.remove(local_path)
461
- return True
462
- else:
463
- print(f'✗ Archive extraction failed with return code: {result.returncode}')
464
- if result.stderr:
465
- print(f'Error output: {result.stderr}')
466
- return False
467
-
468
- except Exception as e:
469
- print(f'✗ Archive restoration failed: {str(e)}')
470
- traceback.print_exc()
471
- return False
472
-
473
- # Set authentication
474
- os.environ['HUGGING_FACE_HUB_TOKEN'] = '$token'
475
- try:
476
- api = HfApi()
477
- if restore_from_archive(api, '$dataset_id', '$backup_name', '$restore_path'):
478
- sys.exit(0)
479
- else:
480
- sys.exit(1)
481
- except Exception as e:
482
- print(f'Restoration process error: {str(e)}')
483
- traceback.print_exc()
484
- sys.exit(1)
485
- EOF
486
  }
487
 
488
  # Verify data integrity after restoration
@@ -619,8 +453,6 @@ restore_archive() {
619
  fi
620
  }
621
 
622
-
623
-
624
  # Main program entry
625
  main() {
626
  local command="start"
 
67
  # Logging configuration
68
  export LOG_FILE="${LOG_FILE:-}"
69
  export LOG_LEVEL="${LOG_LEVEL:-}"
70
+
71
+ # LFS Storage Cleanup configuration
72
+ export LFS_CLEANUP_METHOD="${LFS_CLEANUP_METHOD:-none}"
73
+ export FORCE_LFS_CLEANUP="${FORCE_LFS_CLEANUP:-false}"
74
+ export ENABLE_LFS_MONITORING="${ENABLE_LFS_MONITORING:-false}"
75
  }
76
 
77
  # Validate required environment variables
 
183
  fi
184
  }
185
 
186
+ # Call Python upload handler
187
  run_upload_handler() {
188
  local archive_file="$1"
189
  local filename="$2"
 
193
  local max_backups="$6"
194
  local token="$7"
195
 
196
+ # Get script directory for relative imports
197
+ local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
198
+
199
+ # Call the standalone Python module
200
+ python3 "${script_dir}/hf_persistence.py" upload \
201
+ --token "$token" \
202
+ --dataset-id "$dataset_id" \
203
+ --archive-file "$archive_file" \
204
+ --filename "$filename" \
205
+ --archive-prefix "$backup_prefix" \
206
+ --archive-extension "$backup_extension" \
207
+ --max-archives "$max_backups"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  }
209
 
210
  # Upload archive to Hugging Face
 
268
  done
269
  }
270
 
271
+ # Call Python archive lister
272
  run_archive_lister() {
273
  local dataset_id="$1"
274
  local backup_prefix="$2"
275
  local backup_extension="$3"
276
  local token="$4"
277
 
278
+ # Get script directory for relative imports
279
+ local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
280
+
281
+ # Call the standalone Python module
282
+ python3 "${script_dir}/hf_persistence.py" list \
283
+ --token "$token" \
284
+ --dataset-id "$dataset_id" \
285
+ --archive-prefix "$backup_prefix" \
286
+ --archive-extension "$backup_extension"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
  }
288
 
289
  # List available archives
 
301
  run_archive_lister "$DATASET_ID" "$ARCHIVE_PREFIX" "$ARCHIVE_EXTENSION" "$HF_TOKEN"
302
  }
303
 
304
+ # Call Python download handler
305
  run_download_handler() {
306
  local backup_name="$1"
307
  local dataset_id="$2"
308
  local restore_path="$3"
309
  local token="$4"
310
 
311
+ # Get script directory for relative imports
312
+ local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
313
+
314
+ # Call the standalone Python module
315
+ python3 "${script_dir}/hf_persistence.py" restore \
316
+ --token "$token" \
317
+ --dataset-id "$dataset_id" \
318
+ --archive-name "$backup_name" \
319
+ --restore-path "$restore_path"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  }
321
 
322
  # Verify data integrity after restoration
 
453
  fi
454
  }
455
 
 
 
456
  # Main program entry
457
  main() {
458
  local command="start"