nxdev-org commited on
Commit
4560dd6
·
verified ·
1 Parent(s): fcf7edb

Update sync_storage.py

Browse files
Files changed (1) hide show
  1. sync_storage.py +109 -108
sync_storage.py CHANGED
@@ -5,7 +5,7 @@ import json
5
  import hashlib
6
  from pathlib import Path
7
  from datetime import datetime
8
- from huggingface_hub import HfApi, create_repo, list_repo_files
9
  import tarfile
10
  import tempfile
11
  import logging
@@ -15,44 +15,40 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
15
  logger = logging.getLogger(__name__)
16
 
17
  class HFStorageSync:
18
- def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data",
19
  max_backups=3, compression_level=6):
20
  self.repo_id = repo_id
21
  self.data_dir = Path(data_dir)
22
  self.token = token
23
  self.max_backups = max_backups
24
  self.compression_level = compression_level
25
-
26
  # Initialize API with token directly
27
  self.api = HfApi(token=token) if token else HfApi()
28
-
29
  # File patterns for better organization
30
  self.archive_pattern = "data-{timestamp}.tar.gz"
31
  self.latest_link = "data-latest.tar.gz"
32
  self.metadata_file = "storage-metadata.json"
33
-
34
  def _get_directory_hash(self):
35
  """Calculate hash of directory contents for change detection"""
36
  hasher = hashlib.sha256()
37
-
38
  if not self.data_dir.exists():
39
  return hasher.hexdigest()
40
-
41
  for item in sorted(self.data_dir.rglob('*')):
42
  if item.is_file() and item.name not in [".gitkeep", "test_write"]:
43
  hasher.update(str(item.relative_to(self.data_dir)).encode())
44
  hasher.update(str(item.stat().st_mtime).encode())
45
  hasher.update(str(item.stat().st_size).encode())
46
-
47
  return hasher.hexdigest()
48
-
49
  def _get_archive_size(self, archive_path):
50
  """Get the size of an archive file"""
51
  try:
52
  return os.path.getsize(archive_path)
53
  except:
54
  return 0
55
-
56
  def _format_size(self, size_bytes):
57
  """Format file size in human readable format"""
58
  for unit in ['B', 'KB', 'MB', 'GB']:
@@ -66,7 +62,6 @@ class HFStorageSync:
66
  if not self.token:
67
  logger.warning("No token provided, cannot create repository")
68
  return False
69
-
70
  try:
71
  # Check if repo exists
72
  repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset")
@@ -83,49 +78,39 @@ class HFStorageSync:
83
  exist_ok=True
84
  )
85
  logger.info(f"Created repository {self.repo_id}")
86
-
87
  # Create initial README and metadata
88
  self._create_initial_files()
89
  return True
90
  except Exception as create_error:
91
  logger.error(f"Failed to create repository: {create_error}")
92
  return False
93
-
94
  def _create_initial_files(self):
95
  """Create initial repository files"""
96
  readme_content = """# Open WebUI Storage
97
-
98
  This dataset stores persistent data for Open WebUI deployment with automatic cleanup and versioning.
99
-
100
  ## Contents
101
-
102
  - `data-latest.tar.gz`: Latest data archive (symlink)
103
  - `data-YYYYMMDD-HHMMSS.tar.gz`: Timestamped data archives
104
  - `storage-metadata.json`: Metadata about stored archives
105
  - `README.md`: This file
106
-
107
  ## Features
108
-
109
  - Automatic cleanup of old backups
110
  - Change detection to avoid unnecessary uploads
111
  - Compression optimization
112
  - Storage usage monitoring
113
-
114
  This repository is automatically managed by the Open WebUI sync system.
115
  """
116
-
117
  metadata = {
118
  "created": datetime.utcnow().isoformat(),
119
  "max_backups": self.max_backups,
120
  "archives": [],
121
  "total_size": 0
122
  }
123
-
124
  # Upload README
125
  with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp:
126
  tmp.write(readme_content)
127
  tmp.flush()
128
-
129
  self.api.upload_file(
130
  path_or_fileobj=tmp.name,
131
  path_in_repo="README.md",
@@ -135,12 +120,10 @@ This repository is automatically managed by the Open WebUI sync system.
135
  token=self.token
136
  )
137
  os.unlink(tmp.name)
138
-
139
  # Upload metadata
140
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
141
  json.dump(metadata, tmp, indent=2)
142
  tmp.flush()
143
-
144
  self.api.upload_file(
145
  path_or_fileobj=tmp.name,
146
  path_in_repo=self.metadata_file,
@@ -150,7 +133,7 @@ This repository is automatically managed by the Open WebUI sync system.
150
  token=self.token
151
  )
152
  os.unlink(tmp.name)
153
-
154
  def _get_metadata(self):
155
  """Download and parse metadata"""
156
  try:
@@ -160,7 +143,6 @@ This repository is automatically managed by the Open WebUI sync system.
160
  repo_type="dataset",
161
  token=self.token
162
  )
163
-
164
  with open(file_path, 'r') as f:
165
  return json.load(f)
166
  except Exception as e:
@@ -171,14 +153,13 @@ This repository is automatically managed by the Open WebUI sync system.
171
  "archives": [],
172
  "total_size": 0
173
  }
174
-
175
  def _update_metadata(self, metadata):
176
  """Upload updated metadata"""
177
  try:
178
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
179
  json.dump(metadata, tmp, indent=2)
180
  tmp.flush()
181
-
182
  self.api.upload_file(
183
  path_or_fileobj=tmp.name,
184
  path_in_repo=self.metadata_file,
@@ -190,17 +171,15 @@ This repository is automatically managed by the Open WebUI sync system.
190
  os.unlink(tmp.name)
191
  except Exception as e:
192
  logger.error(f"Failed to update metadata: {e}")
193
-
194
  def _cleanup_old_archives(self, metadata):
195
  """Remove old archives beyond max_backups limit"""
196
  if len(metadata["archives"]) <= self.max_backups:
197
  return metadata
198
-
199
  # Sort by timestamp, keep newest
200
  archives = sorted(metadata["archives"], key=lambda x: x["timestamp"], reverse=True)
201
  to_keep = archives[:self.max_backups]
202
  to_delete = archives[self.max_backups:]
203
-
204
  total_deleted_size = 0
205
  for archive in to_delete:
206
  try:
@@ -214,23 +193,18 @@ This repository is automatically managed by the Open WebUI sync system.
214
  logger.info(f"Deleted old archive: {archive['filename']} ({self._format_size(archive['size'])})")
215
  except Exception as e:
216
  logger.warning(f"Failed to delete {archive['filename']}: {e}")
217
-
218
  metadata["archives"] = to_keep
219
  metadata["total_size"] -= total_deleted_size
220
-
221
  if total_deleted_size > 0:
222
  logger.info(f"Cleaned up {self._format_size(total_deleted_size)} of storage")
223
-
224
  return metadata
225
 
226
  def download_data(self):
227
  """Download and extract latest data from HF dataset repo"""
228
  try:
229
  logger.info("Downloading data from Hugging Face...")
230
-
231
  # Ensure data directory exists and is writable
232
  self.data_dir.mkdir(parents=True, exist_ok=True)
233
-
234
  # Test write permissions
235
  test_file = self.data_dir / "test_write"
236
  try:
@@ -240,16 +214,13 @@ This repository is automatically managed by the Open WebUI sync system.
240
  except Exception as e:
241
  logger.warning(f"Data directory may not be writable: {e}")
242
  return
243
-
244
  if not self.token:
245
  logger.warning("No HF_TOKEN provided, skipping download")
246
  return
247
-
248
  # Ensure repository exists
249
  if not self.ensure_repo_exists():
250
  logger.error("Could not access or create repository")
251
  return
252
-
253
  # Try to download the latest data archive
254
  try:
255
  # First try the latest link
@@ -259,52 +230,40 @@ This repository is automatically managed by the Open WebUI sync system.
259
  repo_type="dataset",
260
  token=self.token
261
  )
262
-
263
  with tarfile.open(file_path, 'r:gz') as tar:
264
  tar.extractall(self.data_dir)
265
-
266
  archive_size = self._get_archive_size(file_path)
267
  logger.info(f"Data extracted to {self.data_dir} ({self._format_size(archive_size)})")
268
-
269
  except Exception as e:
270
  logger.info(f"No existing data found (normal for first run): {e}")
271
-
272
  except Exception as e:
273
  logger.error(f"Error during download: {e}")
274
-
275
  def upload_data(self, force=False):
276
  """Compress and upload data to HF dataset repo with change detection"""
277
  try:
278
  if not self.token:
279
  logger.warning("No HF_TOKEN provided, skipping upload")
280
  return
281
-
282
  if not self.data_dir.exists() or not any(self.data_dir.iterdir()):
283
  logger.warning("No data to upload")
284
  return
285
-
286
  # Calculate current directory hash
287
  current_hash = self._get_directory_hash()
288
-
289
  # Get metadata to check for changes
290
  metadata = self._get_metadata()
291
  last_hash = metadata.get("last_hash")
292
-
293
  if not force and current_hash == last_hash:
294
  logger.info("No changes detected, skipping upload")
295
  return
296
-
297
  logger.info("Changes detected, preparing upload...")
298
-
299
  # Ensure repository exists
300
  if not self.ensure_repo_exists():
301
  logger.error("Could not access or create repository")
302
  return
303
-
304
  # Create timestamped filename
305
  timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
306
  archive_filename = self.archive_pattern.format(timestamp=timestamp)
307
-
308
  # Create temporary archive with optimized compression
309
  with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp:
310
  with tarfile.open(tmp.name, f'w:gz', compresslevel=self.compression_level) as tar:
@@ -315,11 +274,9 @@ This repository is automatically managed by the Open WebUI sync system.
315
  if item.is_file():
316
  total_files += 1
317
  elif item.is_dir():
318
- total_files += sum(1 for _ in item.rglob('*') if _.is_file())
319
-
320
  archive_size = self._get_archive_size(tmp.name)
321
  logger.info(f"Created archive: {self._format_size(archive_size)}, {total_files} files")
322
-
323
  # Upload timestamped archive
324
  self.api.upload_file(
325
  path_or_fileobj=tmp.name,
@@ -329,7 +286,6 @@ This repository is automatically managed by the Open WebUI sync system.
329
  commit_message=f"Update Open WebUI data - {timestamp}",
330
  token=self.token
331
  )
332
-
333
  # Upload as latest (overwrite)
334
  self.api.upload_file(
335
  path_or_fileobj=tmp.name,
@@ -339,10 +295,8 @@ This repository is automatically managed by the Open WebUI sync system.
339
  commit_message=f"Update latest data - {timestamp}",
340
  token=self.token
341
  )
342
-
343
  # Clean up temp file
344
  os.unlink(tmp.name)
345
-
346
  # Update metadata
347
  archive_info = {
348
  "filename": archive_filename,
@@ -351,123 +305,170 @@ This repository is automatically managed by the Open WebUI sync system.
351
  "hash": current_hash,
352
  "files_count": total_files
353
  }
354
-
355
  metadata["archives"].append(archive_info)
356
  metadata["total_size"] += archive_size
357
  metadata["last_hash"] = current_hash
358
  metadata["last_upload"] = datetime.utcnow().isoformat()
359
-
360
  # Cleanup old archives
361
  metadata = self._cleanup_old_archives(metadata)
362
-
363
  # Update metadata
364
  self._update_metadata(metadata)
365
-
366
  logger.info(f"Upload successful: {archive_filename} ({self._format_size(archive_size)})")
367
  logger.info(f"Total storage used: {self._format_size(metadata['total_size'])}")
368
-
 
 
369
  except Exception as e:
370
  logger.error(f"Error uploading data: {e}")
371
-
372
  def list_archives(self):
373
  """List all available archives"""
374
  try:
375
  metadata = self._get_metadata()
376
-
377
  if not metadata["archives"]:
378
  logger.info("No archives found")
379
  return
380
-
381
  logger.info("Available archives:")
382
  logger.info("-" * 60)
383
-
384
  total_size = 0
385
  for archive in sorted(metadata["archives"], key=lambda x: x["timestamp"], reverse=True):
386
  size_str = self._format_size(archive["size"])
387
  files_count = archive.get("files_count", "unknown")
388
  logger.info(f"{archive['filename']:<30} {size_str:>10} {files_count:>6} files")
389
  total_size += archive["size"]
390
-
391
  logger.info("-" * 60)
392
  logger.info(f"Total: {len(metadata['archives'])} archives, {self._format_size(total_size)}")
393
-
394
  except Exception as e:
395
  logger.error(f"Error listing archives: {e}")
396
-
397
  def cleanup_storage(self):
398
  """Force cleanup of old archives"""
399
  try:
400
  metadata = self._get_metadata()
401
  old_size = metadata["total_size"]
402
-
403
  metadata = self._cleanup_old_archives(metadata)
404
  self._update_metadata(metadata)
405
-
406
  saved = old_size - metadata["total_size"]
407
  if saved > 0:
408
  logger.info(f"Cleanup completed. Saved {self._format_size(saved)} of storage")
409
  else:
410
  logger.info("No cleanup needed")
411
-
412
  except Exception as e:
413
  logger.error(f"Error during cleanup: {e}")
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  def main():
416
  import sys
417
-
418
  repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage")
419
  token = os.getenv("HF_TOKEN")
420
  data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data")
421
  max_backups = int(os.getenv("MAX_BACKUPS", "3"))
422
-
423
  sync = HFStorageSync(repo_id, token, data_dir, max_backups=max_backups)
424
-
425
  if len(sys.argv) > 1:
426
  command = sys.argv[1].lower()
427
-
 
428
  if command == "download":
429
  sync.download_data()
430
  elif command == "upload":
431
- force = len(sys.argv) > 2 and sys.argv[2] == "--force"
432
  sync.upload_data(force=force)
433
  elif command == "list":
434
  sync.list_archives()
435
  elif command == "cleanup":
436
  sync.cleanup_storage()
 
 
 
 
 
 
 
 
437
  else:
438
- print("Usage: sync_storage.py [download|upload [--force]|list|cleanup]")
 
439
  else:
440
- print("Usage: sync_storage.py [download|upload [--force]|list|cleanup]")
441
- print("\nCommands:")
442
- print(" download Download and extract latest data")
443
- print(" upload Upload data (only if changed)")
444
- print(" upload --force Force upload even if no changes detected")
445
- print(" list List all available archives")
446
- print(" cleanup Force cleanup of old archives")
447
- print("\nEnvironment variables:")
448
- print(" HF_STORAGE_REPO Hugging Face repository ID")
449
- print(" HF_TOKEN Hugging Face API token")
450
- print(" DATA_DIR Local data directory")
451
- print(" MAX_BACKUPS Maximum number of backups to keep (default: 3)")
452
-
453
- """
454
- # Download latest data
455
- python sync_storage.py download
456
 
457
- # Upload only if changed (default)
458
- python sync_storage.py upload
 
 
 
 
 
 
 
 
 
 
 
459
 
460
- # Force upload regardless of changes
461
- python sync_storage.py upload --force
462
 
463
- # List all archives
464
- python sync_storage.py list
465
-
466
- # Clean up old archives
467
- python sync_storage.py cleanup
468
-
469
- # Set custom backup limit
470
- MAX_BACKUPS=5 python sync_storage.py upload
471
- """
472
  if __name__ == "__main__":
473
  main()
 
5
  import hashlib
6
  from pathlib import Path
7
  from datetime import datetime
8
+ from huggingface_hub import HfApi, create_repo, list_repo_files, delete_repo
9
  import tarfile
10
  import tempfile
11
  import logging
 
15
  logger = logging.getLogger(__name__)
16
 
17
  class HFStorageSync:
18
+ def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data",
19
  max_backups=3, compression_level=6):
20
  self.repo_id = repo_id
21
  self.data_dir = Path(data_dir)
22
  self.token = token
23
  self.max_backups = max_backups
24
  self.compression_level = compression_level
 
25
  # Initialize API with token directly
26
  self.api = HfApi(token=token) if token else HfApi()
 
27
  # File patterns for better organization
28
  self.archive_pattern = "data-{timestamp}.tar.gz"
29
  self.latest_link = "data-latest.tar.gz"
30
  self.metadata_file = "storage-metadata.json"
31
+
32
  def _get_directory_hash(self):
33
  """Calculate hash of directory contents for change detection"""
34
  hasher = hashlib.sha256()
 
35
  if not self.data_dir.exists():
36
  return hasher.hexdigest()
37
+
38
  for item in sorted(self.data_dir.rglob('*')):
39
  if item.is_file() and item.name not in [".gitkeep", "test_write"]:
40
  hasher.update(str(item.relative_to(self.data_dir)).encode())
41
  hasher.update(str(item.stat().st_mtime).encode())
42
  hasher.update(str(item.stat().st_size).encode())
 
43
  return hasher.hexdigest()
44
+
45
  def _get_archive_size(self, archive_path):
46
  """Get the size of an archive file"""
47
  try:
48
  return os.path.getsize(archive_path)
49
  except:
50
  return 0
51
+
52
  def _format_size(self, size_bytes):
53
  """Format file size in human readable format"""
54
  for unit in ['B', 'KB', 'MB', 'GB']:
 
62
  if not self.token:
63
  logger.warning("No token provided, cannot create repository")
64
  return False
 
65
  try:
66
  # Check if repo exists
67
  repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset")
 
78
  exist_ok=True
79
  )
80
  logger.info(f"Created repository {self.repo_id}")
 
81
  # Create initial README and metadata
82
  self._create_initial_files()
83
  return True
84
  except Exception as create_error:
85
  logger.error(f"Failed to create repository: {create_error}")
86
  return False
87
+
88
  def _create_initial_files(self):
89
  """Create initial repository files"""
90
  readme_content = """# Open WebUI Storage
 
91
  This dataset stores persistent data for Open WebUI deployment with automatic cleanup and versioning.
 
92
  ## Contents
 
93
  - `data-latest.tar.gz`: Latest data archive (symlink)
94
  - `data-YYYYMMDD-HHMMSS.tar.gz`: Timestamped data archives
95
  - `storage-metadata.json`: Metadata about stored archives
96
  - `README.md`: This file
 
97
  ## Features
 
98
  - Automatic cleanup of old backups
99
  - Change detection to avoid unnecessary uploads
100
  - Compression optimization
101
  - Storage usage monitoring
 
102
  This repository is automatically managed by the Open WebUI sync system.
103
  """
 
104
  metadata = {
105
  "created": datetime.utcnow().isoformat(),
106
  "max_backups": self.max_backups,
107
  "archives": [],
108
  "total_size": 0
109
  }
 
110
  # Upload README
111
  with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp:
112
  tmp.write(readme_content)
113
  tmp.flush()
 
114
  self.api.upload_file(
115
  path_or_fileobj=tmp.name,
116
  path_in_repo="README.md",
 
120
  token=self.token
121
  )
122
  os.unlink(tmp.name)
 
123
  # Upload metadata
124
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
125
  json.dump(metadata, tmp, indent=2)
126
  tmp.flush()
 
127
  self.api.upload_file(
128
  path_or_fileobj=tmp.name,
129
  path_in_repo=self.metadata_file,
 
133
  token=self.token
134
  )
135
  os.unlink(tmp.name)
136
+
137
  def _get_metadata(self):
138
  """Download and parse metadata"""
139
  try:
 
143
  repo_type="dataset",
144
  token=self.token
145
  )
 
146
  with open(file_path, 'r') as f:
147
  return json.load(f)
148
  except Exception as e:
 
153
  "archives": [],
154
  "total_size": 0
155
  }
156
+
157
  def _update_metadata(self, metadata):
158
  """Upload updated metadata"""
159
  try:
160
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp:
161
  json.dump(metadata, tmp, indent=2)
162
  tmp.flush()
 
163
  self.api.upload_file(
164
  path_or_fileobj=tmp.name,
165
  path_in_repo=self.metadata_file,
 
171
  os.unlink(tmp.name)
172
  except Exception as e:
173
  logger.error(f"Failed to update metadata: {e}")
174
+
175
  def _cleanup_old_archives(self, metadata):
176
  """Remove old archives beyond max_backups limit"""
177
  if len(metadata["archives"]) <= self.max_backups:
178
  return metadata
 
179
  # Sort by timestamp, keep newest
180
  archives = sorted(metadata["archives"], key=lambda x: x["timestamp"], reverse=True)
181
  to_keep = archives[:self.max_backups]
182
  to_delete = archives[self.max_backups:]
 
183
  total_deleted_size = 0
184
  for archive in to_delete:
185
  try:
 
193
  logger.info(f"Deleted old archive: {archive['filename']} ({self._format_size(archive['size'])})")
194
  except Exception as e:
195
  logger.warning(f"Failed to delete {archive['filename']}: {e}")
 
196
  metadata["archives"] = to_keep
197
  metadata["total_size"] -= total_deleted_size
 
198
  if total_deleted_size > 0:
199
  logger.info(f"Cleaned up {self._format_size(total_deleted_size)} of storage")
 
200
  return metadata
201
 
202
  def download_data(self):
203
  """Download and extract latest data from HF dataset repo"""
204
  try:
205
  logger.info("Downloading data from Hugging Face...")
 
206
  # Ensure data directory exists and is writable
207
  self.data_dir.mkdir(parents=True, exist_ok=True)
 
208
  # Test write permissions
209
  test_file = self.data_dir / "test_write"
210
  try:
 
214
  except Exception as e:
215
  logger.warning(f"Data directory may not be writable: {e}")
216
  return
 
217
  if not self.token:
218
  logger.warning("No HF_TOKEN provided, skipping download")
219
  return
 
220
  # Ensure repository exists
221
  if not self.ensure_repo_exists():
222
  logger.error("Could not access or create repository")
223
  return
 
224
  # Try to download the latest data archive
225
  try:
226
  # First try the latest link
 
230
  repo_type="dataset",
231
  token=self.token
232
  )
 
233
  with tarfile.open(file_path, 'r:gz') as tar:
234
  tar.extractall(self.data_dir)
 
235
  archive_size = self._get_archive_size(file_path)
236
  logger.info(f"Data extracted to {self.data_dir} ({self._format_size(archive_size)})")
 
237
  except Exception as e:
238
  logger.info(f"No existing data found (normal for first run): {e}")
 
239
  except Exception as e:
240
  logger.error(f"Error during download: {e}")
241
+
242
  def upload_data(self, force=False):
243
  """Compress and upload data to HF dataset repo with change detection"""
244
  try:
245
  if not self.token:
246
  logger.warning("No HF_TOKEN provided, skipping upload")
247
  return
 
248
  if not self.data_dir.exists() or not any(self.data_dir.iterdir()):
249
  logger.warning("No data to upload")
250
  return
 
251
  # Calculate current directory hash
252
  current_hash = self._get_directory_hash()
 
253
  # Get metadata to check for changes
254
  metadata = self._get_metadata()
255
  last_hash = metadata.get("last_hash")
 
256
  if not force and current_hash == last_hash:
257
  logger.info("No changes detected, skipping upload")
258
  return
 
259
  logger.info("Changes detected, preparing upload...")
 
260
  # Ensure repository exists
261
  if not self.ensure_repo_exists():
262
  logger.error("Could not access or create repository")
263
  return
 
264
  # Create timestamped filename
265
  timestamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
266
  archive_filename = self.archive_pattern.format(timestamp=timestamp)
 
267
  # Create temporary archive with optimized compression
268
  with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp:
269
  with tarfile.open(tmp.name, f'w:gz', compresslevel=self.compression_level) as tar:
 
274
  if item.is_file():
275
  total_files += 1
276
  elif item.is_dir():
277
+ total_files += sum(1 for _f in item.rglob('*') if _f.is_file())
 
278
  archive_size = self._get_archive_size(tmp.name)
279
  logger.info(f"Created archive: {self._format_size(archive_size)}, {total_files} files")
 
280
  # Upload timestamped archive
281
  self.api.upload_file(
282
  path_or_fileobj=tmp.name,
 
286
  commit_message=f"Update Open WebUI data - {timestamp}",
287
  token=self.token
288
  )
 
289
  # Upload as latest (overwrite)
290
  self.api.upload_file(
291
  path_or_fileobj=tmp.name,
 
295
  commit_message=f"Update latest data - {timestamp}",
296
  token=self.token
297
  )
 
298
  # Clean up temp file
299
  os.unlink(tmp.name)
 
300
  # Update metadata
301
  archive_info = {
302
  "filename": archive_filename,
 
305
  "hash": current_hash,
306
  "files_count": total_files
307
  }
 
308
  metadata["archives"].append(archive_info)
309
  metadata["total_size"] += archive_size
310
  metadata["last_hash"] = current_hash
311
  metadata["last_upload"] = datetime.utcnow().isoformat()
 
312
  # Cleanup old archives
313
  metadata = self._cleanup_old_archives(metadata)
 
314
  # Update metadata
315
  self._update_metadata(metadata)
 
316
  logger.info(f"Upload successful: {archive_filename} ({self._format_size(archive_size)})")
317
  logger.info(f"Total storage used: {self._format_size(metadata['total_size'])}")
318
+
319
+ # Remove old commit history
320
+ self.prune_repo_history()
321
  except Exception as e:
322
  logger.error(f"Error uploading data: {e}")
323
+
324
  def list_archives(self):
325
  """List all available archives"""
326
  try:
327
  metadata = self._get_metadata()
 
328
  if not metadata["archives"]:
329
  logger.info("No archives found")
330
  return
 
331
  logger.info("Available archives:")
332
  logger.info("-" * 60)
 
333
  total_size = 0
334
  for archive in sorted(metadata["archives"], key=lambda x: x["timestamp"], reverse=True):
335
  size_str = self._format_size(archive["size"])
336
  files_count = archive.get("files_count", "unknown")
337
  logger.info(f"{archive['filename']:<30} {size_str:>10} {files_count:>6} files")
338
  total_size += archive["size"]
 
339
  logger.info("-" * 60)
340
  logger.info(f"Total: {len(metadata['archives'])} archives, {self._format_size(total_size)}")
 
341
  except Exception as e:
342
  logger.error(f"Error listing archives: {e}")
343
+
344
  def cleanup_storage(self):
345
  """Force cleanup of old archives"""
346
  try:
347
  metadata = self._get_metadata()
348
  old_size = metadata["total_size"]
 
349
  metadata = self._cleanup_old_archives(metadata)
350
  self._update_metadata(metadata)
 
351
  saved = old_size - metadata["total_size"]
352
  if saved > 0:
353
  logger.info(f"Cleanup completed. Saved {self._format_size(saved)} of storage")
354
  else:
355
  logger.info("No cleanup needed")
 
356
  except Exception as e:
357
  logger.error(f"Error during cleanup: {e}")
358
 
359
+ # <<< NEW FUNCTION START >>>
360
+ def prune_repo_history(self):
361
+ """
362
+ DANGEROUS: Deletes and recreates the repo to purge git history and reduce storage.
363
+ This will erase all old versions, keeping only the current files.
364
+ """
365
+ logger.warning("="*60)
366
+ logger.warning("!!! DESTRUCTIVE OPERATION INITIATED: PRUNE REPO HISTORY !!!")
367
+ logger.warning(f"This will permanently delete the repository '{self.repo_id}' and its entire Git history.")
368
+ logger.warning("Only the most recent versions of files will be preserved.")
369
+ logger.warning("="*60)
370
+
371
+ try:
372
+ # 1. Get the list of current files to preserve
373
+ logger.info("Step 1/5: Listing current files in the repository...")
374
+ current_files = list_repo_files(self.repo_id, repo_type="dataset", token=self.token)
375
+ if not current_files:
376
+ logger.warning("Repository is empty or inaccessible. Nothing to prune.")
377
+ return
378
+
379
+ logger.info(f"Found {len(current_files)} files to preserve.")
380
+
381
+ # 2. Download all current files to a temporary directory
382
+ with tempfile.TemporaryDirectory() as tmpdir:
383
+ logger.info(f"Step 2/5: Downloading current files to a temporary location...")
384
+ for file_path in current_files:
385
+ self.api.hf_hub_download(
386
+ repo_id=self.repo_id,
387
+ filename=file_path,
388
+ repo_type="dataset",
389
+ token=self.token,
390
+ local_dir=tmpdir,
391
+ local_dir_use_symlinks=False
392
+ )
393
+
394
+ # 3. Delete the entire repository
395
+ logger.warning(f"Step 3/5: Deleting repository '{self.repo_id}'...")
396
+ delete_repo(self.repo_id, repo_type="dataset", token=self.token)
397
+ logger.info("Repository deleted successfully.")
398
+
399
+ # 4. Re-create the repository (now empty with no history)
400
+ logger.info(f"Step 4/5: Re-creating repository '{self.repo_id}'...")
401
+ self.ensure_repo_exists() # This will create it and initial files
402
+ logger.info("Repository re-created successfully.")
403
+
404
+ # 5. Upload the preserved files back to the new repository
405
+ logger.info("Step 5/5: Uploading preserved files to the new repository...")
406
+ self.api.upload_folder(
407
+ folder_path=tmpdir,
408
+ repo_id=self.repo_id,
409
+ repo_type="dataset",
410
+ token=self.token,
411
+ commit_message="Repo history pruned, restoring current files"
412
+ )
413
+
414
+ logger.info("="*60)
415
+ logger.info("Repository history prune complete. Storage usage has been reset.")
416
+ logger.info("="*60)
417
+
418
+ except Exception as e:
419
+ logger.error(f"An error occurred during the prune operation: {e}")
420
+ logger.error("The repository may be in an inconsistent state. Please check the Hugging Face Hub.")
421
+ # <<< NEW FUNCTION END >>>
422
+
423
  def main():
424
  import sys
 
425
  repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage")
426
  token = os.getenv("HF_TOKEN")
427
  data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data")
428
  max_backups = int(os.getenv("MAX_BACKUPS", "3"))
429
+
430
  sync = HFStorageSync(repo_id, token, data_dir, max_backups=max_backups)
431
+
432
  if len(sys.argv) > 1:
433
  command = sys.argv[1].lower()
434
+ force = "--force" in sys.argv
435
+
436
  if command == "download":
437
  sync.download_data()
438
  elif command == "upload":
 
439
  sync.upload_data(force=force)
440
  elif command == "list":
441
  sync.list_archives()
442
  elif command == "cleanup":
443
  sync.cleanup_storage()
444
+ # <<< NEW COMMAND HANDLING START >>>
445
+ elif command == "prune-history":
446
+ if not force:
447
+ print("ERROR: This is a destructive operation that will delete all backup history.")
448
+ print("Please use 'prune-history --force' to confirm you want to proceed.")
449
+ sys.exit(1)
450
+ sync.prune_repo_history()
451
+ # <<< NEW COMMAND HANDLING END >>>
452
  else:
453
+ print(f"Unknown command: {command}")
454
+ print_usage()
455
  else:
456
+ print_usage()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
+ def print_usage():
459
+ print("Usage: sync_storage.py [command]")
460
+ print("\nCommands:")
461
+ print(" download Download and extract latest data")
462
+ print(" upload [--force] Upload data (use --force to ignore change detection)")
463
+ print(" list List all available archives")
464
+ print(" cleanup Force cleanup of old archives based on MAX_BACKUPS")
465
+ print(" prune-history --force DANGEROUS: Deletes and recreates the repo to clear all git history.")
466
+ print("\nEnvironment variables:")
467
+ print(" HF_STORAGE_REPO Hugging Face repository ID")
468
+ print(" HF_TOKEN Hugging Face API token")
469
+ print(" DATA_DIR Local data directory")
470
+ print(" MAX_BACKUPS Maximum number of backups to keep (default: 3)")
471
 
 
 
472
 
 
 
 
 
 
 
 
 
 
473
  if __name__ == "__main__":
474
  main()