chat / data-sync.sh
Mcanroe's picture
Update data-sync.sh
ed1b963 verified
# This script is injected to handle data backups to Hugging Face.
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
echo "Warning: HF_TOKEN or DATASET_ID not set. Backup functionality is disabled."
else
# Create the Python sync utility
cat > /tmp/hf_sync.py << 'EOL'
from huggingface_hub import HfApi
import sys
import os
# Keeps the latest 'max_files' backups and deletes older ones.
def manage_backups(api, repo_id, max_files=25):
try:
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backup_files = sorted([f for f in files if f.startswith('webui_backup_') and f.endswith('.db')])
if len(backup_files) >= max_files:
num_to_delete = len(backup_files) - max_files + 1
files_to_delete = backup_files[:num_to_delete]
print(f"Pruning {num_to_delete} old backup(s)...")
for file_to_delete in files_to_delete:
try:
api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type="dataset")
except Exception as e:
print(f'Error deleting {file_to_delete}: {str(e)}')
except Exception as e:
print(f"An error occurred during backup management: {str(e)}")
# Uploads a single backup file.
def upload_backup(file_path, file_name, token, repo_id):
api = HfApi(token=token)
try:
print(f"Uploading backup: {file_name}...")
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset"
)
print("Upload successful.")
manage_backups(api, repo_id)
except Exception as e:
print(f"File upload failed: {str(e)}")
# Downloads the most recent backup file.
def download_latest_backup(token, repo_id):
try:
api = HfApi(token=token)
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backup_files = sorted([f for f in files if f.startswith('webui_backup_') and f.endswith('.db')])
if not backup_files:
print("No existing backups found.")
return
latest_backup = backup_files[-1]
print(f"Restoring from latest backup: {latest_backup}")
filepath = api.hf_hub_download(
repo_id=repo_id,
filename=latest_backup,
repo_type="dataset"
)
if filepath and os.path.exists(filepath):
os.makedirs('./data', exist_ok=True)
os.system(f'cp "{filepath}" ./data/webui.db')
print("Restore successful.")
except Exception as e:
print(f"Could not download backup: {str(e)}")
if __name__ == "__main__":
action = sys.argv[1]
token = sys.argv[2]
repo_id = sys.argv[3]
if action == "upload":
upload_backup(sys.argv[4], sys.argv[5], token, repo_id)
elif action == "download":
download_latest_backup(token, repo_id)
EOL
# On startup, restore the latest backup
echo "Searching for existing backup to restore..."
python3 /tmp/hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}"
# Define the continuous sync function
sync_data() {
while true; do
SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
sleep $SYNC_INTERVAL
echo "Starting data sync cycle..."
if [ -f "./data/webui.db" ]; then
timestamp=$(date +%Y%m%d_%H%M%S)
backup_file="webui_backup_${timestamp}.db"
temp_path="/tmp/${backup_file}"
cp ./data/webui.db "${temp_path}"
python3 /tmp/hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "${temp_path}" "${backup_file}"
rm -f "${temp_path}"
fi
done
}
# Start the sync process in the background
sync_data &
echo "Data backup process started in background."
fi