Mcanroe commited on
Commit
ed1b963
·
verified ·
1 Parent(s): e68bfca

Update data-sync.sh

Browse files
Files changed (1) hide show
  1. data-sync.sh +102 -81
data-sync.sh CHANGED
@@ -1,91 +1,112 @@
1
- #!/bin/bash
2
- # Note: Ensure that date, git, and other tools are installed in the system when the script is executed.
3
 
4
- # Check necessary environment variables
5
- if [ -z "$G_NAME" ] || [ -z "$G_TOKEN" ]; then
6
- echo "Missing required environment variables G_NAME or G_TOKEN"
7
- exit 1
8
- fi
9
-
10
- # Build GitHub repository clone URL with token
11
- REPO_URL="https://${G_TOKEN}@github.com/${G_NAME}.git"
12
- REPO_DIR="./data/github_data"
13
- mkdir -p "$REPO_DIR"
14
-
15
- # Clone repository
16
- echo "Cloning repository..."
17
- git clone --depth 1 "$REPO_URL" "$REPO_DIR" || {
18
- echo "Clone failed, please check if G_NAME and G_TOKEN are correct."
19
- exit 1
20
- }
21
-
22
- # Check if the initial webui.db exists and copy it
23
- if [ -f "$REPO_DIR/webui.db" ]; then
24
- cp "$REPO_DIR/webui.db" ./data/webui.db
25
- echo "Successfully pulled webui.db from GitHub repository"
26
  else
27
- echo "webui.db not found in GitHub repository, will push during sync"
28
- fi
 
 
 
29
 
30
- # Define sync function, performs a sync every 12 hours.
31
- sync_data() {
32
- while true; do
33
- # Output current London time during sync
34
- CURRENT_TIME=$(TZ=Europe/London date '+%Y-%m-%d %H:%M:%S')
35
- echo "Current time $CURRENT_TIME"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # ---- Start sync process ----
38
- echo "Starting GitHub sync..."
39
- cd "$REPO_DIR" || { echo "Failed to change directory to $REPO_DIR"; exit 1; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- git config user.name "AutoSync Bot"
42
- git config user.email "autosync@bot.com"
 
 
 
 
 
43
 
44
- if [ -z "$(git branch --list main)" ]; then
45
- echo "Branch 'main' not found. Creating it for the initial commit."
46
- git checkout -b main
47
- else
48
- git switch main
49
- fi
 
 
 
 
50
 
51
- # Copy latest database file to repository directory
52
- if [ -f "../webui.db" ]; then
53
- cp ../webui.db ./webui.db
54
- else
55
- echo "Database file ../webui.db not yet initialized by the application."
56
- if [ ! -f "./webui.db" ] && [ -z "$(git ls-files)" ]; then
57
- echo "Creating .gitkeep to initialize the repository."
58
- touch .gitkeep
59
- git add .gitkeep
60
- fi
61
- fi
62
 
63
- # Check if there are changes
64
- if [[ -n $(git status --porcelain) ]]; then
65
- git add webui.db .gitkeep
66
- git commit -m "Auto sync webui.db: $(TZ=Europe/London date '+%Y-%m-%d %H:%M:%S')"
67
-
68
- echo "Pushing changes to GitHub..."
69
- git push --set-upstream origin main && {
70
- echo "GitHub push successful"
71
- } || {
72
- echo "Push failed, will retry once..."
73
- sleep 10
74
- git push origin main || {
75
- echo "Retry failed, abandoning GitHub push for this cycle."
76
- }
77
- }
78
- else
79
- echo "GitHub: No database changes detected"
80
- fi
81
- # Return to the original directory
82
- cd ../..
83
 
84
- # ---- Sync process complete ----
85
- echo "Sync complete. Waiting 12 hours for the next sync."
86
- sleep 43200
87
- done
88
- }
 
 
 
 
 
 
 
 
89
 
90
- # Start sync process in background
91
- sync_data &
 
 
 
1
+ # This script is injected to handle data backups to Hugging Face.
 
2
 
3
+ if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
4
+ echo "Warning: HF_TOKEN or DATASET_ID not set. Backup functionality is disabled."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  else
6
+ # Create the Python sync utility
7
+ cat > /tmp/hf_sync.py << 'EOL'
8
+ from huggingface_hub import HfApi
9
+ import sys
10
+ import os
11
 
12
+ # Keeps the latest 'max_files' backups and deletes older ones.
13
+ def manage_backups(api, repo_id, max_files=25):
14
+ try:
15
+ files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
16
+ backup_files = sorted([f for f in files if f.startswith('webui_backup_') and f.endswith('.db')])
17
+
18
+ if len(backup_files) >= max_files:
19
+ num_to_delete = len(backup_files) - max_files + 1
20
+ files_to_delete = backup_files[:num_to_delete]
21
+ print(f"Pruning {num_to_delete} old backup(s)...")
22
+ for file_to_delete in files_to_delete:
23
+ try:
24
+ api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type="dataset")
25
+ except Exception as e:
26
+ print(f'Error deleting {file_to_delete}: {str(e)}')
27
+ except Exception as e:
28
+ print(f"An error occurred during backup management: {str(e)}")
29
+
30
+ # Uploads a single backup file.
31
+ def upload_backup(file_path, file_name, token, repo_id):
32
+ api = HfApi(token=token)
33
+ try:
34
+ print(f"Uploading backup: {file_name}...")
35
+ api.upload_file(
36
+ path_or_fileobj=file_path,
37
+ path_in_repo=file_name,
38
+ repo_id=repo_id,
39
+ repo_type="dataset"
40
+ )
41
+ print("Upload successful.")
42
+ manage_backups(api, repo_id)
43
+ except Exception as e:
44
+ print(f"File upload failed: {str(e)}")
45
 
46
+ # Downloads the most recent backup file.
47
+ def download_latest_backup(token, repo_id):
48
+ try:
49
+ api = HfApi(token=token)
50
+ files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
51
+ backup_files = sorted([f for f in files if f.startswith('webui_backup_') and f.endswith('.db')])
52
+
53
+ if not backup_files:
54
+ print("No existing backups found.")
55
+ return
56
+
57
+ latest_backup = backup_files[-1]
58
+ print(f"Restoring from latest backup: {latest_backup}")
59
+
60
+ filepath = api.hf_hub_download(
61
+ repo_id=repo_id,
62
+ filename=latest_backup,
63
+ repo_type="dataset"
64
+ )
65
 
66
+ if filepath and os.path.exists(filepath):
67
+ os.makedirs('./data', exist_ok=True)
68
+ os.system(f'cp "{filepath}" ./data/webui.db')
69
+ print("Restore successful.")
70
+
71
+ except Exception as e:
72
+ print(f"Could not download backup: {str(e)}")
73
 
74
+ if __name__ == "__main__":
75
+ action = sys.argv[1]
76
+ token = sys.argv[2]
77
+ repo_id = sys.argv[3]
78
+
79
+ if action == "upload":
80
+ upload_backup(sys.argv[4], sys.argv[5], token, repo_id)
81
+ elif action == "download":
82
+ download_latest_backup(token, repo_id)
83
+ EOL
84
 
85
+ # On startup, restore the latest backup
86
+ echo "Searching for existing backup to restore..."
87
+ python3 /tmp/hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}"
 
 
 
 
 
 
 
 
88
 
89
+ # Define the continuous sync function
90
+ sync_data() {
91
+ while true; do
92
+ SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
93
+ sleep $SYNC_INTERVAL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ echo "Starting data sync cycle..."
96
+
97
+ if [ -f "./data/webui.db" ]; then
98
+ timestamp=$(date +%Y%m%d_%H%M%S)
99
+ backup_file="webui_backup_${timestamp}.db"
100
+ temp_path="/tmp/${backup_file}"
101
+
102
+ cp ./data/webui.db "${temp_path}"
103
+ python3 /tmp/hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "${temp_path}" "${backup_file}"
104
+ rm -f "${temp_path}"
105
+ fi
106
+ done
107
+ }
108
 
109
+ # Start the sync process in the background
110
+ sync_data &
111
+ echo "Data backup process started in background."
112
+ fi