Jaimodiji commited on
Commit
e5657d7
·
1 Parent(s): cdd3652

Upload hf_sync.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_sync.py +80 -34
hf_sync.py CHANGED
@@ -2,20 +2,64 @@ import os
2
  import sys
3
  import shutil
4
  import sqlite3
 
 
 
5
  from huggingface_hub import snapshot_download, HfApi
6
 
7
  # Configuration
8
  REPO_ID = os.environ.get("DATASET_REPO_ID")
9
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def verify_data():
12
- db_path = "data_repo/database.db"
13
- if not os.path.exists(db_path):
14
- print(f"VERIFICATION FAILED: {db_path} does not exist.")
15
  return False
16
 
17
  try:
18
- conn = sqlite3.connect(db_path)
19
  cursor = conn.cursor()
20
  cursor.execute("SELECT username FROM users WHERE username = ?", ("akshit",))
21
  row = cursor.fetchone()
@@ -25,7 +69,7 @@ def verify_data():
25
  print("VERIFICATION SUCCESS: User 'akshit' found in database.")
26
  return True
27
  else:
28
- print("VERIFICATION FAILED: User 'akshit' NOT found in database.")
29
  return False
30
  except Exception as e:
31
  print(f"VERIFICATION ERROR: {e}")
@@ -36,61 +80,63 @@ def download():
36
  print("DATASET_REPO_ID not set, skipping download.")
37
  return
38
 
39
- print(f"Downloading data from {REPO_ID}...")
40
  try:
41
- # snapshot_download is more efficient for many files than the CLI
42
  snapshot_download(
43
  repo_id=REPO_ID,
44
  repo_type="dataset",
45
- local_dir="data_repo",
46
  token=HF_TOKEN,
47
  max_workers=8
48
  )
49
  print("Download successful.")
 
50
  verify_data()
51
  except Exception as e:
52
  print(f"Download failed: {e}")
53
 
54
  def upload():
55
- if not REPO_ID:
56
- print("DATASET_REPO_ID not set, skipping upload.")
57
- return
58
- if not HF_TOKEN:
59
- print("HF_TOKEN not set, skipping upload.")
60
  return
61
 
62
- print(f"Uploading data to {REPO_ID}...")
 
 
 
 
 
 
63
  try:
 
 
 
 
 
 
64
  api = HfApi(token=HF_TOKEN)
65
  api.upload_folder(
66
- folder_path="data_repo",
67
  repo_id=REPO_ID,
68
  repo_type="dataset",
69
- delete_patterns="*", # Optional: sync deletion if needed
70
  )
71
  print("Upload successful.")
72
  except Exception as e:
73
  print(f"Upload failed: {e}")
 
 
 
74
 
75
  def init_local():
76
- """Ensure data_repo has the necessary structure if download failed or it's new."""
77
- os.makedirs("data_repo/output", exist_ok=True)
78
- os.makedirs("data_repo/processed", exist_ok=True)
79
- os.makedirs("data_repo/uploads", exist_ok=True)
80
 
81
  if __name__ == "__main__":
82
- if len(sys.argv) < 2:
83
- print("Usage: python hf_sync.py [download|upload|init|verify]")
84
- sys.exit(1)
85
-
86
- action = sys.argv[1]
87
- if action == "download":
88
- download()
89
- elif action == "upload":
90
- upload()
91
- elif action == "init":
92
- init_local()
93
- elif action == "verify":
94
- verify_data()
95
  else:
96
- print(f"Unknown action: {action}")
 
2
  import sys
3
  import shutil
4
  import sqlite3
5
+ import json
6
+ import time
7
+ from datetime import datetime
8
  from huggingface_hub import snapshot_download, HfApi
9
 
10
  # Configuration
11
  REPO_ID = os.environ.get("DATASET_REPO_ID")
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
+ DATA_DIR = "data_repo"
14
+ DB_FILE = os.path.join(DATA_DIR, "database.db")
15
+ METADATA_FILE = os.path.join(DATA_DIR, "sync_metadata.json")
16
+ LOCK_FILE = "/tmp/hf_sync.lock"
17
+
18
+ def get_metadata():
19
+ if os.path.exists(METADATA_FILE):
20
+ try:
21
+ with open(METADATA_FILE, 'r') as f:
22
+ return json.load(f)
23
+ except:
24
+ pass
25
+ return {"version": 0, "last_sync": None, "source": None}
26
+
27
+ def update_metadata(action):
28
+ meta = get_metadata()
29
+ meta["version"] += 1
30
+ meta["last_sync"] = datetime.now().isoformat()
31
+ meta["source"] = os.environ.get("HOSTNAME", "local")
32
+ meta["last_action"] = action
33
+ with open(METADATA_FILE, 'w') as f:
34
+ json.dump(meta, f, indent=2)
35
+
36
+ def safe_db_backup():
37
+ """Create a consistent snapshot of the SQLite database."""
38
+ if not os.path.exists(DB_FILE):
39
+ return
40
+
41
+ print("Creating consistent database snapshot...")
42
+ backup_db = DB_FILE + ".tmp"
43
+ try:
44
+ source_conn = sqlite3.connect(DB_FILE)
45
+ dest_conn = sqlite3.connect(backup_db)
46
+ with dest_conn:
47
+ source_conn.backup(dest_conn)
48
+ source_conn.close()
49
+ dest_conn.close()
50
+ shutil.move(backup_db, DB_FILE)
51
+ except Exception as e:
52
+ print(f"Database backup failed: {e}")
53
+ if os.path.exists(backup_db):
54
+ os.remove(backup_db)
55
 
56
  def verify_data():
57
+ if not os.path.exists(DB_FILE):
58
+ print(f"VERIFICATION FAILED: {DB_FILE} does not exist.")
 
59
  return False
60
 
61
  try:
62
+ conn = sqlite3.connect(DB_FILE)
63
  cursor = conn.cursor()
64
  cursor.execute("SELECT username FROM users WHERE username = ?", ("akshit",))
65
  row = cursor.fetchone()
 
69
  print("VERIFICATION SUCCESS: User 'akshit' found in database.")
70
  return True
71
  else:
72
+ print("VERIFICATION FAILED: User 'akshit' NOT found.")
73
  return False
74
  except Exception as e:
75
  print(f"VERIFICATION ERROR: {e}")
 
80
  print("DATASET_REPO_ID not set, skipping download.")
81
  return
82
 
83
+ print(f"Downloading data from {REPO_ID} (Restore)...")
84
  try:
 
85
  snapshot_download(
86
  repo_id=REPO_ID,
87
  repo_type="dataset",
88
+ local_dir=DATA_DIR,
89
  token=HF_TOKEN,
90
  max_workers=8
91
  )
92
  print("Download successful.")
93
+ update_metadata("restore")
94
  verify_data()
95
  except Exception as e:
96
  print(f"Download failed: {e}")
97
 
98
  def upload():
99
+ if not REPO_ID or not HF_TOKEN:
 
 
 
 
100
  return
101
 
102
+ # Simple lock check
103
+ if os.path.exists(LOCK_FILE):
104
+ # If lock is older than 10 mins, assume it's stale
105
+ if time.time() - os.path.getmtime(LOCK_FILE) < 600:
106
+ print("Upload already in progress, skipping...")
107
+ return
108
+
109
  try:
110
+ with open(LOCK_FILE, 'w') as f: f.write(str(os.getpid()))
111
+
112
+ print(f"Starting scheduled backup to {REPO_ID}...")
113
+ safe_db_backup()
114
+ update_metadata("backup")
115
+
116
  api = HfApi(token=HF_TOKEN)
117
  api.upload_folder(
118
+ folder_path=DATA_DIR,
119
  repo_id=REPO_ID,
120
  repo_type="dataset",
121
+ commit_message=f"Automated backup v{get_metadata()['version']}"
122
  )
123
  print("Upload successful.")
124
  except Exception as e:
125
  print(f"Upload failed: {e}")
126
+ finally:
127
+ if os.path.exists(LOCK_FILE):
128
+ os.remove(LOCK_FILE)
129
 
130
  def init_local():
131
+ os.makedirs(f"{DATA_DIR}/output", exist_ok=True)
132
+ os.makedirs(f"{DATA_DIR}/processed", exist_ok=True)
133
+ os.makedirs(f"{DATA_DIR}/uploads", exist_ok=True)
 
134
 
135
  if __name__ == "__main__":
136
+ action = sys.argv[1] if len(sys.argv) > 1 else "help"
137
+ if action == "download": download()
138
+ elif action == "upload": upload()
139
+ elif action == "init": init_local()
140
+ elif action == "verify": verify_data()
 
 
 
 
 
 
 
 
141
  else:
142
+ print("Usage: python hf_sync.py [download|upload|init|verify]")