Jaimodiji commited on
Commit
d30571e
·
1 Parent(s): e5657d7

Upload hf_sync.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_sync.py +55 -67
hf_sync.py CHANGED
@@ -4,6 +4,7 @@ import shutil
4
  import sqlite3
5
  import json
6
  import time
 
7
  from datetime import datetime
8
  from huggingface_hub import snapshot_download, HfApi
9
 
@@ -22,24 +23,41 @@ def get_metadata():
22
  return json.load(f)
23
  except:
24
  pass
25
- return {"version": 0, "last_sync": None, "source": None}
26
 
27
- def update_metadata(action):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  meta = get_metadata()
29
  meta["version"] += 1
30
  meta["last_sync"] = datetime.now().isoformat()
31
  meta["source"] = os.environ.get("HOSTNAME", "local")
32
  meta["last_action"] = action
 
 
33
  with open(METADATA_FILE, 'w') as f:
34
  json.dump(meta, f, indent=2)
35
 
36
  def safe_db_backup():
37
- """Create a consistent snapshot of the SQLite database."""
38
- if not os.path.exists(DB_FILE):
39
- return
40
 
41
- print("Creating consistent database snapshot...")
42
- backup_db = DB_FILE + ".tmp"
43
  try:
44
  source_conn = sqlite3.connect(DB_FILE)
45
  dest_conn = sqlite3.connect(backup_db)
@@ -47,96 +65,66 @@ def safe_db_backup():
47
  source_conn.backup(dest_conn)
48
  source_conn.close()
49
  dest_conn.close()
50
- shutil.move(backup_db, DB_FILE)
51
  except Exception as e:
52
  print(f"Database backup failed: {e}")
53
- if os.path.exists(backup_db):
54
- os.remove(backup_db)
55
-
56
- def verify_data():
57
- if not os.path.exists(DB_FILE):
58
- print(f"VERIFICATION FAILED: {DB_FILE} does not exist.")
59
- return False
60
-
61
- try:
62
- conn = sqlite3.connect(DB_FILE)
63
- cursor = conn.cursor()
64
- cursor.execute("SELECT username FROM users WHERE username = ?", ("akshit",))
65
- row = cursor.fetchone()
66
- conn.close()
67
-
68
- if row:
69
- print("VERIFICATION SUCCESS: User 'akshit' found in database.")
70
- return True
71
- else:
72
- print("VERIFICATION FAILED: User 'akshit' NOT found.")
73
- return False
74
- except Exception as e:
75
- print(f"VERIFICATION ERROR: {e}")
76
- return False
77
 
78
  def download():
79
- if not REPO_ID:
80
- print("DATASET_REPO_ID not set, skipping download.")
81
- return
82
-
83
- print(f"Downloading data from {REPO_ID} (Restore)...")
84
  try:
85
- snapshot_download(
86
- repo_id=REPO_ID,
87
- repo_type="dataset",
88
- local_dir=DATA_DIR,
89
- token=HF_TOKEN,
90
- max_workers=8
91
- )
92
  print("Download successful.")
93
- update_metadata("restore")
94
- verify_data()
95
  except Exception as e:
96
  print(f"Download failed: {e}")
97
 
98
  def upload():
99
- if not REPO_ID or not HF_TOKEN:
100
- return
101
-
102
- # Simple lock check
103
  if os.path.exists(LOCK_FILE):
104
- # If lock is older than 10 mins, assume it's stale
105
- if time.time() - os.path.getmtime(LOCK_FILE) < 600:
106
- print("Upload already in progress, skipping...")
107
- return
108
-
109
  try:
110
  with open(LOCK_FILE, 'w') as f: f.write(str(os.getpid()))
111
 
112
- print(f"Starting scheduled backup to {REPO_ID}...")
113
- safe_db_backup()
114
- update_metadata("backup")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  api = HfApi(token=HF_TOKEN)
117
  api.upload_folder(
118
  folder_path=DATA_DIR,
119
  repo_id=REPO_ID,
120
  repo_type="dataset",
121
- commit_message=f"Automated backup v{get_metadata()['version']}"
122
  )
123
  print("Upload successful.")
124
  except Exception as e:
125
  print(f"Upload failed: {e}")
126
  finally:
127
- if os.path.exists(LOCK_FILE):
128
- os.remove(LOCK_FILE)
129
 
130
  def init_local():
131
- os.makedirs(f"{DATA_DIR}/output", exist_ok=True)
132
- os.makedirs(f"{DATA_DIR}/processed", exist_ok=True)
133
- os.makedirs(f"{DATA_DIR}/uploads", exist_ok=True)
134
 
135
  if __name__ == "__main__":
136
  action = sys.argv[1] if len(sys.argv) > 1 else "help"
137
  if action == "download": download()
138
  elif action == "upload": upload()
139
  elif action == "init": init_local()
140
- elif action == "verify": verify_data()
141
- else:
142
- print("Usage: python hf_sync.py [download|upload|init|verify]")
 
4
  import sqlite3
5
  import json
6
  import time
7
+ import hashlib
8
  from datetime import datetime
9
  from huggingface_hub import snapshot_download, HfApi
10
 
 
23
  return json.load(f)
24
  except:
25
  pass
26
+ return {"version": 0, "last_sync": None, "source": None, "last_db_hash": None}
27
 
28
+ def get_dir_stats():
29
+ """Returns total file count across data directories to detect new uploads."""
30
+ count = 0
31
+ for d in ['uploads', 'processed', 'output']:
32
+ path = os.path.join(DATA_DIR, d)
33
+ if os.path.exists(path):
34
+ count += len(os.listdir(path))
35
+ return count
36
+
37
+ def get_file_hash(path):
38
+ if not os.path.exists(path): return None
39
+ hasher = hashlib.md5()
40
+ with open(path, 'rb') as f:
41
+ for chunk in iter(lambda: f.read(4096), b""):
42
+ hasher.update(chunk)
43
+ return hasher.hexdigest()
44
+
45
+ def update_metadata(action, db_hash):
46
  meta = get_metadata()
47
  meta["version"] += 1
48
  meta["last_sync"] = datetime.now().isoformat()
49
  meta["source"] = os.environ.get("HOSTNAME", "local")
50
  meta["last_action"] = action
51
+ meta["last_db_hash"] = db_hash
52
+ meta["file_count"] = get_dir_stats()
53
  with open(METADATA_FILE, 'w') as f:
54
  json.dump(meta, f, indent=2)
55
 
56
  def safe_db_backup():
57
+ if not os.path.exists(DB_FILE): return None
 
 
58
 
59
+ # We backup to a temp file to get a consistent hash/file
60
+ backup_db = DB_FILE + ".bak"
61
  try:
62
  source_conn = sqlite3.connect(DB_FILE)
63
  dest_conn = sqlite3.connect(backup_db)
 
65
  source_conn.backup(dest_conn)
66
  source_conn.close()
67
  dest_conn.close()
68
+ return backup_db
69
  except Exception as e:
70
  print(f"Database backup failed: {e}")
71
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def download():
74
+ if not REPO_ID: return
75
+ print(f"Downloading data from {REPO_ID}...")
 
 
 
76
  try:
77
+ snapshot_download(repo_id=REPO_ID, repo_type="dataset", local_dir=DATA_DIR, token=HF_TOKEN, max_workers=8)
 
 
 
 
 
 
78
  print("Download successful.")
79
+ update_metadata("restore", get_file_hash(DB_FILE))
 
80
  except Exception as e:
81
  print(f"Download failed: {e}")
82
 
83
  def upload():
84
+ if not REPO_ID or not HF_TOKEN: return
 
 
 
85
  if os.path.exists(LOCK_FILE):
86
+ if time.time() - os.path.getmtime(LOCK_FILE) < 600: return
87
+
 
 
 
88
  try:
89
  with open(LOCK_FILE, 'w') as f: f.write(str(os.getpid()))
90
 
91
+ meta = get_metadata()
92
+ backup_path = safe_db_backup()
93
+ if not backup_path: return
94
+
95
+ current_hash = get_file_hash(backup_path)
96
+ current_count = get_dir_stats()
97
+
98
+ # ONLY UPLOAD IF CHANGED
99
+ if current_hash == meta.get("last_db_hash") and current_count == meta.get("file_count", 0):
100
+ print("No changes detected (DB hash and file count match). skipping upload.")
101
+ os.remove(backup_path)
102
+ return
103
+
104
+ print(f"Changes detected. Syncing to {REPO_ID}...")
105
+ # Replace the real db with the consistent backup for upload
106
+ shutil.move(backup_path, DB_FILE)
107
+ update_metadata("backup", current_hash)
108
 
109
  api = HfApi(token=HF_TOKEN)
110
  api.upload_folder(
111
  folder_path=DATA_DIR,
112
  repo_id=REPO_ID,
113
  repo_type="dataset",
114
+ commit_message=f"Auto-backup v{get_metadata()['version']}"
115
  )
116
  print("Upload successful.")
117
  except Exception as e:
118
  print(f"Upload failed: {e}")
119
  finally:
120
+ if os.path.exists(LOCK_FILE): os.remove(LOCK_FILE)
 
121
 
122
  def init_local():
123
+ for d in ['output', 'processed', 'uploads']: os.makedirs(f"{DATA_DIR}/{d}", exist_ok=True)
 
 
124
 
125
  if __name__ == "__main__":
126
  action = sys.argv[1] if len(sys.argv) > 1 else "help"
127
  if action == "download": download()
128
  elif action == "upload": upload()
129
  elif action == "init": init_local()
130
+ else: print("Usage: python hf_sync.py [download|upload|init]")