Yash030 Claude Opus 4.7 commited on
Commit
a7a7e59
·
1 Parent(s): e24267e

perf: skip backup when no files changed using mtime fingerprint

Browse files

Avoids 20-min upload cycles by comparing file size+mtime before staging.
Only uploads when something actually changed; saves state in .backup_state.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (1) hide show
  1. sync.py +38 -0
sync.py CHANGED
@@ -5,6 +5,7 @@ Usage:
5
  python3 sync.py restore -- download DB from HF on startup
6
  python3 sync.py backup -- upload DB to HF (called in loop)
7
  """
 
8
  import os
9
  import sys
10
  import shutil
@@ -24,6 +25,27 @@ SKIP_FILES = {".env"}
24
  ALLOW_HIDDEN = {".hmac"}
25
  SKIP_NAMES = {"LOCK"} # held open by Dolt — always skip
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def get_api():
28
  return HfApi(token=HF_TOKEN)
29
 
@@ -68,6 +90,17 @@ def backup():
68
  return
69
  api = get_api()
70
 
 
 
 
 
 
 
 
 
 
 
 
71
  # Ensure repo exists
72
  try:
73
  api.repo_info(REPO_ID, repo_type="dataset")
@@ -114,6 +147,11 @@ def backup():
114
  commit_message="sync: periodic backup",
115
  )
116
  print("[sync] backup complete")
 
 
 
 
 
117
  except Exception as e:
118
  print(f"[sync] backup error: {e}")
119
  finally:
 
5
  python3 sync.py restore -- download DB from HF on startup
6
  python3 sync.py backup -- upload DB to HF (called in loop)
7
  """
8
+ import json
9
  import os
10
  import sys
11
  import shutil
 
25
  ALLOW_HIDDEN = {".hmac"}
26
  SKIP_NAMES = {"LOCK"} # held open by Dolt — always skip
27
 
28
+ def _quick_hash(data_dir):
29
+ """Build a fingerprint from file sizes+mtimes — fast, no file reads."""
30
+ entries = {}
31
+ for root, dirs, files in os.walk(data_dir):
32
+ is_inside_dolt = ".dolt" in root.replace("\\", "/").split("/")
33
+ if not is_inside_dolt:
34
+ dirs[:] = [d for d in dirs if not d.startswith(".") or d == ".dolt"]
35
+ for f in files:
36
+ if f in SKIP_FILES or f in SKIP_NAMES:
37
+ continue
38
+ if f.startswith(".") and f not in ALLOW_HIDDEN:
39
+ continue
40
+ full = os.path.join(root, f)
41
+ rel = os.path.relpath(full, data_dir).replace("\\", "/")
42
+ try:
43
+ s = os.stat(full)
44
+ entries[rel] = (s.st_size, s.st_mtime)
45
+ except OSError:
46
+ pass
47
+ return json.dumps(entries, sort_keys=True)
48
+
49
  def get_api():
50
  return HfApi(token=HF_TOKEN)
51
 
 
90
  return
91
  api = get_api()
92
 
93
+ # Fast change detection — skip everything if nothing modified
94
+ state_file = os.path.join(DATA_DIR, ".backup_state")
95
+ current_state = _quick_hash(DATA_DIR)
96
+ if os.path.exists(state_file):
97
+ try:
98
+ if open(state_file).read() == current_state:
99
+ print("[sync] no changes — skipping backup")
100
+ return
101
+ except Exception:
102
+ pass
103
+
104
  # Ensure repo exists
105
  try:
106
  api.repo_info(REPO_ID, repo_type="dataset")
 
147
  commit_message="sync: periodic backup",
148
  )
149
  print("[sync] backup complete")
150
+ # Save state fingerprint so next cycle skips if nothing changed
151
+ try:
152
+ open(state_file, "w").write(current_state)
153
+ except Exception:
154
+ pass
155
  except Exception as e:
156
  print(f"[sync] backup error: {e}")
157
  finally: