zhuoranyang commited on
Commit
144d5cc
·
1 Parent(s): 2774c42

Improve HF result commit auth and increase training log heartbeat

Browse files
hf_app/app.py CHANGED
@@ -901,27 +901,83 @@ def _commit_results_to_repo(p):
901
  """
902
  try:
903
  from huggingface_hub import HfApi
 
904
  except ImportError:
905
  return False, "huggingface_hub not installed"
906
 
907
- space_id = os.environ.get("SPACE_ID") # e.g. "username/space-name"
908
- if not space_id:
909
- return False, "Not running on HF Spaces (SPACE_ID not set)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
910
 
911
  result_dir = os.path.join(RESULTS_DIR, f"p_{p:03d}")
912
  if not os.path.isdir(result_dir):
913
  return False, "No results directory found"
914
 
915
  try:
916
- api = HfApi()
 
 
917
  api.upload_folder(
918
  folder_path=result_dir,
919
  path_in_repo=f"precomputed_results/p_{p:03d}",
920
- repo_id=space_id,
921
  repo_type="space",
 
922
  commit_message=f"Add precomputed results for p={p}",
923
  )
924
- return True, f"Committed results for p={p} to {space_id}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925
  except Exception as e:
926
  logger.warning(f"Failed to commit results for p={p}: {e}")
927
  return False, str(e)
@@ -996,10 +1052,13 @@ def run_pipeline_for_p_streaming(p):
996
 
997
  n_files = len(os.listdir(result_dir)) if os.path.isdir(result_dir) else 0
998
 
999
- # Try to commit results back to the HF repo
1000
  ok_commit, commit_msg = _commit_results_to_repo(p)
1001
  if ok_commit:
1002
- yield f"Results saved to HF repo.", False, False
 
 
 
1003
 
1004
  yield f"\nDone! Generated {n_files} files for p={p}.", False, True
1005
 
 
901
  """
902
  try:
903
  from huggingface_hub import HfApi
904
+ from huggingface_hub.utils import HfHubHTTPError
905
  except ImportError:
906
  return False, "huggingface_hub not installed"
907
 
908
+ repo_id = (
909
+ os.environ.get("HF_SPACE_REPO_ID")
910
+ or os.environ.get("HF_REPO_ID")
911
+ or os.environ.get("SPACE_ID")
912
+ or ""
913
+ ).strip()
914
+ if not repo_id:
915
+ return False, "No target space repo found (set HF_SPACE_REPO_ID or SPACE_ID)"
916
+
917
+ # Accept "spaces/owner/name" and full URL forms, normalize to "owner/name".
918
+ for prefix in ("https://huggingface.co/spaces/", "http://huggingface.co/spaces/"):
919
+ if repo_id.startswith(prefix):
920
+ repo_id = repo_id[len(prefix):]
921
+ if repo_id.startswith("spaces/"):
922
+ repo_id = repo_id[len("spaces/"):]
923
+ repo_id = repo_id.strip("/")
924
+ if repo_id.count("/") != 1:
925
+ return False, (
926
+ f"Invalid space repo id '{repo_id}'. "
927
+ "Expected 'owner/space-name'."
928
+ )
929
+
930
+ token = None
931
+ token_var = None
932
+ for var_name in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
933
+ raw = os.environ.get(var_name, "").strip()
934
+ if raw:
935
+ token = raw
936
+ token_var = var_name
937
+ break
938
+ if not token:
939
+ return False, (
940
+ "Missing Hugging Face write token. Add a Space Secret named "
941
+ "HF_TOKEN with write access to this Space repo."
942
+ )
943
 
944
  result_dir = os.path.join(RESULTS_DIR, f"p_{p:03d}")
945
  if not os.path.isdir(result_dir):
946
  return False, "No results directory found"
947
 
948
  try:
949
+ api = HfApi(token=token)
950
+ who = api.whoami(token=token)
951
+ actor = who.get("name", "unknown-user")
952
  api.upload_folder(
953
  folder_path=result_dir,
954
  path_in_repo=f"precomputed_results/p_{p:03d}",
955
+ repo_id=repo_id,
956
  repo_type="space",
957
+ token=token,
958
  commit_message=f"Add precomputed results for p={p}",
959
  )
960
+ return True, (
961
+ f"Committed results for p={p} to {repo_id} "
962
+ f"(auth: {token_var}, user: {actor})"
963
+ )
964
+ except HfHubHTTPError as e:
965
+ status = getattr(getattr(e, "response", None), "status_code", None)
966
+ if status in (401, 403):
967
+ msg = (
968
+ f"HF auth failed ({status}) for {repo_id}. "
969
+ "Set HF_TOKEN Space Secret to a valid WRITE token that can "
970
+ "push to this Space."
971
+ )
972
+ elif status == 404:
973
+ msg = (
974
+ f"Space repo '{repo_id}' not found. "
975
+ "Confirm owner/name and set HF_SPACE_REPO_ID if needed."
976
+ )
977
+ else:
978
+ msg = f"Hugging Face Hub error ({status}): {e}"
979
+ logger.warning(f"Failed to commit results for p={p}: {msg}")
980
+ return False, msg
981
  except Exception as e:
982
  logger.warning(f"Failed to commit results for p={p}: {e}")
983
  return False, str(e)
 
1052
 
1053
  n_files = len(os.listdir(result_dir)) if os.path.isdir(result_dir) else 0
1054
 
1055
+ # Try to commit results back to the HF repo so they persist across restarts
1056
  ok_commit, commit_msg = _commit_results_to_repo(p)
1057
  if ok_commit:
1058
+ yield f"Results saved to HF repo (will persist across restarts).", False, False
1059
+ else:
1060
+ yield (f"Warning: could not save to HF repo: {commit_msg}. "
1061
+ f"Results are available now but will be lost on restart."), False, False
1062
 
1063
  yield f"\nDone! Generated {n_files} files for p={p}.", False, True
1064
 
hf_app/requirements.txt CHANGED
@@ -7,3 +7,5 @@ Pillow>=9.0
7
  plotly>=5.0
8
  einops>=0.6
9
  scipy>=1.10
 
 
 
7
  plotly>=5.0
8
  einops>=0.6
9
  scipy>=1.10
10
+ huggingface_hub>=0.24
11
+ hf_xet>=1.1.0
precompute/train_all.py CHANGED
@@ -22,6 +22,7 @@ import argparse
22
  import json
23
  import os
24
  import sys
 
25
 
26
  # Add src to path
27
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
@@ -165,8 +166,12 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
165
  config = Config(config_dict)
166
  trainer = Trainer(config=config, use_wandb=False)
167
 
168
- # Progress logging interval: print ~20 updates during training
169
- log_interval = max(1, num_epochs // 20)
 
 
 
 
170
 
171
  # Override save directory so checkpoints go into our output structure
172
  trainer.save_dir = output_dir
@@ -192,7 +197,12 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
192
  train_loss, test_loss = trainer.do_a_training_step(epoch)
193
 
194
  # Progress logging
195
- if epoch % log_interval == 0 or epoch == config.num_epochs - 1:
 
 
 
 
 
196
  pct = 100 * (epoch + 1) / config.num_epochs
197
  train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
198
  test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
@@ -203,6 +213,7 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
203
  f" train_acc={train_acc:.4f}"
204
  f" test_acc={test_acc:.4f}",
205
  flush=True)
 
206
 
207
  if test_loss.item() < config.stopping_thresh:
208
  print(f" Early stopping at epoch {epoch}: "
 
22
  import json
23
  import os
24
  import sys
25
+ import time
26
 
27
  # Add src to path
28
  sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
 
166
  config = Config(config_dict)
167
  trainer = Trainer(config=config, use_wandb=False)
168
 
169
+ # Progress logging:
170
+ # - keep epoch-based logs reasonably frequent
171
+ # - also enforce a wall-clock heartbeat so streaming UIs stay active
172
+ log_interval = min(max(1, num_epochs // 20), 100)
173
+ max_silence_sec = 20
174
+ last_log_time = time.time()
175
 
176
  # Override save directory so checkpoints go into our output structure
177
  trainer.save_dir = output_dir
 
197
  train_loss, test_loss = trainer.do_a_training_step(epoch)
198
 
199
  # Progress logging
200
+ now = time.time()
201
+ if (
202
+ epoch % log_interval == 0
203
+ or epoch == config.num_epochs - 1
204
+ or (now - last_log_time) >= max_silence_sec
205
+ ):
206
  pct = 100 * (epoch + 1) / config.num_epochs
207
  train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
208
  test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
 
213
  f" train_acc={train_acc:.4f}"
214
  f" test_acc={test_acc:.4f}",
215
  flush=True)
216
+ last_log_time = now
217
 
218
  if test_loss.item() < config.stopping_thresh:
219
  print(f" Early stopping at epoch {epoch}: "