Spaces:

y-agent
/

modular-addition-feature-learning

Sleeping

App Files Files Community

zhuoranyang commited on Feb 18

Commit

144d5cc

1 Parent(s): 2774c42

Improve HF result commit auth and increase training log heartbeat

Browse files

Files changed (3) hide show

hf_app/app.py +67 -8
hf_app/requirements.txt +2 -0
precompute/train_all.py +14 -3

hf_app/app.py CHANGED Viewed

@@ -901,27 +901,83 @@ def _commit_results_to_repo(p):
     """
     try:
         from huggingface_hub import HfApi
     except ImportError:
         return False, "huggingface_hub not installed"
-    space_id = os.environ.get("SPACE_ID")  # e.g. "username/space-name"
-    if not space_id:
-        return False, "Not running on HF Spaces (SPACE_ID not set)"
     result_dir = os.path.join(RESULTS_DIR, f"p_{p:03d}")
     if not os.path.isdir(result_dir):
         return False, "No results directory found"
     try:
-        api = HfApi()
         api.upload_folder(
             folder_path=result_dir,
             path_in_repo=f"precomputed_results/p_{p:03d}",
-            repo_id=space_id,
             repo_type="space",
             commit_message=f"Add precomputed results for p={p}",
         )
-        return True, f"Committed results for p={p} to {space_id}"
     except Exception as e:
         logger.warning(f"Failed to commit results for p={p}: {e}")
         return False, str(e)
@@ -996,10 +1052,13 @@ def run_pipeline_for_p_streaming(p):
     n_files = len(os.listdir(result_dir)) if os.path.isdir(result_dir) else 0
-    # Try to commit results back to the HF repo
     ok_commit, commit_msg = _commit_results_to_repo(p)
     if ok_commit:
-        yield f"Results saved to HF repo.", False, False
     yield f"\nDone! Generated {n_files} files for p={p}.", False, True

     """
     try:
         from huggingface_hub import HfApi
+        from huggingface_hub.utils import HfHubHTTPError
     except ImportError:
         return False, "huggingface_hub not installed"
+    repo_id = (
+        os.environ.get("HF_SPACE_REPO_ID")
+        or os.environ.get("HF_REPO_ID")
+        or os.environ.get("SPACE_ID")
+        or ""
+    ).strip()
+    if not repo_id:
+        return False, "No target space repo found (set HF_SPACE_REPO_ID or SPACE_ID)"
+    # Accept "spaces/owner/name" and full URL forms, normalize to "owner/name".
+    for prefix in ("https://huggingface.co/spaces/", "http://huggingface.co/spaces/"):
+        if repo_id.startswith(prefix):
+            repo_id = repo_id[len(prefix):]
+    if repo_id.startswith("spaces/"):
+        repo_id = repo_id[len("spaces/"):]
+    repo_id = repo_id.strip("/")
+    if repo_id.count("/") != 1:
+        return False, (
+            f"Invalid space repo id '{repo_id}'. "
+            "Expected 'owner/space-name'."
+        )
+    token = None
+    token_var = None
+    for var_name in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
+        raw = os.environ.get(var_name, "").strip()
+        if raw:
+            token = raw
+            token_var = var_name
+            break
+    if not token:
+        return False, (
+            "Missing Hugging Face write token. Add a Space Secret named "
+            "HF_TOKEN with write access to this Space repo."
+        )
     result_dir = os.path.join(RESULTS_DIR, f"p_{p:03d}")
     if not os.path.isdir(result_dir):
         return False, "No results directory found"
     try:
+        api = HfApi(token=token)
+        who = api.whoami(token=token)
+        actor = who.get("name", "unknown-user")
         api.upload_folder(
             folder_path=result_dir,
             path_in_repo=f"precomputed_results/p_{p:03d}",
+            repo_id=repo_id,
             repo_type="space",
+            token=token,
             commit_message=f"Add precomputed results for p={p}",
         )
+        return True, (
+            f"Committed results for p={p} to {repo_id} "
+            f"(auth: {token_var}, user: {actor})"
+        )
+    except HfHubHTTPError as e:
+        status = getattr(getattr(e, "response", None), "status_code", None)
+        if status in (401, 403):
+            msg = (
+                f"HF auth failed ({status}) for {repo_id}. "
+                "Set HF_TOKEN Space Secret to a valid WRITE token that can "
+                "push to this Space."
+            )
+        elif status == 404:
+            msg = (
+                f"Space repo '{repo_id}' not found. "
+                "Confirm owner/name and set HF_SPACE_REPO_ID if needed."
+            )
+        else:
+            msg = f"Hugging Face Hub error ({status}): {e}"
+        logger.warning(f"Failed to commit results for p={p}: {msg}")
+        return False, msg
     except Exception as e:
         logger.warning(f"Failed to commit results for p={p}: {e}")
         return False, str(e)
     n_files = len(os.listdir(result_dir)) if os.path.isdir(result_dir) else 0
+    # Try to commit results back to the HF repo so they persist across restarts
     ok_commit, commit_msg = _commit_results_to_repo(p)
     if ok_commit:
+        yield f"Results saved to HF repo (will persist across restarts).", False, False
+    else:
+        yield (f"Warning: could not save to HF repo: {commit_msg}. "
+               f"Results are available now but will be lost on restart."), False, False
     yield f"\nDone! Generated {n_files} files for p={p}.", False, True

hf_app/requirements.txt CHANGED Viewed

@@ -7,3 +7,5 @@ Pillow>=9.0
 plotly>=5.0
 einops>=0.6
 scipy>=1.10

 plotly>=5.0
 einops>=0.6
 scipy>=1.10
+huggingface_hub>=0.24
+hf_xet>=1.1.0

precompute/train_all.py CHANGED Viewed

@@ -22,6 +22,7 @@ import argparse
 import json
 import os
 import sys
 # Add src to path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
@@ -165,8 +166,12 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
     config = Config(config_dict)
     trainer = Trainer(config=config, use_wandb=False)
-    # Progress logging interval: print ~20 updates during training
-    log_interval = max(1, num_epochs // 20)
     # Override save directory so checkpoints go into our output structure
     trainer.save_dir = output_dir
@@ -192,7 +197,12 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
         train_loss, test_loss = trainer.do_a_training_step(epoch)
         # Progress logging
-        if epoch % log_interval == 0 or epoch == config.num_epochs - 1:
             pct = 100 * (epoch + 1) / config.num_epochs
             train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
             test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
@@ -203,6 +213,7 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
                   f"  train_acc={train_acc:.4f}"
                   f"  test_acc={test_acc:.4f}",
                   flush=True)
         if test_loss.item() < config.stopping_thresh:
             print(f"  Early stopping at epoch {epoch}: "

 import json
 import os
 import sys
+import time
 # Add src to path
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
     config = Config(config_dict)
     trainer = Trainer(config=config, use_wandb=False)
+    # Progress logging:
+    # - keep epoch-based logs reasonably frequent
+    # - also enforce a wall-clock heartbeat so streaming UIs stay active
+    log_interval = min(max(1, num_epochs // 20), 100)
+    max_silence_sec = 20
+    last_log_time = time.time()
     # Override save directory so checkpoints go into our output structure
     trainer.save_dir = output_dir
         train_loss, test_loss = trainer.do_a_training_step(epoch)
         # Progress logging
+        now = time.time()
+        if (
+            epoch % log_interval == 0
+            or epoch == config.num_epochs - 1
+            or (now - last_log_time) >= max_silence_sec
+        ):
             pct = 100 * (epoch + 1) / config.num_epochs
             train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
             test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
                   f"  train_acc={train_acc:.4f}"
                   f"  test_acc={test_acc:.4f}",
                   flush=True)
+            last_log_time = now
         if test_loss.item() < config.stopping_thresh:
             print(f"  Early stopping at epoch {epoch}: "