Commit ·
144d5cc
1
Parent(s): 2774c42
Improve HF result commit auth and increase training log heartbeat
Browse files- hf_app/app.py +67 -8
- hf_app/requirements.txt +2 -0
- precompute/train_all.py +14 -3
hf_app/app.py
CHANGED
|
@@ -901,27 +901,83 @@ def _commit_results_to_repo(p):
|
|
| 901 |
"""
|
| 902 |
try:
|
| 903 |
from huggingface_hub import HfApi
|
|
|
|
| 904 |
except ImportError:
|
| 905 |
return False, "huggingface_hub not installed"
|
| 906 |
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 910 |
|
| 911 |
result_dir = os.path.join(RESULTS_DIR, f"p_{p:03d}")
|
| 912 |
if not os.path.isdir(result_dir):
|
| 913 |
return False, "No results directory found"
|
| 914 |
|
| 915 |
try:
|
| 916 |
-
api = HfApi()
|
|
|
|
|
|
|
| 917 |
api.upload_folder(
|
| 918 |
folder_path=result_dir,
|
| 919 |
path_in_repo=f"precomputed_results/p_{p:03d}",
|
| 920 |
-
repo_id=
|
| 921 |
repo_type="space",
|
|
|
|
| 922 |
commit_message=f"Add precomputed results for p={p}",
|
| 923 |
)
|
| 924 |
-
return True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
except Exception as e:
|
| 926 |
logger.warning(f"Failed to commit results for p={p}: {e}")
|
| 927 |
return False, str(e)
|
|
@@ -996,10 +1052,13 @@ def run_pipeline_for_p_streaming(p):
|
|
| 996 |
|
| 997 |
n_files = len(os.listdir(result_dir)) if os.path.isdir(result_dir) else 0
|
| 998 |
|
| 999 |
-
# Try to commit results back to the HF repo
|
| 1000 |
ok_commit, commit_msg = _commit_results_to_repo(p)
|
| 1001 |
if ok_commit:
|
| 1002 |
-
yield f"Results saved to HF repo.", False, False
|
|
|
|
|
|
|
|
|
|
| 1003 |
|
| 1004 |
yield f"\nDone! Generated {n_files} files for p={p}.", False, True
|
| 1005 |
|
|
|
|
| 901 |
"""
|
| 902 |
try:
|
| 903 |
from huggingface_hub import HfApi
|
| 904 |
+
from huggingface_hub.utils import HfHubHTTPError
|
| 905 |
except ImportError:
|
| 906 |
return False, "huggingface_hub not installed"
|
| 907 |
|
| 908 |
+
repo_id = (
|
| 909 |
+
os.environ.get("HF_SPACE_REPO_ID")
|
| 910 |
+
or os.environ.get("HF_REPO_ID")
|
| 911 |
+
or os.environ.get("SPACE_ID")
|
| 912 |
+
or ""
|
| 913 |
+
).strip()
|
| 914 |
+
if not repo_id:
|
| 915 |
+
return False, "No target space repo found (set HF_SPACE_REPO_ID or SPACE_ID)"
|
| 916 |
+
|
| 917 |
+
# Accept "spaces/owner/name" and full URL forms, normalize to "owner/name".
|
| 918 |
+
for prefix in ("https://huggingface.co/spaces/", "http://huggingface.co/spaces/"):
|
| 919 |
+
if repo_id.startswith(prefix):
|
| 920 |
+
repo_id = repo_id[len(prefix):]
|
| 921 |
+
if repo_id.startswith("spaces/"):
|
| 922 |
+
repo_id = repo_id[len("spaces/"):]
|
| 923 |
+
repo_id = repo_id.strip("/")
|
| 924 |
+
if repo_id.count("/") != 1:
|
| 925 |
+
return False, (
|
| 926 |
+
f"Invalid space repo id '{repo_id}'. "
|
| 927 |
+
"Expected 'owner/space-name'."
|
| 928 |
+
)
|
| 929 |
+
|
| 930 |
+
token = None
|
| 931 |
+
token_var = None
|
| 932 |
+
for var_name in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
|
| 933 |
+
raw = os.environ.get(var_name, "").strip()
|
| 934 |
+
if raw:
|
| 935 |
+
token = raw
|
| 936 |
+
token_var = var_name
|
| 937 |
+
break
|
| 938 |
+
if not token:
|
| 939 |
+
return False, (
|
| 940 |
+
"Missing Hugging Face write token. Add a Space Secret named "
|
| 941 |
+
"HF_TOKEN with write access to this Space repo."
|
| 942 |
+
)
|
| 943 |
|
| 944 |
result_dir = os.path.join(RESULTS_DIR, f"p_{p:03d}")
|
| 945 |
if not os.path.isdir(result_dir):
|
| 946 |
return False, "No results directory found"
|
| 947 |
|
| 948 |
try:
|
| 949 |
+
api = HfApi(token=token)
|
| 950 |
+
who = api.whoami(token=token)
|
| 951 |
+
actor = who.get("name", "unknown-user")
|
| 952 |
api.upload_folder(
|
| 953 |
folder_path=result_dir,
|
| 954 |
path_in_repo=f"precomputed_results/p_{p:03d}",
|
| 955 |
+
repo_id=repo_id,
|
| 956 |
repo_type="space",
|
| 957 |
+
token=token,
|
| 958 |
commit_message=f"Add precomputed results for p={p}",
|
| 959 |
)
|
| 960 |
+
return True, (
|
| 961 |
+
f"Committed results for p={p} to {repo_id} "
|
| 962 |
+
f"(auth: {token_var}, user: {actor})"
|
| 963 |
+
)
|
| 964 |
+
except HfHubHTTPError as e:
|
| 965 |
+
status = getattr(getattr(e, "response", None), "status_code", None)
|
| 966 |
+
if status in (401, 403):
|
| 967 |
+
msg = (
|
| 968 |
+
f"HF auth failed ({status}) for {repo_id}. "
|
| 969 |
+
"Set HF_TOKEN Space Secret to a valid WRITE token that can "
|
| 970 |
+
"push to this Space."
|
| 971 |
+
)
|
| 972 |
+
elif status == 404:
|
| 973 |
+
msg = (
|
| 974 |
+
f"Space repo '{repo_id}' not found. "
|
| 975 |
+
"Confirm owner/name and set HF_SPACE_REPO_ID if needed."
|
| 976 |
+
)
|
| 977 |
+
else:
|
| 978 |
+
msg = f"Hugging Face Hub error ({status}): {e}"
|
| 979 |
+
logger.warning(f"Failed to commit results for p={p}: {msg}")
|
| 980 |
+
return False, msg
|
| 981 |
except Exception as e:
|
| 982 |
logger.warning(f"Failed to commit results for p={p}: {e}")
|
| 983 |
return False, str(e)
|
|
|
|
| 1052 |
|
| 1053 |
n_files = len(os.listdir(result_dir)) if os.path.isdir(result_dir) else 0
|
| 1054 |
|
| 1055 |
+
# Try to commit results back to the HF repo so they persist across restarts
|
| 1056 |
ok_commit, commit_msg = _commit_results_to_repo(p)
|
| 1057 |
if ok_commit:
|
| 1058 |
+
yield f"Results saved to HF repo (will persist across restarts).", False, False
|
| 1059 |
+
else:
|
| 1060 |
+
yield (f"Warning: could not save to HF repo: {commit_msg}. "
|
| 1061 |
+
f"Results are available now but will be lost on restart."), False, False
|
| 1062 |
|
| 1063 |
yield f"\nDone! Generated {n_files} files for p={p}.", False, True
|
| 1064 |
|
hf_app/requirements.txt
CHANGED
|
@@ -7,3 +7,5 @@ Pillow>=9.0
|
|
| 7 |
plotly>=5.0
|
| 8 |
einops>=0.6
|
| 9 |
scipy>=1.10
|
|
|
|
|
|
|
|
|
| 7 |
plotly>=5.0
|
| 8 |
einops>=0.6
|
| 9 |
scipy>=1.10
|
| 10 |
+
huggingface_hub>=0.24
|
| 11 |
+
hf_xet>=1.1.0
|
precompute/train_all.py
CHANGED
|
@@ -22,6 +22,7 @@ import argparse
|
|
| 22 |
import json
|
| 23 |
import os
|
| 24 |
import sys
|
|
|
|
| 25 |
|
| 26 |
# Add src to path
|
| 27 |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
|
@@ -165,8 +166,12 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
|
|
| 165 |
config = Config(config_dict)
|
| 166 |
trainer = Trainer(config=config, use_wandb=False)
|
| 167 |
|
| 168 |
-
# Progress logging
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
# Override save directory so checkpoints go into our output structure
|
| 172 |
trainer.save_dir = output_dir
|
|
@@ -192,7 +197,12 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
|
|
| 192 |
train_loss, test_loss = trainer.do_a_training_step(epoch)
|
| 193 |
|
| 194 |
# Progress logging
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
pct = 100 * (epoch + 1) / config.num_epochs
|
| 197 |
train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
|
| 198 |
test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
|
|
@@ -203,6 +213,7 @@ def run_training(p, run_name, output_base, d_mlp_override=None):
|
|
| 203 |
f" train_acc={train_acc:.4f}"
|
| 204 |
f" test_acc={test_acc:.4f}",
|
| 205 |
flush=True)
|
|
|
|
| 206 |
|
| 207 |
if test_loss.item() < config.stopping_thresh:
|
| 208 |
print(f" Early stopping at epoch {epoch}: "
|
|
|
|
| 22 |
import json
|
| 23 |
import os
|
| 24 |
import sys
|
| 25 |
+
import time
|
| 26 |
|
| 27 |
# Add src to path
|
| 28 |
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
|
|
|
| 166 |
config = Config(config_dict)
|
| 167 |
trainer = Trainer(config=config, use_wandb=False)
|
| 168 |
|
| 169 |
+
# Progress logging:
|
| 170 |
+
# - keep epoch-based logs reasonably frequent
|
| 171 |
+
# - also enforce a wall-clock heartbeat so streaming UIs stay active
|
| 172 |
+
log_interval = min(max(1, num_epochs // 20), 100)
|
| 173 |
+
max_silence_sec = 20
|
| 174 |
+
last_log_time = time.time()
|
| 175 |
|
| 176 |
# Override save directory so checkpoints go into our output structure
|
| 177 |
trainer.save_dir = output_dir
|
|
|
|
| 197 |
train_loss, test_loss = trainer.do_a_training_step(epoch)
|
| 198 |
|
| 199 |
# Progress logging
|
| 200 |
+
now = time.time()
|
| 201 |
+
if (
|
| 202 |
+
epoch % log_interval == 0
|
| 203 |
+
or epoch == config.num_epochs - 1
|
| 204 |
+
or (now - last_log_time) >= max_silence_sec
|
| 205 |
+
):
|
| 206 |
pct = 100 * (epoch + 1) / config.num_epochs
|
| 207 |
train_acc = trainer.train_accs[-1] if trainer.train_accs else 0
|
| 208 |
test_acc = trainer.test_accs[-1] if trainer.test_accs else 0
|
|
|
|
| 213 |
f" train_acc={train_acc:.4f}"
|
| 214 |
f" test_acc={test_acc:.4f}",
|
| 215 |
flush=True)
|
| 216 |
+
last_log_time = now
|
| 217 |
|
| 218 |
if test_loss.item() < config.stopping_thresh:
|
| 219 |
print(f" Early stopping at epoch {epoch}: "
|