File size: 4,085 Bytes
2850928 c2e41bf 2850928 aeae33b 9c1fdac 70e82f1 2850928 c2e41bf 2850928 c2e41bf 2850928 70e82f1 2850928 c2e41bf 2850928 c2e41bf 9c1fdac 2850928 c2e41bf 2850928 c2e41bf 2850928 c2e41bf 2850928 70e82f1 c2e41bf 70e82f1 c2e41bf 2850928 70e82f1 2850928 70e82f1 2850928 70e82f1 c2e41bf 2850928 c2e41bf 2850928 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | import os
import sys
import subprocess
import multiprocessing
import threading
import http.server
import socketserver
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi, login
# 1. Configuration
# You must set these in Hugging Face Space Settings -> Variables and secrets
HF_TOKEN = os.environ.get("HF_TOKEN")
REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
def upload_artifacts(api: HfApi) -> None:
"""Upload data/, models/, results/ to REPO_ID. Best-effort — never raises."""
print(f"Uploading artifacts to {REPO_ID}...")
for folder in ("data", "models", "results"):
if not os.path.exists(folder):
print(f"[SKIP] {folder}/ does not exist")
continue
try:
api.upload_folder(
folder_path=folder,
repo_id=REPO_ID,
repo_type="model",
path_in_repo=folder,
)
print(f"[SUCCESS] Uploaded {folder}/")
except Exception as e:
print(f"[ERROR] Failed to upload {folder}/: {e}")
print("\n[DONE] Upload pass complete.")
def main():
print("--- DAHS HF RUNNER STARTING ---")
if not HF_TOKEN or not REPO_ID:
print("[FATAL ERROR] HF_TOKEN and REPO_ID environment variables are missing!")
print("Please go to Space Settings -> Variables and secrets, and add:")
print("1. HF_TOKEN (Must be a Fine-grained token with 'Write' access to models)")
print("2. REPO_ID (The exact name of the dataset/model repo, e.g., Vittal-M/DAHS-Models)")
sys.exit(1)
print(f"Logging into Hugging Face...")
login(token=HF_TOKEN)
api = HfApi()
# 🚨 CRITICAL FIX: Fail FAST if the repo can't be created or accessed
try:
api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
print(f"[SUCCESS] Repository {REPO_ID} is accessible and ready.")
except Exception as e:
print(f"[FATAL ERROR] Failed to create or access the repository {REPO_ID}.")
print(f"Reason: {e}")
print("ABORTING: We will not start the training to prevent wasting your time/credits.")
sys.exit(1)
# Trick Hugging Face Health Checks
def start_dummy_server():
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", 7860), Handler) as httpd:
httpd.serve_forever()
threading.Thread(target=start_dummy_server, daemon=True).start()
print("Started dummy web server on port 7860 to bypass health check timeouts.")
# 2. Run the heavy pipeline
# Sized for Q1 results within ~12h compute budget on HF:
# 2000 scenarios -> ~120k selector training rows
# 500 eval seeds -> 4500 sims, plenty for Friedman/Nemenyi/Wilcoxon
cores = "8"
print(f"\n--- STARTING DAHS PIPELINE (2000 Scenarios, 500 Eval Seeds, {cores} Workers) ---")
result = subprocess.run([
"python", "scripts/run_pipeline.py",
"--scenarios", "2000",
"--eval-seeds", "500",
"--workers", cores,
])
status = "SUCCESS" if result.returncode == 0 else f"FAILED (exit {result.returncode})"
Path("results").mkdir(exist_ok=True)
(Path("results") / "run_status.txt").write_text(
f"{status}\n{datetime.utcnow().isoformat()}Z\n"
)
if result.returncode == 0:
print("--- PIPELINE FINISHED SUCCESSFULLY ---\n")
else:
print(f"\n[ERROR] Pipeline exited with code {result.returncode}. Uploading partial artifacts anyway.\n")
# 3. Upload trained artifacts (always — even on partial failure)
upload_artifacts(api)
if result.returncode != 0:
sys.exit(1)
# 4. PAUSE THE SPACE TO SAVE CREDITS
try:
print("Pausing the Space to stop billing...")
api.pause_space(repo_id=os.environ.get("SPACE_ID", REPO_ID))
except Exception as e:
print(f"Failed to pause space automatically: {e}")
print("IMPORTANT: Please go to the Space Settings and pause it manually!")
if __name__ == "__main__":
main()
|