DAHS / scripts /hf_runner.py
Vittal-M's picture
Update scripts/hf_runner.py
70e82f1 verified
import os
import sys
import subprocess
import multiprocessing
import threading
import http.server
import socketserver
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi, login
# 1. Configuration
# You must set these in Hugging Face Space Settings -> Variables and secrets
HF_TOKEN = os.environ.get("HF_TOKEN")
REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
def upload_artifacts(api: HfApi) -> None:
"""Upload data/, models/, results/ to REPO_ID. Best-effort — never raises."""
print(f"Uploading artifacts to {REPO_ID}...")
for folder in ("data", "models", "results"):
if not os.path.exists(folder):
print(f"[SKIP] {folder}/ does not exist")
continue
try:
api.upload_folder(
folder_path=folder,
repo_id=REPO_ID,
repo_type="model",
path_in_repo=folder,
)
print(f"[SUCCESS] Uploaded {folder}/")
except Exception as e:
print(f"[ERROR] Failed to upload {folder}/: {e}")
print("\n[DONE] Upload pass complete.")
def main():
print("--- DAHS HF RUNNER STARTING ---")
if not HF_TOKEN or not REPO_ID:
print("[FATAL ERROR] HF_TOKEN and REPO_ID environment variables are missing!")
print("Please go to Space Settings -> Variables and secrets, and add:")
print("1. HF_TOKEN (Must be a Fine-grained token with 'Write' access to models)")
print("2. REPO_ID (The exact name of the dataset/model repo, e.g., Vittal-M/DAHS-Models)")
sys.exit(1)
print(f"Logging into Hugging Face...")
login(token=HF_TOKEN)
api = HfApi()
# 🚨 CRITICAL FIX: Fail FAST if the repo can't be created or accessed
try:
api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
print(f"[SUCCESS] Repository {REPO_ID} is accessible and ready.")
except Exception as e:
print(f"[FATAL ERROR] Failed to create or access the repository {REPO_ID}.")
print(f"Reason: {e}")
print("ABORTING: We will not start the training to prevent wasting your time/credits.")
sys.exit(1)
# Trick Hugging Face Health Checks
def start_dummy_server():
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", 7860), Handler) as httpd:
httpd.serve_forever()
threading.Thread(target=start_dummy_server, daemon=True).start()
print("Started dummy web server on port 7860 to bypass health check timeouts.")
# 2. Run the heavy pipeline
# Sized for Q1 results within ~12h compute budget on HF:
# 2000 scenarios -> ~120k selector training rows
# 500 eval seeds -> 4500 sims, plenty for Friedman/Nemenyi/Wilcoxon
cores = "8"
print(f"\n--- STARTING DAHS PIPELINE (2000 Scenarios, 500 Eval Seeds, {cores} Workers) ---")
result = subprocess.run([
"python", "scripts/run_pipeline.py",
"--scenarios", "2000",
"--eval-seeds", "500",
"--workers", cores,
])
status = "SUCCESS" if result.returncode == 0 else f"FAILED (exit {result.returncode})"
Path("results").mkdir(exist_ok=True)
(Path("results") / "run_status.txt").write_text(
f"{status}\n{datetime.utcnow().isoformat()}Z\n"
)
if result.returncode == 0:
print("--- PIPELINE FINISHED SUCCESSFULLY ---\n")
else:
print(f"\n[ERROR] Pipeline exited with code {result.returncode}. Uploading partial artifacts anyway.\n")
# 3. Upload trained artifacts (always — even on partial failure)
upload_artifacts(api)
if result.returncode != 0:
sys.exit(1)
# 4. PAUSE THE SPACE TO SAVE CREDITS
try:
print("Pausing the Space to stop billing...")
api.pause_space(repo_id=os.environ.get("SPACE_ID", REPO_ID))
except Exception as e:
print(f"Failed to pause space automatically: {e}")
print("IMPORTANT: Please go to the Space Settings and pause it manually!")
if __name__ == "__main__":
main()