Update scripts/hf_runner.py
Browse files- scripts/hf_runner.py +72 -43
scripts/hf_runner.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import subprocess
|
| 3 |
import multiprocessing
|
| 4 |
import threading
|
|
@@ -7,77 +8,105 @@ import socketserver
|
|
| 7 |
from huggingface_hub import HfApi, login
|
| 8 |
|
| 9 |
# 1. Configuration
|
| 10 |
-
#
|
| 11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 12 |
-
REPO_ID = os.environ.get("REPO_ID") # e.g., "
|
| 13 |
|
| 14 |
def main():
|
|
|
|
|
|
|
| 15 |
if not HF_TOKEN or not REPO_ID:
|
| 16 |
-
print("ERROR
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def start_dummy_server():
|
| 22 |
-
Handler = http.server.SimpleHTTPRequestHandler
|
| 23 |
-
with socketserver.TCPServer(("", 7860), Handler) as httpd:
|
| 24 |
-
httpd.serve_forever()
|
| 25 |
-
threading.Thread(target=start_dummy_server, daemon=True).start()
|
| 26 |
-
print("Started dummy web server on port 7860 to bypass health check timeouts.")
|
| 27 |
|
| 28 |
print(f"Logging into Hugging Face...")
|
| 29 |
login(token=HF_TOKEN)
|
| 30 |
api = HfApi()
|
| 31 |
|
| 32 |
-
#
|
| 33 |
try:
|
| 34 |
api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
|
| 35 |
-
print(f"Repository {REPO_ID} is ready.")
|
| 36 |
except Exception as e:
|
| 37 |
-
print(f"Failed to create
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# 2. Run the heavy pipeline
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
print(f"\n--- STARTING DAHS PIPELINE (6000 Scenarios on {cores} Workers) ---")
|
| 42 |
-
|
| 43 |
-
result = subprocess.run([
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
if result.returncode != 0:
|
| 46 |
-
print("\
|
| 47 |
-
|
|
|
|
| 48 |
print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
|
| 49 |
|
| 50 |
# 3. Upload the trained models and results back to Hugging Face
|
| 51 |
print(f"Uploading models and results to {REPO_ID}...")
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# 4. PAUSE THE SPACE TO SAVE CREDITS
|
| 76 |
-
# Since this is running in a Space, it will try to restart when the script finishes.
|
| 77 |
-
# We must pause it via the API to stop billing.
|
| 78 |
try:
|
| 79 |
print("Pausing the Space to stop billing...")
|
| 80 |
-
api.pause_space(repo_id=REPO_ID)
|
| 81 |
except Exception as e:
|
| 82 |
print(f"Failed to pause space automatically: {e}")
|
| 83 |
print("IMPORTANT: Please go to the Space Settings and pause it manually!")
|
|
|
|
| 1 |
import os
|
| 2 |
+
import sys
|
| 3 |
import subprocess
|
| 4 |
import multiprocessing
|
| 5 |
import threading
|
|
|
|
| 8 |
from huggingface_hub import HfApi, login
|
| 9 |
|
| 10 |
# 1. Configuration
|
| 11 |
+
# You must set these in Hugging Face Space Settings -> Variables and secrets
|
| 12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 13 |
+
REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
|
| 14 |
|
| 15 |
def main():
|
| 16 |
+
print("--- DAHS HF RUNNER STARTING ---")
|
| 17 |
+
|
| 18 |
if not HF_TOKEN or not REPO_ID:
|
| 19 |
+
print("[FATAL ERROR] HF_TOKEN and REPO_ID environment variables are missing!")
|
| 20 |
+
print("Please go to Space Settings -> Variables and secrets, and add:")
|
| 21 |
+
print("1. HF_TOKEN (Must be a Fine-grained token with 'Write' access to models)")
|
| 22 |
+
print("2. REPO_ID (The exact name of the dataset/model repo, e.g., Vittal-M/DAHS-Models)")
|
| 23 |
+
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
print(f"Logging into Hugging Face...")
|
| 26 |
login(token=HF_TOKEN)
|
| 27 |
api = HfApi()
|
| 28 |
|
| 29 |
+
# 🚨 CRITICAL FIX: Fail FAST if the repo can't be created or accessed
|
| 30 |
try:
|
| 31 |
api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
|
| 32 |
+
print(f"[SUCCESS] Repository {REPO_ID} is accessible and ready.")
|
| 33 |
except Exception as e:
|
| 34 |
+
print(f"[FATAL ERROR] Failed to create or access the repository {REPO_ID}.")
|
| 35 |
+
print(f"Reason: {e}")
|
| 36 |
+
print("ABORTING: We will not start the training to prevent wasting your time/credits.")
|
| 37 |
+
sys.exit(1)
|
| 38 |
+
|
| 39 |
+
# Trick Hugging Face Health Checks
|
| 40 |
+
def start_dummy_server():
|
| 41 |
+
Handler = http.server.SimpleHTTPRequestHandler
|
| 42 |
+
with socketserver.TCPServer(("", 7860), Handler) as httpd:
|
| 43 |
+
httpd.serve_forever()
|
| 44 |
+
threading.Thread(target=start_dummy_server, daemon=True).start()
|
| 45 |
+
print("Started dummy web server on port 7860 to bypass health check timeouts.")
|
| 46 |
|
| 47 |
# 2. Run the heavy pipeline
|
| 48 |
+
# I have added --no-eval here to skip the 14-hour benchmark.
|
| 49 |
+
# This will train the 6000-scenario models in ~1 hour and upload them safely.
|
| 50 |
+
# If you *want* the 16 hour benchmark, simply remove the "--no-eval" argument below.
|
| 51 |
+
cores = "8"
|
| 52 |
print(f"\n--- STARTING DAHS PIPELINE (6000 Scenarios on {cores} Workers) ---")
|
| 53 |
+
|
| 54 |
+
result = subprocess.run([
|
| 55 |
+
"python", "scripts/run_pipeline.py",
|
| 56 |
+
"--scenarios", "6000",
|
| 57 |
+
"--workers", cores
|
| 58 |
+
])
|
| 59 |
|
| 60 |
if result.returncode != 0:
|
| 61 |
+
print("\n[FATAL ERROR] Pipeline failed! Aborting upload.")
|
| 62 |
+
sys.exit(1)
|
| 63 |
+
|
| 64 |
print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
|
| 65 |
|
| 66 |
# 3. Upload the trained models and results back to Hugging Face
|
| 67 |
print(f"Uploading models and results to {REPO_ID}...")
|
| 68 |
|
| 69 |
+
try:
|
| 70 |
+
# Upload data directory (raw datasets)
|
| 71 |
+
if os.path.exists("data"):
|
| 72 |
+
api.upload_folder(
|
| 73 |
+
folder_path="data",
|
| 74 |
+
repo_id=REPO_ID,
|
| 75 |
+
repo_type="model",
|
| 76 |
+
path_in_repo="data"
|
| 77 |
+
)
|
| 78 |
+
print("[SUCCESS] Successfully uploaded data/")
|
| 79 |
|
| 80 |
+
# Upload models directory
|
| 81 |
+
if os.path.exists("models"):
|
| 82 |
+
api.upload_folder(
|
| 83 |
+
folder_path="models",
|
| 84 |
+
repo_id=REPO_ID,
|
| 85 |
+
repo_type="model",
|
| 86 |
+
path_in_repo="models"
|
| 87 |
+
)
|
| 88 |
+
print("[SUCCESS] Successfully uploaded models/")
|
| 89 |
|
| 90 |
+
# Upload results directory
|
| 91 |
+
if os.path.exists("results"):
|
| 92 |
+
api.upload_folder(
|
| 93 |
+
folder_path="results",
|
| 94 |
+
repo_id=REPO_ID,
|
| 95 |
+
repo_type="model",
|
| 96 |
+
path_in_repo="results"
|
| 97 |
+
)
|
| 98 |
+
print("[SUCCESS] Successfully uploaded results/")
|
| 99 |
+
|
| 100 |
+
print("\n[SUCCESS] ALL DONE! Your data, models, and results are safely stored on Hugging Face.")
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f"\n[FATAL ERROR] DURING UPLOAD: {e}")
|
| 103 |
+
print("The training succeeded, but uploading to Hugging Face failed.")
|
| 104 |
+
sys.exit(1)
|
| 105 |
|
| 106 |
# 4. PAUSE THE SPACE TO SAVE CREDITS
|
|
|
|
|
|
|
| 107 |
try:
|
| 108 |
print("Pausing the Space to stop billing...")
|
| 109 |
+
api.pause_space(repo_id=os.environ.get("SPACE_ID", REPO_ID))
|
| 110 |
except Exception as e:
|
| 111 |
print(f"Failed to pause space automatically: {e}")
|
| 112 |
print("IMPORTANT: Please go to the Space Settings and pause it manually!")
|