Vittal-M commited on
Commit
70e82f1
·
verified ·
1 Parent(s): 212eb34

Update scripts/hf_runner.py

Browse files
Files changed (1) hide show
  1. scripts/hf_runner.py +45 -51
scripts/hf_runner.py CHANGED
@@ -5,6 +5,8 @@ import multiprocessing
5
  import threading
6
  import http.server
7
  import socketserver
 
 
8
  from huggingface_hub import HfApi, login
9
 
10
  # 1. Configuration
@@ -12,6 +14,26 @@ from huggingface_hub import HfApi, login
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
  REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def main():
16
  print("--- DAHS HF RUNNER STARTING ---")
17
 
@@ -45,62 +67,34 @@ def main():
45
  print("Started dummy web server on port 7860 to bypass health check timeouts.")
46
 
47
  # 2. Run the heavy pipeline
48
- # I have added --no-eval here to skip the 14-hour benchmark.
49
- # This will train the 6000-scenario models in ~1 hour and upload them safely.
50
- # If you *want* the 16 hour benchmark, simply remove the "--no-eval" argument below.
51
- cores = "8"
52
- print(f"\n--- STARTING DAHS PIPELINE (3000 Scenarios on {cores} Workers) ---")
53
-
54
  result = subprocess.run([
55
- "python", "scripts/run_pipeline.py",
56
- "--scenarios", "3000",
57
- "--workers", cores
 
58
  ])
59
-
60
- if result.returncode != 0:
61
- print("\n[FATAL ERROR] Pipeline failed! Aborting upload.")
62
- sys.exit(1)
63
-
64
- print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
65
 
66
- # 3. Upload the trained models and results back to Hugging Face
67
- print(f"Uploading models and results to {REPO_ID}...")
68
-
69
- try:
70
- # Upload data directory (raw datasets)
71
- if os.path.exists("data"):
72
- api.upload_folder(
73
- folder_path="data",
74
- repo_id=REPO_ID,
75
- repo_type="model",
76
- path_in_repo="data"
77
- )
78
- print("[SUCCESS] Successfully uploaded data/")
79
 
80
- # Upload models directory
81
- if os.path.exists("models"):
82
- api.upload_folder(
83
- folder_path="models",
84
- repo_id=REPO_ID,
85
- repo_type="model",
86
- path_in_repo="models"
87
- )
88
- print("[SUCCESS] Successfully uploaded models/")
89
 
90
- # Upload results directory
91
- if os.path.exists("results"):
92
- api.upload_folder(
93
- folder_path="results",
94
- repo_id=REPO_ID,
95
- repo_type="model",
96
- path_in_repo="results"
97
- )
98
- print("[SUCCESS] Successfully uploaded results/")
99
-
100
- print("\n[SUCCESS] ALL DONE! Your data, models, and results are safely stored on Hugging Face.")
101
- except Exception as e:
102
- print(f"\n[FATAL ERROR] DURING UPLOAD: {e}")
103
- print("The training succeeded, but uploading to Hugging Face failed.")
104
  sys.exit(1)
105
 
106
  # 4. PAUSE THE SPACE TO SAVE CREDITS
 
5
  import threading
6
  import http.server
7
  import socketserver
8
+ from datetime import datetime
9
+ from pathlib import Path
10
  from huggingface_hub import HfApi, login
11
 
12
  # 1. Configuration
 
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
  REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
16
 
17
+ def upload_artifacts(api: HfApi) -> None:
18
+ """Upload data/, models/, results/ to REPO_ID. Best-effort — never raises."""
19
+ print(f"Uploading artifacts to {REPO_ID}...")
20
+ for folder in ("data", "models", "results"):
21
+ if not os.path.exists(folder):
22
+ print(f"[SKIP] {folder}/ does not exist")
23
+ continue
24
+ try:
25
+ api.upload_folder(
26
+ folder_path=folder,
27
+ repo_id=REPO_ID,
28
+ repo_type="model",
29
+ path_in_repo=folder,
30
+ )
31
+ print(f"[SUCCESS] Uploaded {folder}/")
32
+ except Exception as e:
33
+ print(f"[ERROR] Failed to upload {folder}/: {e}")
34
+ print("\n[DONE] Upload pass complete.")
35
+
36
+
37
  def main():
38
  print("--- DAHS HF RUNNER STARTING ---")
39
 
 
67
  print("Started dummy web server on port 7860 to bypass health check timeouts.")
68
 
69
  # 2. Run the heavy pipeline
70
+ # Sized for Q1 results within ~12h compute budget on HF:
71
+ # 2000 scenarios -> ~120k selector training rows
72
+ # 500 eval seeds -> 4500 sims, plenty for Friedman/Nemenyi/Wilcoxon
73
+ cores = "8"
74
+ print(f"\n--- STARTING DAHS PIPELINE (2000 Scenarios, 500 Eval Seeds, {cores} Workers) ---")
75
+
76
  result = subprocess.run([
77
+ "python", "scripts/run_pipeline.py",
78
+ "--scenarios", "2000",
79
+ "--eval-seeds", "500",
80
+ "--workers", cores,
81
  ])
 
 
 
 
 
 
82
 
83
+ status = "SUCCESS" if result.returncode == 0 else f"FAILED (exit {result.returncode})"
84
+ Path("results").mkdir(exist_ok=True)
85
+ (Path("results") / "run_status.txt").write_text(
86
+ f"{status}\n{datetime.utcnow().isoformat()}Z\n"
87
+ )
 
 
 
 
 
 
 
 
88
 
89
+ if result.returncode == 0:
90
+ print("--- PIPELINE FINISHED SUCCESSFULLY ---\n")
91
+ else:
92
+ print(f"\n[ERROR] Pipeline exited with code {result.returncode}. Uploading partial artifacts anyway.\n")
 
 
 
 
 
93
 
94
+ # 3. Upload trained artifacts (always — even on partial failure)
95
+ upload_artifacts(api)
96
+
97
+ if result.returncode != 0:
 
 
 
 
 
 
 
 
 
 
98
  sys.exit(1)
99
 
100
  # 4. PAUSE THE SPACE TO SAVE CREDITS