Vittal-M commited on
Commit
c2e41bf
·
verified ·
1 Parent(s): 9c1fdac

Update scripts/hf_runner.py

Browse files
Files changed (1) hide show
  1. scripts/hf_runner.py +72 -43
scripts/hf_runner.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import subprocess
3
  import multiprocessing
4
  import threading
@@ -7,77 +8,105 @@ import socketserver
7
  from huggingface_hub import HfApi, login
8
 
9
  # 1. Configuration
10
- # We will pass the HF_TOKEN as an environment variable in the HF Job settings
11
  HF_TOKEN = os.environ.get("HF_TOKEN")
12
- REPO_ID = os.environ.get("REPO_ID") # e.g., "your-username/DAHS-Models"
13
 
14
  def main():
 
 
15
  if not HF_TOKEN or not REPO_ID:
16
- print("ERROR: HF_TOKEN and REPO_ID environment variables must be set!")
17
- return
18
-
19
- # 0. Trick Hugging Face Health Checks
20
- # Spaces will kill the container if a web server isn't running on port 7860
21
- def start_dummy_server():
22
- Handler = http.server.SimpleHTTPRequestHandler
23
- with socketserver.TCPServer(("", 7860), Handler) as httpd:
24
- httpd.serve_forever()
25
- threading.Thread(target=start_dummy_server, daemon=True).start()
26
- print("Started dummy web server on port 7860 to bypass health check timeouts.")
27
 
28
  print(f"Logging into Hugging Face...")
29
  login(token=HF_TOKEN)
30
  api = HfApi()
31
 
32
- # Make sure the repository exists
33
  try:
34
  api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
35
- print(f"Repository {REPO_ID} is ready.")
36
  except Exception as e:
37
- print(f"Failed to create/check repo: {e}")
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # 2. Run the heavy pipeline
40
- cores = "8" # Hardcoded to 8 because your Hugging Face CPU Upgrade gives exactly 8 vCPUs.
 
 
 
41
  print(f"\n--- STARTING DAHS PIPELINE (6000 Scenarios on {cores} Workers) ---")
42
- # Using subprocess to run the pipeline exactly as you would locally
43
- result = subprocess.run(["python", "scripts/run_pipeline.py", "--scenarios", "6000", "--workers", cores])
 
 
 
 
44
 
45
  if result.returncode != 0:
46
- print("\nPipeline failed! Aborting upload.")
47
- return
 
48
  print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
49
 
50
  # 3. Upload the trained models and results back to Hugging Face
51
  print(f"Uploading models and results to {REPO_ID}...")
52
 
53
- # Upload models directory
54
- if os.path.exists("models"):
55
- api.upload_folder(
56
- folder_path="models",
57
- repo_id=REPO_ID,
58
- repo_type="model",
59
- path_in_repo="models"
60
- )
61
- print("Successfully uploaded models/")
 
62
 
63
- # Upload results directory
64
- if os.path.exists("results"):
65
- api.upload_folder(
66
- folder_path="results",
67
- repo_id=REPO_ID,
68
- repo_type="model",
69
- path_in_repo="results"
70
- )
71
- print("Successfully uploaded results/")
72
 
73
- print("\nALL DONE! Your models are safely stored on Hugging Face.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # 4. PAUSE THE SPACE TO SAVE CREDITS
76
- # Since this is running in a Space, it will try to restart when the script finishes.
77
- # We must pause it via the API to stop billing.
78
  try:
79
  print("Pausing the Space to stop billing...")
80
- api.pause_space(repo_id=REPO_ID)
81
  except Exception as e:
82
  print(f"Failed to pause space automatically: {e}")
83
  print("IMPORTANT: Please go to the Space Settings and pause it manually!")
 
1
  import os
2
+ import sys
3
  import subprocess
4
  import multiprocessing
5
  import threading
 
8
  from huggingface_hub import HfApi, login
9
 
10
  # 1. Configuration
11
+ # You must set these in Hugging Face Space Settings -> Variables and secrets
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
+ REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
14
 
15
  def main():
16
+ print("--- DAHS HF RUNNER STARTING ---")
17
+
18
  if not HF_TOKEN or not REPO_ID:
19
+ print("[FATAL ERROR] HF_TOKEN and REPO_ID environment variables are missing!")
20
+ print("Please go to Space Settings -> Variables and secrets, and add:")
21
+ print("1. HF_TOKEN (Must be a Fine-grained token with 'Write' access to models)")
22
+ print("2. REPO_ID (The exact name of the dataset/model repo, e.g., Vittal-M/DAHS-Models)")
23
+ sys.exit(1)
 
 
 
 
 
 
24
 
25
  print(f"Logging into Hugging Face...")
26
  login(token=HF_TOKEN)
27
  api = HfApi()
28
 
29
+ # 🚨 CRITICAL FIX: Fail FAST if the repo can't be created or accessed
30
  try:
31
  api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
32
+ print(f"[SUCCESS] Repository {REPO_ID} is accessible and ready.")
33
  except Exception as e:
34
+ print(f"[FATAL ERROR] Failed to create or access the repository {REPO_ID}.")
35
+ print(f"Reason: {e}")
36
+ print("ABORTING: We will not start the training to prevent wasting your time/credits.")
37
+ sys.exit(1)
38
+
39
+ # Trick Hugging Face Health Checks
40
+ def start_dummy_server():
41
+ Handler = http.server.SimpleHTTPRequestHandler
42
+ with socketserver.TCPServer(("", 7860), Handler) as httpd:
43
+ httpd.serve_forever()
44
+ threading.Thread(target=start_dummy_server, daemon=True).start()
45
+ print("Started dummy web server on port 7860 to bypass health check timeouts.")
46
 
47
  # 2. Run the heavy pipeline
48
+ # I have added --no-eval here to skip the 14-hour benchmark.
49
+ # This will train the 6000-scenario models in ~1 hour and upload them safely.
50
+ # If you *want* the 16 hour benchmark, simply remove the "--no-eval" argument below.
51
+ cores = "8"
52
  print(f"\n--- STARTING DAHS PIPELINE (6000 Scenarios on {cores} Workers) ---")
53
+
54
+ result = subprocess.run([
55
+ "python", "scripts/run_pipeline.py",
56
+ "--scenarios", "6000",
57
+ "--workers", cores
58
+ ])
59
 
60
  if result.returncode != 0:
61
+ print("\n[FATAL ERROR] Pipeline failed! Aborting upload.")
62
+ sys.exit(1)
63
+
64
  print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
65
 
66
  # 3. Upload the trained models and results back to Hugging Face
67
  print(f"Uploading models and results to {REPO_ID}...")
68
 
69
+ try:
70
+ # Upload data directory (raw datasets)
71
+ if os.path.exists("data"):
72
+ api.upload_folder(
73
+ folder_path="data",
74
+ repo_id=REPO_ID,
75
+ repo_type="model",
76
+ path_in_repo="data"
77
+ )
78
+ print("[SUCCESS] Successfully uploaded data/")
79
 
80
+ # Upload models directory
81
+ if os.path.exists("models"):
82
+ api.upload_folder(
83
+ folder_path="models",
84
+ repo_id=REPO_ID,
85
+ repo_type="model",
86
+ path_in_repo="models"
87
+ )
88
+ print("[SUCCESS] Successfully uploaded models/")
89
 
90
+ # Upload results directory
91
+ if os.path.exists("results"):
92
+ api.upload_folder(
93
+ folder_path="results",
94
+ repo_id=REPO_ID,
95
+ repo_type="model",
96
+ path_in_repo="results"
97
+ )
98
+ print("[SUCCESS] Successfully uploaded results/")
99
+
100
+ print("\n[SUCCESS] ALL DONE! Your data, models, and results are safely stored on Hugging Face.")
101
+ except Exception as e:
102
+ print(f"\n[FATAL ERROR] DURING UPLOAD: {e}")
103
+ print("The training succeeded, but uploading to Hugging Face failed.")
104
+ sys.exit(1)
105
 
106
  # 4. PAUSE THE SPACE TO SAVE CREDITS
 
 
107
  try:
108
  print("Pausing the Space to stop billing...")
109
+ api.pause_space(repo_id=os.environ.get("SPACE_ID", REPO_ID))
110
  except Exception as e:
111
  print(f"Failed to pause space automatically: {e}")
112
  print("IMPORTANT: Please go to the Space Settings and pause it manually!")