Spaces:

Vittal-M
/

DAHS

Paused

App Files Files Community

Vittal-M commited on Apr 28

Commit

c2e41bf

verified ·

1 Parent(s): 9c1fdac

Update scripts/hf_runner.py

Browse files

Files changed (1) hide show

scripts/hf_runner.py +72 -43

scripts/hf_runner.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import subprocess
 import multiprocessing
 import threading
@@ -7,77 +8,105 @@ import socketserver
 from huggingface_hub import HfApi, login
 # 1. Configuration
-# We will pass the HF_TOKEN as an environment variable in the HF Job settings
 HF_TOKEN = os.environ.get("HF_TOKEN")
-REPO_ID = os.environ.get("REPO_ID") # e.g., "your-username/DAHS-Models"
 def main():
     if not HF_TOKEN or not REPO_ID:
-        print("ERROR: HF_TOKEN and REPO_ID environment variables must be set!")
-        return
-    # 0. Trick Hugging Face Health Checks
-    # Spaces will kill the container if a web server isn't running on port 7860
-    def start_dummy_server():
-        Handler = http.server.SimpleHTTPRequestHandler
-        with socketserver.TCPServer(("", 7860), Handler) as httpd:
-            httpd.serve_forever()
-    threading.Thread(target=start_dummy_server, daemon=True).start()
-    print("Started dummy web server on port 7860 to bypass health check timeouts.")
     print(f"Logging into Hugging Face...")
     login(token=HF_TOKEN)
     api = HfApi()
-    # Make sure the repository exists
     try:
         api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
-        print(f"Repository {REPO_ID} is ready.")
     except Exception as e:
-        print(f"Failed to create/check repo: {e}")
     # 2. Run the heavy pipeline
-    cores = "8" # Hardcoded to 8 because your Hugging Face CPU Upgrade gives exactly 8 vCPUs.
     print(f"\n--- STARTING DAHS PIPELINE (6000 Scenarios on {cores} Workers) ---")
-    # Using subprocess to run the pipeline exactly as you would locally
-    result = subprocess.run(["python", "scripts/run_pipeline.py", "--scenarios", "6000", "--workers", cores])
     if result.returncode != 0:
-        print("\nPipeline failed! Aborting upload.")
-        return
     print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
     # 3. Upload the trained models and results back to Hugging Face
     print(f"Uploading models and results to {REPO_ID}...")
-    # Upload models directory
-    if os.path.exists("models"):
-        api.upload_folder(
-            folder_path="models",
-            repo_id=REPO_ID,
-            repo_type="model",
-            path_in_repo="models"
-        )
-        print("Successfully uploaded models/")
-    # Upload results directory
-    if os.path.exists("results"):
-        api.upload_folder(
-            folder_path="results",
-            repo_id=REPO_ID,
-            repo_type="model",
-            path_in_repo="results"
-        )
-        print("Successfully uploaded results/")
-    print("\nALL DONE! Your models are safely stored on Hugging Face.")
     # 4. PAUSE THE SPACE TO SAVE CREDITS
-    # Since this is running in a Space, it will try to restart when the script finishes.
-    # We must pause it via the API to stop billing.
     try:
         print("Pausing the Space to stop billing...")
-        api.pause_space(repo_id=REPO_ID)
     except Exception as e:
         print(f"Failed to pause space automatically: {e}")
         print("IMPORTANT: Please go to the Space Settings and pause it manually!")

 import os
+import sys
 import subprocess
 import multiprocessing
 import threading
 from huggingface_hub import HfApi, login
 # 1. Configuration
+# You must set these in Hugging Face Space Settings -> Variables and secrets
 HF_TOKEN = os.environ.get("HF_TOKEN")
+REPO_ID = os.environ.get("REPO_ID") # e.g., "Vittal-M/DAHS-Models"
 def main():
+    print("--- DAHS HF RUNNER STARTING ---")
     if not HF_TOKEN or not REPO_ID:
+        print("[FATAL ERROR] HF_TOKEN and REPO_ID environment variables are missing!")
+        print("Please go to Space Settings -> Variables and secrets, and add:")
+        print("1. HF_TOKEN (Must be a Fine-grained token with 'Write' access to models)")
+        print("2. REPO_ID (The exact name of the dataset/model repo, e.g., Vittal-M/DAHS-Models)")
+        sys.exit(1)
     print(f"Logging into Hugging Face...")
     login(token=HF_TOKEN)
     api = HfApi()
+    # 🚨 CRITICAL FIX: Fail FAST if the repo can't be created or accessed
     try:
         api.create_repo(repo_id=REPO_ID, repo_type="model", exist_ok=True)
+        print(f"[SUCCESS] Repository {REPO_ID} is accessible and ready.")
     except Exception as e:
+        print(f"[FATAL ERROR] Failed to create or access the repository {REPO_ID}.")
+        print(f"Reason: {e}")
+        print("ABORTING: We will not start the training to prevent wasting your time/credits.")
+        sys.exit(1)
+    # Trick Hugging Face Health Checks
+    def start_dummy_server():
+        Handler = http.server.SimpleHTTPRequestHandler
+        with socketserver.TCPServer(("", 7860), Handler) as httpd:
+            httpd.serve_forever()
+    threading.Thread(target=start_dummy_server, daemon=True).start()
+    print("Started dummy web server on port 7860 to bypass health check timeouts.")
     # 2. Run the heavy pipeline
+    # I have added --no-eval here to skip the 14-hour benchmark.
+    # This will train the 6000-scenario models in ~1 hour and upload them safely.
+    # If you *want* the 16 hour benchmark, simply remove the "--no-eval" argument below.
+    cores = "8"
     print(f"\n--- STARTING DAHS PIPELINE (6000 Scenarios on {cores} Workers) ---")
+    result = subprocess.run([
+        "python", "scripts/run_pipeline.py",
+        "--scenarios", "6000",
+        "--workers", cores
+    ])
     if result.returncode != 0:
+        print("\n[FATAL ERROR] Pipeline failed! Aborting upload.")
+        sys.exit(1)
     print("--- PIPELINE FINISHED SUCCESSFULY ---\n")
     # 3. Upload the trained models and results back to Hugging Face
     print(f"Uploading models and results to {REPO_ID}...")
+    try:
+        # Upload data directory (raw datasets)
+        if os.path.exists("data"):
+            api.upload_folder(
+                folder_path="data",
+                repo_id=REPO_ID,
+                repo_type="model",
+                path_in_repo="data"
+            )
+            print("[SUCCESS] Successfully uploaded data/")
+        # Upload models directory
+        if os.path.exists("models"):
+            api.upload_folder(
+                folder_path="models",
+                repo_id=REPO_ID,
+                repo_type="model",
+                path_in_repo="models"
+            )
+            print("[SUCCESS] Successfully uploaded models/")
+        # Upload results directory
+        if os.path.exists("results"):
+            api.upload_folder(
+                folder_path="results",
+                repo_id=REPO_ID,
+                repo_type="model",
+                path_in_repo="results"
+            )
+            print("[SUCCESS] Successfully uploaded results/")
+        print("\n[SUCCESS] ALL DONE! Your data, models, and results are safely stored on Hugging Face.")
+    except Exception as e:
+        print(f"\n[FATAL ERROR] DURING UPLOAD: {e}")
+        print("The training succeeded, but uploading to Hugging Face failed.")
+        sys.exit(1)
     # 4. PAUSE THE SPACE TO SAVE CREDITS
     try:
         print("Pausing the Space to stop billing...")
+        api.pause_space(repo_id=os.environ.get("SPACE_ID", REPO_ID))
     except Exception as e:
         print(f"Failed to pause space automatically: {e}")
         print("IMPORTANT: Please go to the Space Settings and pause it manually!")