SParsh003 commited on
Commit
8e221e2
·
verified ·
1 Parent(s): 441713e

Upload runner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. runner.py +15 -9
runner.py CHANGED
@@ -6,8 +6,9 @@ import sys
6
  import threading
7
  from http.server import HTTPServer, SimpleHTTPRequestHandler
8
  from huggingface_hub import HfApi
 
9
 
10
- # Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy' and doesn't kill it.
11
  def start_health_server():
12
  class HealthCheckHandler(SimpleHTTPRequestHandler):
13
  def do_GET(self):
@@ -28,12 +29,14 @@ def main():
28
 
29
  log("===== STARTING TRAINING RUNNER =====")
30
 
31
- # Verify GPU exists
32
- try:
33
- subprocess.run(["nvidia-smi"], check=True)
34
- log("GPU detected. Proceeding with training setup...")
35
- except:
36
- log("No GPU detected. Idling to prevent loops.")
 
 
37
  while True: time.sleep(3600)
38
 
39
  if os.path.exists("training_done.txt"):
@@ -70,8 +73,11 @@ def main():
70
  f.write("done")
71
 
72
  log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
73
- api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
74
- log("Hardware downgraded successfully.")
 
 
 
75
 
76
  while True:
77
  time.sleep(3600)
 
6
  import threading
7
  from http.server import HTTPServer, SimpleHTTPRequestHandler
8
  from huggingface_hub import HfApi
9
+ import torch
10
 
11
+ # Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy'.
12
  def start_health_server():
13
  class HealthCheckHandler(SimpleHTTPRequestHandler):
14
  def do_GET(self):
 
29
 
30
  log("===== STARTING TRAINING RUNNER =====")
31
 
32
+ # RELIABLE GPU DETECTION using PyTorch
33
+ is_gpu = torch.cuda.is_available()
34
+
35
+ if is_gpu:
36
+ gpu_name = torch.cuda.get_device_name(0)
37
+ log(f"GPU DETECTED: {gpu_name}. Proceeding with training setup...")
38
+ else:
39
+ log("No GPU detected by PyTorch! Idling to prevent loops on CPU.")
40
  while True: time.sleep(3600)
41
 
42
  if os.path.exists("training_done.txt"):
 
73
  f.write("done")
74
 
75
  log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
76
+ try:
77
+ api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
78
+ log("Hardware downgraded successfully.")
79
+ except Exception as e:
80
+ log(f"Failed to downgrade hardware: {e}")
81
 
82
  while True:
83
  time.sleep(3600)