Spaces:
Sleeping
Sleeping
Upload runner.py with huggingface_hub
Browse files
runner.py
CHANGED
|
@@ -6,8 +6,9 @@ import sys
|
|
| 6 |
import threading
|
| 7 |
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
| 8 |
from huggingface_hub import HfApi
|
|
|
|
| 9 |
|
| 10 |
-
# Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy'
|
| 11 |
def start_health_server():
|
| 12 |
class HealthCheckHandler(SimpleHTTPRequestHandler):
|
| 13 |
def do_GET(self):
|
|
@@ -28,12 +29,14 @@ def main():
|
|
| 28 |
|
| 29 |
log("===== STARTING TRAINING RUNNER =====")
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
log("
|
|
|
|
|
|
|
| 37 |
while True: time.sleep(3600)
|
| 38 |
|
| 39 |
if os.path.exists("training_done.txt"):
|
|
@@ -70,8 +73,11 @@ def main():
|
|
| 70 |
f.write("done")
|
| 71 |
|
| 72 |
log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
while True:
|
| 77 |
time.sleep(3600)
|
|
|
|
| 6 |
import threading
|
| 7 |
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
+
import torch
|
| 10 |
|
| 11 |
+
# Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy'.
|
| 12 |
def start_health_server():
|
| 13 |
class HealthCheckHandler(SimpleHTTPRequestHandler):
|
| 14 |
def do_GET(self):
|
|
|
|
| 29 |
|
| 30 |
log("===== STARTING TRAINING RUNNER =====")
|
| 31 |
|
| 32 |
+
# RELIABLE GPU DETECTION using PyTorch
|
| 33 |
+
is_gpu = torch.cuda.is_available()
|
| 34 |
+
|
| 35 |
+
if is_gpu:
|
| 36 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 37 |
+
log(f"GPU DETECTED: {gpu_name}. Proceeding with training setup...")
|
| 38 |
+
else:
|
| 39 |
+
log("No GPU detected by PyTorch! Idling to prevent loops on CPU.")
|
| 40 |
while True: time.sleep(3600)
|
| 41 |
|
| 42 |
if os.path.exists("training_done.txt"):
|
|
|
|
| 73 |
f.write("done")
|
| 74 |
|
| 75 |
log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
|
| 76 |
+
try:
|
| 77 |
+
api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
|
| 78 |
+
log("Hardware downgraded successfully.")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
log(f"Failed to downgrade hardware: {e}")
|
| 81 |
|
| 82 |
while True:
|
| 83 |
time.sleep(3600)
|