Spaces:
Sleeping
Sleeping
File size: 2,877 Bytes
8202de9 9ed1791 8202de9 8e221e2 8202de9 8e221e2 9ed1791 8202de9 9ed1791 8e221e2 8202de9 9ed1791 8202de9 9ed1791 8202de9 9ed1791 8202de9 5821db8 8202de9 9ed1791 8202de9 9ed1791 8202de9 9ed1791 8202de9 9ed1791 8e221e2 8202de9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import subprocess
import time
import sys
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from huggingface_hub import HfApi
import torch
# Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy'.
def start_health_server():
class HealthCheckHandler(SimpleHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.end_headers()
self.wfile.write(b"Training in progress...")
server = HTTPServer(('0.0.0.0', 7860), HealthCheckHandler)
server.serve_forever()
threading.Thread(target=start_health_server, daemon=True).start()
def log(msg):
print(msg, flush=True)
def main():
repo_id = "SParsh003/LifeOS-Trainer"
api = HfApi(token=os.environ.get("HF_TOKEN"))
log("===== STARTING TRAINING RUNNER =====")
# RELIABLE GPU DETECTION using PyTorch
is_gpu = torch.cuda.is_available()
if is_gpu:
gpu_name = torch.cuda.get_device_name(0)
log(f"GPU DETECTED: {gpu_name}. Proceeding with training setup...")
else:
log("No GPU detected by PyTorch! Idling to prevent loops on CPU.")
while True: time.sleep(3600)
if os.path.exists("training_done.txt"):
log("Training already completed. Idling.")
while True: time.sleep(3600)
log("Cloning repository...")
os.system("git clone https://huggingface.co/spaces/SParsh003/LifeOS-Personal-Chaos-Agen LifeOS")
os.chdir("LifeOS")
log("Installing Unsloth, TRL, and dependencies (this takes a few minutes)...")
os.system("pip install unsloth")
os.system("pip install --no-deps trl peft accelerate bitsandbytes")
os.system("pip install datasets transformers sentencepiece pydantic fastapi requests matplotlib")
log("Starting GRPO Training (50 episodes)...")
cmd = [
"python", "-m", "lifeos.training.train_grpo",
"--real-gpu",
"--episodes", "50",
"--push-to-hub",
"--hub-model-id", "SParsh003/LifeOS-Trained-Agent"
]
# Run and stream output
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
for line in process.stdout:
print(line, end="", flush=True)
process.wait()
log("Training complete!")
os.chdir("..")
with open("training_done.txt", "w") as f:
f.write("done")
log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
try:
api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
log("Hardware downgraded successfully.")
except Exception as e:
log(f"Failed to downgrade hardware: {e}")
while True:
time.sleep(3600)
if __name__ == '__main__':
main()
|