File size: 2,877 Bytes
8202de9
 
 
 
9ed1791
 
 
8202de9
8e221e2
8202de9
8e221e2
9ed1791
 
 
 
 
 
 
 
 
 
 
 
 
 
8202de9
 
 
 
9ed1791
 
8e221e2
 
 
 
 
 
 
 
8202de9
 
 
9ed1791
8202de9
 
9ed1791
8202de9
 
 
9ed1791
8202de9
5821db8
 
8202de9
9ed1791
8202de9
 
 
 
 
 
 
 
 
 
 
9ed1791
8202de9
 
9ed1791
8202de9
 
 
 
9ed1791
8e221e2
 
 
 
 
8202de9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87

import os
import subprocess
import time
import sys
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
from huggingface_hub import HfApi
import torch

# Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy'.
def start_health_server():
    class HealthCheckHandler(SimpleHTTPRequestHandler):
        def do_GET(self):
            self.send_response(200)
            self.end_headers()
            self.wfile.write(b"Training in progress...")
    server = HTTPServer(('0.0.0.0', 7860), HealthCheckHandler)
    server.serve_forever()

threading.Thread(target=start_health_server, daemon=True).start()

def log(msg):
    print(msg, flush=True)

def main():
    repo_id = "SParsh003/LifeOS-Trainer"
    api = HfApi(token=os.environ.get("HF_TOKEN"))
    
    log("===== STARTING TRAINING RUNNER =====")
    
    # RELIABLE GPU DETECTION using PyTorch
    is_gpu = torch.cuda.is_available()
    
    if is_gpu:
        gpu_name = torch.cuda.get_device_name(0)
        log(f"GPU DETECTED: {gpu_name}. Proceeding with training setup...")
    else:
        log("No GPU detected by PyTorch! Idling to prevent loops on CPU.")
        while True: time.sleep(3600)
        
    if os.path.exists("training_done.txt"):
        log("Training already completed. Idling.")
        while True: time.sleep(3600)
        
    log("Cloning repository...")
    os.system("git clone https://huggingface.co/spaces/SParsh003/LifeOS-Personal-Chaos-Agen LifeOS")
    os.chdir("LifeOS")
    
    log("Installing Unsloth, TRL, and dependencies (this takes a few minutes)...")
    os.system("pip install unsloth")
    os.system("pip install --no-deps trl peft accelerate bitsandbytes")
    os.system("pip install datasets transformers sentencepiece pydantic fastapi requests matplotlib")
    
    log("Starting GRPO Training (50 episodes)...")
    cmd = [
        "python", "-m", "lifeos.training.train_grpo",
        "--real-gpu",
        "--episodes", "50",
        "--push-to-hub",
        "--hub-model-id", "SParsh003/LifeOS-Trained-Agent"
    ]
    
    # Run and stream output
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
    for line in process.stdout:
        print(line, end="", flush=True)
    process.wait()
    
    log("Training complete!")
    os.chdir("..")
    with open("training_done.txt", "w") as f:
        f.write("done")
        
    log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
    try:
        api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
        log("Hardware downgraded successfully.")
    except Exception as e:
        log(f"Failed to downgrade hardware: {e}")
    
    while True:
        time.sleep(3600)

if __name__ == '__main__':
    main()