SParsh003 commited on
Commit
9ed1791
·
verified ·
1 Parent(s): 04b2905

Upload runner.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. runner.py +30 -10
runner.py CHANGED
@@ -2,34 +2,54 @@
2
  import os
3
  import subprocess
4
  import time
 
 
 
5
  from huggingface_hub import HfApi
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def main():
8
  repo_id = "SParsh003/LifeOS-Trainer"
9
  api = HfApi(token=os.environ.get("HF_TOKEN"))
10
 
 
 
11
  # Verify GPU exists
12
  try:
13
  subprocess.run(["nvidia-smi"], check=True)
14
- print("GPU detected. Proceeding with training setup...")
15
  except:
16
- print("No GPU detected. Idling to prevent loops.")
17
  while True: time.sleep(3600)
18
 
19
  if os.path.exists("training_done.txt"):
20
- print("Training already completed. Idling.")
21
  while True: time.sleep(3600)
22
 
23
- print("Cloning repository...")
24
  os.system("git clone https://huggingface.co/spaces/SParsh003/LifeOS-Personal-Chaos-Agen LifeOS")
25
  os.chdir("LifeOS")
26
 
27
- print("Installing Unsloth, TRL, and dependencies (this takes a few minutes)...")
28
  os.system("pip install unsloth")
29
  os.system("pip install --no-deps trl peft accelerate bitsandbytes datasets transformers sentencepiece")
30
  os.system("pip install matplotlib pydantic fastapi requests")
31
 
32
- print("Starting GRPO Training (50 episodes)...")
33
  cmd = [
34
  "python", "-m", "lifeos.training.train_grpo",
35
  "--real-gpu",
@@ -41,17 +61,17 @@ def main():
41
  # Run and stream output
42
  process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
43
  for line in process.stdout:
44
- print(line, end="")
45
  process.wait()
46
 
47
- print("Training complete!")
48
  os.chdir("..")
49
  with open("training_done.txt", "w") as f:
50
  f.write("done")
51
 
52
- print("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
53
  api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
54
- print("Hardware downgraded successfully.")
55
 
56
  while True:
57
  time.sleep(3600)
 
2
  import os
3
  import subprocess
4
  import time
5
+ import sys
6
+ import threading
7
+ from http.server import HTTPServer, SimpleHTTPRequestHandler
8
  from huggingface_hub import HfApi
9
 
10
+ # Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy' and doesn't kill it.
11
+ def start_health_server():
12
+ class HealthCheckHandler(SimpleHTTPRequestHandler):
13
+ def do_GET(self):
14
+ self.send_response(200)
15
+ self.end_headers()
16
+ self.wfile.write(b"Training in progress...")
17
+ server = HTTPServer(('0.0.0.0', 7860), HealthCheckHandler)
18
+ server.serve_forever()
19
+
20
+ threading.Thread(target=start_health_server, daemon=True).start()
21
+
22
+ def log(msg):
23
+ print(msg, flush=True)
24
+
25
  def main():
26
  repo_id = "SParsh003/LifeOS-Trainer"
27
  api = HfApi(token=os.environ.get("HF_TOKEN"))
28
 
29
+ log("===== STARTING TRAINING RUNNER =====")
30
+
31
  # Verify GPU exists
32
  try:
33
  subprocess.run(["nvidia-smi"], check=True)
34
+ log("GPU detected. Proceeding with training setup...")
35
  except:
36
+ log("No GPU detected. Idling to prevent loops.")
37
  while True: time.sleep(3600)
38
 
39
  if os.path.exists("training_done.txt"):
40
+ log("Training already completed. Idling.")
41
  while True: time.sleep(3600)
42
 
43
+ log("Cloning repository...")
44
  os.system("git clone https://huggingface.co/spaces/SParsh003/LifeOS-Personal-Chaos-Agen LifeOS")
45
  os.chdir("LifeOS")
46
 
47
+ log("Installing Unsloth, TRL, and dependencies (this takes a few minutes)...")
48
  os.system("pip install unsloth")
49
  os.system("pip install --no-deps trl peft accelerate bitsandbytes datasets transformers sentencepiece")
50
  os.system("pip install matplotlib pydantic fastapi requests")
51
 
52
+ log("Starting GRPO Training (50 episodes)...")
53
  cmd = [
54
  "python", "-m", "lifeos.training.train_grpo",
55
  "--real-gpu",
 
61
  # Run and stream output
62
  process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
63
  for line in process.stdout:
64
+ print(line, end="", flush=True)
65
  process.wait()
66
 
67
+ log("Training complete!")
68
  os.chdir("..")
69
  with open("training_done.txt", "w") as f:
70
  f.write("done")
71
 
72
+ log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...")
73
  api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic")
74
+ log("Hardware downgraded successfully.")
75
 
76
  while True:
77
  time.sleep(3600)