Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| import time | |
| import sys | |
| import threading | |
| from http.server import HTTPServer, SimpleHTTPRequestHandler | |
| from huggingface_hub import HfApi | |
| import torch | |
| # Start a dummy HTTP server on 7860 so Hugging Face marks the space as 'Healthy'. | |
| def start_health_server(): | |
| class HealthCheckHandler(SimpleHTTPRequestHandler): | |
| def do_GET(self): | |
| self.send_response(200) | |
| self.end_headers() | |
| self.wfile.write(b"Training in progress...") | |
| server = HTTPServer(('0.0.0.0', 7860), HealthCheckHandler) | |
| server.serve_forever() | |
| threading.Thread(target=start_health_server, daemon=True).start() | |
| def log(msg): | |
| print(msg, flush=True) | |
| def main(): | |
| repo_id = "SParsh003/LifeOS-Trainer" | |
| api = HfApi(token=os.environ.get("HF_TOKEN")) | |
| log("===== STARTING TRAINING RUNNER =====") | |
| # RELIABLE GPU DETECTION using PyTorch | |
| is_gpu = torch.cuda.is_available() | |
| if is_gpu: | |
| gpu_name = torch.cuda.get_device_name(0) | |
| log(f"GPU DETECTED: {gpu_name}. Proceeding with training setup...") | |
| else: | |
| log("No GPU detected by PyTorch! Idling to prevent loops on CPU.") | |
| while True: time.sleep(3600) | |
| if os.path.exists("training_done.txt"): | |
| log("Training already completed. Idling.") | |
| while True: time.sleep(3600) | |
| log("Cloning repository...") | |
| os.system("git clone https://huggingface.co/spaces/SParsh003/LifeOS-Personal-Chaos-Agen LifeOS") | |
| os.chdir("LifeOS") | |
| log("Installing Unsloth, TRL, and dependencies (this takes a few minutes)...") | |
| os.system("pip install unsloth") | |
| os.system("pip install --no-deps trl peft accelerate bitsandbytes") | |
| os.system("pip install datasets transformers sentencepiece pydantic fastapi requests matplotlib") | |
| log("Starting GRPO Training (50 episodes)...") | |
| cmd = [ | |
| "python", "-m", "lifeos.training.train_grpo", | |
| "--real-gpu", | |
| "--episodes", "50", | |
| "--push-to-hub", | |
| "--hub-model-id", "SParsh003/LifeOS-Trained-Agent" | |
| ] | |
| # Run and stream output | |
| process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) | |
| for line in process.stdout: | |
| print(line, end="", flush=True) | |
| process.wait() | |
| log("Training complete!") | |
| os.chdir("..") | |
| with open("training_done.txt", "w") as f: | |
| f.write("done") | |
| log("DOWNGRADING HARDWARE TO CPU TO STOP BILLING...") | |
| try: | |
| api.request_space_hardware(repo_id=repo_id, hardware="cpu-basic") | |
| log("Hardware downgraded successfully.") | |
| except Exception as e: | |
| log(f"Failed to downgrade hardware: {e}") | |
| while True: | |
| time.sleep(3600) | |
| if __name__ == '__main__': | |
| main() | |