#!/usr/bin/env python3 import os import subprocess import time import sys def main(): print("[persistent-launcher] Starting mission critical pretrain loop...") # Environment Setup env = os.environ.copy() env["FEATHER_HF_JOB_NAMESPACE"] = "GAInTech" env["FEATHER_HF_OWNER"] = "GAInTech" # Use a fresh space to guarantee code refresh unique_tag = int(time.time()) env["FEATHER_HF_SPACE_REPO"] = f"GAInTech/feather-a10g-pretrain-v2-{unique_tag}" env["HYDRA_BATCH_SIZE"] = "96" env["HYDRA_TOTAL_BATCH"] = "196608" env["HYDRA_GRAD_CKPT"] = "1" env["HYDRA_SAMPLED_SOFTMAX"] = "256" env["FEATHER_HF_FLAVOR"] = "a10g-large" env["HYDRA_TARGET_SHARDS"] = "0" env["HYDRA_USE_NEMOTRON"] = "1" env["HYDRA_LOCAL_SHARDS_ONLY"] = "0" env["FEATHER_HF_USE_SPACE_IMAGE"] = "1" env["FEATHER_HF_SKIP_UPLOAD"] = "0" # Launch Command cmd = [sys.executable, "/home/mikeb/work/feather/scripts/launch_feather_hf_job.py"] max_retries = 3 for attempt in range(max_retries): print(f"[persistent-launcher] Attempt {attempt+1}/{max_retries}...") try: # We use check_call to ensure it finishes correctly (including the wait_for_space) subprocess.check_call(cmd, env=env) print("[persistent-launcher] Job submitted successfully.") return 0 except subprocess.CalledProcessError as e: print(f"[persistent-launcher] Attempt failed with {e}. Retrying in 60s...") time.sleep(60) print("[persistent-launcher] All attempts failed.") return 1 if __name__ == "__main__": sys.exit(main())