Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import os | |
| import subprocess | |
| import time | |
| import sys | |
| def main(): | |
| print("[persistent-launcher] Starting mission critical pretrain loop...") | |
| # Environment Setup | |
| env = os.environ.copy() | |
| env["FEATHER_HF_JOB_NAMESPACE"] = "GAInTech" | |
| env["FEATHER_HF_OWNER"] = "GAInTech" | |
| # Use a fresh space to guarantee code refresh | |
| unique_tag = int(time.time()) | |
| env["FEATHER_HF_SPACE_REPO"] = f"GAInTech/feather-a10g-pretrain-v2-{unique_tag}" | |
| env["HYDRA_BATCH_SIZE"] = "96" | |
| env["HYDRA_TOTAL_BATCH"] = "196608" | |
| env["HYDRA_GRAD_CKPT"] = "1" | |
| env["HYDRA_SAMPLED_SOFTMAX"] = "256" | |
| env["FEATHER_HF_FLAVOR"] = "a10g-large" | |
| env["HYDRA_TARGET_SHARDS"] = "0" | |
| env["HYDRA_USE_NEMOTRON"] = "1" | |
| env["HYDRA_LOCAL_SHARDS_ONLY"] = "0" | |
| env["FEATHER_HF_USE_SPACE_IMAGE"] = "1" | |
| env["FEATHER_HF_SKIP_UPLOAD"] = "0" | |
| # Launch Command | |
| cmd = [sys.executable, "/home/mikeb/work/feather/scripts/launch_feather_hf_job.py"] | |
| max_retries = 3 | |
| for attempt in range(max_retries): | |
| print(f"[persistent-launcher] Attempt {attempt+1}/{max_retries}...") | |
| try: | |
| # We use check_call to ensure it finishes correctly (including the wait_for_space) | |
| subprocess.check_call(cmd, env=env) | |
| print("[persistent-launcher] Job submitted successfully.") | |
| return 0 | |
| except subprocess.CalledProcessError as e: | |
| print(f"[persistent-launcher] Attempt failed with {e}. Retrying in 60s...") | |
| time.sleep(60) | |
| print("[persistent-launcher] All attempts failed.") | |
| return 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |