feather-a10g-gt80k-runtime-public / scripts /persistent_pretrain_launcher.py
icarus112's picture
Upload folder using huggingface_hub
422445b verified
#!/usr/bin/env python3
import os
import subprocess
import time
import sys
def main():
print("[persistent-launcher] Starting mission critical pretrain loop...")
# Environment Setup
env = os.environ.copy()
env["FEATHER_HF_JOB_NAMESPACE"] = "GAInTech"
env["FEATHER_HF_OWNER"] = "GAInTech"
# Use a fresh space to guarantee code refresh
unique_tag = int(time.time())
env["FEATHER_HF_SPACE_REPO"] = f"GAInTech/feather-a10g-pretrain-v2-{unique_tag}"
env["HYDRA_BATCH_SIZE"] = "96"
env["HYDRA_TOTAL_BATCH"] = "196608"
env["HYDRA_GRAD_CKPT"] = "1"
env["HYDRA_SAMPLED_SOFTMAX"] = "256"
env["FEATHER_HF_FLAVOR"] = "a10g-large"
env["HYDRA_TARGET_SHARDS"] = "0"
env["HYDRA_USE_NEMOTRON"] = "1"
env["HYDRA_LOCAL_SHARDS_ONLY"] = "0"
env["FEATHER_HF_USE_SPACE_IMAGE"] = "1"
env["FEATHER_HF_SKIP_UPLOAD"] = "0"
# Launch Command
cmd = [sys.executable, "/home/mikeb/work/feather/scripts/launch_feather_hf_job.py"]
max_retries = 3
for attempt in range(max_retries):
print(f"[persistent-launcher] Attempt {attempt+1}/{max_retries}...")
try:
# We use check_call to ensure it finishes correctly (including the wait_for_space)
subprocess.check_call(cmd, env=env)
print("[persistent-launcher] Job submitted successfully.")
return 0
except subprocess.CalledProcessError as e:
print(f"[persistent-launcher] Attempt failed with {e}. Retrying in 60s...")
time.sleep(60)
print("[persistent-launcher] All attempts failed.")
return 1
if __name__ == "__main__":
sys.exit(main())