feather-a10g-large-runtime / overlay /scripts /monitor_feather_cron.py
icarus112's picture
Upload folder using huggingface_hub
22741d9 verified
Raw
History Blame Contribute Delete
2.33 kB
#!/usr/bin/env python3
import os
import subprocess
import json
import time
NAMESPACE = "GAInTech"
JOB_ID = os.environ.get("FEATHER_ACTIVE_JOB_ID")
def get_job_status(job_id):
try:
raw = subprocess.check_output(["hf", "jobs", "inspect", "--namespace", NAMESPACE, job_id, "--format", "json"], text=True)
data = json.loads(raw)
if not data: return None
return data[0]
except:
return None
def get_job_logs(job_id, lines=50):
try:
return subprocess.check_output(["hf", "jobs", "logs", "--namespace", NAMESPACE, job_id, "--tail", str(lines)], text=True)
except:
return ""
def main():
if not JOB_ID:
print("FEATHER_ACTIVE_JOB_ID not set. Checking for running jobs...")
raw = subprocess.check_output(["hf", "jobs", "ps", "--namespace", NAMESPACE, "--format", "json"], text=True)
jobs = json.loads(raw)
if not jobs:
print("No running jobs found.")
return
job_id = jobs[0]["id"]
else:
job_id = JOB_ID
status_data = get_job_status(job_id)
if not status_data:
print(f"Job {job_id} not found.")
return
stage = status_data.get("status", {}).get("stage", "UNKNOWN")
print(f"Job: {job_id} | Stage: {stage}")
if stage in ["ERROR", "FAILED", "CANCELLED", "COMPLETED"]:
print(f"TERMINAL STATE: {stage}. Intervention required.")
return
logs = get_job_logs(job_id)
last_step_line = ""
for line in logs.splitlines():
if "step=" in line:
last_step_line = line
if last_step_line:
print(f"LATEST TELEMETRY: {last_step_line}")
# Parse TPS and BPB
try:
parts = last_step_line.split()
tps = 0
bpb = 0
for p in parts:
if p.startswith("tps="): tps = float(p.split("=")[1])
if p.startswith("bpb="): bpb = float(p.split("=")[1])
if tps < 100000 and tps > 0:
print(f"CRITICAL: TPS is {tps}, which is below 150k target. Checking bottlenecks...")
if bpb > 3.5:
print(f"WARNING: BPB is {bpb}, high divergence risk.")
except:
pass
else:
print("No telemetry found in logs yet.")
if __name__ == "__main__":
main()