| |
| import os |
| import subprocess |
| import json |
| import time |
|
|
| NAMESPACE = "GAInTech" |
| JOB_ID = os.environ.get("FEATHER_ACTIVE_JOB_ID") |
|
|
| def get_job_status(job_id): |
| try: |
| raw = subprocess.check_output(["hf", "jobs", "inspect", "--namespace", NAMESPACE, job_id, "--format", "json"], text=True) |
| data = json.loads(raw) |
| if not data: return None |
| return data[0] |
| except: |
| return None |
|
|
| def get_job_logs(job_id, lines=50): |
| try: |
| return subprocess.check_output(["hf", "jobs", "logs", "--namespace", NAMESPACE, job_id, "--tail", str(lines)], text=True) |
| except: |
| return "" |
|
|
| def main(): |
| if not JOB_ID: |
| print("FEATHER_ACTIVE_JOB_ID not set. Checking for running jobs...") |
| raw = subprocess.check_output(["hf", "jobs", "ps", "--namespace", NAMESPACE, "--format", "json"], text=True) |
| jobs = json.loads(raw) |
| if not jobs: |
| print("No running jobs found.") |
| return |
| job_id = jobs[0]["id"] |
| else: |
| job_id = JOB_ID |
|
|
| status_data = get_job_status(job_id) |
| if not status_data: |
| print(f"Job {job_id} not found.") |
| return |
|
|
| stage = status_data.get("status", {}).get("stage", "UNKNOWN") |
| print(f"Job: {job_id} | Stage: {stage}") |
|
|
| if stage in ["ERROR", "FAILED", "CANCELLED", "COMPLETED"]: |
| print(f"TERMINAL STATE: {stage}. Intervention required.") |
| return |
|
|
| logs = get_job_logs(job_id) |
| last_step_line = "" |
| for line in logs.splitlines(): |
| if "step=" in line: |
| last_step_line = line |
|
|
| if last_step_line: |
| print(f"LATEST TELEMETRY: {last_step_line}") |
| |
| try: |
| parts = last_step_line.split() |
| tps = 0 |
| bpb = 0 |
| for p in parts: |
| if p.startswith("tps="): tps = float(p.split("=")[1]) |
| if p.startswith("bpb="): bpb = float(p.split("=")[1]) |
| |
| if tps < 100000 and tps > 0: |
| print(f"CRITICAL: TPS is {tps}, which is below 150k target. Checking bottlenecks...") |
| if bpb > 3.5: |
| print(f"WARNING: BPB is {bpb}, high divergence risk.") |
| except: |
| pass |
| else: |
| print("No telemetry found in logs yet.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|