Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import os | |
| import subprocess | |
| import json | |
| import time | |
| NAMESPACE = "GAInTech" | |
| JOB_ID = os.environ.get("FEATHER_ACTIVE_JOB_ID") | |
| def get_job_status(job_id): | |
| try: | |
| raw = subprocess.check_output(["hf", "jobs", "inspect", "--namespace", NAMESPACE, job_id, "--format", "json"], text=True) | |
| data = json.loads(raw) | |
| if not data: return None | |
| return data[0] | |
| except: | |
| return None | |
| def get_job_logs(job_id, lines=50): | |
| try: | |
| return subprocess.check_output(["hf", "jobs", "logs", "--namespace", NAMESPACE, job_id, "--tail", str(lines)], text=True) | |
| except: | |
| return "" | |
| def main(): | |
| if not JOB_ID: | |
| print("FEATHER_ACTIVE_JOB_ID not set. Checking for running/pending jobs...") | |
| raw = subprocess.check_output(["hf", "jobs", "ps", "-a", "--namespace", NAMESPACE, "--format", "json"], text=True) | |
| jobs = json.loads(raw) | |
| if not jobs: | |
| print("No jobs found in namespace.") | |
| return | |
| # Filter for RUNNING/PENDING/SCHEDULING/INITIALIZING | |
| active_stages = {"RUNNING", "PENDING", "SCHEDULING", "INITIALIZING"} | |
| active_jobs = [j for j in jobs if j.get("status", {}).get("stage") in active_stages] | |
| if not active_jobs: | |
| print(f"No active jobs found. Latest job: {jobs[0]['id']} ({jobs[0]['status']['stage']})") | |
| return | |
| job_id = active_jobs[0]["id"] | |
| else: | |
| job_id = JOB_ID | |
| status_data = get_job_status(job_id) | |
| if not status_data: | |
| print(f"Job {job_id} not found.") | |
| return | |
| stage = status_data.get("status", {}).get("stage", "UNKNOWN") | |
| print(f"Job: {job_id} | Stage: {stage}") | |
| if stage in ["ERROR", "FAILED", "CANCELLED", "COMPLETED"]: | |
| print(f"TERMINAL STATE: {stage}. Intervention required.") | |
| return | |
| logs = get_job_logs(job_id) | |
| last_step_line = "" | |
| for line in logs.splitlines(): | |
| if "step=" in line: | |
| last_step_line = line | |
| if last_step_line: | |
| print(f"LATEST TELEMETRY: {last_step_line}") | |
| # Parse TPS and BPB | |
| try: | |
| parts = last_step_line.split() | |
| tps = 0 | |
| bpb = 0 | |
| for p in parts: | |
| if p.startswith("tps="): tps = float(p.split("=")[1]) | |
| if p.startswith("bpb="): bpb = float(p.split("=")[1]) | |
| if tps < 100000 and tps > 0: | |
| print(f"CRITICAL: TPS is {tps}, which is below 150k target. Checking bottlenecks...") | |
| if bpb > 3.5: | |
| print(f"WARNING: BPB is {bpb}, high divergence risk.") | |
| except: | |
| pass | |
| else: | |
| print("No telemetry found in logs yet.") | |
| if __name__ == "__main__": | |
| main() | |