bamboo-1 / src /watch_pod.py
rain1024's picture
Consolidate project: merge scripts/, bamboo1/ into src/, optimize training
24ec440
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "runpod>=1.6.0",
# "click>=8.0.0",
# ]
# ///
"""
Watch RunPod pod status.
Usage:
export $(cat .env | xargs) && uv run scripts/watch_pod.py
export $(cat .env | xargs) && uv run scripts/watch_pod.py --pod-id <id>
"""
import os
import time
import click
import runpod
from runpod.api.graphql import run_graphql_query
def get_pod_status(pod_id):
query = f'''
query getPodStatus {{
pod(input: {{ podId: "{pod_id}" }}) {{
id
name
desiredStatus
runtime {{
uptimeInSeconds
gpus {{
gpuUtilPercent
memoryUtilPercent
}}
container {{
cpuPercent
memoryPercent
}}
}}
}}
}}
'''
return run_graphql_query(query)
@click.command()
@click.option("--pod-id", default=None, help="Pod ID to watch")
@click.option("--interval", default=10, type=int, help="Refresh interval in seconds")
def main(pod_id, interval):
"""Watch RunPod pod status in real-time."""
api_key = os.environ.get("RUNPOD_API_KEY")
if not api_key:
raise click.ClickException("Set RUNPOD_API_KEY")
runpod.api_key = api_key
# Get pod ID if not provided
if not pod_id:
pods = runpod.get_pods()
if not pods:
click.echo("No active pods found.")
return
pod_id = pods[0]["id"]
click.echo(f"Watching pod: {pods[0].get('name', pod_id)}")
click.echo(f"Refreshing every {interval}s. Press Ctrl+C to stop.\n")
try:
while True:
result = get_pod_status(pod_id)
pod = result.get("data", {}).get("pod")
if not pod:
click.echo("Pod not found or terminated.")
break
# Clear and print status
click.clear()
click.echo(f"=== {pod['name']} ({pod['id']}) ===")
click.echo(f"Status: {pod['desiredStatus']}")
runtime = pod.get("runtime") or {}
uptime = runtime.get("uptimeInSeconds", 0)
mins, secs = divmod(uptime, 60)
hours, mins = divmod(mins, 60)
click.echo(f"Uptime: {int(hours)}h {int(mins)}m {int(secs)}s")
gpus = runtime.get("gpus") or []
if gpus:
gpu = gpus[0]
click.echo(f"GPU Util: {gpu.get('gpuUtilPercent', 0):.1f}%")
click.echo(f"GPU Mem: {gpu.get('memoryUtilPercent', 0):.1f}%")
container = runtime.get("container") or {}
click.echo(f"CPU: {container.get('cpuPercent', 0):.1f}%")
click.echo(f"Memory: {container.get('memoryPercent', 0):.1f}%")
click.echo(f"\nLast update: {time.strftime('%H:%M:%S')}")
click.echo("Press Ctrl+C to stop")
if pod["desiredStatus"] not in ["RUNNING", "STARTING"]:
click.echo(f"\nPod is {pod['desiredStatus']}. Stopping watch.")
break
time.sleep(interval)
except KeyboardInterrupt:
click.echo("\nStopped watching.")
if __name__ == "__main__":
main()