atlasops / Makefile
Harikishanth R
fix: skip-kubectl + scroll + health β€” HF Space ready
7e9a520
PROJECT ?= cloudsre-v3-amd
REGION ?= us-central1
CLUSTER ?= atlasops
# ── Cluster lifecycle ──────────────────────────────────────────────────────────
.PHONY: up down status
up:
bash infra/setup.sh $(PROJECT) $(REGION) $(CLUSTER)
down:
bash infra/teardown.sh $(PROJECT) $(REGION) $(CLUSTER)
status:
kubectl get pods -A --context=gke_$(PROJECT)_$(REGION)_$(CLUSTER)
# ── Chaos injection ────────────────────────────────────────────────────────────
.PHONY: chaos chaos-reset
chaos:
@if [ -z "$(SCENARIO)" ]; then echo "Usage: make chaos SCENARIO=sf-001"; exit 1; fi
@MANIFEST=$$(find bench/chaos_manifests -name "$(SCENARIO).yaml" | head -1); \
if [ -z "$$MANIFEST" ]; then echo "Scenario $(SCENARIO) not found"; exit 1; fi; \
echo "Applying chaos: $$MANIFEST"; \
kubectl apply -f $$MANIFEST
chaos-reset:
kubectl delete podchaos,networkchaos,stresschaos,dnschaos,iochaos,timechaos --all -A 2>/dev/null || true
# ── Historical replays ─────────────────────────────────────────────────────────
replay-%:
kubectl apply -f bench/chaos_manifests/named_replays/$*.yaml
@echo "Replay $* triggered. Watch: make status"
# ── Agent runtime ──────────────────────────────────────────────────────────────
.PHONY: coordinator
coordinator:
python agents/coordinator.py
# ── Benchmark ─────────────────────────────────────────────────────────────────
.PHONY: bench bench-baseline
bench:
python bench/runner.py --model $(MODEL) --output bench/results/$(shell date +%Y%m%d_%H%M%S)
bench-baseline:
python bench/runner.py --model checkpoints/cloudsre_v2_baseline --tag baseline_v2 \
--output bench/results/baseline_v2
# ── Training ───────────────────────────────────────────────────────────────────
.PHONY: sft grpo trajectories
trajectories:
python training/generate_trajectories.py --output data/sft_corpus.jsonl
sft:
python training/sft.py \
--model Qwen/Qwen2.5-7B-Instruct \
--data data/sft_corpus.jsonl \
--output checkpoints/sft_v3
grpo:
python training/grpo.py \
--model checkpoints/sft_v3 \
--output checkpoints/grpo_v3 \
--tiers cascade,multi_fault,named_replays
# ── Dashboard ─────────────────────────────────────────────────────────────────
.PHONY: dashboard
dashboard:
python dashboard.py
# ── Linting / tests ───────────────────────────────────────────────────────────
.PHONY: lint test release-gate smoke-e2e-local
lint:
ruff check .
test:
pytest tests/ -v
release-gate:
python scripts/release_gate.py --strict --output docs/RELEASE_READINESS.md
smoke-e2e-local:
pytest tests/test_app_endpoints.py tests/test_coordinator.py tests/test_tools.py tests/test_bench_runner.py -q